diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000000..c40783bc1b --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,16 @@ +# The binaries are really slow, if you compile them in 'dev' mode with the defaults. +# Enable some optimizations even in 'dev' mode, to make tests faster. The basic +# optimizations enabled by "opt-level=1" don't affect debuggability too much. +# +# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/ +# +[profile.dev.package."*"] +# Set the default for dependencies in Development mode. +opt-level = 3 + +[profile.dev] +# Turn on a small amount of optimization in Development mode. +opt-level = 1 + +[alias] +build_testing = ["build", "--features", "testing"] diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 73c487c301..0000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,682 +0,0 @@ -version: 2.1 - -executors: - zenith-xlarge-executor: - resource_class: xlarge - docker: - # NB: when changed, do not forget to update rust image tag in all Dockerfiles - - image: zimg/rust:1.56 - zenith-executor: - docker: - - image: zimg/rust:1.56 - -jobs: - check-codestyle-rust: - executor: zenith-xlarge-executor - steps: - - checkout - - run: - name: rustfmt - when: always - command: cargo fmt --all -- --check - - # A job to build postgres - build-postgres: - executor: zenith-xlarge-executor - parameters: - build_type: - type: enum - enum: ["debug", "release"] - environment: - BUILD_TYPE: << parameters.build_type >> - steps: - # Checkout the git repo (circleci doesn't have a flag to enable submodules here) - - checkout - - # Grab the postgres git revision to build a cache key. - # Note this works even though the submodule hasn't been checkout out yet. - - run: - name: Get postgres cache key - command: git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres - - - restore_cache: - name: Restore postgres cache - keys: - # Restore ONLY if the rev key matches exactly - - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - - # Build postgres if the restore_cache didn't find a build. - # `make` can't figure out whether the cache is valid, since - # it only compares file timestamps. - - run: - name: build postgres - command: | - if [ ! -e tmp_install/bin/postgres ]; then - # "depth 1" saves some time by not cloning the whole repo - git submodule update --init --depth 1 - # bail out on any warnings - COPT='-Werror' mold -run make postgres -j$(nproc) - fi - - - save_cache: - name: Save postgres cache - key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - paths: - - tmp_install - - # A job to build zenith rust code - build-zenith: - executor: zenith-xlarge-executor - parameters: - build_type: - type: enum - enum: ["debug", "release"] - environment: - BUILD_TYPE: << parameters.build_type >> - steps: - # Checkout the git repo (without submodules) - - checkout - - # Grab the postgres git revision to build a cache key. - # Note this works even though the submodule hasn't been checkout out yet. - - run: - name: Get postgres cache key - command: | - git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres - - - restore_cache: - name: Restore postgres cache - keys: - # Restore ONLY if the rev key matches exactly - - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - - - restore_cache: - name: Restore rust cache - keys: - # Require an exact match. While an out of date cache might speed up the build, - # there's no way to clean out old packages, so the cache grows every time something - # changes. - - v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} - - # Build the rust code, including test binaries - - run: - name: Rust build << parameters.build_type >> - command: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run) - CARGO_FLAGS= - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - CARGO_FLAGS=--release - fi - - export CARGO_INCREMENTAL=0 - "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests - - - save_cache: - name: Save rust cache - key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} - paths: - - ~/.cargo/registry - - ~/.cargo/git - - target - - # Run style checks - # has to run separately from cargo fmt section - # since needs to run with dependencies - - run: - name: cargo clippy - command: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run) - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - fi - - "${cov_prefix[@]}" ./run_clippy.sh - - # Run rust unit tests - - run: - name: cargo test - command: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run) - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - fi - - "${cov_prefix[@]}" cargo test - - # Install the rust binaries, for use by test jobs - - run: - name: Install rust binaries - command: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run) - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - fi - - binaries=$( - "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps | - jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' - ) - - test_exe_paths=$( - "${cov_prefix[@]}" cargo test --message-format=json --no-run | - jq -r '.executable | select(. != null)' - ) - - mkdir -p /tmp/zenith/bin - mkdir -p /tmp/zenith/test_bin - mkdir -p /tmp/zenith/etc - - # Install target binaries - for bin in $binaries; do - SRC=target/$BUILD_TYPE/$bin - DST=/tmp/zenith/bin/$bin - cp $SRC $DST - echo $DST >> /tmp/zenith/etc/binaries.list - done - - # Install test executables (for code coverage) - if [[ $BUILD_TYPE == "debug" ]]; then - for bin in $test_exe_paths; do - SRC=$bin - DST=/tmp/zenith/test_bin/$(basename $bin) - cp $SRC $DST - echo $DST >> /tmp/zenith/etc/binaries.list - done - fi - - # Install the postgres binaries, for use by test jobs - - run: - name: Install postgres binaries - command: | - cp -a tmp_install /tmp/zenith/pg_install - - - run: - name: Merge coverage data - command: | - # This will speed up workspace uploads - if [[ $BUILD_TYPE == "debug" ]]; then - scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge - fi - - # Save the rust binaries and coverage data for other jobs in this workflow. - - persist_to_workspace: - root: /tmp/zenith - paths: - - "*" - - check-codestyle-python: - executor: zenith-executor - steps: - - checkout - - restore_cache: - keys: - - v1-python-deps-{{ checksum "poetry.lock" }} - - run: - name: Install deps - command: ./scripts/pysync - - save_cache: - key: v1-python-deps-{{ checksum "poetry.lock" }} - paths: - - /home/circleci/.cache/pypoetry/virtualenvs - - run: - name: Run yapf to ensure code format - when: always - command: poetry run yapf --recursive --diff . - - run: - name: Run mypy to check types - when: always - command: poetry run mypy . - - run-pytest: - executor: zenith-executor - parameters: - # pytest args to specify the tests to run. - # - # This can be a test file name, e.g. 'test_pgbench.py, or a subdirectory, - # or '-k foobar' to run tests containing string 'foobar'. See pytest man page - # section SPECIFYING TESTS / SELECTING TESTS for details. - # - # Select the type of Rust build. Must be "release" or "debug". - build_type: - type: string - default: "debug" - # This parameter is required, to prevent the mistake of running all tests in one job. - test_selection: - type: string - default: "" - # Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr - extra_params: - type: string - default: "" - needs_postgres_source: - type: boolean - default: false - run_in_parallel: - type: boolean - default: true - save_perf_report: - type: boolean - default: false - environment: - BUILD_TYPE: << parameters.build_type >> - steps: - - attach_workspace: - at: /tmp/zenith - - checkout - - when: - condition: << parameters.needs_postgres_source >> - steps: - - run: git submodule update --init --depth 1 - - restore_cache: - keys: - - v1-python-deps-{{ checksum "poetry.lock" }} - - run: - name: Install deps - command: ./scripts/pysync - - save_cache: - key: v1-python-deps-{{ checksum "poetry.lock" }} - paths: - - /home/circleci/.cache/pypoetry/virtualenvs - - run: - name: Run pytest - # pytest doesn't output test logs in real time, so CI job may fail with - # `Too long with no output` error, if a test is running for a long time. - # In that case, tests should have internal timeouts that are less than - # no_output_timeout, specified here. - no_output_timeout: 10m - environment: - - ZENITH_BIN: /tmp/zenith/bin - - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install - - TEST_OUTPUT: /tmp/test_output - # this variable will be embedded in perf test report - # and is needed to distinguish different environments - - PLATFORM: zenith-local-ci - command: | - PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)" - rm -rf $PERF_REPORT_DIR - - TEST_SELECTION="test_runner/<< parameters.test_selection >>" - EXTRA_PARAMS="<< parameters.extra_params >>" - if [ -z "$TEST_SELECTION" ]; then - echo "test_selection must be set" - exit 1 - fi - if << parameters.run_in_parallel >>; then - EXTRA_PARAMS="-n4 $EXTRA_PARAMS" - fi - if << parameters.save_perf_report >>; then - if [[ $CIRCLE_BRANCH == "main" ]]; then - mkdir -p "$PERF_REPORT_DIR" - EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" - fi - fi - - export GITHUB_SHA=$CIRCLE_SHA1 - - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run) - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - fi - - # Run the tests. - # - # The junit.xml file allows CircleCI to display more fine-grained test information - # in its "Tests" tab in the results page. - # --verbose prints name of each test (helpful when there are - # multiple tests in one file) - # -rA prints summary in the end - # -n4 uses four processes to run tests via pytest-xdist - # -s is not used to prevent pytest from capturing output, because tests are running - # in parallel and logs are mixed between different tests - "${cov_prefix[@]}" ./scripts/pytest \ - --junitxml=$TEST_OUTPUT/junit.xml \ - --tb=short \ - --verbose \ - -m "not remote_cluster" \ - -rA $TEST_SELECTION $EXTRA_PARAMS - - if << parameters.save_perf_report >>; then - if [[ $CIRCLE_BRANCH == "main" ]]; then - export REPORT_FROM="$PERF_REPORT_DIR" - export REPORT_TO=local - scripts/generate_and_push_perf_report.sh - fi - fi - - run: - # CircleCI artifacts are preserved one file at a time, so skipping - # this step isn't a good idea. If you want to extract the - # pageserver state, perhaps a tarball would be a better idea. - name: Delete all data but logs - when: always - command: | - du -sh /tmp/test_output/* - find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete - du -sh /tmp/test_output/* - - store_artifacts: - path: /tmp/test_output - # The store_test_results step tells CircleCI where to find the junit.xml file. - - store_test_results: - path: /tmp/test_output - - run: - name: Merge coverage data - command: | - # This will speed up workspace uploads - if [[ $BUILD_TYPE == "debug" ]]; then - scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge - fi - # Save coverage data (if any) - - persist_to_workspace: - root: /tmp/zenith - paths: - - "*" - - coverage-report: - executor: zenith-xlarge-executor - steps: - - attach_workspace: - at: /tmp/zenith - - checkout - - restore_cache: - name: Restore rust cache - keys: - # Require an exact match. While an out of date cache might speed up the build, - # there's no way to clean out old packages, so the cache grows every time something - # changes. - - v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }} - - run: - name: Build coverage report - command: | - COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1 - - scripts/coverage \ - --dir=/tmp/zenith/coverage report \ - --input-objects=/tmp/zenith/etc/binaries.list \ - --commit-url=$COMMIT_URL \ - --format=github - - run: - name: Upload coverage report - command: | - LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME - REPORT_URL=https://zenithdb.github.io/zenith-coverage-data/$CIRCLE_SHA1 - COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1 - - scripts/git-upload \ - --repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-coverage-data.git \ - --message="Add code coverage for $COMMIT_URL" \ - copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE - - # Add link to the coverage report to the commit - curl -f -X POST \ - https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \ - -H "Accept: application/vnd.github.v3+json" \ - --user "$CI_ACCESS_TOKEN" \ - --data \ - "{ - \"state\": \"success\", - \"context\": \"zenith-coverage\", - \"description\": \"Coverage report is ready\", - \"target_url\": \"$REPORT_URL\" - }" - - # Build zenithdb/zenith:latest image and push it to Docker hub - docker-image: - docker: - - image: cimg/base:2021.04 - steps: - - checkout - - setup_remote_docker: - docker_layer_caching: true - - run: - name: Init postgres submodule - command: git submodule update --init --depth 1 - - run: - name: Build and push Docker image - command: | - echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin - DOCKER_TAG=$(git log --oneline|wc -l) - docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest - docker tag zenithdb/zenith:latest zenithdb/zenith:${DOCKER_TAG} && docker push zenithdb/zenith:${DOCKER_TAG} - - # Build zenithdb/compute-node:latest image and push it to Docker hub - docker-image-compute: - docker: - - image: cimg/base:2021.04 - steps: - - checkout - - setup_remote_docker: - docker_layer_caching: true - # Build zenithdb/compute-tools:latest image and push it to Docker hub - # TODO: this should probably also use versioned tag, not just :latest. - # XXX: but should it? We build and use it only locally now. - - run: - name: Build and push compute-tools Docker image - command: | - echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin - docker build -t zenithdb/compute-tools:latest -f Dockerfile.compute-tools . - docker push zenithdb/compute-tools:latest - - run: - name: Init postgres submodule - command: git submodule update --init --depth 1 - - run: - name: Build and push compute-node Docker image - command: | - echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin - DOCKER_TAG=$(git log --oneline|wc -l) - docker build -t zenithdb/compute-node:latest vendor/postgres && docker push zenithdb/compute-node:latest - docker tag zenithdb/compute-node:latest zenithdb/compute-node:${DOCKER_TAG} && docker push zenithdb/compute-node:${DOCKER_TAG} - - deploy-staging: - docker: - - image: cimg/python:3.10 - steps: - - checkout - - setup_remote_docker - - run: - name: Get Zenith binaries - command: | - rm -rf zenith_install postgres_install.tar.gz zenith_install.tar.gz - mkdir zenith_install - DOCKER_TAG=$(git log --oneline|wc -l) - docker pull --quiet zenithdb/zenith:${DOCKER_TAG} - ID=$(docker create zenithdb/zenith:${DOCKER_TAG}) - docker cp $ID:/data/postgres_install.tar.gz . - tar -xzf postgres_install.tar.gz -C zenith_install && rm postgres_install.tar.gz - docker cp $ID:/usr/local/bin/pageserver zenith_install/bin/ - docker cp $ID:/usr/local/bin/safekeeper zenith_install/bin/ - docker cp $ID:/usr/local/bin/proxy zenith_install/bin/ - docker cp $ID:/usr/local/bin/postgres zenith_install/bin/ - docker rm -v $ID - echo ${DOCKER_TAG} | tee zenith_install/.zenith_current_version - tar -czf zenith_install.tar.gz -C zenith_install . - ls -la zenith_install.tar.gz - - run: - name: Setup ansible - command: | - pip install --progress-bar off --user ansible boto3 - ansible-galaxy collection install amazon.aws - - run: - name: Apply re-deploy playbook - environment: - ANSIBLE_HOST_KEY_CHECKING: false - command: | - echo "${STAGING_SSH_KEY}" | base64 --decode | ssh-add - - export AWS_REGION=${STAGING_AWS_REGION} - export AWS_ACCESS_KEY_ID=${STAGING_AWS_ACCESS_KEY_ID} - export AWS_SECRET_ACCESS_KEY=${STAGING_AWS_SECRET_ACCESS_KEY} - ansible-playbook .circleci/storage-redeploy.playbook.yml - rm -f zenith_install.tar.gz - - deploy-staging-proxy: - docker: - - image: cimg/base:2021.04 - environment: - KUBECONFIG: .kubeconfig - steps: - - checkout - - run: - name: Store kubeconfig file - command: | - echo "${STAGING_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG} - chmod 0600 ${KUBECONFIG} - - run: - name: Setup helm v3 - command: | - curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add zenithdb https://zenithdb.github.io/helm-charts - - run: - name: Re-deploy proxy - command: | - DOCKER_TAG=$(git log --oneline|wc -l) - helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/proxy.staging.yaml --set image.tag=${DOCKER_TAG} --wait - - # Trigger a new remote CI job - remote-ci-trigger: - docker: - - image: cimg/base:2021.04 - parameters: - remote_repo: - type: string - environment: - REMOTE_REPO: << parameters.remote_repo >> - steps: - - run: - name: Set PR's status to pending - command: | - LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME - - curl -f -X POST \ - https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \ - -H "Accept: application/vnd.github.v3+json" \ - --user "$CI_ACCESS_TOKEN" \ - --data \ - "{ - \"state\": \"pending\", - \"context\": \"zenith-remote-ci\", - \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\" - }" - - run: - name: Request a remote CI test - command: | - LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME - - curl -f -X POST \ - https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \ - -H "Accept: application/vnd.github.v3+json" \ - --user "$CI_ACCESS_TOKEN" \ - --data \ - "{ - \"ref\": \"main\", - \"inputs\": { - \"ci_job_name\": \"zenith-remote-ci\", - \"commit_hash\": \"$CIRCLE_SHA1\", - \"remote_repo\": \"$LOCAL_REPO\" - } - }" - -workflows: - build_and_test: - jobs: - - check-codestyle-rust - - check-codestyle-python - - build-postgres: - name: build-postgres-<< matrix.build_type >> - matrix: - parameters: - build_type: ["debug", "release"] - - build-zenith: - name: build-zenith-<< matrix.build_type >> - matrix: - parameters: - build_type: ["debug", "release"] - requires: - - build-postgres-<< matrix.build_type >> - - run-pytest: - name: pg_regress-tests-<< matrix.build_type >> - context: PERF_TEST_RESULT_CONNSTR - matrix: - parameters: - build_type: ["debug", "release"] - test_selection: batch_pg_regress - needs_postgres_source: true - requires: - - build-zenith-<< matrix.build_type >> - - run-pytest: - name: other-tests-<< matrix.build_type >> - matrix: - parameters: - build_type: ["debug", "release"] - test_selection: batch_others - requires: - - build-zenith-<< matrix.build_type >> - - run-pytest: - name: benchmarks - context: PERF_TEST_RESULT_CONNSTR - build_type: release - test_selection: performance - run_in_parallel: false - save_perf_report: true - requires: - - build-zenith-release - - coverage-report: - # Context passes credentials for gh api - context: CI_ACCESS_TOKEN - requires: - # TODO: consider adding more - - other-tests-debug - - docker-image: - # Context gives an ability to login - context: Docker Hub - # Build image only for commits to main - filters: - branches: - only: - - main - requires: - - pg_regress-tests-release - - other-tests-release - - docker-image-compute: - # Context gives an ability to login - context: Docker Hub - # Build image only for commits to main - filters: - branches: - only: - - main - requires: - - pg_regress-tests-release - - other-tests-release - - deploy-staging: - # Context gives an ability to login - context: Docker Hub - # deploy only for commits to main - filters: - branches: - only: - - main - requires: - - docker-image - - deploy-staging-proxy: - # deploy only for commits to main - filters: - branches: - only: - - main - requires: - - docker-image - - remote-ci-trigger: - # Context passes credentials for gh api - context: CI_ACCESS_TOKEN - remote_repo: "zenithdb/console" - requires: - # XXX: Successful build doesn't mean everything is OK, but - # the job to be triggered takes so much time to complete (~22 min) - # that it's better not to wait for the commented-out steps - - build-zenith-debug - # - pg_regress-tests-release - # - other-tests-release diff --git a/.circleci/storage-redeploy.playbook.yml b/.circleci/storage-redeploy.playbook.yml deleted file mode 100644 index 8173d81521..0000000000 --- a/.circleci/storage-redeploy.playbook.yml +++ /dev/null @@ -1,138 +0,0 @@ -- name: discover storage nodes - hosts: localhost - connection: local - gather_facts: False - - tasks: - - - name: discover safekeepers - no_log: true - ec2_instance_info: - filters: - "tag:zenith_env": "staging" - "tag:zenith_service": "safekeeper" - register: ec2_safekeepers - - - name: discover pageservers - no_log: true - ec2_instance_info: - filters: - "tag:zenith_env": "staging" - "tag:zenith_service": "pageserver" - register: ec2_pageservers - - - name: add safekeepers to host group - no_log: true - add_host: - name: safekeeper-{{ ansible_loop.index }} - ansible_host: "{{ item.public_ip_address }}" - groups: - - storage - - safekeepers - with_items: "{{ ec2_safekeepers.instances }}" - loop_control: - extended: yes - - - name: add pageservers to host group - no_log: true - add_host: - name: pageserver-{{ ansible_loop.index }} - ansible_host: "{{ item.public_ip_address }}" - groups: - - storage - - pageservers - with_items: "{{ ec2_pageservers.instances }}" - loop_control: - extended: yes - -- name: Retrive versions - hosts: storage - gather_facts: False - remote_user: admin - - tasks: - - - name: Get current version of binaries - set_fact: - current_version: "{{lookup('file', '../zenith_install/.zenith_current_version') }}" - - - name: Check that file with version exists on host - stat: - path: /usr/local/.zenith_current_version - register: version_file - - - name: Try to get current version from the host - when: version_file.stat.exists - ansible.builtin.fetch: - src: /usr/local/.zenith_current_version - dest: .remote_version.{{ inventory_hostname }} - fail_on_missing: no - flat: yes - - - name: Store remote version to variable - when: version_file.stat.exists - set_fact: - remote_version: "{{ lookup('file', '.remote_version.{{ inventory_hostname }}') }}" - - - name: Store default value of remote version to variable in case when remote version file not found - when: not version_file.stat.exists - set_fact: - remote_version: "000" - -- name: Extract Zenith binaries - hosts: storage - gather_facts: False - remote_user: admin - - tasks: - - - name: Inform about version conflict - when: current_version <= remote_version - debug: msg="Current version {{ current_version }} LE than remote {{ remote_version }}" - - - name: Extract Zenith binaries to /usr/local - when: current_version > remote_version - ansible.builtin.unarchive: - src: ../zenith_install.tar.gz - dest: /usr/local - become: true - -- name: Restart safekeepers - hosts: safekeepers - gather_facts: False - remote_user: admin - - tasks: - - - name: Inform about version conflict - when: current_version <= remote_version - debug: msg="Current version {{ current_version }} LE than remote {{ remote_version }}" - - - name: Restart systemd service - when: current_version > remote_version - ansible.builtin.systemd: - daemon_reload: yes - name: safekeeper - enabled: yes - state: restarted - become: true - -- name: Restart pageservers - hosts: pageservers - gather_facts: False - remote_user: admin - - tasks: - - - name: Inform about version conflict - when: current_version <= remote_version - debug: msg="Current version {{ current_version }} LE than remote {{ remote_version }}" - - - name: Restart systemd service - when: current_version > remote_version - ansible.builtin.systemd: - daemon_reload: yes - name: pageserver - enabled: yes - state: restarted - become: true diff --git a/.config/hakari.toml b/.config/hakari.toml new file mode 100644 index 0000000000..42d184b857 --- /dev/null +++ b/.config/hakari.toml @@ -0,0 +1,26 @@ +# This file contains settings for `cargo hakari`. +# See https://docs.rs/cargo-hakari/latest/cargo_hakari/config for a full list of options. + +hakari-package = "workspace_hack" + +# Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above. +dep-format-version = "2" + +# Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended. +# Hakari works much better with the new feature resolver. +# For more about the new feature resolver, see: +# https://blog.rust-lang.org/2021/03/25/Rust-1.51.0.html#cargos-new-feature-resolver +# Have to keep the resolver still here since hakari requires this field, +# despite it's now the default for 2021 edition & cargo. +resolver = "2" + +# Add triples corresponding to platforms commonly used by developers here. +# https://doc.rust-lang.org/rustc/platform-support.html +platforms = [ + # "x86_64-unknown-linux-gnu", + # "x86_64-apple-darwin", + # "x86_64-pc-windows-msvc", +] + +# Write out exact versions rather than a semver range. (Defaults to false.) +# exact-versions = true diff --git a/.dockerignore b/.dockerignore index 352336496f..92eb4f24de 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,18 +1,21 @@ -**/.git/ -**/__pycache__ -**/.pytest_cache +* -.git -target -tmp_check -tmp_install -tmp_check_cli -test_output -.vscode -.zenith -integration_tests/.zenith -.mypy_cache - -Dockerfile -.dockerignore +!rust-toolchain.toml +!Cargo.toml +!Cargo.lock +!Makefile +!.cargo/ +!.config/ +!control_plane/ +!compute_tools/ +!libs/ +!pageserver/ +!pgxn/ +!proxy/ +!safekeeper/ +!vendor/postgres-v14/ +!vendor/postgres-v15/ +!workspace_hack/ +!neon_local/ +!scripts/ninstall.sh diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000000..3afa4b683c --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1 @@ +4c2bb43775947775401cbb9d774823c5723a91f8 diff --git a/.github/ISSUE_TEMPLATE/bug-template.md b/.github/ISSUE_TEMPLATE/bug-template.md new file mode 100644 index 0000000000..d33eec3cde --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug-template.md @@ -0,0 +1,23 @@ +--- +name: Bug Template +about: Used for describing bugs +title: '' +labels: t/bug +assignees: '' + +--- + +## Steps to reproduce + + +## Expected result + + +## Actual result + + +## Environment + + +## Logs, links +- diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md new file mode 100644 index 0000000000..7707e0aa67 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/epic-template.md @@ -0,0 +1,25 @@ +--- +name: Epic Template +about: A set of related tasks contributing towards specific outcome, comprising of + more than 1 week of work. +title: 'Epic: ' +labels: t/Epic +assignees: '' + +--- + +## Motivation + + +## DoD + + +## Implementation ideas + + +## Tasks +- [ ] + + +## Other related tasks and Epics +- diff --git a/.github/PULL_REQUEST_TEMPLATE/release-pr.md b/.github/PULL_REQUEST_TEMPLATE/release-pr.md new file mode 100644 index 0000000000..8fcc3bd4af --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md @@ -0,0 +1,20 @@ +## Release 202Y-MM-DD + +**NB: this PR must be merged only by 'Create a merge commit'!** + +### Checklist when preparing for release +- [ ] Read or refresh [the release flow guide](https://github.com/neondatabase/cloud/wiki/Release:-general-flow) +- [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers? +- [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan? + + + +### Checklist after release +- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/219/files)) +- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel +- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true) +- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some) +- [ ] Check [cloud SLO dashboard](https://observer.zenith.tech/d/_oWcBMJ7k/cloud-slos?orgId=1) +- [ ] Check [compute startup metrics dashboard](https://observer.zenith.tech/d/5OkYJEmVz/compute-startup-time) + + diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml new file mode 100644 index 0000000000..dfb314571b --- /dev/null +++ b/.github/actions/allure-report/action.yml @@ -0,0 +1,221 @@ +name: 'Create Allure report' +description: 'Create and publish Allure report' + +inputs: + action: + desctiption: 'generate or store' + required: true + build_type: + description: '`build_type` from run-python-test-set action' + required: true + test_selection: + description: '`test_selector` from run-python-test-set action' + required: false +outputs: + report-url: + description: 'Allure report URL' + value: ${{ steps.generate-report.outputs.report-url }} + +runs: + using: "composite" + steps: + - name: Validate input parameters + shell: bash -euxo pipefail {0} + run: | + if [ "${{ inputs.action }}" != "store" ] && [ "${{ inputs.action }}" != "generate" ]; then + echo 2>&1 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only" + exit 1 + fi + + if [ -z "${{ inputs.test_selection }}" ] && [ "${{ inputs.action }}" == "store" ]; then + echo 2>&1 "inputs.test_selection must be set for 'store' action" + exit 2 + fi + + - name: Calculate key + id: calculate-key + shell: bash -euxo pipefail {0} + run: | + # TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key + + pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) + if [ "${pr_number}" != "null" ]; then + key=pr-${pr_number} + elif [ "${GITHUB_REF}" = "refs/heads/main" ]; then + # Shortcut for a special branch + key=main + else + key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -c "[:alnum:]._-" "-") + fi + echo "KEY=${key}" >> $GITHUB_OUTPUT + + - uses: actions/setup-java@v3 + if: ${{ inputs.action == 'generate' }} + with: + distribution: 'temurin' + java-version: '17' + + - name: Install Allure + if: ${{ inputs.action == 'generate' }} + shell: bash -euxo pipefail {0} + run: | + if ! which allure; then + ALLURE_ZIP=allure-${ALLURE_VERSION}.zip + wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP} + echo "${ALLURE_ZIP_MD5} ${ALLURE_ZIP}" | md5sum -c + unzip -q ${ALLURE_ZIP} + echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH + rm -f ${ALLURE_ZIP} + fi + env: + ALLURE_VERSION: 2.19.0 + ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464 + + - name: Upload Allure results + if: ${{ inputs.action == 'store' }} + env: + REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }} + RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }} + TEST_OUTPUT: /tmp/test_output + BUCKET: neon-github-public-dev + shell: bash -euxo pipefail {0} + run: | + # Add metadata + cat < $TEST_OUTPUT/allure/results/executor.json + { + "name": "GitHub Actions", + "type": "github", + "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html", + "buildOrder": ${GITHUB_RUN_ID}, + "buildName": "GitHub Actions Run #${{ github.run_number }}/${GITHUB_RUN_ATTEMPT}", + "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}", + "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html", + "reportName": "Allure Report" + } + EOF + cat < $TEST_OUTPUT/allure/results/environment.properties + TEST_SELECTION=${{ inputs.test_selection }} + BUILD_TYPE=${{ inputs.build_type }} + EOF + + ARCHIVE="${GITHUB_RUN_ID}-${{ inputs.test_selection }}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst" + ZSTD_NBTHREADS=0 + + tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd . + aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}" + + # Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this + - name: Acquire Allure lock + if: ${{ inputs.action == 'generate' }} + shell: bash -euxo pipefail {0} + env: + LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt + BUCKET: neon-github-public-dev + run: | + LOCK_TIMEOUT=300 # seconds + + for _ in $(seq 1 5); do + for i in $(seq 1 ${LOCK_TIMEOUT}); do + LOCK_ADDED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true) + # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS) + if [ -z "${LOCK_ADDED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ADDED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then + break + fi + sleep 1 + done + echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" > lock.txt + aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}" + + # A double-check that exactly WE have acquired the lock + aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt + if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then + break + fi + done + + - name: Generate and publish final Allure report + if: ${{ inputs.action == 'generate' }} + id: generate-report + env: + REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }} + RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }} + TEST_OUTPUT: /tmp/test_output + BUCKET: neon-github-public-dev + shell: bash -euxo pipefail {0} + run: | + # Get previously uploaded data for this run + ZSTD_NBTHREADS=0 + + s3_filepaths=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/${GITHUB_RUN_ID}- | jq --raw-output '.Contents[].Key') + if [ -z "$s3_filepaths" ]; then + # There's no previously uploaded data for this run + exit 0 + fi + for s3_filepath in ${s3_filepaths}; do + aws s3 cp --only-show-errors "s3://${BUCKET}/${s3_filepath}" "${TEST_OUTPUT}/allure/" + + archive=${TEST_OUTPUT}/allure/$(basename $s3_filepath) + mkdir -p ${archive%.tar.zst} + tar -xf ${archive} -C ${archive%.tar.zst} + rm -f ${archive} + done + + # Get history trend + aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${TEST_OUTPUT}/allure/latest/history" || true + + # Generate report + allure generate --clean --output $TEST_OUTPUT/allure/report $TEST_OUTPUT/allure/* + + # Replace a logo link with a redirect to the latest version of the report + sed -i 's| ./index.html + + + + Redirecting to ${REPORT_URL} + + EOF + aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html" + + echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY} + echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT + + - name: Release Allure lock + if: ${{ inputs.action == 'generate' && always() }} + shell: bash -euxo pipefail {0} + env: + LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt + BUCKET: neon-github-public-dev + run: | + aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0 + + if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then + aws s3 rm "s3://${BUCKET}/${LOCK_FILE}" + fi + + - uses: actions/github-script@v6 + if: ${{ inputs.action == 'generate' && always() }} + env: + REPORT_URL: ${{ steps.generate-report.outputs.report-url }} + BUILD_TYPE: ${{ inputs.build_type }} + SHA: ${{ github.event.pull_request.head.sha || github.sha }} + with: + script: | + const { REPORT_URL, BUILD_TYPE, SHA } = process.env + + await github.rest.repos.createCommitStatus({ + owner: context.repo.owner, + repo: context.repo.repo, + sha: `${SHA}`, + state: 'success', + target_url: `${REPORT_URL}`, + context: `Allure report / ${BUILD_TYPE}`, + }) diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml new file mode 100644 index 0000000000..eb34d4206a --- /dev/null +++ b/.github/actions/download/action.yml @@ -0,0 +1,59 @@ +name: "Download an artifact" +description: "Custom download action" +inputs: + name: + description: "Artifact name" + required: true + path: + description: "A directory to put artifact into" + default: "." + required: false + skip-if-does-not-exist: + description: "Allow to skip if file doesn't exist, fail otherwise" + default: false + required: false + prefix: + description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" + required: false + +runs: + using: "composite" + steps: + - name: Download artifact + id: download-artifact + shell: bash -euxo pipefail {0} + env: + TARGET: ${{ inputs.path }} + ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst + SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }} + PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }} + run: | + BUCKET=neon-github-public-dev + FILENAME=$(basename $ARCHIVE) + + S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) + if [ -z "${S3_KEY}" ]; then + if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then + echo 'SKIPPED=true' >> $GITHUB_OUTPUT + exit 0 + else + echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist" + exit 1 + fi + fi + + echo 'SKIPPED=false' >> $GITHUB_OUTPUT + + mkdir -p $(dirname $ARCHIVE) + time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} ${ARCHIVE} + + - name: Extract artifact + if: ${{ steps.download-artifact.outputs.SKIPPED == 'false' }} + shell: bash -euxo pipefail {0} + env: + TARGET: ${{ inputs.path }} + ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst + run: | + mkdir -p ${TARGET} + time tar -xf ${ARCHIVE} -C ${TARGET} + rm -f ${ARCHIVE} diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml new file mode 100644 index 0000000000..b4fd151582 --- /dev/null +++ b/.github/actions/neon-project-create/action.yml @@ -0,0 +1,82 @@ +name: 'Create Neon Project' +description: 'Create Neon Project using API' + +inputs: + api_key: + desctiption: 'Neon API key' + required: true + environment: + desctiption: 'dev (aka captest) or stage' + required: true + region_id: + desctiption: 'Region ID, if not set the project will be created in the default region' + required: false +outputs: + dsn: + description: 'Created Project DSN (for main database)' + value: ${{ steps.create-neon-project.outputs.dsn }} + project_id: + description: 'Created Project ID' + value: ${{ steps.create-neon-project.outputs.project_id }} + +runs: + using: "composite" + steps: + - name: Parse Input + id: parse-input + shell: bash -euxo pipefail {0} + run: | + case "${ENVIRONMENT}" in + dev) + API_HOST=console.dev.neon.tech + REGION_ID=${REGION_ID:-eu-west-1} + ;; + staging) + API_HOST=console.stage.neon.tech + REGION_ID=${REGION_ID:-us-east-1} + ;; + *) + echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only" + exit 1 + ;; + esac + + echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT + echo "region_id=${REGION_ID}" >> $GITHUB_OUTPUT + env: + ENVIRONMENT: ${{ inputs.environment }} + REGION_ID: ${{ inputs.region_id }} + + - name: Create Neon Project + id: create-neon-project + # A shell without `set -x` to not to expose password/dsn in logs + shell: bash -euo pipefail {0} + run: | + project=$(curl \ + "https://${API_HOST}/api/v1/projects" \ + --fail \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" \ + --data "{ + \"project\": { + \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\", + \"platform_id\": \"aws\", + \"region_id\": \"${REGION_ID}\", + \"settings\": { } + } + }") + + # Mask password + echo "::add-mask::$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .password')" + + dsn=$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .dsn')/main + echo "::add-mask::${dsn}" + echo "dsn=${dsn}" >> $GITHUB_OUTPUT + + project_id=$(echo $project | jq --raw-output '.id') + echo "project_id=${project_id}" >> $GITHUB_OUTPUT + env: + API_KEY: ${{ inputs.api_key }} + API_HOST: ${{ steps.parse-input.outputs.api_host }} + REGION_ID: ${{ steps.parse-input.outputs.region_id }} diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml new file mode 100644 index 0000000000..d417c489ef --- /dev/null +++ b/.github/actions/neon-project-delete/action.yml @@ -0,0 +1,54 @@ +name: 'Delete Neon Project' +description: 'Delete Neon Project using API' + +inputs: + api_key: + desctiption: 'Neon API key' + required: true + environment: + desctiption: 'dev (aka captest) or stage' + required: true + project_id: + desctiption: 'ID of the Project to delete' + required: true + +runs: + using: "composite" + steps: + - name: Parse Input + id: parse-input + shell: bash -euxo pipefail {0} + run: | + case "${ENVIRONMENT}" in + dev) + API_HOST=console.dev.neon.tech + ;; + staging) + API_HOST=console.stage.neon.tech + ;; + *) + echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only" + exit 1 + ;; + esac + + echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT + env: + ENVIRONMENT: ${{ inputs.environment }} + + - name: Delete Neon Project + shell: bash -euxo pipefail {0} + run: | + # Allow PROJECT_ID to be empty/null for cases when .github/actions/neon-project-create failed + if [ -n "${PROJECT_ID}" ]; then + curl -X "POST" \ + "https://${API_HOST}/api/v1/projects/${PROJECT_ID}/delete" \ + --fail \ + --header "Accept: application/json" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer ${API_KEY}" + fi + env: + API_KEY: ${{ inputs.api_key }} + PROJECT_ID: ${{ inputs.project_id }} + API_HOST: ${{ steps.parse-input.outputs.api_host }} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml new file mode 100644 index 0000000000..990c7e25a9 --- /dev/null +++ b/.github/actions/run-python-test-set/action.yml @@ -0,0 +1,198 @@ +name: 'Run python test' +description: 'Runs a Neon python test set, performing all the required preparations before' + +inputs: + build_type: + description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug", or "remote" for the remote cluster' + required: true + test_selection: + description: 'A python test suite to run' + required: true + extra_params: + description: 'Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr' + required: false + default: '' + needs_postgres_source: + description: 'Set to true if the test suite requires postgres source checked out' + required: false + default: 'false' + run_in_parallel: + description: 'Whether to run tests in parallel' + required: false + default: 'true' + save_perf_report: + description: 'Whether to upload the performance report, if true PERF_TEST_RESULT_CONNSTR env variable should be set' + required: false + default: 'false' + run_with_real_s3: + description: 'Whether to pass real s3 credentials to the test suite' + required: false + default: 'false' + real_s3_bucket: + description: 'Bucket name for real s3 tests' + required: false + default: '' + real_s3_region: + description: 'Region name for real s3 tests' + required: false + default: '' + real_s3_access_key_id: + description: 'Access key id' + required: false + default: '' + real_s3_secret_access_key: + description: 'Secret access key' + required: false + default: '' + +runs: + using: "composite" + steps: + - name: Get Neon artifact + if: inputs.build_type != 'remote' + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact + path: /tmp/neon + + - name: Download Neon binaries for the previous release + if: inputs.build_type != 'remote' + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact + path: /tmp/neon-previous + prefix: latest + + - name: Download compatibility snapshot for Postgres 14 + if: inputs.build_type != 'remote' + uses: ./.github/actions/download + with: + name: compatibility-snapshot-${{ inputs.build_type }}-pg14 + path: /tmp/compatibility_snapshot_pg14 + prefix: latest + + - name: Checkout + if: inputs.needs_postgres_source == 'true' + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 1 + + - name: Cache poetry deps + id: cache_poetry + uses: actions/cache@v3 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + shell: bash -euxo pipefail {0} + run: ./scripts/pysync + + - name: Run pytest + env: + NEON_BIN: /tmp/neon/bin + COMPATIBILITY_NEON_BIN: /tmp/neon-previous/bin + COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: ${{ inputs.build_type }} + AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }} + AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }} + COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14 + ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage') + ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage') + shell: bash -euxo pipefail {0} + run: | + # PLATFORM will be embedded in the perf test report + # and it is needed to distinguish different environments + export PLATFORM=${PLATFORM:-github-actions-selfhosted} + export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} + export DEFAULT_PG_VERSION=${DEFAULT_PG_VERSION:-14} + + if [ "${BUILD_TYPE}" = "remote" ]; then + export REMOTE_ENV=1 + fi + + PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)" + rm -rf $PERF_REPORT_DIR + + TEST_SELECTION="test_runner/${{ inputs.test_selection }}" + EXTRA_PARAMS="${{ inputs.extra_params }}" + if [ -z "$TEST_SELECTION" ]; then + echo "test_selection must be set" + exit 1 + fi + if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then + # -n4 uses four processes to run tests via pytest-xdist + EXTRA_PARAMS="-n4 $EXTRA_PARAMS" + + # --dist=loadgroup points tests marked with @pytest.mark.xdist_group + # to the same worker to make @pytest.mark.order work with xdist + EXTRA_PARAMS="--dist=loadgroup $EXTRA_PARAMS" + fi + + if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then + echo "REAL S3 ENABLED" + export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty + export REMOTE_STORAGE_S3_BUCKET=${{ inputs.real_s3_bucket }} + export REMOTE_STORAGE_S3_REGION=${{ inputs.real_s3_region }} + fi + + if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then + mkdir -p "$PERF_REPORT_DIR" + EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" + fi + + if [[ "${{ inputs.build_type }}" == "debug" ]]; then + cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) + elif [[ "${{ inputs.build_type }}" == "release" ]]; then + cov_prefix=() + else + cov_prefix=() + fi + + # Wake up the cluster if we use remote neon instance + if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" + fi + + # Run the tests. + # + # The junit.xml file allows CI tools to display more fine-grained test information + # in its "Tests" tab in the results page. + # --verbose prints name of each test (helpful when there are + # multiple tests in one file) + # -rA prints summary in the end + # -s is not used to prevent pytest from capturing output, because tests are running + # in parallel and logs are mixed between different tests + # + mkdir -p $TEST_OUTPUT/allure/results + "${cov_prefix[@]}" ./scripts/pytest \ + --junitxml=$TEST_OUTPUT/junit.xml \ + --alluredir=$TEST_OUTPUT/allure/results \ + --tb=short \ + --verbose \ + -rA $TEST_SELECTION $EXTRA_PARAMS + + if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then + export REPORT_FROM="$PERF_REPORT_DIR" + export REPORT_TO="$PLATFORM" + scripts/generate_and_push_perf_report.sh + fi + + - name: Upload compatibility snapshot for Postgres 14 + if: github.ref_name == 'release' + uses: ./.github/actions/upload + with: + name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }} + # The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test + path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/ + prefix: latest + + - name: Create Allure report + if: success() || failure() + uses: ./.github/actions/allure-report + with: + action: store + build_type: ${{ inputs.build_type }} + test_selection: ${{ inputs.test_selection }} diff --git a/.github/actions/save-coverage-data/action.yml b/.github/actions/save-coverage-data/action.yml new file mode 100644 index 0000000000..6fbe19a96e --- /dev/null +++ b/.github/actions/save-coverage-data/action.yml @@ -0,0 +1,22 @@ +name: 'Merge and upload coverage data' +description: 'Compresses and uploads the coverage data as an artifact' + +runs: + using: "composite" + steps: + - name: Merge coverage data + shell: bash -euxo pipefail {0} + run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge + + - name: Download previous coverage data into the same directory + uses: ./.github/actions/download + with: + name: coverage-data-artifact + path: /tmp/coverage + skip-if-does-not-exist: true # skip if there's no previous coverage to download + + - name: Upload coverage data + uses: ./.github/actions/upload + with: + name: coverage-data-artifact + path: /tmp/coverage diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml new file mode 100644 index 0000000000..291a2cf3b0 --- /dev/null +++ b/.github/actions/upload/action.yml @@ -0,0 +1,58 @@ +name: "Upload an artifact" +description: "Custom upload action" +inputs: + name: + description: "Artifact name" + required: true + path: + description: "A directory or file to upload" + required: true + prefix: + description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" + required: false + +runs: + using: "composite" + steps: + - name: Prepare artifact + shell: bash -euxo pipefail {0} + env: + SOURCE: ${{ inputs.path }} + ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst + run: | + mkdir -p $(dirname $ARCHIVE) + + if [ -f ${ARCHIVE} ]; then + echo 2>&1 "File ${ARCHIVE} already exist. Something went wrong before" + exit 1 + fi + + ZSTD_NBTHREADS=0 + if [ -d ${SOURCE} ]; then + time tar -C ${SOURCE} -cf ${ARCHIVE} --zstd . + elif [ -f ${SOURCE} ]; then + time tar -cf ${ARCHIVE} --zstd ${SOURCE} + elif ! ls ${SOURCE} > /dev/null 2>&1; then + echo 2>&1 "${SOURCE} does not exist" + exit 2 + else + echo 2>&1 "${SOURCE} is neither a directory nor a file, do not know how to handle it" + exit 3 + fi + + - name: Upload artifact + shell: bash -euxo pipefail {0} + env: + SOURCE: ${{ inputs.path }} + ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst + PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }} + run: | + BUCKET=neon-github-public-dev + FILENAME=$(basename $ARCHIVE) + + FILESIZE=$(du -sh ${ARCHIVE} | cut -f1) + + time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${FILENAME} + + # Ref https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#adding-a-job-summary + echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY} diff --git a/.github/ansible/.gitignore b/.github/ansible/.gitignore new file mode 100644 index 0000000000..e3454fd43c --- /dev/null +++ b/.github/ansible/.gitignore @@ -0,0 +1,7 @@ +zenith_install.tar.gz +.zenith_current_version +neon_install.tar.gz +.neon_current_version + +collections/* +!collections/.keep diff --git a/.github/ansible/ansible.cfg b/.github/ansible/ansible.cfg new file mode 100644 index 0000000000..0497ee401d --- /dev/null +++ b/.github/ansible/ansible.cfg @@ -0,0 +1,13 @@ +[defaults] + +localhost_warning = False +host_key_checking = False +timeout = 30 +collections_paths = ./collections + +[ssh_connection] +ssh_args = -F ./ansible.ssh.cfg +# teleport doesn't support sftp yet https://github.com/gravitational/teleport/issues/7127 +# and scp neither worked for me +transfer_method = piped +pipelining = True diff --git a/.github/ansible/ansible.ssh.cfg b/.github/ansible/ansible.ssh.cfg new file mode 100644 index 0000000000..cd058b5427 --- /dev/null +++ b/.github/ansible/ansible.ssh.cfg @@ -0,0 +1,15 @@ +# Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed +# (use pre 8.5 option name to cope with old ssh in CI) +PubkeyAcceptedKeyTypes +ssh-rsa-cert-v01@openssh.com + +Host tele.zenith.tech + User admin + Port 3023 + StrictHostKeyChecking no + UserKnownHostsFile /dev/null + +Host * !tele.zenith.tech + User admin + StrictHostKeyChecking no + UserKnownHostsFile /dev/null + ProxyJump tele.zenith.tech diff --git a/.github/ansible/collections/.keep b/.github/ansible/collections/.keep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml new file mode 100644 index 0000000000..4adc685684 --- /dev/null +++ b/.github/ansible/deploy.yaml @@ -0,0 +1,191 @@ +- name: Upload Neon binaries + hosts: storage + gather_facts: False + remote_user: "{{ remote_user }}" + + tasks: + + - name: get latest version of Neon binaries + register: current_version_file + set_fact: + current_version: "{{ lookup('file', '.neon_current_version') | trim }}" + tags: + - pageserver + - safekeeper + + - name: inform about versions + debug: + msg: "Version to deploy - {{ current_version }}" + tags: + - pageserver + - safekeeper + + - name: upload and extract Neon binaries to /usr/local + ansible.builtin.unarchive: + owner: root + group: root + src: neon_install.tar.gz + dest: /usr/local + become: true + tags: + - pageserver + - safekeeper + - binaries + - putbinaries + +- name: Deploy pageserver + hosts: pageservers + gather_facts: False + remote_user: "{{ remote_user }}" + + tasks: + + - name: upload init script + when: console_mgmt_base_url is defined + ansible.builtin.template: + src: scripts/init_pageserver.sh + dest: /tmp/init_pageserver.sh + owner: root + group: root + mode: '0755' + become: true + tags: + - pageserver + + - name: init pageserver + shell: + cmd: /tmp/init_pageserver.sh + args: + creates: "/storage/pageserver/data/tenants" + environment: + NEON_REPO_DIR: "/storage/pageserver/data" + LD_LIBRARY_PATH: "/usr/local/v14/lib" + become: true + tags: + - pageserver + + - name: read the existing remote pageserver config + ansible.builtin.slurp: + src: /storage/pageserver/data/pageserver.toml + register: _remote_ps_config + tags: + - pageserver + + - name: parse the existing pageserver configuration + ansible.builtin.set_fact: + _existing_ps_config: "{{ _remote_ps_config['content'] | b64decode | sivel.toiletwater.from_toml }}" + tags: + - pageserver + + - name: construct the final pageserver configuration dict + ansible.builtin.set_fact: + pageserver_config: "{{ pageserver_config_stub | combine({'id': _existing_ps_config.id }) }}" + tags: + - pageserver + + - name: template the pageserver config + template: + src: templates/pageserver.toml.j2 + dest: /storage/pageserver/data/pageserver.toml + become: true + tags: + - pageserver + + - name: upload systemd service definition + ansible.builtin.template: + src: systemd/pageserver.service + dest: /etc/systemd/system/pageserver.service + owner: root + group: root + mode: '0644' + become: true + tags: + - pageserver + + - name: start systemd service + ansible.builtin.systemd: + daemon_reload: yes + name: pageserver + enabled: yes + state: restarted + become: true + tags: + - pageserver + + - name: post version to console + when: console_mgmt_base_url is defined + shell: + cmd: | + INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) + curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID + tags: + - pageserver + +- name: Deploy safekeeper + hosts: safekeepers + gather_facts: False + remote_user: "{{ remote_user }}" + + tasks: + + - name: upload init script + when: console_mgmt_base_url is defined + ansible.builtin.template: + src: scripts/init_safekeeper.sh + dest: /tmp/init_safekeeper.sh + owner: root + group: root + mode: '0755' + become: true + tags: + - safekeeper + + - name: init safekeeper + shell: + cmd: /tmp/init_safekeeper.sh + args: + creates: "/storage/safekeeper/data/safekeeper.id" + environment: + NEON_REPO_DIR: "/storage/safekeeper/data" + LD_LIBRARY_PATH: "/usr/local/v14/lib" + become: true + tags: + - safekeeper + + # in the future safekeepers should discover pageservers byself + # but currently use first pageserver that was discovered + - name: set first pageserver var for safekeepers + set_fact: + first_pageserver: "{{ hostvars[groups['pageservers'][0]]['inventory_hostname'] }}" + tags: + - safekeeper + + - name: upload systemd service definition + ansible.builtin.template: + src: systemd/safekeeper.service + dest: /etc/systemd/system/safekeeper.service + owner: root + group: root + mode: '0644' + become: true + tags: + - safekeeper + + - name: start systemd service + ansible.builtin.systemd: + daemon_reload: yes + name: safekeeper + enabled: yes + state: restarted + become: true + tags: + - safekeeper + + - name: post version to console + when: console_mgmt_base_url is defined + shell: + cmd: | + INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) + curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID + tags: + - safekeeper diff --git a/.github/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh new file mode 100755 index 0000000000..9d2d0926f5 --- /dev/null +++ b/.github/ansible/get_binaries.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +set -e + +if [ -n "${DOCKER_TAG}" ]; then + # Verson is DOCKER_TAG but without prefix + VERSION=$(echo $DOCKER_TAG | sed 's/^.*-//g') +else + echo "Please set DOCKER_TAG environment variable" + exit 1 +fi + + +# do initial cleanup +rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version +mkdir neon_install + +# retrieve binaries from docker image +echo "getting binaries from docker image" +docker pull --quiet neondatabase/neon:${DOCKER_TAG} +ID=$(docker create neondatabase/neon:${DOCKER_TAG}) +docker cp ${ID}:/data/postgres_install.tar.gz . +tar -xzf postgres_install.tar.gz -C neon_install +mkdir neon_install/bin/ +docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/ +docker cp ${ID}:/usr/local/bin/pageserver_binutils neon_install/bin/ +docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/ +docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/ +docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/ +docker cp ${ID}:/usr/local/v15/bin/ neon_install/v15/bin/ +docker cp ${ID}:/usr/local/v14/lib/ neon_install/v14/lib/ +docker cp ${ID}:/usr/local/v15/lib/ neon_install/v15/lib/ +docker rm -vf ${ID} + +# store version to file (for ansible playbooks) and create binaries tarball +echo ${VERSION} > neon_install/.neon_current_version +echo ${VERSION} > .neon_current_version +tar -czf neon_install.tar.gz -C neon_install . + +# do final cleaup +rm -rf neon_install postgres_install.tar.gz diff --git a/.github/ansible/neon-stress.hosts.yaml b/.github/ansible/neon-stress.hosts.yaml new file mode 100644 index 0000000000..dd61ac5a5e --- /dev/null +++ b/.github/ansible/neon-stress.hosts.yaml @@ -0,0 +1,31 @@ +storage: + vars: + bucket_name: neon-storage-ireland + bucket_region: eu-west-1 + console_mgmt_base_url: http://neon-stress-console.local + etcd_endpoints: neon-stress-etcd.local:2379 + safekeeper_enable_s3_offload: 'false' + pageserver_config_stub: + pg_distrib_dir: /usr/local + remote_storage: + bucket_name: "{{ bucket_name }}" + bucket_region: "{{ bucket_region }}" + prefix_in_bucket: "{{ inventory_hostname }}" + safekeeper_s3_prefix: neon-stress/wal + hostname_suffix: ".local" + remote_user: admin + children: + pageservers: + hosts: + neon-stress-ps-1: + console_region_id: aws-eu-west-1 + neon-stress-ps-2: + console_region_id: aws-eu-west-1 + safekeepers: + hosts: + neon-stress-sk-1: + console_region_id: aws-eu-west-1 + neon-stress-sk-2: + console_region_id: aws-eu-west-1 + neon-stress-sk-3: + console_region_id: aws-eu-west-1 diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml new file mode 100644 index 0000000000..bb4af91f71 --- /dev/null +++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml @@ -0,0 +1,35 @@ +storage: + vars: + bucket_name: neon-prod-storage-ap-southeast-1 + bucket_region: ap-southeast-1 + console_mgmt_base_url: http://console-release.local + etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379 + pageserver_config_stub: + pg_distrib_dir: /usr/local + remote_storage: + bucket_name: "{{ bucket_name }}" + bucket_region: "{{ bucket_region }}" + prefix_in_bucket: "pageserver/v1" + safekeeper_s3_prefix: safekeeper/v1/wal + hostname_suffix: "" + remote_user: ssm-user + ansible_aws_ssm_region: ap-southeast-1 + ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1 + console_region_id: aws-ap-southeast-1 + + children: + pageservers: + hosts: + pageserver-0.ap-southeast-1.aws.neon.tech: + ansible_host: i-064de8ea28bdb495b + pageserver-1.ap-southeast-1.aws.neon.tech: + ansible_host: i-0b180defcaeeb6b93 + + safekeepers: + hosts: + safekeeper-0.ap-southeast-1.aws.neon.tech: + ansible_host: i-0d6f1dc5161eef894 + safekeeper-1.ap-southeast-1.aws.neon.tech: + ansible_host: i-0e338adda8eb2d19f + safekeeper-2.ap-southeast-1.aws.neon.tech: + ansible_host: i-04fb63634e4679eb9 diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml new file mode 100644 index 0000000000..68b1579746 --- /dev/null +++ b/.github/ansible/prod.eu-central-1.hosts.yaml @@ -0,0 +1,35 @@ +storage: + vars: + bucket_name: neon-prod-storage-eu-central-1 + bucket_region: eu-central-1 + console_mgmt_base_url: http://console-release.local + etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379 + pageserver_config_stub: + pg_distrib_dir: /usr/local + remote_storage: + bucket_name: "{{ bucket_name }}" + bucket_region: "{{ bucket_region }}" + prefix_in_bucket: "pageserver/v1" + safekeeper_s3_prefix: safekeeper/v1/wal + hostname_suffix: "" + remote_user: ssm-user + ansible_aws_ssm_region: eu-central-1 + ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1 + console_region_id: aws-eu-central-1 + + children: + pageservers: + hosts: + pageserver-0.eu-central-1.aws.neon.tech: + ansible_host: i-0cd8d316ecbb715be + pageserver-1.eu-central-1.aws.neon.tech: + ansible_host: i-090044ed3d383fef0 + + safekeepers: + hosts: + safekeeper-0.eu-central-1.aws.neon.tech: + ansible_host: i-0b238612d2318a050 + safekeeper-1.eu-central-1.aws.neon.tech: + ansible_host: i-07b9c45e5c2637cd4 + safekeeper-2.eu-central-1.aws.neon.tech: + ansible_host: i-020257302c3c93d88 diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml new file mode 100644 index 0000000000..1d54e2ef0a --- /dev/null +++ b/.github/ansible/prod.us-east-2.hosts.yaml @@ -0,0 +1,36 @@ +storage: + vars: + bucket_name: neon-prod-storage-us-east-2 + bucket_region: us-east-2 + console_mgmt_base_url: http://console-release.local + etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379 + pageserver_config_stub: + pg_distrib_dir: /usr/local + remote_storage: + bucket_name: "{{ bucket_name }}" + bucket_region: "{{ bucket_region }}" + prefix_in_bucket: "pageserver/v1" + safekeeper_s3_prefix: safekeeper/v1/wal + hostname_suffix: "" + remote_user: ssm-user + ansible_aws_ssm_region: us-east-2 + ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2 + console_region_id: aws-us-east-2 + + children: + pageservers: + hosts: + pageserver-0.us-east-2.aws.neon.tech: + ansible_host: i-062227ba7f119eb8c + pageserver-1.us-east-2.aws.neon.tech: + ansible_host: i-0b3ec0afab5968938 + + safekeepers: + hosts: + safekeeper-0.us-east-2.aws.neon.tech: + ansible_host: i-0e94224750c57d346 + safekeeper-1.us-east-2.aws.neon.tech: + ansible_host: i-06d113fb73bfddeb0 + safekeeper-2.us-east-2.aws.neon.tech: + ansible_host: i-09f66c8e04afff2e8 + diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml new file mode 100644 index 0000000000..bca2614399 --- /dev/null +++ b/.github/ansible/production.hosts.yaml @@ -0,0 +1,33 @@ +--- +storage: + vars: + console_mgmt_base_url: http://console-release.local + bucket_name: zenith-storage-oregon + bucket_region: us-west-2 + etcd_endpoints: zenith-1-etcd.local:2379 + pageserver_config_stub: + pg_distrib_dir: /usr/local + remote_storage: + bucket_name: "{{ bucket_name }}" + bucket_region: "{{ bucket_region }}" + prefix_in_bucket: "{{ inventory_hostname }}" + safekeeper_s3_prefix: prod-1/wal + hostname_suffix: ".local" + remote_user: admin + + children: + pageservers: + hosts: + zenith-1-ps-2: + console_region_id: aws-us-west-2 + zenith-1-ps-3: + console_region_id: aws-us-west-2 + + safekeepers: + hosts: + zenith-1-sk-1: + console_region_id: aws-us-west-2 + zenith-1-sk-2: + console_region_id: aws-us-west-2 + zenith-1-sk-3: + console_region_id: aws-us-west-2 diff --git a/.github/ansible/scripts/init_pageserver.sh b/.github/ansible/scripts/init_pageserver.sh new file mode 100644 index 0000000000..426925a837 --- /dev/null +++ b/.github/ansible/scripts/init_pageserver.sh @@ -0,0 +1,31 @@ +#!/bin/sh + +# get instance id from meta-data service +INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) + +# store fqdn hostname in var +HOST=$(hostname -f) + + +cat <> $GITHUB_PATH + + - name: Create Neon Project + if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch"]'), matrix.platform) + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + environment: ${{ github.event.inputs.environment || 'dev' }} + api_key: ${{ ( github.event.inputs.environment || 'dev' ) == 'staging' && secrets.NEON_STAGING_API_KEY || secrets.NEON_CAPTEST_API_KEY }} + + - name: Set up Connection String + id: set-up-connstr + run: | + case "${PLATFORM}" in + neon-captest-reuse) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} + ;; + neon-captest-new | neon-captest-prefetch) + CONNSTR=${{ steps.create-neon-project.outputs.dsn }} + ;; + rds-aurora) + CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }} + ;; + *) + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch' or 'rds-aurora'" + exit 1 + ;; + esac + + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + + psql ${CONNSTR} -c "SELECT version();" + + - name: Set database options + if: matrix.platform == 'neon-captest-prefetch' + run: | + psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET enable_seqscan_prefetch=on" + psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET seqscan_prefetch_buffers=10" + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + + - name: Benchmark init + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + + - name: Benchmark simple-update + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + + - name: Benchmark select-only + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + + - name: Create Allure report + if: success() || failure() + uses: ./.github/actions/allure-report + with: + action: generate + build_type: ${{ env.BUILD_TYPE }} + + - name: Delete Neon Project + if: ${{ steps.create-neon-project.outputs.project_id && always() }} + uses: ./.github/actions/neon-project-delete + with: + environment: dev + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_CAPTEST_API_KEY }} + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml new file mode 100644 index 0000000000..a726cb01ff --- /dev/null +++ b/.github/workflows/build_and_test.yml @@ -0,0 +1,964 @@ +name: Test and Deploy + +on: + push: + branches: + - main + - release + pull_request: + +concurrency: + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + cancel-in-progress: true + +env: + RUST_BACKTRACE: 1 + COPT: '-Werror' + +jobs: + tag: + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest + outputs: + build-tag: ${{steps.build-tag.outputs.tag}} + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Get build tag + run: | + echo run:$GITHUB_RUN_ID + echo ref:$GITHUB_REF_NAME + echo rev:$(git rev-list --count HEAD) + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT + fi + shell: bash + id: build-tag + + build-neon: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + strategy: + fail-fast: false + matrix: + build_type: [ debug, release ] + + env: + BUILD_TYPE: ${{ matrix.build_type }} + GIT_VERSION: ${{ github.sha }} + + steps: + - name: Fix git ownership + run: | + # Workaround for `fatal: detected dubious ownership in repository at ...` + # + # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers + # Ref https://github.com/actions/checkout/issues/785 + # + git config --global --add safe.directory ${{ github.workspace }} + git config --global --add safe.directory ${GITHUB_WORKSPACE} + + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 1 + + - name: Set pg 14 revision for caching + id: pg_v14_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT + shell: bash -euxo pipefail {0} + + - name: Set pg 15 revision for caching + id: pg_v15_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT + shell: bash -euxo pipefail {0} + + # Set some environment variables used by all the steps. + # + # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc. + # It also includes --features, if any + # + # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS, + # because "cargo metadata" doesn't accept --release or --debug options + # + # We run tests with addtional features, that are turned off by default (e.g. in release builds), see + # corresponding Cargo.toml files for their descriptions. + - name: Set env variables + run: | + if [[ $BUILD_TYPE == "debug" ]]; then + cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" + CARGO_FEATURES="--features testing" + CARGO_FLAGS="--locked --timings $CARGO_FEATURES" + elif [[ $BUILD_TYPE == "release" ]]; then + cov_prefix="" + CARGO_FEATURES="--features testing,profiling" + CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES" + fi + echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV + echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV + echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV + shell: bash -euxo pipefail {0} + + # Don't include the ~/.cargo/registry/src directory. It contains just + # uncompressed versions of the crates in ~/.cargo/registry/cache + # directory, and it's faster to let 'cargo' to rebuild it from the + # compressed crates. + - name: Cache cargo deps + id: cache_cargo + uses: actions/cache@v3 + with: + path: | + ~/.cargo/registry/ + !~/.cargo/registry/src + ~/.cargo/git/ + target/ + # Fall back to older versions of the key, if no cache for current Cargo.lock was found + key: | + v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + v10-${{ runner.os }}-${{ matrix.build_type }}-cargo- + + - name: Cache postgres v14 build + id: cache_pg_14 + uses: actions/cache@v3 + with: + path: pg_install/v14 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Cache postgres v15 build + id: cache_pg_15 + uses: actions/cache@v3 + with: + path: pg_install/v15 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Build postgres v14 + if: steps.cache_pg_14.outputs.cache-hit != 'true' + run: mold -run make postgres-v14 -j$(nproc) + shell: bash -euxo pipefail {0} + + - name: Build postgres v15 + if: steps.cache_pg_15.outputs.cache-hit != 'true' + run: mold -run make postgres-v15 -j$(nproc) + shell: bash -euxo pipefail {0} + + - name: Build neon extensions + run: mold -run make neon-pg-ext -j$(nproc) + shell: bash -euxo pipefail {0} + + - name: Run cargo build + run: | + ${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests + shell: bash -euxo pipefail {0} + + - name: Run cargo test + run: | + ${cov_prefix} cargo test $CARGO_FLAGS + shell: bash -euxo pipefail {0} + + - name: Install rust binaries + run: | + # Install target binaries + mkdir -p /tmp/neon/bin/ + binaries=$( + ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps | + jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' + ) + for bin in $binaries; do + SRC=target/$BUILD_TYPE/$bin + DST=/tmp/neon/bin/$bin + cp "$SRC" "$DST" + done + + # Install test executables and write list of all binaries (for code coverage) + if [[ $BUILD_TYPE == "debug" ]]; then + # Keep bloated coverage data files away from the rest of the artifact + mkdir -p /tmp/coverage/ + + mkdir -p /tmp/neon/test_bin/ + + test_exe_paths=$( + ${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run | + jq -r '.executable | select(. != null)' + ) + for bin in $test_exe_paths; do + SRC=$bin + DST=/tmp/neon/test_bin/$(basename $bin) + + # We don't need debug symbols for code coverage, so strip them out to make + # the artifact smaller. + strip "$SRC" -o "$DST" + echo "$DST" >> /tmp/coverage/binaries.list + done + + for bin in $binaries; do + echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list + done + fi + shell: bash -euxo pipefail {0} + + - name: Install postgres binaries + run: cp -a pg_install /tmp/neon/pg_install + shell: bash -euxo pipefail {0} + + - name: Upload Neon artifact + uses: ./.github/actions/upload + with: + name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact + path: /tmp/neon + + - name: Prepare cargo build timing stats for storing + run: | + mkdir -p "/tmp/neon/cargo-timings/$BUILD_TYPE/" + cp -r ./target/cargo-timings/* "/tmp/neon/cargo-timings/$BUILD_TYPE/" + shell: bash -euxo pipefail {0} + - name: Upload cargo build stats + uses: ./.github/actions/upload + with: + name: neon-${{ runner.os }}-${{ matrix.build_type }}-build-stats + path: /tmp/neon/cargo-timings/ + + # XXX: keep this after the binaries.list is formed, so the coverage can properly work later + - name: Merge and upload coverage data + if: matrix.build_type == 'debug' + uses: ./.github/actions/save-coverage-data + + regress-tests: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + needs: [ build-neon ] + strategy: + fail-fast: false + matrix: + build_type: [ debug, release ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 2 + + - name: Pytest regression tests + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ matrix.build_type }} + test_selection: regress + needs_postgres_source: true + run_with_real_s3: true + real_s3_bucket: ci-tests-s3 + real_s3_region: us-west-2 + real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}" + real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}" + + - name: Merge and upload coverage data + if: matrix.build_type == 'debug' + uses: ./.github/actions/save-coverage-data + + benchmarks: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + needs: [ build-neon ] + if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') + strategy: + fail-fast: false + matrix: + build_type: [ release ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 2 + + - name: Pytest benchmarks + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ matrix.build_type }} + test_selection: performance + run_in_parallel: false + save_perf_report: ${{ github.ref == 'refs/heads/main' }} + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + # XXX: no coverage data handling here, since benchmarks are run on release builds, + # while coverage is currently collected for the debug ones + + merge-allure-report: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + needs: [ regress-tests, benchmarks ] + if: success() || failure() + strategy: + fail-fast: false + matrix: + build_type: [ debug, release ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: false + + - name: Create Allure report + id: create-allure-report + uses: ./.github/actions/allure-report + with: + action: generate + build_type: ${{ matrix.build_type }} + + - name: Store Allure test stat in the DB + if: ${{ steps.create-allure-report.outputs.report-url }} + env: + BUILD_TYPE: ${{ matrix.build_type }} + SHA: ${{ github.event.pull_request.head.sha || github.sha }} + REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }} + TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} + shell: bash -euxo pipefail {0} + run: | + curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json + ./scripts/pysync + + DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json + + coverage-report: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + needs: [ regress-tests ] + strategy: + fail-fast: false + matrix: + build_type: [ debug ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 1 + + - name: Restore cargo deps cache + id: cache_cargo + uses: actions/cache@v3 + with: + path: | + ~/.cargo/registry/ + !~/.cargo/registry/src + ~/.cargo/git/ + target/ + key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + + - name: Get Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact + path: /tmp/neon + + - name: Get coverage artifact + uses: ./.github/actions/download + with: + name: coverage-data-artifact + path: /tmp/coverage + + - name: Merge coverage data + run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge + shell: bash -euxo pipefail {0} + + - name: Build and upload coverage report + run: | + COMMIT_SHA=${{ github.event.pull_request.head.sha }} + COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} + COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA + + scripts/coverage \ + --dir=/tmp/coverage report \ + --input-objects=/tmp/coverage/binaries.list \ + --commit-url=$COMMIT_URL \ + --format=github + + REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA + + scripts/git-upload \ + --repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \ + --message="Add code coverage for $COMMIT_URL" \ + copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE + + # Add link to the coverage report to the commit + curl -f -X POST \ + https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ + -H "Accept: application/vnd.github.v3+json" \ + --user "${{ secrets.CI_ACCESS_TOKEN }}" \ + --data \ + "{ + \"state\": \"success\", + \"context\": \"neon-coverage\", + \"description\": \"Coverage report is ready\", + \"target_url\": \"$REPORT_URL\" + }" + shell: bash -euxo pipefail {0} + + trigger-e2e-tests: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned + options: --init + needs: [ push-docker-hub, tag ] + steps: + - name: Set PR's status to pending and request a remote CI test + run: | + # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit + # but we need to use a real sha of a latest commit in the PR's branch for the e2e job, + # to place a job run status update later. + COMMIT_SHA=${{ github.event.pull_request.head.sha }} + # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those + COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} + + REMOTE_REPO="${{ github.repository_owner }}/cloud" + + curl -f -X POST \ + https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ + -H "Accept: application/vnd.github.v3+json" \ + --user "${{ secrets.CI_ACCESS_TOKEN }}" \ + --data \ + "{ + \"state\": \"pending\", + \"context\": \"neon-cloud-e2e\", + \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\" + }" + + curl -f -X POST \ + https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \ + -H "Accept: application/vnd.github.v3+json" \ + --user "${{ secrets.CI_ACCESS_TOKEN }}" \ + --data \ + "{ + \"ref\": \"main\", + \"inputs\": { + \"ci_job_name\": \"neon-cloud-e2e\", + \"commit_hash\": \"$COMMIT_SHA\", + \"remote_repo\": \"${{ github.repository }}\", + \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\", + \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\" + } + }" + + neon-image: + runs-on: dev + needs: [ tag ] + container: gcr.io/kaniko-project/executor:v1.9.0-debug + + steps: + - name: Checkout + uses: actions/checkout@v1 # v3 won't work with kaniko + with: + submodules: true + fetch-depth: 0 + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build neon + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} + + compute-tools-image: + runs-on: dev + needs: [ tag ] + container: gcr.io/kaniko-project/executor:v1.9.0-debug + + steps: + - name: Checkout + uses: actions/checkout@v1 # v3 won't work with kaniko + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build compute tools + run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} + + compute-node-image-v14: + runs-on: dev + container: gcr.io/kaniko-project/executor:v1.9.0-debug + needs: [ tag ] + steps: + - name: Checkout + uses: actions/checkout@v1 # v3 won't work with kaniko + with: + submodules: true + fetch-depth: 0 + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build compute node with extensions v14 + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} + + compute-node-image-v15: + runs-on: dev + container: gcr.io/kaniko-project/executor:v1.9.0-debug + needs: [ tag ] + steps: + - name: Checkout + uses: actions/checkout@v1 # v3 won't work with kaniko + with: + submodules: true + fetch-depth: 0 + + - name: Configure ECR login + run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + + - name: Kaniko build compute node with extensions v15 + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} + + test-images: + needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ] + runs-on: dev + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. + # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify. + # Regular pageserver version string looks like + # Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: [] + # Bad versions might loop like: + # Neon page server git-env:local failpoints: true, features: ["testing"] + # Ensure that we don't have bad versions. + - name: Verify image versions + shell: bash # ensure no set -e for better error messages + run: | + pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") + + echo "Pageserver version string: $pageserver_version" + + if ! echo "$pageserver_version" | grep -qv 'git-env:local' ; then + echo "Pageserver version should not be the default Dockerfile one" + exit 1 + fi + + if ! echo "$pageserver_version" | grep -qv '"testing"' ; then + echo "Pageserver version should have no testing feature enabled" + exit 1 + fi + + - name: Verify docker-compose example + run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh + + - name: Print logs and clean up + if: always() + run: | + docker compose -f ./docker-compose/docker-compose.yml logs || 0 + docker compose -f ./docker-compose/docker-compose.yml down + + promote-images: + runs-on: dev + needs: [ tag, test-images ] + if: github.event_name != 'workflow_dispatch' + container: amazon/aws-cli + strategy: + fail-fast: false + matrix: + name: [ neon, compute-node-v14, compute-node-v15, compute-tools ] + + steps: + - name: Promote image to latest + run: | + export MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=${{needs.tag.outputs.build-tag}} --query 'images[].imageManifest' --output text) + aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST" + + push-docker-hub: + runs-on: dev + needs: [ promote-images, tag ] + container: golang:1.19-bullseye + + steps: + - name: Install Crane & ECR helper + run: | + go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 + go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 + + - name: Configure ECR login + run: | + mkdir /github/home/.docker/ + echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + + - name: Pull neon image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} neon + + - name: Pull compute tools image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools + + - name: Pull compute node v14 image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14 + + - name: Pull compute node v15 image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15 + + - name: Pull rust image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust + + - name: Push images to production ECR + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + run: | + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest + + - name: Configure Docker Hub login + run: | + # ECR Credential Helper & Docker Hub don't work together in config, hence reset + echo "" > /github/home/.docker/config.json + crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io + + - name: Push neon image to Docker Hub + run: crane push neon neondatabase/neon:${{needs.tag.outputs.build-tag}} + + - name: Push compute tools image to Docker Hub + run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} + + - name: Push compute node v14 image to Docker Hub + run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} + + - name: Push compute node v15 image to Docker Hub + run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} + + - name: Push rust image to Docker Hub + run: crane push rust neondatabase/rust:pinned + + - name: Add latest tag to images in Docker Hub + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + run: | + crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest + + calculate-deploy-targets: + runs-on: [ self-hosted, Linux, k8s-runner ] + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + outputs: + matrix-include: ${{ steps.set-matrix.outputs.include }} + steps: + - id: set-matrix + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}' + NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY"}' + echo "include=[$STAGING, $NEON_STRESS]" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}' + echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + + deploy: + runs-on: [ self-hosted, Linux, k8s-runner ] + #container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest + # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. + # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash + strategy: + matrix: + include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Setup python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Setup ansible + run: | + export PATH="/root/.local/bin:$PATH" + pip install --progress-bar off --user ansible boto3 toml + + - name: Redeploy + run: | + export DOCKER_TAG=${{needs.tag.outputs.build-tag}} + cd "$(pwd)/.github/ansible" + + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + ./get_binaries.sh + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + RELEASE=true ./get_binaries.sh + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + + eval $(ssh-agent) + echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key + echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub + chmod 0600 ssh-key + ssh-add ssh-key + rm -f ssh-key ssh-key-cert.pub + ansible-galaxy collection install sivel.toiletwater + ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} + rm -f neon_install.tar.gz .neon_current_version + + deploy-new: + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned + # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. + # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] + if: | + (github.ref_name == 'main') && + github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash + strategy: + matrix: + target_region: [ us-east-2 ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Redeploy + run: | + export DOCKER_TAG=${{needs.tag.outputs.build-tag}} + cd "$(pwd)/.github/ansible" + + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + ./get_binaries.sh + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + RELEASE=true ./get_binaries.sh + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + + ansible-galaxy collection install sivel.toiletwater + ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}} + rm -f neon_install.tar.gz .neon_current_version + + deploy-prod-new: + runs-on: prod + container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. + # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] + if: | + (github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash + strategy: + matrix: + target_region: [ us-east-2, eu-central-1, ap-southeast-1 ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Redeploy + run: | + export DOCKER_TAG=${{needs.tag.outputs.build-tag}} + cd "$(pwd)/.github/ansible" + + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + ./get_binaries.sh + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + RELEASE=true ./get_binaries.sh + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + + ansible-galaxy collection install sivel.toiletwater + ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}} + rm -f neon_install.tar.gz .neon_current_version + + deploy-proxy: + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest + # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash + strategy: + matrix: + include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} + env: + KUBECONFIG: .kubeconfig + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Add curl + run: apt update && apt install curl -y + + - name: Store kubeconfig file + run: | + echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG} + chmod 0600 ${KUBECONFIG} + + - name: Setup helm v3 + run: | + curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + helm repo add neondatabase https://neondatabase.github.io/helm-charts + + - name: Re-deploy proxy + run: | + DOCKER_TAG=${{needs.tag.outputs.build-tag}} + helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + + deploy-proxy-new: + runs-on: dev + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned + # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] + if: | + (github.ref_name == 'main') && + github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash + strategy: + matrix: + include: + - target_region: us-east-2 + target_cluster: dev-us-east-2-beta + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Configure environment + run: | + helm repo add neondatabase https://neondatabase.github.io/helm-charts + aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} + + - name: Re-deploy proxy + run: | + DOCKER_TAG=${{needs.tag.outputs.build-tag}} + helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + + deploy-proxy-prod-new: + runs-on: prod + container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] + if: | + (github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash + strategy: + matrix: + include: + - target_region: us-east-2 + target_cluster: prod-us-east-2-delta + - target_region: eu-central-1 + target_cluster: prod-eu-central-1-gamma + - target_region: ap-southeast-1 + target_cluster: prod-ap-southeast-1-epsilon + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Configure environment + run: | + helm repo add neondatabase https://neondatabase.github.io/helm-charts + aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} + + - name: Re-deploy proxy + run: | + DOCKER_TAG=${{needs.tag.outputs.build-tag}} + helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + + promote-compatibility-data: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + needs: [ deploy, deploy-proxy ] + if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch' + steps: + - name: Promote compatibility snapshot for the release + shell: bash -euxo pipefail {0} + env: + BUCKET: neon-github-public-dev + PREFIX: artifacts/latest + run: | + # Update compatibility snapshot for the release + for build_type in debug release; do + OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst + NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst + + time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME} + done + + # Update Neon artifact for the release (reuse already uploaded artifact) + for build_type in debug release; do + OLD_PREFIX=artifacts/${GITHUB_RUN_ID} + FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst + + S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) + if [ -z "${S3_KEY}" ]; then + echo 2>&1 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist" + exit 1 + fi + + time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME} + done diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml new file mode 100644 index 0000000000..1e77963760 --- /dev/null +++ b/.github/workflows/codestyle.yml @@ -0,0 +1,166 @@ +name: Check code style and build + +on: + push: + branches: + - main + pull_request: + +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + cancel-in-progress: true + +env: + RUST_BACKTRACE: 1 + COPT: '-Werror' + +jobs: + check-codestyle-rust: + strategy: + fail-fast: false + matrix: + # XXX: both OSes have rustup + # * https://github.com/actions/runner-images/blob/main/images/macos/macos-12-Readme.md#rust-tools + # * https://github.com/actions/runner-images/blob/main/images/linux/Ubuntu2204-Readme.md#rust-tools + # this is all we need to install our toolchain later via rust-toolchain.toml + # so don't install any toolchain explicitly. + os: [ubuntu-latest, macos-latest] + timeout-minutes: 90 + name: check codestyle rust and postgres + runs-on: ${{ matrix.os }} + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 2 + + - name: Check formatting + run: cargo fmt --all -- --check + + - name: Install Ubuntu postgres dependencies + if: matrix.os == 'ubuntu-latest' + run: | + sudo apt update + sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev + + - name: Install macOS postgres dependencies + if: matrix.os == 'macos-latest' + run: brew install flex bison openssl + + - name: Set pg 14 revision for caching + id: pg_v14_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT + shell: bash -euxo pipefail {0} + + - name: Set pg 15 revision for caching + id: pg_v15_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT + shell: bash -euxo pipefail {0} + + - name: Cache postgres v14 build + id: cache_pg_14 + uses: actions/cache@v3 + with: + path: pg_install/v14 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Cache postgres v15 build + id: cache_pg_15 + uses: actions/cache@v3 + with: + path: pg_install/v15 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Set extra env for macOS + if: matrix.os == 'macos-latest' + run: | + echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV + echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV + + - name: Build postgres v14 + if: steps.cache_pg_14.outputs.cache-hit != 'true' + run: make postgres-v14 + shell: bash -euxo pipefail {0} + + - name: Build postgres v15 + if: steps.cache_pg_15.outputs.cache-hit != 'true' + run: make postgres-v15 + shell: bash -euxo pipefail {0} + + - name: Build neon extensions + run: make neon-pg-ext + + - name: Cache cargo deps + id: cache_cargo + uses: actions/cache@v3 + with: + path: | + ~/.cargo/registry + !~/.cargo/registry/src + ~/.cargo/git + target + key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust + + - name: Run cargo clippy + run: ./run_clippy.sh + + - name: Ensure all project builds + run: cargo build --locked --all --all-targets + + check-rust-dependencies: + runs-on: dev + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: false + fetch-depth: 1 + + # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci + - name: Check every project module is covered by Hakari + run: | + cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date + cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack + shell: bash -euxo pipefail {0} + + check-codestyle-python: + runs-on: [ self-hosted, Linux, k8s-runner ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: false + fetch-depth: 1 + + - name: Cache poetry deps + id: cache_poetry + uses: actions/cache@v3 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + run: ./scripts/pysync + + - name: Run isort to ensure code format + run: poetry run isort --diff --check . + + - name: Run black to ensure code format + run: poetry run black --diff --check . + + - name: Run flake8 to ensure code format + run: poetry run flake8 . + + - name: Run mypy to check types + run: poetry run mypy . diff --git a/.github/workflows/notifications.yml b/.github/workflows/notifications.yml deleted file mode 100644 index 55dc979896..0000000000 --- a/.github/workflows/notifications.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: Send Notifications - -on: - push: - branches: [ main ] - -jobs: - send-notifications: - timeout-minutes: 30 - name: send commit notifications - runs-on: ubuntu-latest - - steps: - - - name: Checkout - uses: actions/checkout@v2 - with: - submodules: true - fetch-depth: 2 - - - name: Form variables for notification message - id: git_info_grab - run: | - git_stat=$(git show --stat=50) - git_stat="${git_stat//'%'/'%25'}" - git_stat="${git_stat//$'\n'/'%0A'}" - git_stat="${git_stat//$'\r'/'%0D'}" - git_stat="${git_stat// / }" # space -> 'Space En', as github tends to eat ordinary spaces - echo "::set-output name=git_stat::$git_stat" - echo "::set-output name=sha_short::$(git rev-parse --short HEAD)" - echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})" - - - name: Send notification - uses: appleboy/telegram-action@master - with: - to: ${{ secrets.TELEGRAM_TO }} - token: ${{ secrets.TELEGRAM_TOKEN }} - format: markdown - args: | - *@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }}) - - ``` - ${{ steps.git_info_grab.outputs.git_stat }} - ``` - diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml new file mode 100644 index 0000000000..0600f9234f --- /dev/null +++ b/.github/workflows/pg_clients.yml @@ -0,0 +1,99 @@ +name: Test Postgres client libraries + +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '23 02 * * *' # run once a day, timezone is utc + + workflow_dispatch: + +concurrency: + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + cancel-in-progress: true + +jobs: + test-postgres-client-libs: + # TODO: switch to gen2 runner, requires docker + runs-on: [ ubuntu-latest ] + + env: + TEST_OUTPUT: /tmp/test_output + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: 3.9 + + - name: Install Poetry + uses: snok/install-poetry@v1 + + - name: Cache poetry deps + id: cache_poetry + uses: actions/cache@v3 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + shell: bash -euxo pipefail {0} + run: ./scripts/pysync + + - name: Create Neon Project + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + environment: staging + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Run pytest + env: + REMOTE_ENV: 1 + BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + shell: bash -euxo pipefail {0} + run: | + # Test framework expects we have psql binary; + # but since we don't really need it in this test, let's mock it + mkdir -p "$POSTGRES_DISTRIB_DIR/v14/bin" && touch "$POSTGRES_DISTRIB_DIR/v14/bin/psql"; + ./scripts/pytest \ + --junitxml=$TEST_OUTPUT/junit.xml \ + --tb=short \ + --verbose \ + -m "remote_cluster" \ + -rA "test_runner/pg_clients" + + - name: Delete Neon Project + if: ${{ always() }} + uses: ./.github/actions/neon-project-delete + with: + environment: staging + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI. + # It will be fixed after switching to gen2 runner + - name: Upload python test logs + if: always() + uses: actions/upload-artifact@v3 + with: + retention-days: 7 + name: python-test-pg_clients-${{ runner.os }}-stage-logs + path: ${{ env.TEST_OUTPUT }} + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml deleted file mode 100644 index 218783387b..0000000000 --- a/.github/workflows/testing.yml +++ /dev/null @@ -1,74 +0,0 @@ -name: Build and Test - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - -jobs: - regression-check: - strategy: - matrix: - # If we want to duplicate this job for different - # Rust toolchains (e.g. nightly or 1.37.0), add them here. - rust_toolchain: [stable] - os: [ubuntu-latest] - timeout-minutes: 30 - name: run regression test suite - runs-on: ${{ matrix.os }} - - steps: - - name: Checkout - uses: actions/checkout@v2 - with: - submodules: true - fetch-depth: 2 - - - name: install rust toolchain ${{ matrix.rust_toolchain }} - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: ${{ matrix.rust_toolchain }} - override: true - - - name: Install postgres dependencies - run: | - sudo apt update - sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev - - - name: Set pg revision for caching - id: pg_ver - run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres) - - - name: Cache postgres build - id: cache_pg - uses: actions/cache@v2 - with: - path: | - tmp_install/ - key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }} - - - name: Build postgres - if: steps.cache_pg.outputs.cache-hit != 'true' - run: | - make postgres - - - name: Cache cargo deps - id: cache_cargo - uses: actions/cache@v2 - with: - path: | - ~/.cargo/registry - ~/.cargo/git - target - key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - - # Use `env CARGO_INCREMENTAL=0` to mitigate https://github.com/rust-lang/rust/issues/91696 for rustc 1.57.0 - - name: Run cargo build - run: | - env CARGO_INCREMENTAL=0 cargo build --workspace --bins --examples --tests - - - name: Run cargo test - run: | - env CARGO_INCREMENTAL=0 cargo test -- --nocapture --test-threads=1 diff --git a/.gitignore b/.gitignore index 2ecdaa2053..f1afdee599 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,20 @@ +/pg_install /target /tmp_check -/tmp_install /tmp_check_cli __pycache__/ test_output/ .vscode -/.zenith -/integration_tests/.zenith +.idea +/.neon +/integration_tests/.neon # Coverage *.profraw *.profdata + +*.key +*.crt +*.o +*.so +*.Po diff --git a/.gitmodules b/.gitmodules index 8975c6e2fa..081a404135 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,8 @@ -[submodule "vendor/postgres"] - path = vendor/postgres - url = https://github.com/zenithdb/postgres - branch = main +[submodule "vendor/postgres-v14"] + path = vendor/postgres-v14 + url = https://github.com/neondatabase/postgres.git + branch = REL_14_STABLE_neon +[submodule "vendor/postgres-v15"] + path = vendor/postgres-v15 + url = https://github.com/neondatabase/postgres.git + branch = REL_15_STABLE_neon diff --git a/.yapfignore b/.yapfignore deleted file mode 100644 index 258f6c59cd..0000000000 --- a/.yapfignore +++ /dev/null @@ -1,10 +0,0 @@ -# This file is only read when `yapf` is run from this directory. -# Hence we only top-level directories here to avoid confusion. -# See source code for the exact file format: https://github.com/google/yapf/blob/c6077954245bc3add82dafd853a1c7305a6ebd20/yapf/yapflib/file_resources.py#L40-L43 -vendor/ -target/ -tmp_install/ -__pycache__/ -test_output/ -.zenith/ -.git/ diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 0000000000..4c8c8924d6 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,10 @@ +/compute_tools/ @neondatabase/control-plane +/control_plane/ @neondatabase/compute @neondatabase/storage +/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage +/libs/postgres_ffi/ @neondatabase/compute +/libs/remote_storage/ @neondatabase/storage +/libs/safekeeper_api/ @neondatabase/safekeepers +/pageserver/ @neondatabase/compute @neondatabase/storage +/pgxn/ @neondatabase/compute +/proxy/ @neondatabase/control-plane +/safekeeper/ @neondatabase/safekeepers diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a03cfdda48..43ebefc477 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,17 +11,15 @@ than it was before. ## Submitting changes -1. Make a PR for every change. - - Even seemingly trivial patches can break things in surprising ways. -Use of common sense is OK. If you're only fixing a typo in a comment, -it's probably fine to just push it. But if in doubt, open a PR. - -2. Get at least one +1 on your PR before you push. +1. Get at least one +1 on your PR before you push. For simple patches, it will only take a minute for someone to review it. +2. Don't force push small changes after making the PR ready for review. +Doing so will force readers to re-read your entire PR, which will delay +the review process. + 3. Always keep the CI green. Do not push, if the CI failed on your PR. Even if you think it's not diff --git a/COPYRIGHT b/COPYRIGHT deleted file mode 100644 index 448363b12f..0000000000 --- a/COPYRIGHT +++ /dev/null @@ -1,20 +0,0 @@ -This software is licensed under the Apache 2.0 License: - ----------------------------------------------------------------------------- -Copyright 2021 Zenith Labs, Inc - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. ----------------------------------------------------------------------------- - -The PostgreSQL submodule in vendor/postgres is licensed under the -PostgreSQL license. See vendor/postgres/COPYRIGHT. diff --git a/Cargo.lock b/Cargo.lock index ba3c6729d6..c112c05188 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,12 +17,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" -[[package]] -name = "ahash" -version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "739f4a8db6605981345c5654f3a85b056ce52f37a39d34da03f25bf2151ea16e" - [[package]] name = "ahash" version = "0.7.6" @@ -36,50 +30,81 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "0.7.18" +version = "0.7.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +checksum = "b4f55bd91a0978cbfd91c457a164bab8b4001c833b7f323132c0a4e1922dd44e" dependencies = [ "memchr", ] [[package]] -name = "ansi_term" -version = "0.12.1" +name = "amplify_num" +version = "0.4.1" +source = "git+https://github.com/hlinnaka/rust-amplify.git?branch=unsigned-int-perf#bd49b737c2e6e623ab8e9ba5ceaed5712d3a3940" + +[[package]] +name = "android_system_properties" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" dependencies = [ - "winapi", + "libc", ] [[package]] -name = "anyhow" -version = "1.0.53" +name = "anes" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94a45b455c14666b85fc40a019e8ab9eb75e3a124e05494f5397122bc9eb06e0" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anyhow" +version = "1.0.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98161a4e3e2184da77bb14f02184cdd111e83bbbcc9979dfee3c44b9a85f5602" dependencies = [ "backtrace", ] [[package]] -name = "async-compression" -version = "0.3.12" +name = "arrayvec" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2bf394cfbbe876f0ac67b13b6ca819f9c9f2fb9ec67223cceb1555fbab1c31a" +checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" + +[[package]] +name = "asn1-rs" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf6690c370453db30743b373a60ba498fc0d6d83b11f4abfd87a84a075db5dd4" dependencies = [ - "futures-core", - "memchr", - "pin-project-lite", - "tokio", - "zstd", - "zstd-safe", + "asn1-rs-derive", + "asn1-rs-impl", + "displaydoc", + "nom", + "num-traits", + "rusticata-macros", + "thiserror", + "time 0.3.15", ] [[package]] -name = "async-trait" -version = "0.1.52" +name = "asn1-rs-derive" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061a7acccaa286c011ddc30970520b98fa40e00c9d644633fb26b5fc63a265e3" +checksum = "726535892e8eae7e70657b4c8ea93d26b8553afb1ce617caee529ef96d7dee6c" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "asn1-rs-impl" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed" dependencies = [ "proc-macro2", "quote", @@ -87,20 +112,44 @@ dependencies = [ ] [[package]] -name = "attohttpc" -version = "0.18.0" +name = "async-stream" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e69e13a99a7e6e070bb114f7ff381e58c7ccc188630121fc4c2fe4bcf24cd072" +checksum = "dad5c83079eae9969be7fadefe640a1c566901f05ff91ab221de4b6f68d9507e" dependencies = [ - "http", - "log", - "rustls 0.20.2", - "serde", - "serde_json", - "url", - "webpki 0.22.0", - "webpki-roots", - "wildmatch", + "async-stream-impl", + "futures-core", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f203db73a71dfa2fb6dd22763990fa26f3d2625a6da2da900d23b87d26be27" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "async-trait" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76464446b8bc32758d7e88ee1a804d9914cd9b1cb264c029899680b0be29826f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "atomic-polyfill" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c041a8d9751a520ee19656232a18971f18946a7900f1520ee4400002244dd89" +dependencies = [ + "critical-section", ] [[package]] @@ -121,59 +170,55 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] -name = "aversion" -version = "0.2.1" +name = "axum" +version = "0.5.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41992ab8cfcc3026ef9abceffe0c2b0479c043183fc23825e30d22baab6df334" +checksum = "c9e3356844c4d6a6d6467b8da2cffb4a2820be256f50a3a386c9d152bab31043" dependencies = [ - "aversion-macros", - "byteorder", + "async-trait", + "axum-core", + "bitflags", + "bytes", + "futures-util", + "http", + "http-body", + "hyper", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", "serde", - "serde_cbor", - "thiserror", + "sync_wrapper", + "tokio", + "tower", + "tower-http", + "tower-layer", + "tower-service", ] [[package]] -name = "aversion-macros" -version = "0.2.1" +name = "axum-core" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ba5785f953985aa0caca927ba4005880f3b4f53de87f134e810ae3549f744d2" +checksum = "d9f0c0a60006f2a293d82d571f635042a72edf927539b7685bd62d361963839b" dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "aws-creds" -version = "0.27.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "460a75eac8f3cb7683e0a9a588a83c3ff039331ea7bfbfbfcecf1dacab276e11" -dependencies = [ - "anyhow", - "attohttpc", - "dirs", - "rust-ini", - "serde", - "serde-xml-rs", - "serde_derive", - "url", -] - -[[package]] -name = "aws-region" -version = "0.23.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e37c2dc2c9047311911ef175e0ffbb3853f17c32b72cf3d562f455e5ff77267" -dependencies = [ - "anyhow", + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "mime", + "tower-layer", + "tower-service", ] [[package]] name = "backtrace" -version = "0.3.64" +version = "0.3.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e121dee8023ce33ab248d9ce1493df03c3b38a659b240096fcbd7048ff9c31f" +checksum = "cab84319d616cfb654d03394f38ab7e6f0919e181b1b57e1fd15e7fb4077d9a7" dependencies = [ "addr2line", "cc", @@ -185,10 +230,19 @@ dependencies = [ ] [[package]] -name = "base64" -version = "0.12.3" +name = "bare-metal" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3441f0f7b02788e948e47f457ca01f1d7e6d92c693bc132c22b087d3141c03ff" +checksum = "5deb64efa5bd81e31fcd1938615a6d98c82eafcbcd787162b6f63b91d6bac5b3" +dependencies = [ + "rustc_version 0.2.3", +] + +[[package]] +name = "bare-metal" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8fe8f5a8a398345e52358e18ff07cc17a568fbca5c6f73873d3a62056309603" [[package]] name = "base64" @@ -207,15 +261,13 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.59.2" +version = "0.61.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8" +checksum = "8a022e58a142a46fea340d68012b9201c094e93ec3d033a944a24f8fd4a4f09a" dependencies = [ "bitflags", "cexpr", "clang-sys", - "clap 2.34.0", - "env_logger", "lazy_static", "lazycell", "log", @@ -225,9 +277,22 @@ dependencies = [ "regex", "rustc-hash", "shlex", + "syn", "which", ] +[[package]] +name = "bit_field" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb6dd1c2376d2e096796e234a70e17e94cc2d5d54ff8ce42b28cef1d0d359a4" + +[[package]] +name = "bitfield" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719" + [[package]] name = "bitflags" version = "1.3.2" @@ -244,27 +309,37 @@ dependencies = [ ] [[package]] -name = "bookfile" -version = "0.3.0" -source = "git+https://github.com/zenithdb/bookfile.git?branch=generic-readext#d51a99c7a0be48c3d9cc7cb85c9b7fb05ce1100c" +name = "block-buffer" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e" dependencies = [ - "aversion", - "byteorder", - "serde", - "thiserror", + "generic-array", ] [[package]] -name = "boxfnonce" -version = "0.1.1" +name = "bstr" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426" +checksum = "fca0852af221f458706eb0725c03e4ed6c46af9ac98e6a689d5e634215d594dd" +dependencies = [ + "memchr", + "once_cell", + "regex-automata", + "serde", +] [[package]] name = "bumpalo" -version = "3.9.1" +version = "3.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a45a46ab1f2412e53d3a0ade76ffad2025804294569aae387231a0cd6e0899" +checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d" + +[[package]] +name = "bytemuck" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f5715e491b5a1598fc2bef5a606847b5dc1d48ea625bd3c02c00de8285591da" [[package]] name = "byteorder" @@ -274,21 +349,24 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" -version = "1.1.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" +checksum = "ec8a7b6a70fde80372154c65702f00a0f56f3e1c36abbc6c440484be248856db" dependencies = [ "serde", ] [[package]] -name = "cc" -version = "1.0.72" +name = "cast" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee" -dependencies = [ - "jobserver", -] +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" [[package]] name = "cexpr" @@ -307,22 +385,52 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.19" +version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" +checksum = "bfd4d1b31faaa3a89d7934dbded3111da0d2ef28e3ebccdb4f0179f5929d1ef1" dependencies = [ - "libc", + "iana-time-zone", + "js-sys", "num-integer", "num-traits", - "time", + "serde", + "time 0.1.44", + "wasm-bindgen", "winapi", ] [[package]] -name = "clang-sys" -version = "1.3.1" +name = "ciborium" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cc00842eed744b858222c4c9faf7243aafc6d33f92f96935263ef4d8a41ce21" +checksum = "b0c137568cc60b904a7724001b35ce2630fd00d5d84805fbb608ab89509d788f" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "346de753af073cc87b52b2083a506b38ac176a44cfb05497b622e27be899b369" + +[[package]] +name = "ciborium-ll" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "213030a2b5a4e0c0892b6652260cf6ccac84827b83a85a534e178e3906c4cf1b" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clang-sys" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa2e27ae6ab525c3d369ded447057bca5438d86dc3a68f6faafb8269ba82ebf3" dependencies = [ "glob", "libc", @@ -331,77 +439,135 @@ dependencies = [ [[package]] name = "clap" -version = "2.34.0" +version = "3.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" +checksum = "86447ad904c7fb335a790c9d7fe3d0d971dc523b8ccd1561a520de9a85302750" dependencies = [ - "ansi_term", - "atty", "bitflags", - "strsim 0.8.0", - "textwrap 0.11.0", - "unicode-width", - "vec_map", + "clap_lex 0.2.4", + "indexmap", + "textwrap", ] [[package]] name = "clap" -version = "3.0.14" +version = "4.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b63edc3f163b3c71ec8aa23f9bd6070f77edbf3d1d198b164afa90ff00e4ec62" +checksum = "6bf8832993da70a4c6d13c581f4463c2bdda27b9bf1c5498dc4365543abe6d6f" dependencies = [ "atty", "bitflags", - "indexmap", - "os_str_bytes", - "strsim 0.10.0", + "clap_lex 0.3.0", + "strsim", "termcolor", - "textwrap 0.14.2", +] + +[[package]] +name = "clap_lex" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5" +dependencies = [ + "os_str_bytes", +] + +[[package]] +name = "clap_lex" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8" +dependencies = [ + "os_str_bytes", +] + +[[package]] +name = "close_fds" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed" +dependencies = [ + "cfg-if", + "libc", +] + +[[package]] +name = "cmake" +version = "0.1.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8ad8cef104ac57b68b89df3208164d228503abbdce70f6880ffa3d970e7443a" +dependencies = [ + "cc", +] + +[[package]] +name = "codespan-reporting" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" +dependencies = [ + "termcolor", + "unicode-width", ] [[package]] name = "combine" -version = "4.6.3" +version = "4.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50b727aacc797f9fc28e355d21f34709ac4fc9adecfe470ad07b8f4464f53062" +checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4" dependencies = [ "bytes", "memchr", ] +[[package]] +name = "comfy-table" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85914173c2f558d61613bfbbf1911f14e630895087a7ed2fafc0f5319e1536e7" +dependencies = [ + "crossterm", + "strum", + "strum_macros", + "unicode-width", +] + [[package]] name = "compute_tools" version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 3.0.14", + "clap 4.0.15", "env_logger", + "futures", "hyper", - "libc", "log", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", + "notify", + "postgres", "regex", "serde", "serde_json", "tar", "tokio", + "tokio-postgres", + "url", + "workspace_hack", ] [[package]] name = "const_format" -version = "0.2.22" +version = "0.2.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22bc6cd49b0ec407b680c3e380182b6ac63b73991cb7602de350352fc309b614" +checksum = "7309d9b4d3d2c0641e018d449232f2e28f1b22933c137f157d3dbc14228b8c0e" dependencies = [ "const_format_proc_macros", ] [[package]] name = "const_format_proc_macros" -version = "0.2.22" +version = "0.2.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef196d5d972878a48da7decb7686eded338b4858fbabeed513d63a7c98b2b82d" +checksum = "d897f47bf7270cf70d370f8f98c1abb6d2d4cf60a6845d30e05bfb90c6568650" dependencies = [ "proc-macro2", "quote", @@ -413,57 +579,215 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "lazy_static", - "nix", - "pageserver", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "clap 4.0.15", + "comfy-table", + "git-version", + "nix 0.25.0", + "once_cell", + "pageserver_api", + "postgres", "regex", "reqwest", + "safekeeper_api", "serde", + "serde_with", "tar", "thiserror", "toml", "url", + "utils", "workspace_hack", - "zenith_utils", +] + +[[package]] +name = "core-foundation" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" + +[[package]] +name = "cortex-m" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70858629a458fdfd39f9675c4dc309411f2a3f83bede76988d81bf1a0ecee9e0" +dependencies = [ + "bare-metal 0.2.5", + "bitfield", + "embedded-hal", + "volatile-register", +] + +[[package]] +name = "cpp_demangle" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f" +dependencies = [ + "cfg-if", ] [[package]] name = "cpufeatures" -version = "0.2.1" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469" +checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" dependencies = [ "libc", ] [[package]] name = "crc32c" -version = "0.6.1" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee6b9c9389584bcba988bd0836086789b7f87ad91892d6a83d5291dbb24524b5" +checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" dependencies = [ - "rustc_version", + "rustc_version 0.4.0", +] + +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "criterion" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb" +dependencies = [ + "anes", + "atty", + "cast", + "ciborium", + "clap 3.2.22", + "criterion-plot", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "critical-section" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95da181745b56d4bd339530ec393508910c909c784e8962d15d722bacf0bcbcd" +dependencies = [ + "bare-metal 1.0.0", + "cfg-if", + "cortex-m", + "riscv", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f916dfc5d356b0ed9dae65f1db9fc9770aa2851d2662b988ccf4fe3516e86348" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "memoffset 0.6.5", + "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.7" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e5bed1f1c269533fa816a0a5492b3545209a205ca1a54842be180eb63a16a6" +checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc" dependencies = [ "cfg-if", - "lazy_static", + "once_cell", ] [[package]] -name = "crypto-mac" -version = "0.10.1" +name = "crossterm" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff07008ec701e8028e2ceb8f83f0e4274ee62bd2dbdc4fefff2e9a91824081a" +checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67" +dependencies = [ + "bitflags", + "crossterm_winapi", + "libc", + "mio", + "parking_lot 0.12.1", + "signal-hook", + "signal-hook-mio", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c" +dependencies = [ + "winapi", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" dependencies = [ "generic-array", - "subtle", + "typenum", ] [[package]] @@ -477,13 +801,111 @@ dependencies = [ ] [[package]] -name = "daemonize" -version = "0.4.1" +name = "cxx" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70c24513e34f53b640819f0ac9f705b673fcf4006d7aab8778bee72ebfc89815" +checksum = "3f83d0ebf42c6eafb8d7c52f7e5f2d3003b89c7aa4fd2b79229209459a849af8" dependencies = [ - "boxfnonce", - "libc", + "cc", + "cxxbridge-flags", + "cxxbridge-macro", + "link-cplusplus", +] + +[[package]] +name = "cxx-build" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07d050484b55975889284352b0ffc2ecbda25c0c55978017c132b29ba0818a86" +dependencies = [ + "cc", + "codespan-reporting", + "once_cell", + "proc-macro2", + "quote", + "scratch", + "syn", +] + +[[package]] +name = "cxxbridge-flags" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d2199b00553eda8012dfec8d3b1c75fce747cf27c169a270b3b99e3448ab78" + +[[package]] +name = "cxxbridge-macro" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb67a6de1f602736dd7eaead0080cf3435df806c61b24b13328db128c58868f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "darling" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4529658bdda7fd6769b8614be250cdcfc3aeb0ee72fe66f9e41e5e5eb73eac02" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "649c91bc01e8b1eac09fb91e8dbc7d517684ca6be8ebc75bb9cafc894f9fdb6f" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddfc69c5bfcbd2fc09a0f38451d2daf0e372e367986a83906d1b0dbc88134fb5" +dependencies = [ + "darling_core", + "quote", + "syn", +] + +[[package]] +name = "data-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ee2393c4a91429dffb4bedf19f4d6abf27d8a732c8ce4980305d782e5426d57" + +[[package]] +name = "debugid" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6ee87af31d84ef885378aebca32be3d682b0e0dc119d5b4860a2c5bb5046730" +dependencies = [ + "uuid 0.8.2", +] + +[[package]] +name = "der-parser" +version = "8.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42d4bc9b0db0a0df9ae64634ac5bdefb7afcb534e182275ca0beadbe486701c1" +dependencies = [ + "asn1-rs", + "displaydoc", + "nom", + "num-bigint", + "num-traits", + "rusticata-macros", ] [[package]] @@ -496,19 +918,31 @@ dependencies = [ ] [[package]] -name = "dirs" -version = "4.0.0" +name = "digest" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059" +checksum = "adfbc57365a37acbd2ebf2b64d7e69bb766e2fea813521ed536f5d0520dcf86c" dependencies = [ - "dirs-sys", + "block-buffer 0.10.3", + "crypto-common", + "subtle", ] [[package]] -name = "dirs-sys" -version = "0.3.6" +name = "dirs-next" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03d86534ed367a67548dc68113a0f5db55432fdfbb6e6f9d77704397d95d5780" +checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" +dependencies = [ + "cfg-if", + "dirs-sys-next", +] + +[[package]] +name = "dirs-sys-next" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" dependencies = [ "libc", "redox_users", @@ -516,34 +950,46 @@ dependencies = [ ] [[package]] -name = "dlv-list" +name = "displaydoc" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68df3f2b690c1b86e65ef7830956aededf3cb0a16f898f79b9a6f421a7b6211b" +checksum = "3bf95dc3f046b9da4f2d51833c0d3547d8564ef6910f5c1ed130306a75b92886" dependencies = [ - "rand", + "proc-macro2", + "quote", + "syn", ] [[package]] name = "either" -version = "1.6.1" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" + +[[package]] +name = "embedded-hal" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35949884794ad573cf46071e41c9b60efb0cb311e3ca01f7af807af1debc66ff" +dependencies = [ + "nb 0.1.3", + "void", +] [[package]] name = "encoding_rs" -version = "0.8.30" +version = "0.8.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7896dc8abb250ffdda33912550faa54c88ec8b998dec0b2c55ab224921ce11df" +checksum = "9852635589dc9f9ea1b6fe9f05b50ef208c85c834a562f0c6abb1c475736ec2b" dependencies = [ "cfg-if", ] [[package]] name = "env_logger" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b2cf0344971ee6c64c31be0d530793fba457d322dfec2810c453d0ef228f9c3" +checksum = "c90bf5f19754d10198ccb95b70664fc925bd1fc090a0fd9a6ebc54acc8cd6272" dependencies = [ "atty", "humantime", @@ -553,13 +999,46 @@ dependencies = [ ] [[package]] -name = "fail" -version = "0.5.0" +name = "etcd-client" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3245a0ca564e7f3c797d20d833a6870f57a728ac967d5225b3ffdef4465011" +checksum = "9fb8664f6ea68aba5503d42dd1be786b0f1bd9b7972e7f40208c83ef74db91bf" +dependencies = [ + "http", + "prost", + "tokio", + "tokio-stream", + "tonic", + "tonic-build", + "tower", + "tower-service", +] + +[[package]] +name = "etcd_broker" +version = "0.1.0" +dependencies = [ + "etcd-client", + "once_cell", + "regex", + "serde", + "serde_json", + "serde_with", + "thiserror", + "tokio", + "tracing", + "utils", + "workspace_hack", +] + +[[package]] +name = "fail" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe5e43d0f78a42ad591453aedb1d7ae631ce7ee445c7643691055a9ed8d3b01c" dependencies = [ - "lazy_static", "log", + "once_cell", "rand", ] @@ -571,25 +1050,43 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "1.7.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3fcf0cee53519c866c09b5de1f6c56ff9d647101f81c1964fa632e148896cdf" +checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" dependencies = [ "instant", ] [[package]] name = "filetime" -version = "0.2.15" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "975ccf83d8d9d0d84682850a38c8169027be83368805971cc4f238c2b245bc98" +checksum = "e94a7bbaa59354bc20dd75b67f23e2797b4490e9d6928203fb105c79e448c86c" dependencies = [ "cfg-if", "libc", "redox_syscall", + "windows-sys", +] + +[[package]] +name = "findshlibs" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64" +dependencies = [ + "cc", + "lazy_static", + "libc", "winapi", ] +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + [[package]] name = "fnv" version = "1.0.7" @@ -597,12 +1094,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] -name = "form_urlencoded" -version = "1.0.1" +name = "foreign-types" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "form_urlencoded" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8" dependencies = [ - "matches", "percent-encoding", ] @@ -617,10 +1128,19 @@ dependencies = [ ] [[package]] -name = "futures" -version = "0.3.21" +name = "fsevent-sys" +version = "4.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f73fe65f54d1e12b726f517d3e2135ca3125a437b6d998caf1962961f7172d9e" +checksum = "76ee7a02da4d231650c7cea31349b889be2f45ddb3ef3032d2ec8185f6313fd2" +dependencies = [ + "libc", +] + +[[package]] +name = "futures" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f21eda599937fba36daeb58a22e8f5cee2d14c4a17b5b7739c7c8e5e3b8230c" dependencies = [ "futures-channel", "futures-core", @@ -633,9 +1153,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3083ce4b914124575708913bca19bfe887522d6e2e6d0952943f5eac4a74010" +checksum = "30bdd20c28fadd505d0fd6712cdfcb0d4b5648baf45faef7f852afb2399bb050" dependencies = [ "futures-core", "futures-sink", @@ -643,15 +1163,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c09fd04b7e4073ac7156a9539b57a484a8ea920f79c7c675d05d289ab6110d3" +checksum = "4e5aa3de05362c3fb88de6531e6296e85cde7739cccad4b9dfeeb7f6ebce56bf" [[package]] name = "futures-executor" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9420b90cfa29e327d0429f19be13e7ddb68fa1cccb09d65e5706b8c7a749b8a6" +checksum = "9ff63c23854bee61b6e9cd331d523909f238fc7636290b96826e9cfa5faa00ab" dependencies = [ "futures-core", "futures-task", @@ -660,15 +1180,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc4045962a5a5e935ee2fdedaa4e08284547402885ab326734432bed5d12966b" +checksum = "bbf4d2a7a308fd4578637c0b17c7e1c7ba127b8f6ba00b29f717e9655d85eb68" [[package]] name = "futures-macro" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33c1e13800337f4d4d7a316bf45a567dbcb6ffe087f16424852d97e97a91f512" +checksum = "42cd15d1c7456c04dbdf7e88bcd69760d74f3a798d6444e16974b505b0e62f17" dependencies = [ "proc-macro2", "quote", @@ -677,21 +1197,27 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21163e139fa306126e6eedaf49ecdb4588f939600f0b1e770f4205ee4b7fa868" +checksum = "21b20ba5a92e727ba30e72834706623d94ac93a725410b6a6b6fbc1b07f7ba56" [[package]] name = "futures-task" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c66a976bf5909d801bbef33416c41372779507e7a6b3a5e25e4749c58f776a" +checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1" + +[[package]] +name = "futures-timer" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.21" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8b7abd5d659d9b90c8cba917f6ec750a74e2dc23902ef9cd4cc8c8b22e6036a" +checksum = "44fb6cb1be61cc1d2e43b262516aafcf63b241cffdb1d3fa115f91d9c7b09c90" dependencies = [ "futures-channel", "futures-core", @@ -707,9 +1233,9 @@ dependencies = [ [[package]] name = "generic-array" -version = "0.14.5" +version = "0.14.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd48d33ec7f05fbfa152300fdad764757cbded343c1aa1cff2fbaf4134851803" +checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" dependencies = [ "typenum", "version_check", @@ -717,20 +1243,20 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.4" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418d37c8b1d42553c93648be529cb70f920d3baf8ef469b74b9638df426e0b4c" +checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6" dependencies = [ "cfg-if", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", ] [[package]] name = "gimli" -version = "0.26.1" +version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78cc372d058dcf6d5ecd98510e7fbc9e5aec4d21de70f65fea8fecebcd881bd4" +checksum = "22030e2c5a68ec659fde1e949a745124b48e6fa8b045b7ed5bd1fe4ccc5c4e5d" [[package]] name = "git-version" @@ -762,9 +1288,9 @@ checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" [[package]] name = "h2" -version = "0.3.11" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9f1f717ddc7b2ba36df7e871fd88db79326551d3d6f1fc406fbfd28b582ff8e" +checksum = "5ca32592cf21ac7ccab1825cd87f6c9b3d9022c44d086172ed0966bec8af30be" dependencies = [ "bytes", "fnv", @@ -786,23 +1312,42 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" [[package]] -name = "hashbrown" -version = "0.9.1" +name = "hash32" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" +checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" dependencies = [ - "ahash 0.4.7", + "byteorder", ] [[package]] name = "hashbrown" -version = "0.11.2" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" dependencies = [ - "ahash 0.7.6", + "ahash", ] +[[package]] +name = "heapless" +version = "0.7.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743" +dependencies = [ + "atomic-polyfill", + "hash32", + "rustc_version 0.4.0", + "spin 0.9.4", + "stable_deref_trait", +] + +[[package]] +name = "heck" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" + [[package]] name = "hermit-abi" version = "0.1.19" @@ -827,42 +1372,41 @@ version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ebdb29d2ea9ed0083cd8cece49bbd968021bd99b0849edb4a9a7ee0fdf6a4e0" -[[package]] -name = "hmac" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1441c6b1e930e2817404b5046f1f989899143a12bf92de603b69f4e0aee1e15" -dependencies = [ - "crypto-mac 0.10.1", - "digest", -] - [[package]] name = "hmac" version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a2a2320eb7ec0ebe8da8f744d7812d9fc4cb4d09344ac01898dbcb6a20ae69b" dependencies = [ - "crypto-mac 0.11.1", - "digest", + "crypto-mac", + "digest 0.9.0", +] + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest 0.10.5", ] [[package]] name = "http" -version = "0.2.6" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f4c6746584866f0feabcc69893c5b51beef3831656a968ed7ae254cdc4fd03" +checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" dependencies = [ "bytes", "fnv", - "itoa 1.0.1", + "itoa", ] [[package]] name = "http-body" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ff4f84919677303da5f147645dbea6b1881f368d03ac84e1dc09031ebd7b2c6" +checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" dependencies = [ "bytes", "http", @@ -870,10 +1414,16 @@ dependencies = [ ] [[package]] -name = "httparse" -version = "1.6.0" +name = "http-range-header" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9100414882e15fb7feccb4897e5f0ff0ff1ca7d1a86a23208ada4d7a18e6c6c4" +checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" + +[[package]] +name = "httparse" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" [[package]] name = "httpdate" @@ -888,10 +1438,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] -name = "hyper" -version = "0.14.16" +name = "humantime-serde" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7ec3e62bdc98a2f0393a5048e4c30ef659440ea6e0e572965103e72bd836f55" +checksum = "57a3db5ea5923d99402c94e9feb261dc5ee9b4efa158b0315f788cf549cc200c" +dependencies = [ + "humantime", + "serde", +] + +[[package]] +name = "hyper" +version = "0.14.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02c929dc5c39e335a03c405292728118860721b10190d98c2a0f0efd5baafbac" dependencies = [ "bytes", "futures-channel", @@ -902,7 +1462,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 0.4.8", + "itoa", "pin-project-lite", "socket2", "tokio", @@ -919,30 +1479,123 @@ checksum = "d87c48c02e0dc5e3b849a2041db3029fd066650f8f717c07bf8ed78ccb895cac" dependencies = [ "http", "hyper", - "rustls 0.20.2", + "rustls", "tokio", - "tokio-rustls 0.23.2", + "tokio-rustls", ] [[package]] -name = "idna" -version = "0.2.3" +name = "hyper-timeout" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" +checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" +dependencies = [ + "hyper", + "pin-project-lite", + "tokio", + "tokio-io-timeout", +] + +[[package]] +name = "hyper-tls" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" +dependencies = [ + "bytes", + "hyper", + "native-tls", + "tokio", + "tokio-native-tls", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5a6ef98976b22b3b7f2f3a806f858cb862044cfa66805aa3ad84cb3d3b785ed" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "winapi", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" +dependencies = [ + "cxx", + "cxx-build", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6" dependencies = [ - "matches", "unicode-bidi", "unicode-normalization", ] [[package]] name = "indexmap" -version = "1.8.0" +version = "1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282a6247722caba404c065016bbfa522806e51714c34f5dfc3e4a3a46fcb4223" +checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" dependencies = [ "autocfg", - "hashbrown 0.11.2", + "hashbrown", + "serde", +] + +[[package]] +name = "inferno" +version = "0.10.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3886428c6400486522cf44b8626e7b94ad794c14390290f2a274dcf728a58f" +dependencies = [ + "ahash", + "atty", + "indexmap", + "itoa", + "lazy_static", + "log", + "num-format", + "quick-xml", + "rgb", + "str_stack", +] + +[[package]] +name = "inotify" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff" +dependencies = [ + "bitflags", + "inotify-sys", + "libc", +] + +[[package]] +name = "inotify-sys" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" +dependencies = [ + "libc", ] [[package]] @@ -956,57 +1609,42 @@ dependencies = [ [[package]] name = "ipnet" -version = "2.3.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f2d64f2edebec4ce84ad108148e67e1064789bee435edc5b60ad398714a3a9" +checksum = "879d54834c8c76457ef4293a689b2a8c59b076067ad77b15efafbb05f92a592b" [[package]] name = "itertools" -version = "0.10.3" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" dependencies = [ "either", ] [[package]] name = "itoa" -version = "0.4.8" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" - -[[package]] -name = "itoa" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" - -[[package]] -name = "jobserver" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" -dependencies = [ - "libc", -] +checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc" [[package]] name = "js-sys" -version = "0.3.56" +version = "0.3.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a38fc24e30fd564ce974c02bf1d337caddff65be6cc4735a1f7eab22a7440f04" +checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47" dependencies = [ "wasm-bindgen", ] [[package]] name = "jsonwebtoken" -version = "7.2.0" +version = "8.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afabcc15e437a6484fc4f12d0fd63068fe457bf93f1c148d3d9649c60b103f32" +checksum = "1aa4b4af834c6cfd35d8763d359661b90f2e45d8f750a0849156c7f4671af09c" dependencies = [ - "base64 0.12.3", - "pem 0.8.3", + "base64", + "pem", "ring", "serde", "serde_json", @@ -1014,12 +1652,23 @@ dependencies = [ ] [[package]] -name = "kstring" -version = "1.0.6" +name = "kqueue" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b310ccceade8121d7d77fee406160e457c2f4e7c7982d589da3499bc7ea4526" +checksum = "2c8fc60ba15bf51257aa9807a48a61013db043fcf3a78cb0d916e8e396dcad98" dependencies = [ - "serde", + "kqueue-sys", + "libc", +] + +[[package]] +name = "kqueue-sys" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8367585489f01bc55dd27404dcf56b95e6da061a256a666ab23be9ba96a2e587" +dependencies = [ + "bitflags", + "libc", ] [[package]] @@ -1036,9 +1685,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.117" +version = "0.2.135" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c" +checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c" [[package]] name = "libloading" @@ -1051,19 +1700,35 @@ dependencies = [ ] [[package]] -name = "lock_api" -version = "0.4.6" +name = "libm" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88943dd7ef4a2e5a4bfa2753aaab3013e34ce2533d1996fb18ef591e315e2b3b" +checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565" + +[[package]] +name = "link-cplusplus" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9272ab7b96c9046fbc5bc56c06c117cb639fe2d509df0c421cad82d2915cf369" dependencies = [ + "cc", +] + +[[package]] +name = "lock_api" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" +dependencies = [ + "autocfg", "scopeguard", ] [[package]] name = "log" -version = "0.4.14" +version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ "cfg-if", "serde", @@ -1079,21 +1744,10 @@ dependencies = [ ] [[package]] -name = "matches" -version = "0.1.9" +name = "matchit" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" - -[[package]] -name = "maybe-async" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6007f9dad048e0a224f27ca599d669fca8cfa0dac804725aab542b2eb032bce6" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] +checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb" [[package]] name = "md-5" @@ -1101,11 +1755,20 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b5a279bb9607f9f53c22d496eade00d138d1bdcccd07d74650387cf94942a15" dependencies = [ - "block-buffer", - "digest", + "block-buffer 0.9.0", + "digest 0.9.0", "opaque-debug", ] +[[package]] +name = "md-5" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6365506850d44bff6e2fbcb5176cf63650e48bd45ef2fe2665ae1570e0f4b9ca" +dependencies = [ + "digest 0.10.5", +] + [[package]] name = "md5" version = "0.7.0" @@ -1114,9 +1777,18 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "memchr" -version = "2.4.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "memmap2" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95af15f345b17af2efc8ead6080fb8bc376f8cec1b35277b935637595fe77498" +dependencies = [ + "libc", +] [[package]] name = "memoffset" @@ -1127,6 +1799,25 @@ dependencies = [ "autocfg", ] +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + +[[package]] +name = "metrics" +version = "0.1.0" +dependencies = [ + "libc", + "once_cell", + "prometheus", + "workspace_hack", +] + [[package]] name = "mime" version = "0.3.16" @@ -1141,36 +1832,64 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.4.4" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" +checksum = "96590ba8f175222643a85693f33d26e9c8a015f599c216509b1a6894af675d34" dependencies = [ "adler", - "autocfg", ] [[package]] name = "mio" -version = "0.7.14" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8067b404fe97c70829f082dec8bcf4f71225d7eaea1d8645349cb76fa06205cc" +checksum = "57ee1c23c7c63b0c9250c339ffdc69255f110b298b901b9f6c82547b7b87caaf" dependencies = [ "libc", "log", - "miow", - "ntapi", - "winapi", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys", ] [[package]] -name = "miow" -version = "0.3.7" +name = "multimap" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" +checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" + +[[package]] +name = "native-tls" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd7e2f3618557f980e0b17e8856252eee3c97fa12c54dff0ca290fb6266ca4a9" dependencies = [ - "winapi", + "lazy_static", + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", ] +[[package]] +name = "nb" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "801d31da0513b6ec5214e9bf433a77966320625a37860f910be265be6e18d06f" +dependencies = [ + "nb 1.0.0", +] + +[[package]] +name = "nb" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "546c37ac5d9e56f55e73b677106873d9d9f5190605e41a856503623648488cae" + [[package]] name = "nix" version = "0.23.1" @@ -1181,34 +1900,66 @@ dependencies = [ "cc", "cfg-if", "libc", - "memoffset", + "memoffset 0.6.5", +] + +[[package]] +name = "nix" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e322c04a9e3440c327fca7b6c8a63e6890a32fa2ad689db972425f07e0d22abb" +dependencies = [ + "autocfg", + "bitflags", + "cfg-if", + "libc", + "memoffset 0.6.5", + "pin-utils", ] [[package]] name = "nom" -version = "7.1.0" +version = "7.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b1d11e1ef389c76fe5b81bcaf2ea32cf88b62bc494e19f493d0b30e7a930109" +checksum = "a8903e5a29a317527874d0402f867152a3d21c908bb0b933e416c65e301d4c36" dependencies = [ "memchr", "minimal-lexical", - "version_check", ] [[package]] -name = "ntapi" -version = "0.3.6" +name = "notify" +version = "5.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +checksum = "ed2c66da08abae1c024c01d635253e402341b4060a12e99b31c7594063bf490a" dependencies = [ + "bitflags", + "crossbeam-channel", + "filetime", + "fsevent-sys", + "inotify", + "kqueue", + "libc", + "mio", + "walkdir", + "winapi", +] + +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", "winapi", ] [[package]] name = "num-bigint" -version = "0.2.6" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304" +checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f" dependencies = [ "autocfg", "num-integer", @@ -1216,10 +1967,20 @@ dependencies = [ ] [[package]] -name = "num-integer" -version = "0.1.44" +name = "num-format" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +checksum = "54b862ff8df690cf089058c98b183676a7ed0f974cc08b426800093227cbff3b" +dependencies = [ + "arrayvec", + "itoa", +] + +[[package]] +name = "num-integer" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" dependencies = [ "autocfg", "num-traits", @@ -1227,11 +1988,12 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -1245,19 +2007,43 @@ dependencies = [ ] [[package]] -name = "object" -version = "0.27.1" +name = "num_threads" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67ac1d3f9a1d3616fd9a60c8d74296f22406a238b6a72f5cc1e6f314df4ffbf9" +checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" +dependencies = [ + "libc", +] + +[[package]] +name = "object" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21158b2c33aa6d4561f1c0a6ea283ca92bc54802a93b263e910746d679a7eb53" dependencies = [ "memchr", ] [[package]] -name = "once_cell" -version = "1.9.0" +name = "oid-registry" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" +checksum = "7d4bda43fd1b844cbc6e6e54b5444e2b1bc7838bce59ad205902cccbb26d6761" +dependencies = [ + "asn1-rs", +] + +[[package]] +name = "once_cell" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1" + +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" [[package]] name = "opaque-debug" @@ -1266,74 +2052,138 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" [[package]] -name = "ordered-multimap" -version = "0.3.1" +name = "openssl" +version = "0.10.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c672c7ad9ec066e428c00eb917124a06f08db19e2584de982cc34b1f4c12485" +checksum = "12fc0523e3bd51a692c8850d075d74dc062ccf251c0110668cbd921917118a13" dependencies = [ - "dlv-list", - "hashbrown 0.9.1", + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "openssl-sys" +version = "0.9.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5230151e44c0f05157effb743e8d517472843121cf9243e8b81393edb5acd9ce" +dependencies = [ + "autocfg", + "cc", + "libc", + "pkg-config", + "vcpkg", ] [[package]] name = "os_str_bytes" -version = "6.0.0" +version = "6.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" -dependencies = [ - "memchr", -] +checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff" + +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "pageserver" version = "0.1.0" dependencies = [ + "amplify_num", "anyhow", - "async-compression", + "async-stream", "async-trait", - "bookfile", "byteorder", "bytes", "chrono", - "clap 3.0.14", + "clap 4.0.15", + "close_fds", "const_format", "crc32c", + "criterion", "crossbeam-utils", - "daemonize", + "etcd_broker", "fail", "futures", + "git-version", "hex", "hex-literal", "humantime", + "humantime-serde", "hyper", - "lazy_static", - "log", - "nix", + "itertools", + "metrics", + "nix 0.25.0", + "num-traits", "once_cell", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "pageserver_api", + "postgres", + "postgres-protocol", + "postgres-types", "postgres_ffi", + "pprof", + "pq_proto", "rand", "regex", - "rust-s3", + "remote_storage", + "rstar", "scopeguard", "serde", "serde_json", + "serde_with", "signal-hook", + "svg_fmt", "tar", "tempfile", + "tenant_size_model", "thiserror", "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "tokio-stream", + "tokio-postgres", + "tokio-util", "toml_edit", "tracing", - "tracing-futures", "url", + "utils", + "walkdir", + "workspace_hack", +] + +[[package]] +name = "pageserver_api" +version = "0.1.0" +dependencies = [ + "anyhow", + "byteorder", + "bytes", + "const_format", + "postgres_ffi", + "serde", + "serde_with", + "utils", "workspace_hack", - "zenith_metrics", - "zenith_utils", ] [[package]] @@ -1344,7 +2194,17 @@ checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" dependencies = [ "instant", "lock_api", - "parking_lot_core", + "parking_lot_core 0.8.5", +] + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core 0.9.3", ] [[package]] @@ -1361,6 +2221,19 @@ dependencies = [ "winapi", ] +[[package]] +name = "parking_lot_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09a279cbf25cb0757810394fbc1e359949b59e348145c643a939a525692e6929" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-sys", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -1369,62 +2242,61 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] name = "pem" -version = "0.8.3" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd56cbd21fea48d0c440b41cd69c589faacade08c992d9a54e471b79d0fd13eb" +checksum = "03c64931a1a212348ec4f3b4362585eca7159d0d09cbdf4a7f74f02173596fd4" dependencies = [ - "base64 0.13.0", - "once_cell", - "regex", -] - -[[package]] -name = "pem" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9a3b09a20e374558580a4914d3b7d89bd61b954a5a5e1dcbea98753addb1947" -dependencies = [ - "base64 0.13.0", + "base64", ] [[package]] name = "percent-encoding" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" +checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" + +[[package]] +name = "petgraph" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5014253a1331579ce62aa67443b4a658c5e7dd03d4bc6d302b94474888143" +dependencies = [ + "fixedbitset", + "indexmap", +] [[package]] name = "phf" -version = "0.8.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" dependencies = [ "phf_shared", ] [[package]] name = "phf_shared" -version = "0.8.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" dependencies = [ "siphasher", ] [[package]] name = "pin-project" -version = "1.0.10" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58ad3879ad3baf4e44784bc6a718a8698867bb991f8ce24d1bcbe2cfb4c3a75e" +checksum = "ad29a609b6bcd67fee905812e544992d216af9d755757c05ed2d0e15a74c6ecc" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.0.10" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744b6f092ba29c3650faf274db506afd39944f48420f6c86b17cfe0ee1cb36bb" +checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55" dependencies = [ "proc-macro2", "quote", @@ -1433,9 +2305,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e280fbe77cc62c91527259e9442153f4688736748d24660126286329742b4c6c" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" [[package]] name = "pin-utils" @@ -1443,88 +2315,79 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae" + +[[package]] +name = "plotters" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2538b639e642295546c50fcd545198c9d64ee2a38620a628724a3b266d5fbf97" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "193228616381fecdc1224c62e96946dfbc73ff4384fba576e052ff8c1bea8142" + +[[package]] +name = "plotters-svg" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9a81d2759aae1dae668f783c308bc5c8ebd191ff4184aaa1b37f65a6ae5a56f" +dependencies = [ + "plotters-backend", +] + [[package]] name = "postgres" -version = "0.19.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" +version = "0.19.2" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "bytes", "fallible-iterator", "futures", "log", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", -] - -[[package]] -name = "postgres" -version = "0.19.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" -dependencies = [ - "bytes", - "fallible-iterator", - "futures", - "log", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", - "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", + "tokio-postgres", ] [[package]] name = "postgres-protocol" -version = "0.6.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" +version = "0.6.4" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ - "base64 0.13.0", + "base64", "byteorder", "bytes", "fallible-iterator", - "hmac 0.10.1", + "hmac 0.12.1", "lazy_static", - "md-5", + "md-5 0.10.5", "memchr", "rand", - "sha2", - "stringprep", -] - -[[package]] -name = "postgres-protocol" -version = "0.6.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" -dependencies = [ - "base64 0.13.0", - "byteorder", - "bytes", - "fallible-iterator", - "hmac 0.10.1", - "lazy_static", - "md-5", - "memchr", - "rand", - "sha2", + "sha2 0.10.6", "stringprep", ] [[package]] name = "postgres-types" -version = "0.2.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" +version = "0.2.3" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "bytes", "fallible-iterator", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", -] - -[[package]] -name = "postgres-types" -version = "0.2.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" -dependencies = [ - "bytes", - "fallible-iterator", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", + "postgres-protocol", ] [[package]] @@ -1535,18 +2398,39 @@ dependencies = [ "bindgen", "byteorder", "bytes", - "chrono", "crc32c", + "env_logger", "hex", - "lazy_static", "log", - "memoffset", + "memoffset 0.7.1", + "once_cell", + "postgres", "rand", "regex", "serde", "thiserror", + "utils", + "wal_craft", "workspace_hack", - "zenith_utils", +] + +[[package]] +name = "pprof" +version = "0.6.1" +source = "git+https://github.com/neondatabase/pprof-rs.git?branch=wallclock-profiling#4e011a87d22fb4d21d15cc38bce81ff1c75e4bc9" +dependencies = [ + "backtrace", + "cfg-if", + "findshlibs", + "inferno", + "lazy_static", + "libc", + "log", + "nix 0.23.1", + "parking_lot 0.11.2", + "symbolic-demangle", + "tempfile", + "thiserror", ] [[package]] @@ -1555,6 +2439,31 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" +[[package]] +name = "pq_proto" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes", + "pin-project-lite", + "postgres-protocol", + "rand", + "serde", + "tokio", + "tracing", + "workspace_hack", +] + +[[package]] +name = "prettyplease" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c142c0e46b57171fe0c528bee8c5b7569e80f0c17e377cd0e30ea57dbc11bb51" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro-hack" version = "0.5.19" @@ -1563,62 +2472,161 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" -version = "1.0.36" +version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" +checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" dependencies = [ - "unicode-xid", + "unicode-ident", +] + +[[package]] +name = "procfs" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0941606b9934e2d98a3677759a971756eb821f75764d0e0d26946d08e74d9104" +dependencies = [ + "bitflags", + "byteorder", + "hex", + "lazy_static", + "libc", ] [[package]] name = "prometheus" -version = "0.13.0" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f64969ffd5dd8f39bd57a68ac53c163a095ed9d0fb707146da1b27025a3504" +checksum = "45c8babc29389186697fe5a2a4859d697825496b83db5d0b65271cdc0488e88c" dependencies = [ "cfg-if", "fnv", "lazy_static", + "libc", "memchr", - "parking_lot", + "parking_lot 0.12.1", + "procfs", "thiserror", ] +[[package]] +name = "prost" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71adf41db68aa0daaefc69bb30bcd68ded9b9abaad5d1fbb6304c4fb390e083e" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae5a4388762d5815a9fc0dea33c56b021cdc8dde0c55e0c9ca57197254b0cab" +dependencies = [ + "bytes", + "cfg-if", + "cmake", + "heck", + "itertools", + "lazy_static", + "log", + "multimap", + "petgraph", + "prost", + "prost-types", + "regex", + "tempfile", + "which", +] + +[[package]] +name = "prost-derive" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b670f45da57fb8542ebdbb6105a925fe571b67f9e7ed9f47a06a84e72b4e7cc" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d0a014229361011dc8e69c8a1ec6c2e8d0f2af7c91e3ea3f5b2170298461e68" +dependencies = [ + "bytes", + "prost", +] + [[package]] name = "proxy" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", + "atty", + "base64", + "bstr", "bytes", - "clap 3.0.14", + "clap 4.0.15", "futures", - "hashbrown 0.11.2", + "git-version", + "hashbrown", "hex", + "hmac 0.12.1", "hyper", - "lazy_static", + "itertools", "md5", - "parking_lot", + "metrics", + "once_cell", + "parking_lot 0.12.1", "pin-project-lite", + "pq_proto", "rand", "rcgen", "reqwest", - "rustls 0.19.1", + "routerify", + "rstest", + "rustls", + "rustls-pemfile", "scopeguard", "serde", "serde_json", + "sha2 0.10.6", + "socket2", + "thiserror", "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", + "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls 0.22.0", - "zenith_metrics", - "zenith_utils", + "tokio-rustls", + "tracing", + "tracing-subscriber", + "url", + "utils", + "uuid 1.2.1", + "workspace_hack", + "x509-parser", +] + +[[package]] +name = "quick-xml" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8533f14c8382aaad0d592c812ac3b826162128b65662331e1127b45c3d18536b" +dependencies = [ + "memchr", ] [[package]] name = "quote" -version = "1.0.15" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "864d3e96a899863136fc6e99f3d7cae289dafe43bf2c5ac19b70df7210c0a145" +checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" dependencies = [ "proc-macro2", ] @@ -1647,9 +2655,9 @@ dependencies = [ [[package]] name = "rand_core" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ "getrandom", ] @@ -1664,41 +2672,66 @@ dependencies = [ ] [[package]] -name = "rcgen" -version = "0.8.14" +name = "rayon" +version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5911d1403f4143c9d56a702069d593e8d0f3fab880a85e103604d0893ea31ba7" +checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" dependencies = [ - "chrono", - "pem 1.0.2", + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "num_cpus", +] + +[[package]] +name = "rcgen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffbe84efe2f38dea12e9bfc1f65377fdf03e53a18cb3b995faedf7934c7e785b" +dependencies = [ + "pem", "ring", + "time 0.3.15", "yasna", ] [[package]] name = "redox_syscall" -version = "0.2.10" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ "bitflags", ] [[package]] name = "redox_users" -version = "0.4.0" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" +checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" dependencies = [ "getrandom", "redox_syscall", + "thiserror", ] [[package]] name = "regex" -version = "1.5.4" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" dependencies = [ "aho-corasick", "memchr", @@ -1716,9 +2749,30 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.25" +version = "0.6.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" +checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" + +[[package]] +name = "remote_storage" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "metrics", + "once_cell", + "rusoto_core", + "rusoto_s3", + "serde", + "serde_json", + "tempfile", + "tokio", + "tokio-util", + "toml_edit", + "tracing", + "utils", + "workspace_hack", +] [[package]] name = "remove_dir_all" @@ -1731,11 +2785,11 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.9" +version = "0.11.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f242f1488a539a79bac6dbe7c8609ae43b7914b7736210f239a37cccb32525" +checksum = "431949c384f4e2ae07605ccaa56d1d9d2ecdb5cadd4f9577ccfab29f2e5149fc" dependencies = [ - "base64 0.13.0", + "base64", "bytes", "encoding_rs", "futures-core", @@ -1747,19 +2801,19 @@ dependencies = [ "hyper-rustls", "ipnet", "js-sys", - "lazy_static", "log", "mime", + "once_cell", "percent-encoding", "pin-project-lite", - "rustls 0.20.2", + "rustls", "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", "tokio", - "tokio-rustls 0.23.2", - "tokio-util", + "tokio-rustls", + "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", @@ -1768,6 +2822,15 @@ dependencies = [ "winreg", ] +[[package]] +name = "rgb" +version = "0.8.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3603b7d71ca82644f79b5a06d1220e9a58ede60bd32255f698cb1af8838b8db3" +dependencies = [ + "bytemuck", +] + [[package]] name = "ring" version = "0.16.20" @@ -1777,12 +2840,33 @@ dependencies = [ "cc", "libc", "once_cell", - "spin", + "spin 0.5.2", "untrusted", "web-sys", "winapi", ] +[[package]] +name = "riscv" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6907ccdd7a31012b70faf2af85cd9e5ba97657cc3987c4f13f8e4d2c2a088aba" +dependencies = [ + "bare-metal 1.0.0", + "bit_field", + "riscv-target", +] + +[[package]] +name = "riscv-target" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88aa938cda42a0cf62a20cfe8d139ff1af20c2e681212b5b34adb5a58333f222" +dependencies = [ + "lazy_static", + "regex", +] + [[package]] name = "routerify" version = "3.0.0" @@ -1797,43 +2881,121 @@ dependencies = [ ] [[package]] -name = "rust-ini" -version = "0.17.0" +name = "rstar" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63471c4aa97a1cf8332a5f97709a79a4234698de6a1f5087faf66f2dae810e22" +checksum = "b40f1bfe5acdab44bc63e6699c28b74f75ec43afb59f3eda01e145aff86a25fa" dependencies = [ - "cfg-if", - "ordered-multimap", + "heapless", + "num-traits", + "smallvec", ] [[package]] -name = "rust-s3" -version = "0.28.1" +name = "rstest" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dc0e521d1084d6950e050d4e2595f0fbdaa2b96bb795bab3d90a282288c5e49" +checksum = "e9c9dc66cc29792b663ffb5269be669f1613664e69ad56441fdb895c2347b930" +dependencies = [ + "futures", + "futures-timer", + "rstest_macros", + "rustc_version 0.4.0", +] + +[[package]] +name = "rstest_macros" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5015e68a0685a95ade3eee617ff7101ab6a3fc689203101ca16ebc16f2b89c66" dependencies = [ - "anyhow", - "async-trait", - "aws-creds", - "aws-region", - "base64 0.13.0", "cfg-if", + "proc-macro2", + "quote", + "rustc_version 0.4.0", + "syn", +] + +[[package]] +name = "rusoto_core" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1db30db44ea73551326269adcf7a2169428a054f14faf9e1768f2163494f2fa2" +dependencies = [ + "async-trait", + "base64", + "bytes", + "crc32fast", + "futures", + "http", + "hyper", + "hyper-tls", + "lazy_static", + "log", + "rusoto_credential", + "rusoto_signature", + "rustc_version 0.4.0", + "serde", + "serde_json", + "tokio", + "xml-rs", +] + +[[package]] +name = "rusoto_credential" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee0a6c13db5aad6047b6a44ef023dbbc21a056b6dab5be3b79ce4283d5c02d05" +dependencies = [ + "async-trait", "chrono", + "dirs-next", + "futures", + "hyper", + "serde", + "serde_json", + "shlex", + "tokio", + "zeroize", +] + +[[package]] +name = "rusoto_s3" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7aae4677183411f6b0b412d66194ef5403293917d66e70ab118f07cc24c5b14d" +dependencies = [ + "async-trait", + "bytes", + "futures", + "rusoto_core", + "xml-rs", +] + +[[package]] +name = "rusoto_signature" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5ae95491c8b4847931e291b151127eccd6ff8ca13f33603eb3d0035ecb05272" +dependencies = [ + "base64", + "bytes", + "chrono", + "digest 0.9.0", + "futures", "hex", "hmac 0.11.0", "http", + "hyper", "log", - "maybe-async", - "md5", + "md-5 0.9.1", "percent-encoding", - "reqwest", + "pin-project-lite", + "rusoto_credential", + "rustc_version 0.4.0", "serde", - "serde-xml-rs", - "serde_derive", - "sha2", + "sha2 0.9.9", "tokio", - "tokio-stream", - "url", ] [[package]] @@ -1848,63 +3010,128 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +dependencies = [ + "semver 0.9.0", +] + [[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ - "semver", + "semver 1.0.14", +] + +[[package]] +name = "rusticata-macros" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632" +dependencies = [ + "nom", ] [[package]] name = "rustls" -version = "0.19.1" +version = "0.20.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" -dependencies = [ - "base64 0.13.0", - "log", - "ring", - "sct 0.6.1", - "webpki 0.21.4", -] - -[[package]] -name = "rustls" -version = "0.20.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d37e5e2290f3e040b594b1a9e04377c2c671f1a1cfd9bfdef82106ac1c113f84" +checksum = "5aab8ee6c7097ed6057f43c187a62418d0c05a4bd5f18b3571db50ee0f9ce033" dependencies = [ "log", "ring", - "sct 0.7.0", - "webpki 0.22.0", + "sct", + "webpki", ] [[package]] name = "rustls-pemfile" -version = "0.2.1" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eebeaeb360c87bfb72e84abdb3447159c0eaececf1bef2aecd65a8be949d1c9" +checksum = "0864aeff53f8c05aa08d86e5ef839d3dfcf07aeba2db32f12db0ef716e87bd55" dependencies = [ - "base64 0.13.0", + "base64", ] [[package]] name = "rustls-split" -version = "0.2.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fb079b52cfdb005752b7c3c646048e702003576a8321058e4c8b38227c11aa6" +checksum = "78802c9612b4689d207acff746f38132ca1b12dadb55d471aa5f10fd580f47d3" dependencies = [ - "rustls 0.19.1", + "rustls", ] [[package]] -name = "ryu" +name = "rustversion" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" +checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8" + +[[package]] +name = "ryu" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" + +[[package]] +name = "safekeeper" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "byteorder", + "bytes", + "clap 4.0.15", + "const_format", + "crc32c", + "etcd_broker", + "fs2", + "git-version", + "hex", + "humantime", + "hyper", + "metrics", + "nix 0.25.0", + "once_cell", + "parking_lot 0.12.1", + "postgres", + "postgres-protocol", + "postgres_ffi", + "pq_proto", + "regex", + "remote_storage", + "safekeeper_api", + "serde", + "serde_json", + "serde_with", + "signal-hook", + "tempfile", + "thiserror", + "tokio", + "tokio-postgres", + "toml_edit", + "tracing", + "url", + "utils", + "workspace_hack", +] + +[[package]] +name = "safekeeper_api" +version = "0.1.0" +dependencies = [ + "const_format", + "serde", + "serde_with", + "utils", + "workspace_hack", +] [[package]] name = "same-file" @@ -1915,6 +3142,16 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "schannel" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88d6731146462ea25d9244b2ed5fd1d716d25c52e4d54aa4fb0f3c4e9854dbe2" +dependencies = [ + "lazy_static", + "windows-sys", +] + [[package]] name = "scopeguard" version = "1.1.0" @@ -1922,14 +3159,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] -name = "sct" -version = "0.6.1" +name = "scratch" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b362b83898e0e69f38515b82ee15aa80636befe47c3b6d3d89a911e78fc228ce" -dependencies = [ - "ring", - "untrusted", -] +checksum = "9c8132065adcfd6e02db789d9285a0deb2f3fcb04002865ab67d5fb103533898" [[package]] name = "sct" @@ -1942,47 +3175,63 @@ dependencies = [ ] [[package]] -name = "semver" -version = "1.0.5" +name = "security-framework" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0486718e92ec9a68fbed73bb5ef687d71103b142595b406835649bebd33f72c7" +checksum = "2bc1bb97804af6631813c55739f771071e0f2ed33ee20b68c86ec505d906356c" +dependencies = [ + "bitflags", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0160a13a177a45bfb43ce71c01580998474f556ad854dcbca936dd2841a5c556" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +dependencies = [ + "semver-parser", +] + +[[package]] +name = "semver" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4" + +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" [[package]] name = "serde" -version = "1.0.136" +version = "1.0.145" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce31e24b01e1e524df96f1c2fdd054405f8d7376249a5110886fb4b658484789" +checksum = "728eb6351430bccb993660dfffc5a72f91ccc1295abaa8ce19b27ebe4f75568b" dependencies = [ "serde_derive", ] -[[package]] -name = "serde-xml-rs" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65162e9059be2f6a3421ebbb4fef3e74b7d9e7c60c50a0e292c6239f19f1edfa" -dependencies = [ - "log", - "serde", - "thiserror", - "xml-rs", -] - -[[package]] -name = "serde_cbor" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" -dependencies = [ - "half", - "serde", -] - [[package]] name = "serde_derive" -version = "1.0.136" +version = "1.0.145" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08597e7152fcd306f41838ed3e37be9eaeed2b61c42e2117266a554fab4662f9" +checksum = "81fa1584d3d1bcacd84c277a0dfe21f5b0f6accf4a23d04d4c6d61f1af522b4c" dependencies = [ "proc-macro2", "quote", @@ -1991,11 +3240,11 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.78" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d23c1ba4cf0efd44be32017709280b32d1cea5c3f1275c3b6d9e8bc54f758085" +checksum = "41feea4228a6f1cd09ec7a3593a682276702cd67b5273544757dae23c096f074" dependencies = [ - "itoa 1.0.1", + "itoa", "ryu", "serde", ] @@ -2007,24 +3256,63 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" dependencies = [ "form_urlencoded", - "itoa 1.0.1", + "itoa", "ryu", "serde", ] +[[package]] +name = "serde_with" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f2d60d049ea019a84dcd6687b0d1e0030fe663ae105039bdf967ed5e6a9a7" +dependencies = [ + "base64", + "chrono", + "hex", + "indexmap", + "serde", + "serde_json", + "serde_with_macros", + "time 0.3.15", +] + +[[package]] +name = "serde_with_macros" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ccadfacf6cf10faad22bbadf55986bdd0856edfb5d9210aa1dcf1f516e84e93" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "sha2" version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800" dependencies = [ - "block-buffer", + "block-buffer 0.9.0", "cfg-if", "cpufeatures", - "digest", + "digest 0.9.0", "opaque-debug", ] +[[package]] +name = "sha2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest 0.10.5", +] + [[package]] name = "sharded-slab" version = "0.1.4" @@ -2042,14 +3330,25 @@ checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" [[package]] name = "signal-hook" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "647c97df271007dcea485bb74ffdb57f2e683f1306c854f468a0c244badabf2d" +checksum = "a253b5e89e2698464fc26b545c9edceb338e18a89effeeecfea192c3025be29d" dependencies = [ "libc", "signal-hook-registry", ] +[[package]] +name = "signal-hook-mio" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af" +dependencies = [ + "libc", + "mio", + "signal-hook", +] + [[package]] name = "signal-hook-registry" version = "1.4.0" @@ -2061,38 +3360,42 @@ dependencies = [ [[package]] name = "simple_asn1" -version = "0.4.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "692ca13de57ce0613a363c8c2f1de925adebc81b04c923ac60c5488bb44abe4b" +checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085" dependencies = [ - "chrono", "num-bigint", "num-traits", + "thiserror", + "time 0.3.15", ] [[package]] name = "siphasher" -version = "0.3.9" +version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a86232ab60fa71287d7f2ddae4a7073f6b7aac33631c3015abb556f08c6d0a3e" +checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" [[package]] name = "slab" -version = "0.4.5" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9def91fd1e018fe007022791f865d0ccc9b3a0d5001e01aabb8b40e46000afb5" +checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef" +dependencies = [ + "autocfg", +] [[package]] name = "smallvec" -version = "1.8.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" [[package]] name = "socket2" -version = "0.4.4" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66d72b759436ae32898a2af0a14218dbf55efde3feeb170eb623637db85ee1e0" +checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" dependencies = [ "libc", "winapi", @@ -2104,6 +3407,27 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" +[[package]] +name = "spin" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09" +dependencies = [ + "lock_api", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "str_stack" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb" + [[package]] name = "stringprep" version = "0.1.2" @@ -2114,18 +3438,31 @@ dependencies = [ "unicode-normalization", ] -[[package]] -name = "strsim" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" - [[package]] name = "strsim" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "strum" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" + +[[package]] +name = "strum_macros" +version = "0.24.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "subtle" version = "2.4.1" @@ -2133,13 +3470,60 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" [[package]] -name = "syn" -version = "1.0.86" +name = "svg_fmt" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a65b3f4ffa0092e9887669db0eae07941f023991ab58ea44da8fe8e2d511c6b" +checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2" + +[[package]] +name = "symbolic-common" +version = "8.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f551f902d5642e58039aee6a9021a61037926af96e071816361644983966f540" +dependencies = [ + "debugid", + "memmap2", + "stable_deref_trait", + "uuid 0.8.2", +] + +[[package]] +name = "symbolic-demangle" +version = "8.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4564ca7b4e6eb14105aa8bbbce26e080f6b5d9c4373e67167ab31f7b86443750" +dependencies = [ + "cpp_demangle", + "rustc-demangle", + "symbolic-common", +] + +[[package]] +name = "syn" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fcd952facd492f9be3ef0d0b7032a6e442ee9b361d4acc2b1d0c4aaa5f613a1" dependencies = [ "proc-macro2", "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8" + +[[package]] +name = "synstructure" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" +dependencies = [ + "proc-macro2", + "quote", + "syn", "unicode-xid", ] @@ -2168,44 +3552,42 @@ dependencies = [ "winapi", ] +[[package]] +name = "tenant_size_model" +version = "0.1.0" +dependencies = [ + "workspace_hack", +] + [[package]] name = "termcolor" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" +checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" dependencies = [ "winapi-util", ] [[package]] name = "textwrap" -version = "0.11.0" +version = "0.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" -dependencies = [ - "unicode-width", -] - -[[package]] -name = "textwrap" -version = "0.14.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0066c8d12af8b5acd21e00547c3797fde4e8677254a7ee429176ccebbe93dd80" +checksum = "949517c0cf1bf4ee812e2e07e08ab448e3ae0d23472aee8a06c985f0c8815b16" [[package]] name = "thiserror" -version = "1.0.30" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417" +checksum = "10deb33631e3c9018b9baf9dcbbc4f737320d2b576bac10f6aefa048fa407e3e" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.30" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b" +checksum = "982d17546b47146b28f7c22e3d08465f6b8903d0ea13c1660d9d84a6e7adcdbb" dependencies = [ "proc-macro2", "quote", @@ -2228,15 +3610,44 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" dependencies = [ "libc", - "wasi", + "wasi 0.10.0+wasi-snapshot-preview1", "winapi", ] [[package]] -name = "tinyvec" -version = "1.5.1" +name = "time" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c1c1d5a42b6245520c249549ec267180beaffcc0615401ac8e31853d4b6d8d2" +checksum = "d634a985c4d4238ec39cacaed2e7ae552fbd3c476b552c1deac3021b7d7eaf0c" +dependencies = [ + "itoa", + "libc", + "num_threads", + "serde", + "time-macros", +] + +[[package]] +name = "time-macros" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" dependencies = [ "tinyvec_macros", ] @@ -2249,10 +3660,11 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.16.1" +version = "1.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c27a64b625de6d309e8c57716ba93021dccf1b3b5c97edd6d3dd2d2135afc0a" +checksum = "0020c875007ad96677dcc890298f4b942882c5d4eb7cc8f439fc3bf813dc9c95" dependencies = [ + "autocfg", "bytes", "libc", "memchr", @@ -2261,15 +3673,26 @@ dependencies = [ "once_cell", "pin-project-lite", "signal-hook-registry", + "socket2", "tokio-macros", "winapi", ] [[package]] -name = "tokio-macros" -version = "1.7.0" +name = "tokio-io-timeout" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b557f72f448c511a979e2564e55d74e6c4432fc96ff4f6241bc6bded342643b7" +checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" +dependencies = [ + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-macros" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9724f9a975fb987ef7a3cd9be0350edcbe130698af5b8f7a631e23d42d052484" dependencies = [ "proc-macro2", "quote", @@ -2277,31 +3700,19 @@ dependencies = [ ] [[package]] -name = "tokio-postgres" -version = "0.7.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7#2949d98df52587d562986aad155dd4e889e408b7" +name = "tokio-native-tls" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b" dependencies = [ - "async-trait", - "byteorder", - "bytes", - "fallible-iterator", - "futures", - "log", - "parking_lot", - "percent-encoding", - "phf", - "pin-project-lite", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "socket2", + "native-tls", "tokio", - "tokio-util", ] [[package]] name = "tokio-postgres" -version = "0.7.1" -source = "git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858#9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" +version = "0.7.6" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" dependencies = [ "async-trait", "byteorder", @@ -2309,12 +3720,12 @@ dependencies = [ "fallible-iterator", "futures", "log", - "parking_lot", + "parking_lot 0.12.1", "percent-encoding", "phf", "pin-project-lite", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", - "postgres-types 0.2.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)", + "postgres-protocol", + "postgres-types", "socket2", "tokio", "tokio-util", @@ -2322,46 +3733,34 @@ dependencies = [ [[package]] name = "tokio-postgres-rustls" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7bd8c37d8c23cb6ecdc32fc171bade4e9c7f1be65f693a17afbaad02091a0a19" +checksum = "606f2b73660439474394432239c82249c0d45eb5f23d91f401be1e33590444a7" dependencies = [ "futures", "ring", - "rustls 0.19.1", + "rustls", "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "tokio-rustls 0.22.0", - "webpki 0.21.4", + "tokio-postgres", + "tokio-rustls", ] [[package]] name = "tokio-rustls" -version = "0.22.0" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6" +checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" dependencies = [ - "rustls 0.19.1", + "rustls", "tokio", - "webpki 0.21.4", -] - -[[package]] -name = "tokio-rustls" -version = "0.23.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a27d5f2b839802bd8267fa19b0530f5a08b9c08cd417976be2a65d130fe1c11b" -dependencies = [ - "rustls 0.20.2", - "tokio", - "webpki 0.22.0", + "webpki", ] [[package]] name = "tokio-stream" -version = "0.1.8" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50145484efff8818b5ccd256697f36863f587da82cf8b409c53adf1e840798e3" +checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce" dependencies = [ "futures-core", "pin-project-lite", @@ -2370,53 +3769,143 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.6.9" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e99e1983e5d376cd8eb4b66604d2e99e79f5bd988c3055891dcd8c9e2604cc0" +checksum = "0bb2e075f03b3d66d8d8785356224ba688d2906a371015e225beeb65ca92c740" dependencies = [ "bytes", "futures-core", "futures-sink", - "log", "pin-project-lite", "tokio", + "tracing", ] [[package]] name = "toml" -version = "0.5.8" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa" +checksum = "8d82e1a7758622a465f8cee077614c73484dac5b836c02ff6a40d5d1010324d7" dependencies = [ "serde", ] [[package]] name = "toml_edit" -version = "0.13.4" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744e9ed5b352340aa47ce033716991b5589e23781acb97cad37d4ea70560f55b" +checksum = "5376256e44f2443f8896ac012507c19a012df0fe8758b55246ae51a2279db51f" dependencies = [ "combine", "indexmap", "itertools", - "kstring", "serde", ] [[package]] -name = "tower-service" -version = "0.3.1" +name = "tonic" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" +checksum = "5be9d60db39854b30b835107500cf0aca0b0d14d6e1c3de124217c23a29c2ddb" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64", + "bytes", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-timeout", + "percent-encoding", + "pin-project", + "prost", + "prost-derive", + "tokio", + "tokio-stream", + "tokio-util", + "tower", + "tower-layer", + "tower-service", + "tracing", + "tracing-futures", +] + +[[package]] +name = "tonic-build" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9263bf4c9bfaae7317c1c2faf7f18491d2fe476f70c414b73bf5d445b00ffa1" +dependencies = [ + "prettyplease", + "proc-macro2", + "prost-build", + "quote", + "syn", +] + +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "indexmap", + "pin-project", + "pin-project-lite", + "rand", + "slab", + "tokio", + "tokio-util", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-http" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c530c8675c1dbf98facee631536fa116b5fb6382d7dd6dc1b118d970eafe3ba" +dependencies = [ + "bitflags", + "bytes", + "futures-core", + "futures-util", + "http", + "http-body", + "http-range-header", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" + +[[package]] +name = "tower-service" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.30" +version = "0.1.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d8d93354fe2a8e50d5953f5ae2e47a3fc2ef03292e7ea46e3cc38f549525fb9" +checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" dependencies = [ "cfg-if", + "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -2424,9 +3913,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.19" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8276d9a4a3a558d7b7ad5303ad50b53d58264641b82914b7ada36bd762e7a716" +checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" dependencies = [ "proc-macro2", "quote", @@ -2435,11 +3924,11 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.22" +version = "0.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03cfcb51380632a72d3111cb8d3447a8d908e577d31beeac006f836383d29a23" +checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" dependencies = [ - "lazy_static", + "once_cell", "valuable", ] @@ -2455,9 +3944,9 @@ dependencies = [ [[package]] name = "tracing-log" -version = "0.1.2" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3" +checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922" dependencies = [ "lazy_static", "log", @@ -2465,21 +3954,34 @@ dependencies = [ ] [[package]] -name = "tracing-subscriber" -version = "0.3.8" +name = "tracing-serde" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74786ce43333fcf51efe947aed9718fbe46d5c7328ec3f1029e818083966d9aa" +checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70" dependencies = [ - "ansi_term", - "lazy_static", "matchers", + "nu-ansi-term", + "once_cell", "regex", + "serde", + "serde_json", "sharded-slab", "smallvec", "thread_local", "tracing", "tracing-core", "tracing-log", + "tracing-serde", ] [[package]] @@ -2496,30 +3998,36 @@ checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" [[package]] name = "unicode-bidi" -version = "0.3.7" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a01404663e3db436ed2746d9fefef640d868edae3cceb81c3b8d5732fda678f" +checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992" + +[[package]] +name = "unicode-ident" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" [[package]] name = "unicode-normalization" -version = "0.1.19" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" +checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" dependencies = [ "tinyvec", ] [[package]] name = "unicode-width" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" [[package]] name = "unicode-xid" -version = "0.2.2" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" [[package]] name = "untrusted" @@ -2529,16 +4037,70 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" [[package]] name = "url" -version = "2.2.2" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" +checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643" dependencies = [ "form_urlencoded", "idna", - "matches", "percent-encoding", ] +[[package]] +name = "utils" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "bincode", + "byteorder", + "bytes", + "criterion", + "git-version", + "hex", + "hex-literal", + "hyper", + "jsonwebtoken", + "metrics", + "nix 0.25.0", + "once_cell", + "pq_proto", + "rand", + "routerify", + "rustls", + "rustls-pemfile", + "rustls-split", + "serde", + "serde_json", + "serde_with", + "signal-hook", + "strum", + "strum_macros", + "tempfile", + "thiserror", + "tokio", + "tokio-rustls", + "tracing", + "tracing-subscriber", + "workspace_hack", +] + +[[package]] +name = "uuid" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" + +[[package]] +name = "uuid" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "feb41e78f93363bb2df8b0e86a2ca30eed7806ea16ea0c790d757cf93f79be83" +dependencies = [ + "getrandom", + "serde", +] + [[package]] name = "valuable" version = "0.1.0" @@ -2546,10 +4108,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" [[package]] -name = "vec_map" -version = "0.8.2" +name = "vcell" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" +checksum = "77439c1b53d2303b20d9459b1ade71a83c716e3f9c34f3228c00e6f185d6c002" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "version_check" @@ -2557,6 +4125,36 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" + +[[package]] +name = "volatile-register" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ee8f19f9d74293faf70901bc20ad067dc1ad390d2cbf1e3f75f721ffee908b6" +dependencies = [ + "vcell", +] + +[[package]] +name = "wal_craft" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap 4.0.15", + "env_logger", + "log", + "once_cell", + "postgres", + "postgres_ffi", + "tempfile", + "workspace_hack", +] + [[package]] name = "walkdir" version = "2.3.2" @@ -2568,40 +4166,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "walkeeper" -version = "0.1.0" -dependencies = [ - "anyhow", - "byteorder", - "bytes", - "clap 3.0.14", - "const_format", - "crc32c", - "daemonize", - "fs2", - "hex", - "humantime", - "hyper", - "lazy_static", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres_ffi", - "regex", - "rust-s3", - "serde", - "serde_json", - "signal-hook", - "tempfile", - "tokio", - "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "tracing", - "walkdir", - "workspace_hack", - "zenith_metrics", - "zenith_utils", -] - [[package]] name = "want" version = "0.3.0" @@ -2619,10 +4183,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" [[package]] -name = "wasm-bindgen" -version = "0.2.79" +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25f1af7423d8588a3d840681122e72e6a24ddbcb3f0ec385cac0d12d24256c06" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -2630,13 +4200,13 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.79" +version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b21c0df030f5a177f3cba22e9bc4322695ec43e7257d865302900290bcdedca" +checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142" dependencies = [ "bumpalo", - "lazy_static", "log", + "once_cell", "proc-macro2", "quote", "syn", @@ -2645,9 +4215,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.29" +version = "0.4.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2eb6ec270a31b1d3c7e266b999739109abce8b6c87e4b31fcfcd788b65267395" +checksum = "23639446165ca5a5de86ae1d8896b737ae80319560fbaa4c2887b7da6e7ebd7d" dependencies = [ "cfg-if", "js-sys", @@ -2657,9 +4227,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.79" +version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f4203d69e40a52ee523b2529a773d5ffc1dc0071801c87b3d270b471b80ed01" +checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2667,9 +4237,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.79" +version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa8a30d46208db204854cadbb5d4baf5fcf8071ba5bf48190c3e59937962ebc" +checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c" dependencies = [ "proc-macro2", "quote", @@ -2680,30 +4250,20 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.79" +version = "0.2.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d958d035c4438e28c70e4321a2911302f10135ce78a9c7834c0cab4123d06a2" +checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f" [[package]] name = "web-sys" -version = "0.3.56" +version = "0.3.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c060b319f29dd25724f09a2ba1418f142f539b2be99fbf4d2d5a8f7330afb8eb" +checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f" dependencies = [ "js-sys", "wasm-bindgen", ] -[[package]] -name = "webpki" -version = "0.21.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8e38c0608262c46d4a56202ebabdeb094cef7e560ca7a226c6bf055188aa4ea" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "webpki" version = "0.22.0" @@ -2716,30 +4276,24 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "0.22.2" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "552ceb903e957524388c4d3475725ff2c8b7960922063af6ce53c9a43da07449" +checksum = "368bfe657969fb01238bb756d351dcade285e0f6fcbd36dcb23359a5169975be" dependencies = [ - "webpki 0.22.0", + "webpki", ] [[package]] name = "which" -version = "4.2.4" +version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a5a7e487e921cf220206864a94a89b6c6905bfc19f1057fa26a4cb360e5c1d2" +checksum = "1c831fbbee9e129a8cf93e7747a82da9d95ba8e16621cae60ec2cdc849bacb7b" dependencies = [ "either", - "lazy_static", "libc", + "once_cell", ] -[[package]] -name = "wildmatch" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6c48bd20df7e4ced539c12f570f937c6b4884928a87fee70a479d72f031d4e0" - [[package]] name = "winapi" version = "0.3.9" @@ -2772,10 +4326,53 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] -name = "winreg" -version = "0.7.0" +name = "windows-sys" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69" +checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2" +dependencies = [ + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47" + +[[package]] +name = "windows_i686_gnu" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6" + +[[package]] +name = "windows_i686_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680" + +[[package]] +name = "winreg" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d" dependencies = [ "winapi", ] @@ -2784,23 +4381,65 @@ dependencies = [ name = "workspace_hack" version = "0.1.0" dependencies = [ + "ahash", + "anyhow", + "bytes", + "chrono", + "clap 4.0.15", + "crossbeam-utils", + "either", + "fail", + "futures-channel", + "futures-task", + "futures-util", + "hashbrown", + "indexmap", "libc", + "log", "memchr", + "nom", + "num-bigint", "num-integer", "num-traits", - "proc-macro2", - "quote", + "prost", + "rand", "regex", "regex-syntax", + "reqwest", + "scopeguard", "serde", + "stable_deref_trait", "syn", + "time 0.3.15", + "tokio", + "tokio-util", + "tracing", + "tracing-core", +] + +[[package]] +name = "x509-parser" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0ecbeb7b67ce215e40e3cc7f2ff902f94a223acf44995934763467e7b1febc8" +dependencies = [ + "asn1-rs", + "base64", + "data-encoding", + "der-parser", + "lazy_static", + "nom", + "oid-registry", + "rusticata-macros", + "thiserror", + "time 0.3.15", ] [[package]] name = "xattr" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "244c3741f4240ef46274860397c7c74e50eb23624996930e484c16679633a54c" +checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc" dependencies = [ "libc", ] @@ -2813,99 +4452,15 @@ checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3" [[package]] name = "yasna" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e262a29d0e61ccf2b6190d7050d4b237535fc76ce4c1210d9caa316f71dffa75" +checksum = "346d34a236c9d3e5f3b9b74563f238f955bbd05fa0b8b4efa53c130c43982f4c" dependencies = [ - "chrono", + "time 0.3.15", ] [[package]] -name = "zenith" -version = "0.1.0" -dependencies = [ - "anyhow", - "clap 3.0.14", - "control_plane", - "pageserver", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres_ffi", - "serde_json", - "walkeeper", - "workspace_hack", - "zenith_utils", -] - -[[package]] -name = "zenith_metrics" -version = "0.1.0" -dependencies = [ - "lazy_static", - "libc", - "once_cell", - "prometheus", -] - -[[package]] -name = "zenith_utils" -version = "0.1.0" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "bytes", - "git-version", - "hex", - "hex-literal", - "hyper", - "jsonwebtoken", - "lazy_static", - "nix", - "pin-project-lite", - "postgres 0.19.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", - "rand", - "routerify", - "rustls 0.19.1", - "rustls-split", - "serde", - "serde_json", - "signal-hook", - "tempfile", - "thiserror", - "tokio", - "tracing", - "tracing-subscriber", - "webpki 0.21.4", - "workspace_hack", - "zenith_metrics", -] - -[[package]] -name = "zstd" -version = "0.10.0+zstd.1.5.2" +name = "zeroize" +version = "1.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b1365becbe415f3f0fcd024e2f7b45bacfb5bdd055f0dc113571394114e7bdd" -dependencies = [ - "zstd-safe", -] - -[[package]] -name = "zstd-safe" -version = "4.1.4+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f7cd17c9af1a4d6c24beb1cc54b17e2ef7b593dc92f19e9d9acad8b182bbaee" -dependencies = [ - "libc", - "zstd-sys", -] - -[[package]] -name = "zstd-sys" -version = "1.6.3+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc49afa5c8d634e75761feda8c592051e7eeb4683ba827211eb0d731d3402ea8" -dependencies = [ - "cc", - "libc", -] +checksum = "c394b5bd0c6f669e7275d9c20aa90ae064cb22e75a1cad54e1b34088034b149f" diff --git a/Cargo.toml b/Cargo.toml index b20e64a06f..0d73710bbb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,15 +1,23 @@ +# 'named-profiles' feature was stabilized in cargo 1.57. This line makes the +# build work with older cargo versions. +# +# We have this because as of this writing, the latest cargo Debian package +# that's available is 1.56. (Confusingly, the Debian package version number +# is 0.57, whereas 'cargo --version' says 1.56.) +# +# See https://tracker.debian.org/pkg/cargo for the current status of the +# package. When that gets updated, we can remove this. +cargo-features = ["named-profiles"] + [workspace] members = [ "compute_tools", "control_plane", "pageserver", - "postgres_ffi", "proxy", - "walkeeper", + "safekeeper", "workspace_hack", - "zenith", - "zenith_metrics", - "zenith_utils", + "libs/*", ] [profile.release] @@ -17,7 +25,64 @@ members = [ # Besides, debug info should not affect the performance. debug = true -# This is only needed for proxy's tests -# TODO: we should probably fork tokio-postgres-rustls instead +# disable debug symbols for all packages except this one to decrease binaries size +[profile.release.package."*"] +debug = false + +[profile.release-line-debug] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +[profile.release-line-debug-lto] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +lto = true + +[profile.release-line-debug-size] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +opt-level = "s" +[profile.release-line-debug-zize] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +opt-level = "z" +[profile.release-line-debug-size-lto] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +opt-level = "s" +lto = true +[profile.release-line-debug-zize-lto] +inherits = "release" +debug = 1 # true = 2 = all symbols, 1 = line only +opt-level = "z" +lto = true + +[profile.release-no-debug] +inherits = "release" +debug = false # true = 2 = all symbols, 1 = line only + +[profile.release-no-debug-size] +inherits = "release" +debug = false # true = 2 = all symbols, 1 = line only +opt-level = "s" +[profile.release-no-debug-zize] +inherits = "release" +debug = false # true = 2 = all symbols, 1 = line only +opt-level = "z" + +[profile.release-no-debug-size-lto] +inherits = "release" +debug = false # true = 2 = all symbols, 1 = line only +opt-level = "s" +lto = true + +[profile.release-no-debug-zize-lto] +inherits = "release" +debug = false # true = 2 = all symbols, 1 = line only +opt-level = "z" +lto = true + + +# This is only needed for proxy's tests. +# TODO: we should probably fork `tokio-postgres-rustls` instead. [patch.crates-io] -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } diff --git a/Dockerfile b/Dockerfile index dd0dba60ca..b0d934d480 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,64 +1,89 @@ -# -# Docker image for console integration testing. -# +### Creates a storage Docker image with postgres, pageserver, safekeeper and proxy binaries. +### The image itself is mainly used as a container for the binaries and for starting e2e tests with custom parameters. +### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used +### inside this image in the real deployments. +ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG IMAGE=rust +ARG TAG=pinned + +# Build Postgres +FROM $REPOSITORY/$IMAGE:$TAG AS pg-build +WORKDIR /home/nonroot + +COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14 +COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15 +COPY --chown=nonroot pgxn pgxn +COPY --chown=nonroot Makefile Makefile +COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh -# -# Build Postgres separately --- this layer will be rebuilt only if one of -# mentioned paths will get any changes. -# -FROM zenithdb/build:buster AS pg-build -WORKDIR /zenith -COPY ./vendor/postgres vendor/postgres -COPY ./Makefile Makefile ENV BUILD_TYPE release -RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres -RUN rm -rf postgres_install/build +RUN set -e \ + && mold -run make -j $(nproc) -s neon-pg-ext \ + && rm -rf pg_install/build \ + && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz . -# -# Build zenith binaries -# -# TODO: build cargo deps as separate layer. We used cargo-chef before but that was -# net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work. -# -FROM zenithdb/build:buster AS build +# Build neon binaries +FROM $REPOSITORY/$IMAGE:$TAG AS build +WORKDIR /home/nonroot +ARG GIT_VERSION=local -ARG GIT_VERSION -RUN if [ -z "$GIT_VERSION" ]; then echo "GIT_VERSION is reqired, use build_arg to pass it"; exit 1; fi - -WORKDIR /zenith -COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server +# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. +# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. +# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build +ARG RUSTC_WRAPPER=cachepot +ENV AWS_REGION=eu-central-1 +ENV CACHEPOT_S3_KEY_PREFIX=cachepot +ARG CACHEPOT_BUCKET=neon-github-dev +#ARG AWS_ACCESS_KEY_ID +#ARG AWS_SECRET_ACCESS_KEY +COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server +COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server COPY . . -RUN GIT_VERSION=$GIT_VERSION cargo build --release +# Show build caching stats to check if it was used in the end. +# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. +RUN set -e \ +&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin proxy --locked --release \ + && cachepot -s + +# Build final image # -# Copy binaries to resulting image. -# -FROM debian:buster-slim +FROM debian:bullseye-slim WORKDIR /data -RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl ca-certificates && \ - mkdir zenith_install +RUN set -e \ + && apt update \ + && apt install -y \ + libreadline-dev \ + libseccomp-dev \ + openssl \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ + && useradd -d /data neon \ + && chown -R neon:neon /data -COPY --from=build /zenith/target/release/pageserver /usr/local/bin -COPY --from=build /zenith/target/release/safekeeper /usr/local/bin -COPY --from=build /zenith/target/release/proxy /usr/local/bin -COPY --from=pg-build /zenith/tmp_install postgres_install -COPY docker-entrypoint.sh /docker-entrypoint.sh +COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin -# Remove build artifacts (~ 500 MB) -RUN rm -rf postgres_install/build && \ - # 'Install' Postgres binaries locally - cp -r postgres_install/* /usr/local/ && \ - # Prepare an archive of Postgres binaries (should be around 11 MB) - # and keep it inside container for an ease of deploy pipeline. - cd postgres_install && tar -czf /data/postgres_install.tar.gz . && cd .. && \ - rm -rf postgres_install +COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ +COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ +COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ -RUN useradd -d /data zenith && chown -R zenith:zenith /data +# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config. +# Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values. +RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ + && /usr/local/bin/pageserver -D /data/.neon/ --init \ + -c "id=1234" \ + -c "broker_endpoints=['http://etcd:2379']" \ + -c "pg_distrib_dir='/usr/local/'" \ + -c "listen_pg_addr='0.0.0.0:6400'" \ + -c "listen_http_addr='0.0.0.0:9898'" VOLUME ["/data"] -USER zenith +USER neon EXPOSE 6400 -ENTRYPOINT ["/docker-entrypoint.sh"] -CMD ["pageserver"] +EXPOSE 9898 diff --git a/Dockerfile.alpine b/Dockerfile.alpine deleted file mode 100644 index dafb7eaf6b..0000000000 --- a/Dockerfile.alpine +++ /dev/null @@ -1,95 +0,0 @@ -# -# Docker image for console integration testing. -# -# We may also reuse it in CI to unify installation process and as a general binaries building -# tool for production servers. -# -# Dynamic linking is used for librocksdb and libstdc++ bacause librocksdb-sys calls -# bindgen with "dynamic" feature flag. This also prevents usage of dockerhub alpine-rust -# images which are statically linked and have guards against any dlopen. I would rather -# prefer all static binaries so we may change the way librocksdb-sys builds or wait until -# we will have our own storage and drop rockdb dependency. -# -# Cargo-chef is used to separate dependencies building from main binaries building. This -# way `docker build` will download and install dependencies only of there are changes to -# out Cargo.toml files. -# - - -# -# build postgres separately -- this layer will be rebuilt only if one of -# mentioned paths will get any changes -# -FROM alpine:3.13 as pg-build -RUN apk add --update clang llvm compiler-rt compiler-rt-static lld musl-dev binutils \ - make bison flex readline-dev zlib-dev perl linux-headers libseccomp-dev -WORKDIR zenith -COPY ./vendor/postgres vendor/postgres -COPY ./Makefile Makefile -# Build using clang and lld -RUN CC='clang' LD='lld' CFLAGS='-fuse-ld=lld --rtlib=compiler-rt' make postgres -j4 - -# -# Calculate cargo dependencies. -# This will always run, but only generate recipe.json with list of dependencies without -# installing them. -# -FROM alpine:20210212 as cargo-deps-inspect -RUN apk add --update rust cargo -RUN cargo install cargo-chef -WORKDIR zenith -COPY . . -RUN cargo chef prepare --recipe-path recipe.json - -# -# Build cargo dependencies. -# This temp cantainner would be build only if recipe.json was changed. -# -FROM alpine:20210212 as deps-build -RUN apk add --update rust cargo openssl-dev clang build-base -# rust-rocksdb can be built against system-wide rocksdb -- that saves about -# 10 minutes during build. Rocksdb apk package is in testing now, but use it -# anyway. In case of any troubles we can download and build rocksdb here manually -# (to cache it as a docker layer). -RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev -WORKDIR zenith -COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server -COPY --from=cargo-deps-inspect /root/.cargo/bin/cargo-chef /root/.cargo/bin/ -COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json -RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json - -# -# Build zenith binaries -# -FROM alpine:20210212 as build -RUN apk add --update rust cargo openssl-dev clang build-base -RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev -WORKDIR zenith -COPY . . -# Copy cached dependencies -COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server -COPY --from=deps-build /zenith/target target -COPY --from=deps-build /root/.cargo /root/.cargo -RUN cargo build --release - -# -# Copy binaries to resulting image. -# build-base hare to provide libstdc++ (it will also bring gcc, but leave it this way until we figure -# out how to statically link rocksdb or avoid it at all). -# -FROM alpine:3.13 -RUN apk add --update openssl build-base libseccomp-dev -RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb -COPY --from=build /zenith/target/release/pageserver /usr/local/bin -COPY --from=build /zenith/target/release/safekeeper /usr/local/bin -COPY --from=build /zenith/target/release/proxy /usr/local/bin -COPY --from=pg-build /zenith/tmp_install /usr/local -COPY docker-entrypoint.sh /docker-entrypoint.sh - -RUN addgroup zenith && adduser -h /data -D -G zenith zenith -VOLUME ["/data"] -WORKDIR /data -USER zenith -EXPOSE 6400 -ENTRYPOINT ["/docker-entrypoint.sh"] -CMD ["pageserver"] diff --git a/Dockerfile.build b/Dockerfile.build deleted file mode 100644 index a9fd2cb0af..0000000000 --- a/Dockerfile.build +++ /dev/null @@ -1,16 +0,0 @@ -# -# Image with all the required dependencies to build https://github.com/zenithdb/zenith -# and Postgres from https://github.com/zenithdb/postgres -# Also includes some rust development and build tools. -# NB: keep in sync with rust image version in .circle/config.yml -# -FROM rust:1.56.1-slim-buster -WORKDIR /zenith - -# Install postgres and zenith build dependencies -# clang is for rocksdb -RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ - libseccomp-dev pkg-config libssl-dev clang - -# Install rust tools -RUN rustup component add clippy && cargo install cargo-audit diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14 new file mode 100644 index 0000000000..27e15593ad --- /dev/null +++ b/Dockerfile.compute-node-v14 @@ -0,0 +1,218 @@ +# +# This file is identical to the Dockerfile.compute-node-v15 file +# except for the version of Postgres that is built. +# + +ARG TAG=pinned + +######################################################################################### +# +# Layer "build-deps" +# +######################################################################################### +FROM debian:bullseye-slim AS build-deps +RUN apt update && \ + apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ + zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev + +######################################################################################### +# +# Layer "pg-build" +# Build Postgres from the neon postgres repository. +# +######################################################################################### +FROM build-deps AS pg-build +COPY vendor/postgres-v14 postgres +RUN cd postgres && \ + ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ + # Install headers + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install + +######################################################################################### +# +# Layer "postgis-build" +# Build PostGIS from the upstream PostGIS mirror. +# +######################################################################################### +FROM build-deps AS postgis-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +RUN apt update && \ + apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc + +RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \ + tar xvzf postgis-3.3.1.tar.gz && \ + cd postgis-3.3.1 && \ + ./autogen.sh && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + ./configure && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + cd extensions/postgis && \ + make clean && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control + +######################################################################################### +# +# Layer "plv8-build" +# Build plv8 +# +######################################################################################### +FROM build-deps AS plv8-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +RUN apt update && \ + apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils + +# https://github.com/plv8/plv8/issues/475: +# v8 uses gold for linking and sets `--thread-count=4` which breaks +# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607) +# Install newer gold version manually as debian-testing binutils version updates +# libc version, which in turn breaks other extension built against non-testing libc. +RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \ + tar xvzf binutils-2.38.tar.gz && \ + cd binutils-2.38 && \ + cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \ + cd ../bfd && ./configure && make bfdver.h && \ + cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \ + cp /usr/local/bin/ld.gold /usr/bin/gold + +# Sed is used to patch for https://github.com/plv8/plv8/issues/503 +RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ + tar xvzf v3.1.4.tar.gz && \ + cd plv8-3.1.4 && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \ + make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ + rm -rf /plv8-* && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control + +######################################################################################### +# +# Layer "h3-pg-build" +# Build h3_pg +# +######################################################################################### +FROM build-deps AS h3-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +# packaged cmake is too old +RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \ + -q -O /tmp/cmake-install.sh \ + && chmod u+x /tmp/cmake-install.sh \ + && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ + && rm /tmp/cmake-install.sh + +RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \ + tar xvzf h3.tgz && \ + cd h3-4.0.1 && \ + mkdir build && \ + cd build && \ + cmake .. -DCMAKE_BUILD_TYPE=Release && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + DESTDIR=/h3 make install && \ + cp -R /h3/usr / && \ + rm -rf build + +RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \ + tar xvzf h3-pg.tgz && \ + cd h3-pg-4.0.1 && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control + +######################################################################################### +# +# Layer "neon-pg-ext-build" +# compile neon extensions +# +######################################################################################### +FROM build-deps AS neon-pg-ext-build +COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /h3/usr / +COPY pgxn/ pgxn/ + +RUN make -j $(getconf _NPROCESSORS_ONLN) \ + PG_CONFIG=/usr/local/pgsql/bin/pg_config \ + -C pgxn/neon \ + -s install + +######################################################################################### +# +# Compile and run the Neon-specific `compute_ctl` binary +# +######################################################################################### +FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools +USER nonroot +# Copy entire project to get Cargo.* files with proper dependencies for the whole project +COPY --chown=nonroot . . +RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto + +######################################################################################### +# +# Clean up postgres folder before inclusion +# +######################################################################################### +FROM neon-pg-ext-build AS postgres-cleanup-layer +COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql + +# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) +RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp + +# Remove headers that we won't need anymore - we've completed installation of all extensions +RUN rm -r /usr/local/pgsql/include + +# Remove now-useless PGXS src infrastructure +RUN rm -r /usr/local/pgsql/lib/pgxs/src + +# Remove static postgresql libraries - all compilation is finished, so we +# can now remove these files - they must be included in other binaries by now +# if they were to be used by other libraries. +RUN rm /usr/local/pgsql/lib/lib*.a + +######################################################################################### +# +# Final layer +# Put it all together into the final image +# +######################################################################################### +FROM debian:bullseye-slim +# Add user postgres +RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ + echo "postgres:test_console_pass" | chpasswd && \ + mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ + chown -R postgres:postgres /var/db/postgres && \ + chmod 0750 /var/db/postgres/compute && \ + echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig + +COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local +COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl + +# Install: +# libreadline8 for psql +# libossp-uuid16 for extension ossp-uuid +# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS +# +# Lastly, link compute_ctl into zenith_ctl while we're at it, +# so that we don't need to put this in another layer. +RUN apt update && \ + apt install --no-install-recommends -y \ + libreadline8 \ + libossp-uuid16 \ + libgeos-c1v5 \ + libgdal28 \ + libproj19 \ + libprotobuf-c1 && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl + +USER postgres +ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15 new file mode 100644 index 0000000000..567848ffd7 --- /dev/null +++ b/Dockerfile.compute-node-v15 @@ -0,0 +1,218 @@ +# +# This file is identical to the Dockerfile.compute-node-v14 file +# except for the version of Postgres that is built. +# + +ARG TAG=pinned + +######################################################################################### +# +# Layer "build-deps" +# +######################################################################################### +FROM debian:bullseye-slim AS build-deps +RUN apt update && \ + apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ + zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev + +######################################################################################### +# +# Layer "pg-build" +# Build Postgres from the neon postgres repository. +# +######################################################################################### +FROM build-deps AS pg-build +COPY vendor/postgres-v15 postgres +RUN cd postgres && \ + ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ + # Install headers + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install + +######################################################################################### +# +# Layer "postgis-build" +# Build PostGIS from the upstream PostGIS mirror. +# +######################################################################################### +FROM build-deps AS postgis-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +RUN apt update && \ + apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc + +RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \ + tar xvzf postgis-3.3.1.tar.gz && \ + cd postgis-3.3.1 && \ + ./autogen.sh && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + ./configure && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + cd extensions/postgis && \ + make clean && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control + +######################################################################################### +# +# Layer "plv8-build" +# Build plv8 +# +######################################################################################### +FROM build-deps AS plv8-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +RUN apt update && \ + apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils + +# https://github.com/plv8/plv8/issues/475: +# v8 uses gold for linking and sets `--thread-count=4` which breaks +# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607) +# Install newer gold version manually as debian-testing binutils version updates +# libc version, which in turn breaks other extension built against non-testing libc. +RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \ + tar xvzf binutils-2.38.tar.gz && \ + cd binutils-2.38 && \ + cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \ + cd ../bfd && ./configure && make bfdver.h && \ + cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \ + cp /usr/local/bin/ld.gold /usr/bin/gold + +# Sed is used to patch for https://github.com/plv8/plv8/issues/503 +RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ + tar xvzf v3.1.4.tar.gz && \ + cd plv8-3.1.4 && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \ + make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ + rm -rf /plv8-* && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control + +######################################################################################### +# +# Layer "h3-pg-build" +# Build h3_pg +# +######################################################################################### +FROM build-deps AS h3-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +# packaged cmake is too old +RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \ + -q -O /tmp/cmake-install.sh \ + && chmod u+x /tmp/cmake-install.sh \ + && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ + && rm /tmp/cmake-install.sh + +RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \ + tar xvzf h3.tgz && \ + cd h3-4.0.1 && \ + mkdir build && \ + cd build && \ + cmake .. -DCMAKE_BUILD_TYPE=Release && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + DESTDIR=/h3 make install && \ + cp -R /h3/usr / && \ + rm -rf build + +RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \ + tar xvzf h3-pg.tgz && \ + cd h3-pg-4.0.1 && \ + export PATH="/usr/local/pgsql/bin:$PATH" && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control + +######################################################################################### +# +# Layer "neon-pg-ext-build" +# compile neon extensions +# +######################################################################################### +FROM build-deps AS neon-pg-ext-build +COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=h3-pg-build /h3/usr / +COPY pgxn/ pgxn/ + +RUN make -j $(getconf _NPROCESSORS_ONLN) \ + PG_CONFIG=/usr/local/pgsql/bin/pg_config \ + -C pgxn/neon \ + -s install + +######################################################################################### +# +# Compile and run the Neon-specific `compute_ctl` binary +# +######################################################################################### +FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools +USER nonroot +# Copy entire project to get Cargo.* files with proper dependencies for the whole project +COPY --chown=nonroot . . +RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto + +######################################################################################### +# +# Clean up postgres folder before inclusion +# +######################################################################################### +FROM neon-pg-ext-build AS postgres-cleanup-layer +COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql + +# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise) +RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp + +# Remove headers that we won't need anymore - we've completed installation of all extensions +RUN rm -r /usr/local/pgsql/include + +# Remove now-useless PGXS src infrastructure +RUN rm -r /usr/local/pgsql/lib/pgxs/src + +# Remove static postgresql libraries - all compilation is finished, so we +# can now remove these files - they must be included in other binaries by now +# if they were to be used by other libraries. +RUN rm /usr/local/pgsql/lib/lib*.a + +######################################################################################### +# +# Final layer +# Put it all together into the final image +# +######################################################################################### +FROM debian:bullseye-slim +# Add user postgres +RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ + echo "postgres:test_console_pass" | chpasswd && \ + mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ + chown -R postgres:postgres /var/db/postgres && \ + chmod 0750 /var/db/postgres/compute && \ + echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig + +COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local +COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl + +# Install: +# libreadline8 for psql +# libossp-uuid16 for extension ossp-uuid +# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS +# +# Lastly, link compute_ctl into zenith_ctl while we're at it, +# so that we don't need to put this in another layer. +RUN apt update && \ + apt install --no-install-recommends -y \ + libreadline8 \ + libossp-uuid16 \ + libgeos-c1v5 \ + libgdal28 \ + libproj19 \ + libprotobuf-c1 && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl + +USER postgres +ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index a1f7582ee4..8231cd0ebb 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,14 +1,29 @@ # First transient image to build compute_tools binaries -# NB: keep in sync with rust image version in .circle/config.yml -FROM rust:1.56.1-slim-buster AS rust-build +# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml +ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG IMAGE=rust +ARG TAG=pinned -WORKDIR /zenith +FROM $REPOSITORY/$IMAGE:$TAG AS rust-build +WORKDIR /home/nonroot + +# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. +# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. +# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build. +ARG RUSTC_WRAPPER=cachepot +ENV AWS_REGION=eu-central-1 +ENV CACHEPOT_S3_KEY_PREFIX=cachepot +ARG CACHEPOT_BUCKET=neon-github-dev +#ARG AWS_ACCESS_KEY_ID +#ARG AWS_SECRET_ACCESS_KEY COPY . . -RUN cargo build -p compute_tools --release +RUN set -e \ + && mold -run cargo build -p compute_tools --locked --release \ + && cachepot -s # Final image that only has one binary -FROM debian:buster-slim +FROM debian:bullseye-slim -COPY --from=rust-build /zenith/target/release/zenith_ctl /usr/local/bin/zenith_ctl +COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl diff --git a/Makefile b/Makefile index ef26ceee2d..6e8b659171 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,7 @@ -# Seccomp BPF is only available for Linux -UNAME_S := $(shell uname -s) -ifeq ($(UNAME_S),Linux) - SECCOMP = --with-libseccomp -else - SECCOMP = -endif +ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) + +# Where to install Postgres, default is ./pg_install, maybe useful for package managers +POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/ # # We differentiate between release / debug build types using the BUILD_TYPE @@ -12,17 +9,37 @@ endif # BUILD_TYPE ?= debug ifeq ($(BUILD_TYPE),release) - PG_CONFIGURE_OPTS = --enable-debug + PG_CONFIGURE_OPTS = --enable-debug --with-openssl PG_CFLAGS = -O2 -g3 $(CFLAGS) # Unfortunately, `--profile=...` is a nightly feature CARGO_BUILD_FLAGS += --release else ifeq ($(BUILD_TYPE),debug) - PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend + PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend PG_CFLAGS = -O0 -g3 $(CFLAGS) else -$(error Bad build type `$(BUILD_TYPE)', see Makefile for options) + $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) endif +# Seccomp BPF is only available for Linux +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Linux) + PG_CONFIGURE_OPTS += --with-libseccomp +endif + +# macOS with brew-installed openssl requires explicit paths +# It can be configured with OPENSSL_PREFIX variable +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Darwin) + OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3) + PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib +endif + +# Use -C option so that when PostgreSQL "make install" installs the +# headers, the mtime of the headers are not changed when there have +# been no changes to the files. Changing the mtime triggers an +# unnecessary rebuild of 'postgres_ffi'. +PG_CONFIGURE_OPTS += INSTALL='$(ROOT_PROJECT_DIR)/scripts/ninstall.sh -C' + # Choose whether we should be silent or verbose CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose) # Fix for a corner case when make doesn't pass a jobserver @@ -35,64 +52,154 @@ CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+) CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1 # -# Top level Makefile to build Zenith and PostgreSQL +# Top level Makefile to build Neon and PostgreSQL # .PHONY: all -all: zenith postgres +all: neon postgres neon-pg-ext -### Zenith Rust bits +### Neon Rust bits # # The 'postgres_ffi' depends on the Postgres headers. -.PHONY: zenith -zenith: postgres-headers - +@echo "Compiling Zenith" +.PHONY: neon +neon: postgres-v14-headers postgres-v15-headers + +@echo "Compiling Neon" $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) ### PostgreSQL parts -tmp_install/build/config.status: - +@echo "Configuring postgres build" - mkdir -p tmp_install/build - (cd tmp_install/build && \ - ../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \ +# The rules are duplicated for Postgres v14 and 15. We may want to refactor +# to avoid the duplication in the future, but it's tolerable for now. +# +$(POSTGRES_INSTALL_DIR)/build/v14/config.status: + +@echo "Configuring Postgres v14 build" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14 + (cd $(POSTGRES_INSTALL_DIR)/build/v14 && \ + $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure CFLAGS='$(PG_CFLAGS)' \ $(PG_CONFIGURE_OPTS) \ - $(SECCOMP) \ - --prefix=$(abspath tmp_install) > configure.log) + --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log) -# nicer alias for running 'configure' -.PHONY: postgres-configure -postgres-configure: tmp_install/build/config.status +$(POSTGRES_INSTALL_DIR)/build/v15/config.status: + +@echo "Configuring Postgres v15 build" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15 + (cd $(POSTGRES_INSTALL_DIR)/build/v15 && \ + $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure CFLAGS='$(PG_CFLAGS)' \ + $(PG_CONFIGURE_OPTS) \ + --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log) -# Install the PostgreSQL header files into tmp_install/include -.PHONY: postgres-headers -postgres-headers: postgres-configure - +@echo "Installing PostgreSQL headers" - $(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install +# nicer alias to run 'configure' +.PHONY: postgres-v14-configure +postgres-v14-configure: $(POSTGRES_INSTALL_DIR)/build/v14/config.status -# Compile and install PostgreSQL and contrib/zenith -.PHONY: postgres -postgres: postgres-configure \ - postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers` - +@echo "Compiling PostgreSQL" - $(MAKE) -C tmp_install/build MAKELEVEL=0 install - +@echo "Compiling contrib/zenith" - $(MAKE) -C tmp_install/build/contrib/zenith install - +@echo "Compiling contrib/zenith_test_utils" - $(MAKE) -C tmp_install/build/contrib/zenith_test_utils install +.PHONY: postgres-v15-configure +postgres-v15-configure: $(POSTGRES_INSTALL_DIR)/build/v15/config.status -.PHONY: postgres-clean -postgres-clean: - $(MAKE) -C tmp_install/build MAKELEVEL=0 clean +# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)//include +.PHONY: postgres-v14-headers +postgres-v14-headers: postgres-v14-configure + +@echo "Installing PostgreSQL v14 headers" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/include MAKELEVEL=0 install + +.PHONY: postgres-v15-headers +postgres-v15-headers: postgres-v15-configure + +@echo "Installing PostgreSQL v15 headers" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/include MAKELEVEL=0 install + +# Compile and install PostgreSQL +.PHONY: postgres-v14 +postgres-v14: postgres-v14-configure \ + postgres-v14-headers # to prevent `make install` conflicts with neon's `postgres-headers` + +@echo "Compiling PostgreSQL v14" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install + +@echo "Compiling libpq v14" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install + +@echo "Compiling pg_buffercache v14" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install + +@echo "Compiling pageinspect v14" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect install + +.PHONY: postgres-v15 +postgres-v15: postgres-v15-configure \ + postgres-v15-headers # to prevent `make install` conflicts with neon's `postgres-headers` + +@echo "Compiling PostgreSQL v15" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install + +@echo "Compiling libpq v15" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install + +@echo "Compiling pg_buffercache v15" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install + +@echo "Compiling pageinspect v15" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect install + +# shorthand to build all Postgres versions +postgres: postgres-v14 postgres-v15 + +.PHONY: postgres-v14-clean +postgres-v14-clean: + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq clean + +.PHONY: postgres-v15-clean +postgres-v15-clean: + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq clean + +neon-pg-ext-v14: postgres-v14 + +@echo "Compiling neon v14" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v14 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) + +@echo "Compiling neon_walredo v14" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install) + +@echo "Compiling neon_test_utils" v14 + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install) + +neon-pg-ext-v15: postgres-v15 + +@echo "Compiling neon v15" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v15 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) + +@echo "Compiling neon_walredo v15" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install) + +@echo "Compiling neon_test_utils" v15 + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 + (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \ + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install) + +.PHONY: neon-pg-ext-clean + $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean + $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean + +neon-pg-ext: neon-pg-ext-v14 neon-pg-ext-v15 +postgres-headers: postgres-v14-headers postgres-v15-headers +postgres-clean: postgres-v14-clean postgres-v15-clean # This doesn't remove the effects of 'configure'. .PHONY: clean clean: - cd tmp_install/build && $(MAKE) clean + cd $(POSTGRES_INSTALL_DIR)/build/v14 && $(MAKE) clean + cd $(POSTGRES_INSTALL_DIR)/build/v15 && $(MAKE) clean $(CARGO_CMD_PREFIX) cargo clean + cd pgxn/neon && $(MAKE) clean + cd pgxn/neon_test_utils && $(MAKE) clean # This removes everything .PHONY: distclean distclean: - rm -rf tmp_install + rm -rf $(POSTGRES_INSTALL_DIR) $(CARGO_CMD_PREFIX) cargo clean .PHONY: fmt @@ -101,4 +208,4 @@ fmt: .PHONY: setup-pre-commit-hook setup-pre-commit-hook: - ln -s -f ../../pre-commit.py .git/hooks/pre-commit + ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000000..4fbec9763b --- /dev/null +++ b/NOTICE @@ -0,0 +1,5 @@ +Neon +Copyright 2022 Neon Inc. + +The PostgreSQL submodules in vendor/postgres-v14 and vendor/postgres-v15 are licensed under the +PostgreSQL license. See vendor/postgres-v14/COPYRIGHT and vendor/postgres-v15/COPYRIGHT. diff --git a/README.md b/README.md index 8dd407f41a..e9c30668e0 100644 --- a/README.md +++ b/README.md @@ -1,82 +1,160 @@ -# Zenith +# Neon -Zenith is a serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributing data across a cluster of nodes. +Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes. + +The project used to be called "Zenith". Many of the commands and code comments +still refer to "zenith", but we are in the process of renaming things. + +## Quick start +[Join the waitlist](https://neon.tech/) for our free tier to receive your serverless postgres instance. Then connect to it with your preferred postgres client (psql, dbeaver, etc) or use the online SQL editor. + +Alternatively, compile and run the project [locally](#running-local-installation). ## Architecture overview -A Zenith installation consists of compute nodes and Zenith storage engine. +A Neon installation consists of compute nodes and a Neon storage engine. -Compute nodes are stateless PostgreSQL nodes, backed by Zenith storage engine. +Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine. -Zenith storage engine consists of two major components: -- Pageserver. Scalable storage backend for compute nodes. -- WAL service. The service that receives WAL from compute node and ensures that it is stored durably. +The Neon storage engine consists of two major components: +- Pageserver. Scalable storage backend for the compute nodes. +- WAL service. The service receives WAL from the compute node and ensures that it is stored durably. Pageserver consists of: -- Repository - Zenith storage implementation. +- Repository - Neon storage implementation. - WAL receiver - service that receives WAL from WAL service and stores it in the repository. - Page service - service that communicates with compute nodes and responds with pages from the repository. -- WAL redo - service that builds pages from base images and WAL records on Page service request. +- WAL redo - service that builds pages from base images and WAL records on Page service request ## Running local installation -1. Install build dependencies and other useful packages -On Ubuntu or Debian this set of packages should be sufficient to build the code: -```text +#### Installing dependencies on Linux +1. Install build dependencies and other applicable packages + +* On Ubuntu or Debian, this set of packages should be sufficient to build the code: +```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ -libssl-dev clang pkg-config libpq-dev +libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client +``` +* On Fedora, these packages are needed: +```bash +dnf install flex bison readline-devel zlib-devel openssl-devel \ + libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib ``` -[Rust] 1.56.1 or later is also required. +2. [Install Rust](https://www.rust-lang.org/tools/install) +``` +# recommended approach from https://www.rust-lang.org/tools/install +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +``` -To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively. +#### Installing dependencies on OSX (12.3.1) +1. Install XCode and dependencies +``` +xcode-select --install +brew install protobuf etcd openssl +``` + +2. [Install Rust](https://www.rust-lang.org/tools/install) +``` +# recommended approach from https://www.rust-lang.org/tools/install +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +``` + +3. Install PostgreSQL Client +``` +# from https://stackoverflow.com/questions/44654216/correct-way-to-install-psql-without-full-postgres-on-macos +brew install libpq +brew link --force libpq +``` + +#### Rustc version + +The project uses [rust toolchain file](./rust-toolchain.toml) to define the version it's built with in CI for testing and local builds. + +This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file. + +rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory. + +non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file. +Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates. + +#### Building on Linux + +1. Build neon and patched postgres +``` +# Note: The path to the neon sources can not contain a space. + +git clone --recursive https://github.com/neondatabase/neon.git +cd neon + +# The preferred and default is to make a debug build. This will create a +# demonstrably slower build than a release build. For a release build, +# use "BUILD_TYPE=release make -j`nproc`" + +make -j`nproc` +``` + +#### Building on OSX + +1. Build neon and patched postgres +``` +# Note: The path to the neon sources can not contain a space. + +git clone --recursive https://github.com/neondatabase/neon.git +cd neon + +# The preferred and default is to make a debug build. This will create a +# demonstrably slower build than a release build. For a release build, +# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" + +make -j`sysctl -n hw.logicalcpu` +``` + +#### Dependency installation notes +To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively. To run the integration tests or Python scripts (not required to use the code), install -Python (3.7 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory. +Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry](https://python-poetry.org/)) in the project directory. -2. Build zenith and patched postgres -```sh -git clone --recursive https://github.com/zenithdb/zenith.git -cd zenith -make -j5 -``` -3. Start pageserver and postgres on top of it (should be called from repo root): +#### Running neon database +1. Start pageserver and postgres on top of it (should be called from repo root): ```sh -# Create repository in .zenith with proper paths to binaries and data +# Create repository in .neon with proper paths to binaries and data # Later that would be responsibility of a package install script -> ./target/debug/zenith init -initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229 -created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8 -created main branch -pageserver init succeeded +> ./target/debug/neon_local init +Starting pageserver at '127.0.0.1:64000' in '.neon' + +Pageserver started +Successfully initialized timeline 7dd0907914ac399ff3be45fb252bfdb7 +Stopping pageserver gracefully...done! # start pageserver and safekeeper -> ./target/debug/zenith start -Starting pageserver at 'localhost:64000' in '.zenith' +> ./target/debug/neon_local start +Starting etcd broker using /usr/bin/etcd +Starting pageserver at '127.0.0.1:64000' in '.neon' + Pageserver started -initializing for single for 7676 -Starting safekeeper at 'localhost:5454' in '.zenith/safekeepers/single' +Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1' Safekeeper started # start postgres compute node -> ./target/debug/zenith pg start main -Starting new postgres main on main... -Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432 -Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres' -waiting for server to start.... done -server started +> ./target/debug/neon_local pg start main +Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ... +Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432 +Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres' # check list of running postgres instances -> ./target/debug/zenith pg list -BRANCH ADDRESS LSN STATUS -main 127.0.0.1:55432 0/1609610 running +> ./target/debug/neon_local pg list + NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS + main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running ``` -4. Now it is possible to connect to postgres and run some queries: +2. Now, it is possible to connect to postgres and run some queries: ```text -> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres +> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# CREATE TABLE t(key int primary key, value text); CREATE TABLE postgres=# insert into t values(1,1); @@ -88,25 +166,32 @@ postgres=# select * from t; (1 row) ``` -5. And create branches and run postgres on them: +3. And create branches and run postgres on them: ```sh # create branch named migration_check -> ./target/debug/zenith branch migration_check main -Created branch 'migration_check' at 0/1609610 +> ./target/debug/neon_local timeline branch --branch-name migration_check +Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main' # check branches tree -> ./target/debug/zenith branch - main - ┗━ @0/1609610: migration_check +> ./target/debug/neon_local timeline list +(L) main [de200bd42b49cc1814412c7e592dd6e9] +(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601] # start postgres on that branch -> ./target/debug/zenith pg start migration_check -Starting postgres node at 'host=127.0.0.1 port=55433 user=stas' -waiting for server to start.... done +> ./target/debug/neon_local pg start migration_check --branch-name migration_check +Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ... +Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433 +Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres' + +# check the new list of running postgres instances +> ./target/debug/neon_local pg list + NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS + main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running + migration_check 127.0.0.1:55433 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running # this new postgres instance will have all the data from 'main' postgres, # but all modifications would not affect data in original postgres -> psql -p55433 -h 127.0.0.1 -U zenith_admin postgres +> psql -p55433 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- @@ -115,19 +200,31 @@ postgres=# select * from t; postgres=# insert into t values(2,2); INSERT 0 1 + +# check that the new change doesn't affect the 'main' postgres +> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres +postgres=# select * from t; + key | value +-----+------- + 1 | 1 +(1 row) ``` -6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances - you have just started. You can stop them all with one command: +4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances + you have just started. You can terminate them all with one command: ```sh -> ./target/debug/zenith stop +> ./target/debug/neon_local stop ``` ## Running tests +Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes). + ```sh -git clone --recursive https://github.com/zenithdb/zenith.git -make # builds also postgres and installs it to ./tmp_install +git clone --recursive https://github.com/neondatabase/neon.git + +CARGO_BUILD_FLAGS="--features=testing" make + ./scripts/pytest ``` @@ -141,14 +238,14 @@ To view your `rustdoc` documentation in a browser, try running `cargo doc --no-d ### Postgres-specific terms -Due to Zenith's very close relation with PostgreSQL internals, there are numerous specific terms used. -Same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use. +Due to Neon's very close relation with PostgreSQL internals, numerous specific terms are used. +The same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use. To get more familiar with this aspect, refer to: -- [Zenith glossary](/docs/glossary.md) -- [PostgreSQL glossary](https://www.postgresql.org/docs/13/glossary.html) -- Other PostgreSQL documentation and sources (Zenith fork sources can be found [here](https://github.com/zenithdb/postgres)) +- [Neon glossary](/docs/glossary.md) +- [PostgreSQL glossary](https://www.postgresql.org/docs/14/glossary.html) +- Other PostgreSQL documentation and sources (Neon fork sources can be found [here](https://github.com/neondatabase/postgres)) ## Join the development diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 3adf762dcb..d6f8fae34c 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -4,16 +4,20 @@ version = "0.1.0" edition = "2021" [dependencies] -libc = "0.2" anyhow = "1.0" chrono = "0.4" -clap = "3.0" +clap = "4.0" env_logger = "0.9" +futures = "0.3.13" hyper = { version = "0.14", features = ["full"] } log = { version = "0.4", features = ["std", "serde"] } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" } +notify = "5.0.0" +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } regex = "1" serde = { version = "1.0", features = ["derive"] } serde_json = "1" tar = "0.4" -tokio = { version = "1", features = ["macros", "rt", "rt-multi-thread"] } +tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +url = "2.2.2" +workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/compute_tools/README.md b/compute_tools/README.md index ccae3d2842..97a7513344 100644 --- a/compute_tools/README.md +++ b/compute_tools/README.md @@ -1,9 +1,9 @@ # Compute node tools -Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd` -`ExecStart` option. It will handle all the `zenith` specifics during compute node +Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd` +`ExecStart` option. It will handle all the `Neon` specifics during compute node initialization: -- `zenith_ctl` accepts cluster (compute node) specification as a JSON file. +- `compute_ctl` accepts cluster (compute node) specification as a JSON file. - Every start is a fresh start, so the data directory is removed and initialized again on each run. - Next it will put configuration files into the `PGDATA` directory. @@ -13,18 +13,18 @@ initialization: - Check and alter/drop/create roles and databases. - Hang waiting on the `postmaster` process to exit. -Also `zenith_ctl` spawns two separate service threads: +Also `compute_ctl` spawns two separate service threads: - `compute-monitor` checks the last Postgres activity timestamp and saves it - into the shared `ComputeState`; + into the shared `ComputeNode`; - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the last activity requests. Usage example: ```sh -zenith_ctl -D /var/db/postgres/compute \ - -C 'postgresql://zenith_admin@localhost/postgres' \ - -S /var/db/postgres/specs/current.json \ - -b /usr/local/bin/postgres +compute_ctl -D /var/db/postgres/compute \ + -C 'postgresql://cloud_admin@localhost/postgres' \ + -S /var/db/postgres/specs/current.json \ + -b /usr/local/bin/postgres ``` ## Tests diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs new file mode 100644 index 0000000000..7786d7af9c --- /dev/null +++ b/compute_tools/src/bin/compute_ctl.rs @@ -0,0 +1,186 @@ +//! +//! Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd` +//! `ExecStart` option. It will handle all the `Neon` specifics during compute node +//! initialization: +//! - `compute_ctl` accepts cluster (compute node) specification as a JSON file. +//! - Every start is a fresh start, so the data directory is removed and +//! initialized again on each run. +//! - Next it will put configuration files into the `PGDATA` directory. +//! - Sync safekeepers and get commit LSN. +//! - Get `basebackup` from pageserver using the returned on the previous step LSN. +//! - Try to start `postgres` and wait until it is ready to accept connections. +//! - Check and alter/drop/create roles and databases. +//! - Hang waiting on the `postmaster` process to exit. +//! +//! Also `compute_ctl` spawns two separate service threads: +//! - `compute-monitor` checks the last Postgres activity timestamp and saves it +//! into the shared `ComputeNode`; +//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the +//! last activity requests. +//! +//! Usage example: +//! ```sh +//! compute_ctl -D /var/db/postgres/compute \ +//! -C 'postgresql://cloud_admin@localhost/postgres' \ +//! -S /var/db/postgres/specs/current.json \ +//! -b /usr/local/bin/postgres +//! ``` +//! +use std::fs::File; +use std::panic; +use std::path::Path; +use std::process::exit; +use std::sync::{Arc, RwLock}; +use std::{thread, time::Duration}; + +use anyhow::{Context, Result}; +use chrono::Utc; +use clap::Arg; +use log::{error, info}; + +use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus}; +use compute_tools::http::api::launch_http_server; +use compute_tools::logger::*; +use compute_tools::monitor::launch_monitor; +use compute_tools::params::*; +use compute_tools::pg_helpers::*; +use compute_tools::spec::*; +use url::Url; + +fn main() -> Result<()> { + // TODO: re-use `utils::logging` later + init_logger(DEFAULT_LOG_LEVEL)?; + + let matches = cli().get_matches(); + + let pgdata = matches + .get_one::("pgdata") + .expect("PGDATA path is required"); + let connstr = matches + .get_one::("connstr") + .expect("Postgres connection string is required"); + let spec = matches.get_one::("spec"); + let spec_path = matches.get_one::("spec-path"); + + // Try to use just 'postgres' if no path is provided + let pgbin = matches.get_one::("pgbin").unwrap(); + + let spec: ComputeSpec = match spec { + // First, try to get cluster spec from the cli argument + Some(json) => serde_json::from_str(json)?, + None => { + // Second, try to read it from the file if path is provided + if let Some(sp) = spec_path { + let path = Path::new(sp); + let file = File::open(path)?; + serde_json::from_reader(file)? + } else { + panic!("cluster spec should be provided via --spec or --spec-path argument"); + } + } + }; + + let pageserver_connstr = spec + .cluster + .settings + .find("neon.pageserver_connstring") + .expect("pageserver connstr should be provided"); + let tenant = spec + .cluster + .settings + .find("neon.tenant_id") + .expect("tenant id should be provided"); + let timeline = spec + .cluster + .settings + .find("neon.timeline_id") + .expect("tenant id should be provided"); + + let compute_state = ComputeNode { + start_time: Utc::now(), + connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?, + pgdata: pgdata.to_string(), + pgbin: pgbin.to_string(), + spec, + tenant, + timeline, + pageserver_connstr, + metrics: ComputeMetrics::new(), + state: RwLock::new(ComputeState::new()), + }; + let compute = Arc::new(compute_state); + + // Launch service threads first, so we were able to serve availability + // requests, while configuration is still in progress. + let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread"); + let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread"); + + // Run compute (Postgres) and hang waiting on it. + match compute.prepare_and_run() { + Ok(ec) => { + let code = ec.code().unwrap_or(1); + info!("Postgres exited with code {}, shutting down", code); + exit(code) + } + Err(error) => { + error!("could not start the compute node: {:?}", error); + + let mut state = compute.state.write().unwrap(); + state.error = Some(format!("{:?}", error)); + state.status = ComputeStatus::Failed; + drop(state); + + // Keep serving HTTP requests, so the cloud control plane was able to + // get the actual error. + info!("giving control plane 30s to collect the error before shutdown"); + thread::sleep(Duration::from_secs(30)); + info!("shutting down"); + Err(error) + } + } +} + +fn cli() -> clap::Command { + // Env variable is set by `cargo` + let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown"); + clap::Command::new("compute_ctl") + .version(version) + .arg( + Arg::new("connstr") + .short('C') + .long("connstr") + .value_name("DATABASE_URL") + .required(true), + ) + .arg( + Arg::new("pgdata") + .short('D') + .long("pgdata") + .value_name("DATADIR") + .required(true), + ) + .arg( + Arg::new("pgbin") + .short('b') + .long("pgbin") + .default_value("postgres") + .value_name("POSTGRES_PATH"), + ) + .arg( + Arg::new("spec") + .short('s') + .long("spec") + .value_name("SPEC_JSON"), + ) + .arg( + Arg::new("spec-path") + .short('S') + .long("spec-path") + .value_name("SPEC_PATH"), + ) +} + +#[test] +fn verify_cli() { + cli().debug_assert() +} diff --git a/compute_tools/src/bin/zenith_ctl.rs b/compute_tools/src/bin/zenith_ctl.rs deleted file mode 100644 index 49ba653fa1..0000000000 --- a/compute_tools/src/bin/zenith_ctl.rs +++ /dev/null @@ -1,249 +0,0 @@ -//! -//! Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd` -//! `ExecStart` option. It will handle all the `zenith` specifics during compute node -//! initialization: -//! - `zenith_ctl` accepts cluster (compute node) specification as a JSON file. -//! - Every start is a fresh start, so the data directory is removed and -//! initialized again on each run. -//! - Next it will put configuration files into the `PGDATA` directory. -//! - Sync safekeepers and get commit LSN. -//! - Get `basebackup` from pageserver using the returned on the previous step LSN. -//! - Try to start `postgres` and wait until it is ready to accept connections. -//! - Check and alter/drop/create roles and databases. -//! - Hang waiting on the `postmaster` process to exit. -//! -//! Also `zenith_ctl` spawns two separate service threads: -//! - `compute-monitor` checks the last Postgres activity timestamp and saves it -//! into the shared `ComputeState`; -//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the -//! last activity requests. -//! -//! Usage example: -//! ```sh -//! zenith_ctl -D /var/db/postgres/compute \ -//! -C 'postgresql://zenith_admin@localhost/postgres' \ -//! -S /var/db/postgres/specs/current.json \ -//! -b /usr/local/bin/postgres -//! ``` -//! -use std::fs::File; -use std::panic; -use std::path::Path; -use std::process::{exit, Command, ExitStatus}; -use std::sync::{Arc, RwLock}; - -use anyhow::{Context, Result}; -use chrono::Utc; -use clap::Arg; -use log::info; -use postgres::{Client, NoTls}; - -use compute_tools::config; -use compute_tools::http_api::launch_http_server; -use compute_tools::logger::*; -use compute_tools::monitor::launch_monitor; -use compute_tools::params::*; -use compute_tools::pg_helpers::*; -use compute_tools::spec::*; -use compute_tools::zenith::*; - -/// Do all the preparations like PGDATA directory creation, configuration, -/// safekeepers sync, basebackup, etc. -fn prepare_pgdata(state: &Arc>) -> Result<()> { - let state = state.read().unwrap(); - let spec = &state.spec; - let pgdata_path = Path::new(&state.pgdata); - let pageserver_connstr = spec - .cluster - .settings - .find("zenith.page_server_connstring") - .expect("pageserver connstr should be provided"); - let tenant = spec - .cluster - .settings - .find("zenith.zenith_tenant") - .expect("tenant id should be provided"); - let timeline = spec - .cluster - .settings - .find("zenith.zenith_timeline") - .expect("tenant id should be provided"); - - info!( - "starting cluster #{}, operation #{}", - spec.cluster.cluster_id, - spec.operation_uuid.as_ref().unwrap() - ); - - // Remove/create an empty pgdata directory and put configuration there. - create_pgdata(&state.pgdata)?; - config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?; - - info!("starting safekeepers syncing"); - let lsn = sync_safekeepers(&state.pgdata, &state.pgbin) - .with_context(|| "failed to sync safekeepers")?; - info!("safekeepers synced at LSN {}", lsn); - - info!( - "getting basebackup@{} from pageserver {}", - lsn, pageserver_connstr - ); - get_basebackup(&state.pgdata, &pageserver_connstr, &tenant, &timeline, &lsn).with_context( - || { - format!( - "failed to get basebackup@{} from pageserver {}", - lsn, pageserver_connstr - ) - }, - )?; - - // Update pg_hba.conf received with basebackup. - update_pg_hba(pgdata_path)?; - - Ok(()) -} - -/// Start Postgres as a child process and manage DBs/roles. -/// After that this will hang waiting on the postmaster process to exit. -fn run_compute(state: &Arc>) -> Result { - let read_state = state.read().unwrap(); - let pgdata_path = Path::new(&read_state.pgdata); - - // Run postgres as a child process. - let mut pg = Command::new(&read_state.pgbin) - .args(&["-D", &read_state.pgdata]) - .spawn() - .expect("cannot start postgres process"); - - // Try default Postgres port if it is not provided - let port = read_state - .spec - .cluster - .settings - .find("port") - .unwrap_or_else(|| "5432".to_string()); - wait_for_postgres(&port, pgdata_path)?; - - let mut client = Client::connect(&read_state.connstr, NoTls)?; - - handle_roles(&read_state.spec, &mut client)?; - handle_databases(&read_state.spec, &mut client)?; - - // 'Close' connection - drop(client); - - info!( - "finished configuration of cluster #{}", - read_state.spec.cluster.cluster_id - ); - - // Release the read lock. - drop(read_state); - - // Get the write lock, update state and release the lock, so HTTP API - // was able to serve requests, while we are blocked waiting on - // Postgres. - let mut state = state.write().unwrap(); - state.ready = true; - drop(state); - - // Wait for child postgres process basically forever. In this state Ctrl+C - // will be propagated to postgres and it will be shut down as well. - let ecode = pg.wait().expect("failed to wait on postgres"); - - Ok(ecode) -} - -fn main() -> Result<()> { - // TODO: re-use `zenith_utils::logging` later - init_logger(DEFAULT_LOG_LEVEL)?; - - // Env variable is set by `cargo` - let version: Option<&str> = option_env!("CARGO_PKG_VERSION"); - let matches = clap::App::new("zenith_ctl") - .version(version.unwrap_or("unknown")) - .arg( - Arg::new("connstr") - .short('C') - .long("connstr") - .value_name("DATABASE_URL") - .required(true), - ) - .arg( - Arg::new("pgdata") - .short('D') - .long("pgdata") - .value_name("DATADIR") - .required(true), - ) - .arg( - Arg::new("pgbin") - .short('b') - .long("pgbin") - .value_name("POSTGRES_PATH"), - ) - .arg( - Arg::new("spec") - .short('s') - .long("spec") - .value_name("SPEC_JSON"), - ) - .arg( - Arg::new("spec-path") - .short('S') - .long("spec-path") - .value_name("SPEC_PATH"), - ) - .get_matches(); - - let pgdata = matches.value_of("pgdata").expect("PGDATA path is required"); - let connstr = matches - .value_of("connstr") - .expect("Postgres connection string is required"); - let spec = matches.value_of("spec"); - let spec_path = matches.value_of("spec-path"); - - // Try to use just 'postgres' if no path is provided - let pgbin = matches.value_of("pgbin").unwrap_or("postgres"); - - let spec: ClusterSpec = match spec { - // First, try to get cluster spec from the cli argument - Some(json) => serde_json::from_str(json)?, - None => { - // Second, try to read it from the file if path is provided - if let Some(sp) = spec_path { - let path = Path::new(sp); - let file = File::open(path)?; - serde_json::from_reader(file)? - } else { - panic!("cluster spec should be provided via --spec or --spec-path argument"); - } - } - }; - - let compute_state = ComputeState { - connstr: connstr.to_string(), - pgdata: pgdata.to_string(), - pgbin: pgbin.to_string(), - spec, - ready: false, - last_active: Utc::now(), - }; - let compute_state = Arc::new(RwLock::new(compute_state)); - - // Launch service threads first, so we were able to serve availability - // requests, while configuration is still in progress. - let mut _threads = vec![ - launch_http_server(&compute_state).expect("cannot launch compute monitor thread"), - launch_monitor(&compute_state).expect("cannot launch http endpoint thread"), - ]; - - prepare_pgdata(&compute_state)?; - - // Run compute (Postgres) and hang waiting on it. Panic if any error happens, - // it will help us to trigger unwind and kill postmaster as well. - match run_compute(&compute_state) { - Ok(ec) => exit(ec.success() as i32), - Err(error) => panic!("cannot start compute node, error: {}", error), - } -} diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs new file mode 100644 index 0000000000..b6ba1692f9 --- /dev/null +++ b/compute_tools/src/checker.rs @@ -0,0 +1,43 @@ +use anyhow::{anyhow, Result}; +use log::error; +use postgres::Client; +use tokio_postgres::NoTls; + +use crate::compute::ComputeNode; + +pub fn create_writablity_check_data(client: &mut Client) -> Result<()> { + let query = " + CREATE TABLE IF NOT EXISTS health_check ( + id serial primary key, + updated_at timestamptz default now() + ); + INSERT INTO health_check VALUES (1, now()) + ON CONFLICT (id) DO UPDATE + SET updated_at = now();"; + let result = client.simple_query(query)?; + if result.len() < 2 { + return Err(anyhow::format_err!("executed {} queries", result.len())); + } + Ok(()) +} + +pub async fn check_writability(compute: &ComputeNode) -> Result<()> { + let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?; + if client.is_closed() { + return Err(anyhow!("connection to postgres closed")); + } + tokio::spawn(async move { + if let Err(e) = connection.await { + error!("connection error: {}", e); + } + }); + + let result = client + .simple_query("UPDATE health_check SET updated_at = now() WHERE id = 1;") + .await?; + + if result.len() != 1 { + return Err(anyhow!("statement can't be executed")); + } + Ok(()) +} diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs new file mode 100644 index 0000000000..bfdd2340ec --- /dev/null +++ b/compute_tools/src/compute.rs @@ -0,0 +1,346 @@ +// +// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`, +// but there are several things that makes `PostgresNode` usage inconvenient in the +// cloud: +// - it inherits from `LocalEnv`, which contains **all-all** the information about +// a complete service running +// - it uses `PageServerNode` with information about http endpoint, which we do not +// need in the cloud again +// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud +// +// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required +// attributes (not required for the cloud). Yet, it is still tempting to unify these +// `PostgresNode` and `ComputeNode` and use one in both places. +// +// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`. +// +use std::fs; +use std::os::unix::fs::PermissionsExt; +use std::path::Path; +use std::process::{Command, ExitStatus, Stdio}; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::RwLock; + +use anyhow::{Context, Result}; +use chrono::{DateTime, Utc}; +use log::info; +use postgres::{Client, NoTls}; +use serde::{Serialize, Serializer}; + +use crate::checker::create_writablity_check_data; +use crate::config; +use crate::pg_helpers::*; +use crate::spec::*; + +/// Compute node info shared across several `compute_ctl` threads. +pub struct ComputeNode { + pub start_time: DateTime, + // Url type maintains proper escaping + pub connstr: url::Url, + pub pgdata: String, + pub pgbin: String, + pub spec: ComputeSpec, + pub tenant: String, + pub timeline: String, + pub pageserver_connstr: String, + pub metrics: ComputeMetrics, + /// Volatile part of the `ComputeNode` so should be used under `RwLock` + /// to allow HTTP API server to serve status requests, while configuration + /// is in progress. + pub state: RwLock, +} + +fn rfc3339_serialize(x: &DateTime, s: S) -> Result +where + S: Serializer, +{ + x.to_rfc3339().serialize(s) +} + +#[derive(Serialize)] +#[serde(rename_all = "snake_case")] +pub struct ComputeState { + pub status: ComputeStatus, + /// Timestamp of the last Postgres activity + #[serde(serialize_with = "rfc3339_serialize")] + pub last_active: DateTime, + pub error: Option, +} + +impl ComputeState { + pub fn new() -> Self { + Self { + status: ComputeStatus::Init, + last_active: Utc::now(), + error: None, + } + } +} + +impl Default for ComputeState { + fn default() -> Self { + Self::new() + } +} + +#[derive(Serialize, Clone, Copy, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputeStatus { + Init, + Running, + Failed, +} + +#[derive(Serialize)] +pub struct ComputeMetrics { + pub sync_safekeepers_ms: AtomicU64, + pub basebackup_ms: AtomicU64, + pub config_ms: AtomicU64, + pub total_startup_ms: AtomicU64, +} + +impl ComputeMetrics { + pub fn new() -> Self { + Self { + sync_safekeepers_ms: AtomicU64::new(0), + basebackup_ms: AtomicU64::new(0), + config_ms: AtomicU64::new(0), + total_startup_ms: AtomicU64::new(0), + } + } +} + +impl Default for ComputeMetrics { + fn default() -> Self { + Self::new() + } +} + +impl ComputeNode { + pub fn set_status(&self, status: ComputeStatus) { + self.state.write().unwrap().status = status; + } + + pub fn get_status(&self) -> ComputeStatus { + self.state.read().unwrap().status + } + + // Remove `pgdata` directory and create it again with right permissions. + fn create_pgdata(&self) -> Result<()> { + // Ignore removal error, likely it is a 'No such file or directory (os error 2)'. + // If it is something different then create_dir() will error out anyway. + let _ok = fs::remove_dir_all(&self.pgdata); + fs::create_dir(&self.pgdata)?; + fs::set_permissions(&self.pgdata, fs::Permissions::from_mode(0o700))?; + + Ok(()) + } + + // Get basebackup from the libpq connection to pageserver using `connstr` and + // unarchive it to `pgdata` directory overriding all its previous content. + fn get_basebackup(&self, lsn: &str) -> Result<()> { + let start_time = Utc::now(); + + let mut client = Client::connect(&self.pageserver_connstr, NoTls)?; + let basebackup_cmd = match lsn { + "0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute + _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn), + }; + let copyreader = client.copy_out(basebackup_cmd.as_str())?; + + // Read the archive directly from the `CopyOutReader` + // + // Set `ignore_zeros` so that unpack() reads all the Copy data and + // doesn't stop at the end-of-archive marker. Otherwise, if the server + // sends an Error after finishing the tarball, we will not notice it. + let mut ar = tar::Archive::new(copyreader); + ar.set_ignore_zeros(true); + ar.unpack(&self.pgdata)?; + + self.metrics.basebackup_ms.store( + Utc::now() + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64, + Ordering::Relaxed, + ); + + Ok(()) + } + + // Run `postgres` in a special mode with `--sync-safekeepers` argument + // and return the reported LSN back to the caller. + fn sync_safekeepers(&self) -> Result { + let start_time = Utc::now(); + + let sync_handle = Command::new(&self.pgbin) + .args(&["--sync-safekeepers"]) + .env("PGDATA", &self.pgdata) // we cannot use -D in this mode + .stdout(Stdio::piped()) + .spawn() + .expect("postgres --sync-safekeepers failed to start"); + + // `postgres --sync-safekeepers` will print all log output to stderr and + // final LSN to stdout. So we pipe only stdout, while stderr will be automatically + // redirected to the caller output. + let sync_output = sync_handle + .wait_with_output() + .expect("postgres --sync-safekeepers failed"); + + if !sync_output.status.success() { + anyhow::bail!( + "postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}", + sync_output.status, + String::from_utf8(sync_output.stdout) + .expect("postgres --sync-safekeepers exited, and stdout is not utf-8"), + ); + } + + self.metrics.sync_safekeepers_ms.store( + Utc::now() + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64, + Ordering::Relaxed, + ); + + let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim()); + + Ok(lsn) + } + + /// Do all the preparations like PGDATA directory creation, configuration, + /// safekeepers sync, basebackup, etc. + pub fn prepare_pgdata(&self) -> Result<()> { + let spec = &self.spec; + let pgdata_path = Path::new(&self.pgdata); + + // Remove/create an empty pgdata directory and put configuration there. + self.create_pgdata()?; + config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?; + + info!("starting safekeepers syncing"); + let lsn = self + .sync_safekeepers() + .with_context(|| "failed to sync safekeepers")?; + info!("safekeepers synced at LSN {}", lsn); + + info!( + "getting basebackup@{} from pageserver {}", + lsn, &self.pageserver_connstr + ); + self.get_basebackup(&lsn).with_context(|| { + format!( + "failed to get basebackup@{} from pageserver {}", + lsn, &self.pageserver_connstr + ) + })?; + + // Update pg_hba.conf received with basebackup. + update_pg_hba(pgdata_path)?; + + Ok(()) + } + + /// Start Postgres as a child process and manage DBs/roles. + /// After that this will hang waiting on the postmaster process to exit. + pub fn run(&self) -> Result { + let start_time = Utc::now(); + + let pgdata_path = Path::new(&self.pgdata); + + // Run postgres as a child process. + let mut pg = Command::new(&self.pgbin) + .args(&["-D", &self.pgdata]) + .spawn() + .expect("cannot start postgres process"); + + wait_for_postgres(&mut pg, pgdata_path)?; + + // If connection fails, + // it may be the old node with `zenith_admin` superuser. + // + // In this case we need to connect with old `zenith_admin`name + // and create new user. We cannot simply rename connected user, + // but we can create a new one and grant it all privileges. + let mut client = match Client::connect(self.connstr.as_str(), NoTls) { + Err(e) => { + info!( + "cannot connect to postgres: {}, retrying with `zenith_admin` username", + e + ); + let mut zenith_admin_connstr = self.connstr.clone(); + + zenith_admin_connstr + .set_username("zenith_admin") + .map_err(|_| anyhow::anyhow!("invalid connstr"))?; + + let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?; + client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; + client.simple_query("GRANT zenith_admin TO cloud_admin")?; + drop(client); + + // reconnect with connsting with expected name + Client::connect(self.connstr.as_str(), NoTls)? + } + Ok(client) => client, + }; + + handle_roles(&self.spec, &mut client)?; + handle_databases(&self.spec, &mut client)?; + handle_role_deletions(self, &mut client)?; + handle_grants(self, &mut client)?; + create_writablity_check_data(&mut client)?; + + // 'Close' connection + drop(client); + let startup_end_time = Utc::now(); + + self.metrics.config_ms.store( + startup_end_time + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64, + Ordering::Relaxed, + ); + self.metrics.total_startup_ms.store( + startup_end_time + .signed_duration_since(self.start_time) + .to_std() + .unwrap() + .as_millis() as u64, + Ordering::Relaxed, + ); + + self.set_status(ComputeStatus::Running); + + info!( + "finished configuration of compute for project {}", + self.spec.cluster.cluster_id + ); + + // Wait for child Postgres process basically forever. In this state Ctrl+C + // will propagate to Postgres and it will be shut down as well. + let ecode = pg + .wait() + .expect("failed to start waiting on Postgres process"); + + Ok(ecode) + } + + pub fn prepare_and_run(&self) -> Result { + info!( + "starting compute for project {}, operation {}, tenant {}, timeline {}", + self.spec.cluster.cluster_id, + self.spec.operation_uuid.as_ref().unwrap(), + self.tenant, + self.timeline, + ); + + self.prepare_pgdata()?; + self.run() + } +} diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 22134db0f8..6cbd0e3d4c 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,7 +6,7 @@ use std::path::Path; use anyhow::Result; use crate::pg_helpers::PgOptionsSerialize; -use crate::zenith::ClusterSpec; +use crate::spec::ComputeSpec; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -32,20 +32,20 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { } /// Create or completely rewrite configuration file specified by `path` -pub fn write_postgres_conf(path: &Path, spec: &ClusterSpec) -> Result<()> { +pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> { // File::create() destroys the file content if it exists. let mut postgres_conf = File::create(path)?; - write_zenith_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?; + write_auto_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?; Ok(()) } // Write Postgres config block wrapped with generated comment section -fn write_zenith_managed_block(file: &mut File, buf: &str) -> Result<()> { - writeln!(file, "# Managed by Zenith: begin")?; +fn write_auto_managed_block(file: &mut File, buf: &str) -> Result<()> { + writeln!(file, "# Managed by compute_ctl: begin")?; writeln!(file, "{}", buf)?; - writeln!(file, "# Managed by Zenith: end")?; + writeln!(file, "# Managed by compute_ctl: end")?; Ok(()) } diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs new file mode 100644 index 0000000000..4c8bbc608b --- /dev/null +++ b/compute_tools/src/http/api.rs @@ -0,0 +1,109 @@ +use std::convert::Infallible; +use std::net::SocketAddr; +use std::sync::Arc; +use std::thread; + +use anyhow::Result; +use hyper::service::{make_service_fn, service_fn}; +use hyper::{Body, Method, Request, Response, Server, StatusCode}; +use log::{error, info}; +use serde_json; + +use crate::compute::{ComputeNode, ComputeStatus}; + +// Service function to handle all available routes. +async fn routes(req: Request, compute: Arc) -> Response { + match (req.method(), req.uri().path()) { + // Timestamp of the last Postgres activity in the plain text. + // DEPRECATED in favour of /status + (&Method::GET, "/last_activity") => { + info!("serving /last_active GET request"); + let state = compute.state.read().unwrap(); + + // Use RFC3339 format for consistency. + Response::new(Body::from(state.last_active.to_rfc3339())) + } + + // Has compute setup process finished? -> true/false. + // DEPRECATED in favour of /status + (&Method::GET, "/ready") => { + info!("serving /ready GET request"); + let status = compute.get_status(); + Response::new(Body::from(format!("{}", status == ComputeStatus::Running))) + } + + // Serialized compute state. + (&Method::GET, "/status") => { + info!("serving /status GET request"); + let state = compute.state.read().unwrap(); + Response::new(Body::from(serde_json::to_string(&*state).unwrap())) + } + + // Startup metrics in JSON format. Keep /metrics reserved for a possible + // future use for Prometheus metrics format. + (&Method::GET, "/metrics.json") => { + info!("serving /metrics.json GET request"); + Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap())) + } + + // DEPRECATED, use POST instead + (&Method::GET, "/check_writability") => { + info!("serving /check_writability GET request"); + let res = crate::checker::check_writability(&compute).await; + match res { + Ok(_) => Response::new(Body::from("true")), + Err(e) => Response::new(Body::from(e.to_string())), + } + } + + (&Method::POST, "/check_writability") => { + info!("serving /check_writability POST request"); + let res = crate::checker::check_writability(&compute).await; + match res { + Ok(_) => Response::new(Body::from("true")), + Err(e) => Response::new(Body::from(e.to_string())), + } + } + + // Return the `404 Not Found` for any other routes. + _ => { + let mut not_found = Response::new(Body::from("404 Not Found")); + *not_found.status_mut() = StatusCode::NOT_FOUND; + not_found + } + } +} + +// Main Hyper HTTP server function that runs it and blocks waiting on it forever. +#[tokio::main] +async fn serve(state: Arc) { + let addr = SocketAddr::from(([0, 0, 0, 0], 3080)); + + let make_service = make_service_fn(move |_conn| { + let state = state.clone(); + async move { + Ok::<_, Infallible>(service_fn(move |req: Request| { + let state = state.clone(); + async move { Ok::<_, Infallible>(routes(req, state).await) } + })) + } + }); + + info!("starting HTTP server on {}", addr); + + let server = Server::bind(&addr).serve(make_service); + + // Run this server forever + if let Err(e) = server.await { + error!("server error: {}", e); + } +} + +/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`. +pub fn launch_http_server(state: &Arc) -> Result> { + let state = Arc::clone(state); + + Ok(thread::Builder::new() + .name("http-endpoint".into()) + .spawn(move || serve(state))?) +} diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs new file mode 100644 index 0000000000..e5fdf85eed --- /dev/null +++ b/compute_tools/src/http/mod.rs @@ -0,0 +1 @@ +pub mod api; diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml new file mode 100644 index 0000000000..9c0f8e3ccd --- /dev/null +++ b/compute_tools/src/http/openapi_spec.yaml @@ -0,0 +1,158 @@ +openapi: "3.0.2" +info: + title: Compute node control API + version: "1.0" + +servers: + - url: "http://localhost:3080" + +paths: + /status: + get: + tags: + - "info" + summary: Get compute node internal status + description: "" + operationId: getComputeStatus + responses: + "200": + description: ComputeState + content: + application/json: + schema: + $ref: "#/components/schemas/ComputeState" + + /metrics.json: + get: + tags: + - "info" + summary: Get compute node startup metrics in JSON format + description: "" + operationId: getComputeMetricsJSON + responses: + "200": + description: ComputeMetrics + content: + application/json: + schema: + $ref: "#/components/schemas/ComputeMetrics" + + /ready: + get: + deprecated: true + tags: + - "info" + summary: Check whether compute startup process finished successfully + description: "" + operationId: computeIsReady + responses: + "200": + description: Compute is ready ('true') or not ('false') + content: + text/plain: + schema: + type: string + example: "true" + + /last_activity: + get: + deprecated: true + tags: + - "info" + summary: Get timestamp of the last compute activity + description: "" + operationId: getLastComputeActivityTS + responses: + "200": + description: Timestamp of the last compute activity + content: + text/plain: + schema: + type: string + example: "2022-10-12T07:20:50.52Z" + + /check_writability: + get: + deprecated: true + tags: + - "check" + summary: Check that we can write new data on this compute + description: "" + operationId: checkComputeWritabilityDeprecated + responses: + "200": + description: Check result + content: + text/plain: + schema: + type: string + description: Error text or 'true' if check passed + example: "true" + + post: + tags: + - "check" + summary: Check that we can write new data on this compute + description: "" + operationId: checkComputeWritability + responses: + "200": + description: Check result + content: + text/plain: + schema: + type: string + description: Error text or 'true' if check passed + example: "true" + +components: + securitySchemes: + JWT: + type: http + scheme: bearer + bearerFormat: JWT + + schemas: + ComputeMetrics: + type: object + description: Compute startup metrics + required: + - sync_safekeepers_ms + - basebackup_ms + - config_ms + - total_startup_ms + properties: + sync_safekeepers_ms: + type: integer + basebackup_ms: + type: integer + config_ms: + type: integer + total_startup_ms: + type: integer + + ComputeState: + type: object + required: + - status + - last_active + properties: + status: + $ref: '#/components/schemas/ComputeStatus' + last_active: + type: string + description: The last detected compute activity timestamp in UTC and RFC3339 format + example: "2022-10-12T07:20:50.52Z" + error: + type: string + description: Text of the error during compute startup, if any + + ComputeStatus: + type: string + enum: + - init + - failed + - running + +security: + - JWT: [] diff --git a/compute_tools/src/http_api.rs b/compute_tools/src/http_api.rs deleted file mode 100644 index 02fab08a6e..0000000000 --- a/compute_tools/src/http_api.rs +++ /dev/null @@ -1,73 +0,0 @@ -use std::convert::Infallible; -use std::net::SocketAddr; -use std::sync::{Arc, RwLock}; -use std::thread; - -use anyhow::Result; -use hyper::service::{make_service_fn, service_fn}; -use hyper::{Body, Method, Request, Response, Server, StatusCode}; -use log::{error, info}; - -use crate::zenith::*; - -// Service function to handle all available routes. -fn routes(req: Request, state: Arc>) -> Response { - match (req.method(), req.uri().path()) { - // Timestamp of the last Postgres activity in the plain text. - (&Method::GET, "/last_activity") => { - info!("serving /last_active GET request"); - let state = state.read().unwrap(); - - // Use RFC3339 format for consistency. - Response::new(Body::from(state.last_active.to_rfc3339())) - } - - // Has compute setup process finished? -> true/false - (&Method::GET, "/ready") => { - info!("serving /ready GET request"); - let state = state.read().unwrap(); - Response::new(Body::from(format!("{}", state.ready))) - } - - // Return the `404 Not Found` for any other routes. - _ => { - let mut not_found = Response::new(Body::from("404 Not Found")); - *not_found.status_mut() = StatusCode::NOT_FOUND; - not_found - } - } -} - -// Main Hyper HTTP server function that runs it and blocks waiting on it forever. -#[tokio::main] -async fn serve(state: Arc>) { - let addr = SocketAddr::from(([0, 0, 0, 0], 3080)); - - let make_service = make_service_fn(move |_conn| { - let state = state.clone(); - async move { - Ok::<_, Infallible>(service_fn(move |req: Request| { - let state = state.clone(); - async move { Ok::<_, Infallible>(routes(req, state)) } - })) - } - }); - - info!("starting HTTP server on {}", addr); - - let server = Server::bind(&addr).serve(make_service); - - // Run this server forever - if let Err(e) = server.await { - error!("server error: {}", e); - } -} - -/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`. -pub fn launch_http_server(state: &Arc>) -> Result> { - let state = Arc::clone(state); - - Ok(thread::Builder::new() - .name("http-endpoint".into()) - .spawn(move || serve(state))?) -} diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 592011d95e..aee6b53e6a 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -2,12 +2,13 @@ //! Various tools and helpers to handle cluster / compute node (Postgres) //! configuration. //! +pub mod checker; pub mod config; -pub mod http_api; +pub mod http; #[macro_use] pub mod logger; +pub mod compute; pub mod monitor; pub mod params; pub mod pg_helpers; pub mod spec; -pub mod zenith; diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 596981b2d2..58cdf796bc 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -1,4 +1,4 @@ -use std::sync::{Arc, RwLock}; +use std::sync::Arc; use std::{thread, time}; use anyhow::Result; @@ -6,18 +6,18 @@ use chrono::{DateTime, Utc}; use log::{debug, info}; use postgres::{Client, NoTls}; -use crate::zenith::ComputeState; +use crate::compute::ComputeNode; const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds // Spin in a loop and figure out the last activity time in the Postgres. // Then update it in the shared state. This function never errors out. // XXX: the only expected panic is at `RwLock` unwrap(). -fn watch_compute_activity(state: &Arc>) { +fn watch_compute_activity(compute: &ComputeNode) { // Suppose that `connstr` doesn't change - let connstr = state.read().unwrap().connstr.clone(); + let connstr = compute.connstr.as_str(); // Define `client` outside of the loop to reuse existing connection if it's active. - let mut client = Client::connect(&connstr, NoTls); + let mut client = Client::connect(connstr, NoTls); let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL); info!("watching Postgres activity at {}", connstr); @@ -32,7 +32,7 @@ fn watch_compute_activity(state: &Arc>) { info!("connection to postgres closed, trying to reconnect"); // Connection is closed, reconnect and try again. - client = Client::connect(&connstr, NoTls); + client = Client::connect(connstr, NoTls); continue; } @@ -43,10 +43,10 @@ fn watch_compute_activity(state: &Arc>) { FROM pg_stat_activity WHERE backend_type = 'client backend' AND pid != pg_backend_pid() - AND usename != 'zenith_admin';", // XXX: find a better way to filter other monitors? + AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors? &[], ); - let mut last_active = state.read().unwrap().last_active; + let mut last_active = compute.state.read().unwrap().last_active; if let Ok(backs) = backends { let mut idle_backs: Vec> = vec![]; @@ -83,24 +83,24 @@ fn watch_compute_activity(state: &Arc>) { } // Update the last activity in the shared state if we got a more recent one. - let mut state = state.write().unwrap(); + let mut state = compute.state.write().unwrap(); if last_active > state.last_active { state.last_active = last_active; debug!("set the last compute activity time to: {}", last_active); } } Err(e) => { - info!("cannot connect to postgres: {}, retrying", e); + debug!("cannot connect to postgres: {}, retrying", e); // Establish a new connection and try again. - client = Client::connect(&connstr, NoTls); + client = Client::connect(connstr, NoTls); } } } } /// Launch a separate compute monitor thread and return its `JoinHandle`. -pub fn launch_monitor(state: &Arc>) -> Result> { +pub fn launch_monitor(state: &Arc) -> Result> { let state = Arc::clone(state); Ok(thread::Builder::new() diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 8b6dc04069..289f223bda 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -1,15 +1,18 @@ -use std::net::{SocketAddr, TcpStream}; +use std::fmt::Write; +use std::fs; +use std::fs::File; +use std::io::{BufRead, BufReader}; use std::os::unix::fs::PermissionsExt; use std::path::Path; -use std::process::Command; -use std::str::FromStr; -use std::{fs, thread, time}; +use std::process::Child; +use std::time::{Duration, Instant}; use anyhow::{bail, Result}; +use notify::{RecursiveMode, Watcher}; use postgres::{Client, Transaction}; use serde::Deserialize; -const POSTGRES_WAIT_TIMEOUT: u64 = 60 * 1000; // milliseconds +const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds /// Rust representation of Postgres role info with only those fields /// that matter for us. @@ -59,9 +62,16 @@ impl GenericOption { /// Represent `GenericOption` as configuration option. pub fn to_pg_setting(&self) -> String { if let Some(val) = &self.value { + let name = match self.name.as_str() { + "safekeepers" => "neon.safekeepers", + "wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout", + "wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout", + it => it, + }; + match self.vartype.as_ref() { - "string" => format!("{} = '{}'", self.name, val), - _ => format!("{} = {}", self.name, val), + "string" => format!("{} = '{}'", name, val), + _ => format!("{} = {}", name, val), } } else { self.name.to_owned() @@ -132,7 +142,16 @@ impl Role { let mut params: String = "LOGIN".to_string(); if let Some(pass) = &self.encrypted_password { - params.push_str(&format!(" PASSWORD 'md5{}'", pass)); + // Some time ago we supported only md5 and treated all encrypted_password as md5. + // Now we also support SCRAM-SHA-256 and to preserve compatibility + // we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256. + if pass.starts_with("SCRAM-SHA-256") { + write!(params, " PASSWORD '{pass}'") + .expect("String is documented to not to error during write operations"); + } else { + write!(params, " PASSWORD 'md5{pass}'") + .expect("String is documented to not to error during write operations"); + } } else { params.push_str(" PASSWORD NULL"); } @@ -149,7 +168,8 @@ impl Database { /// it may require a proper quoting too. pub fn to_pg_options(&self) -> String { let mut params: String = self.options.as_pg_options(); - params.push_str(&format!(" OWNER {}", &self.owner.quote())); + write!(params, " OWNER {}", &self.owner.pg_quote()) + .expect("String is documented to not to error during write operations"); params } @@ -159,19 +179,18 @@ impl Database { /// intended to be used for DB / role names. pub type PgIdent = String; -/// Generic trait used to provide quoting for strings used in the -/// Postgres SQL queries. Currently used only to implement quoting -/// of identifiers, but could be used for literals in the future. -pub trait PgQuote { - fn quote(&self) -> String; +/// Generic trait used to provide quoting / encoding for strings used in the +/// Postgres SQL queries and DATABASE_URL. +pub trait Escaping { + fn pg_quote(&self) -> String; } -impl PgQuote for PgIdent { +impl Escaping for PgIdent { /// This is intended to mimic Postgres quote_ident(), but for simplicity it - /// always quotes provided string with `""` and escapes every `"`. Not idempotent, - /// i.e. if string is already escaped it will be escaped again. - fn quote(&self) -> String { - let result = format!("\"{}\"", self.replace("\"", "\"\"")); + /// always quotes provided string with `""` and escapes every `"`. + /// **Not idempotent**, i.e. if string is already escaped it will be escaped again. + fn pg_quote(&self) -> String { + let result = format!("\"{}\"", self.replace('"', "\"\"")); result } } @@ -210,45 +229,112 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { Ok(postgres_dbs) } -/// Wait for Postgres to become ready to accept connections: -/// - state should be `ready` in the `pgdata/postmaster.pid` -/// - and we should be able to connect to 127.0.0.1:5432 -pub fn wait_for_postgres(port: &str, pgdata: &Path) -> Result<()> { +/// Wait for Postgres to become ready to accept connections. It's ready to +/// accept connections when the state-field in `pgdata/postmaster.pid` says +/// 'ready'. +pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { let pid_path = pgdata.join("postmaster.pid"); - let mut slept: u64 = 0; // ms - let pause = time::Duration::from_millis(100); - let timeout = time::Duration::from_millis(200); - let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap(); + // PostgreSQL writes line "ready" to the postmaster.pid file, when it has + // completed initialization and is ready to accept connections. We want to + // react quickly and perform the rest of our initialization as soon as + // PostgreSQL starts accepting connections. Use 'notify' to be notified + // whenever the PID file is changed, and whenever it changes, read it to + // check if it's now "ready". + // + // You cannot actually watch a file before it exists, so we first watch the + // data directory, and once the postmaster.pid file appears, we switch to + // watch the file instead. We also wake up every 100 ms to poll, just in + // case we miss some events for some reason. Not strictly necessary, but + // better safe than sorry. + let (tx, rx) = std::sync::mpsc::channel(); + let (mut watcher, rx): (Box, _) = match notify::recommended_watcher(move |res| { + let _ = tx.send(res); + }) { + Ok(watcher) => (Box::new(watcher), rx), + Err(e) => { + match e.kind { + notify::ErrorKind::Io(os) if os.raw_os_error() == Some(38) => { + // docker on m1 macs does not support recommended_watcher + // but return "Function not implemented (os error 38)" + // see https://github.com/notify-rs/notify/issues/423 + let (tx, rx) = std::sync::mpsc::channel(); + // let's poll it faster than what we check the results for (100ms) + let config = + notify::Config::default().with_poll_interval(Duration::from_millis(50)); + + let watcher = notify::PollWatcher::new( + move |res| { + let _ = tx.send(res); + }, + config, + )?; + + (Box::new(watcher), rx) + } + _ => return Err(e.into()), + } + } + }; + + watcher.watch(pgdata, RecursiveMode::NonRecursive)?; + + let started_at = Instant::now(); + let mut postmaster_pid_seen = false; loop { - // Sleep POSTGRES_WAIT_TIMEOUT at max (a bit longer actually if consider a TCP timeout, - // but postgres starts listening almost immediately, even if it is not really - // ready to accept connections). - if slept >= POSTGRES_WAIT_TIMEOUT { - bail!("timed out while waiting for Postgres to start"); + if let Ok(Some(status)) = pg.try_wait() { + // Postgres exited, that is not what we expected, bail out earlier. + let code = status.code().unwrap_or(-1); + bail!("Postgres exited unexpectedly with code {}", code); } - if pid_path.exists() { - // XXX: dumb and the simplest way to get the last line in a text file - // TODO: better use `.lines().last()` later - let stdout = Command::new("tail") - .args(&["-n1", pid_path.to_str().unwrap()]) - .output()? - .stdout; - let status = String::from_utf8(stdout)?; - let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok(); + let res = rx.recv_timeout(Duration::from_millis(100)); + log::debug!("woken up by notify: {res:?}"); + // If there are multiple events in the channel already, we only need to be + // check once. Swallow the extra events before we go ahead to check the + // pid file. + while let Ok(res) = rx.try_recv() { + log::debug!("swallowing extra event: {res:?}"); + } - // Now Postgres is ready to accept connections - if status.trim() == "ready" && can_connect { - break; + // Check that we can open pid file first. + if let Ok(file) = File::open(&pid_path) { + if !postmaster_pid_seen { + log::debug!("postmaster.pid appeared"); + watcher + .unwatch(pgdata) + .expect("Failed to remove pgdata dir watch"); + watcher + .watch(&pid_path, RecursiveMode::NonRecursive) + .expect("Failed to add postmaster.pid file watch"); + postmaster_pid_seen = true; + } + + let file = BufReader::new(file); + let last_line = file.lines().last(); + + // Pid file could be there and we could read it, but it could be empty, for example. + if let Some(Ok(line)) = last_line { + let status = line.trim(); + log::debug!("last line of postmaster.pid: {status:?}"); + + // Now Postgres is ready to accept connections + if status == "ready" { + break; + } } } - thread::sleep(pause); - slept += 100; + // Give up after POSTGRES_WAIT_TIMEOUT. + let duration = started_at.elapsed(); + if duration >= POSTGRES_WAIT_TIMEOUT { + bail!("timed out while waiting for Postgres to start"); + } } + log::info!("PostgreSQL is now running, continuing to configure it"); + Ok(()) } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 41e4174bf0..58c94d74ae 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,18 +1,58 @@ use std::path::Path; +use std::str::FromStr; use anyhow::Result; use log::{info, log_enabled, warn, Level}; -use postgres::Client; +use postgres::config::Config; +use postgres::{Client, NoTls}; +use serde::Deserialize; +use crate::compute::ComputeNode; use crate::config; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; -use crate::zenith::ClusterSpec; + +/// Cluster spec or configuration represented as an optional number of +/// delta operations + final cluster state description. +#[derive(Clone, Deserialize)] +pub struct ComputeSpec { + pub format_version: f32, + pub timestamp: String, + pub operation_uuid: Option, + /// Expected cluster state at the end of transition process. + pub cluster: Cluster, + pub delta_operations: Option>, +} + +/// Cluster state seen from the perspective of the external tools +/// like Rails web console. +#[derive(Clone, Deserialize)] +pub struct Cluster { + pub cluster_id: String, + pub name: String, + pub state: Option, + pub roles: Vec, + pub databases: Vec, + pub settings: GenericOptions, +} + +/// Single cluster state changing operation that could not be represented as +/// a static `Cluster` structure. For example: +/// - DROP DATABASE +/// - DROP ROLE +/// - ALTER ROLE name RENAME TO new_name +/// - ALTER DATABASE name RENAME TO new_name +#[derive(Clone, Deserialize)] +pub struct DeltaOp { + pub action: String, + pub name: PgIdent, + pub new_name: Option, +} /// It takes cluster specification and does the following: /// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file. /// - Update `pg_hba.conf` to allow external connections. -pub fn handle_configuration(spec: &ClusterSpec, pgdata_path: &Path) -> Result<()> { +pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> { // File `postgresql.conf` is no longer included into `basebackup`, so just // always write all config into it creating new file. config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?; @@ -39,7 +79,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { /// Given a cluster spec json and open transaction it handles roles creation, /// deletion and update. -pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> { +pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { let mut xact = client.transaction()?; let existing_roles: Vec = get_existing_roles(&mut xact)?; @@ -60,18 +100,13 @@ pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> { // Process delta operations first if let Some(ops) = &spec.delta_operations { - info!("processing delta operations on roles"); + info!("processing role renames"); for op in ops { match op.action.as_ref() { - // We do not check either role exists or not, - // Postgres will take care of it for us "delete_role" => { - let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote()); - - warn!("deleting role '{}'", &op.name); - xact.execute(query.as_str(), &[])?; + // no-op now, roles will be deleted at the end of configuration } - // Renaming role drops its password, since tole name is + // Renaming role drops its password, since role name is // used as a salt there. It is important that this role // is recorded with a new `name` in the `roles` list. // Follow up roles update will set the new password. @@ -82,8 +117,8 @@ pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> { if existing_roles.iter().any(|r| r.name == op.name) { let query: String = format!( "ALTER ROLE {} RENAME TO {}", - op.name.quote(), - new_name.quote() + op.name.pg_quote(), + new_name.pg_quote() ); warn!("renaming role '{}' to '{}'", op.name, new_name); @@ -129,20 +164,27 @@ pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> { } if update_role { - let mut query: String = format!("ALTER ROLE {} ", name.quote()); + let mut query: String = format!("ALTER ROLE {} ", name.pg_quote()); info_print!(" -> update"); query.push_str(&role.to_pg_options()); xact.execute(query.as_str(), &[])?; } } else { - info!("role name {}", &name); - let mut query: String = format!("CREATE ROLE {} ", name.quote()); - info!("role create query {}", &query); + info!("role name: '{}'", &name); + let mut query: String = format!("CREATE ROLE {} ", name.pg_quote()); + info!("role create query: '{}'", &query); info_print!(" -> create"); query.push_str(&role.to_pg_options()); xact.execute(query.as_str(), &[])?; + + let grant_query = format!( + "GRANT pg_read_all_data, pg_write_all_data TO {}", + name.pg_quote() + ); + xact.execute(grant_query.as_str(), &[])?; + info!("role grant query: '{}'", &grant_query); } info_print!("\n"); @@ -153,12 +195,75 @@ pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> { Ok(()) } +/// Reassign all dependent objects and delete requested roles. +pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> { + let spec = &node.spec; + + // First, reassign all dependent objects to db owners. + if let Some(ops) = &spec.delta_operations { + info!("reassigning dependent objects of to-be-deleted roles"); + for op in ops { + if op.action == "delete_role" { + reassign_owned_objects(node, &op.name)?; + } + } + } + + // Second, proceed with role deletions. + let mut xact = client.transaction()?; + if let Some(ops) = &spec.delta_operations { + info!("processing role deletions"); + for op in ops { + // We do not check either role exists or not, + // Postgres will take care of it for us + if op.action == "delete_role" { + let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.pg_quote()); + + warn!("deleting role '{}'", &op.name); + xact.execute(query.as_str(), &[])?; + } + } + } + + Ok(()) +} + +// Reassign all owned objects in all databases to the owner of the database. +fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> { + for db in &node.spec.cluster.databases { + if db.owner != *role_name { + let mut conf = Config::from_str(node.connstr.as_str())?; + conf.dbname(&db.name); + + let mut client = conf.connect(NoTls)?; + + // This will reassign all dependent objects to the db owner + let reassign_query = format!( + "REASSIGN OWNED BY {} TO {}", + role_name.pg_quote(), + db.owner.pg_quote() + ); + info!( + "reassigning objects owned by '{}' in db '{}' to '{}'", + role_name, &db.name, &db.owner + ); + client.simple_query(&reassign_query)?; + + // This now will only drop privileges of the role + let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote()); + client.simple_query(&drop_query)?; + } + } + + Ok(()) +} + /// It follows mostly the same logic as `handle_roles()` excepting that we /// does not use an explicit transactions block, since major database operations /// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level /// atomicity should be enough here due to the order of operations and various checks, /// which together provide us idempotency. -pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> { +pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { let existing_dbs: Vec = get_existing_dbs(client)?; // Print a list of existing Postgres databases (only in debug mode) @@ -175,7 +280,7 @@ pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> { // We do not check either DB exists or not, // Postgres will take care of it for us "delete_db" => { - let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.quote()); + let query: String = format!("DROP DATABASE IF EXISTS {}", &op.name.pg_quote()); warn!("deleting database '{}'", &op.name); client.execute(query.as_str(), &[])?; @@ -187,8 +292,8 @@ pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> { if existing_dbs.iter().any(|r| r.name == op.name) { let query: String = format!( "ALTER DATABASE {} RENAME TO {}", - op.name.quote(), - new_name.quote() + op.name.pg_quote(), + new_name.pg_quote() ); warn!("renaming database '{}' to '{}'", op.name, new_name); @@ -215,8 +320,8 @@ pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> { if let Some(r) = pg_db { // XXX: db owner name is returned as quoted string from Postgres, // when quoting is needed. - let new_owner = if r.owner.starts_with('\"') { - db.owner.quote() + let new_owner = if r.owner.starts_with('"') { + db.owner.pg_quote() } else { db.owner.clone() }; @@ -224,15 +329,15 @@ pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> { if new_owner != r.owner { let query: String = format!( "ALTER DATABASE {} OWNER TO {}", - name.quote(), - db.owner.quote() + name.pg_quote(), + db.owner.pg_quote() ); info_print!(" -> update"); client.execute(query.as_str(), &[])?; } } else { - let mut query: String = format!("CREATE DATABASE {} ", name.quote()); + let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote()); info_print!(" -> create"); query.push_str(&db.to_pg_options()); @@ -244,3 +349,107 @@ pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> { Ok(()) } + +/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants +/// to allow users creating trusted extensions and re-creating `public` schema, for example. +pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { + let spec = &node.spec; + + info!("cluster spec grants:"); + + // We now have a separate `web_access` role to connect to the database + // via the web interface and proxy link auth. And also we grant a + // read / write all data privilege to every role. So also grant + // create to everyone. + // XXX: later we should stop messing with Postgres ACL in such horrible + // ways. + let roles = spec + .cluster + .roles + .iter() + .map(|r| r.name.pg_quote()) + .collect::>(); + + for db in &spec.cluster.databases { + let dbname = &db.name; + + let query: String = format!( + "GRANT CREATE ON DATABASE {} TO {}", + dbname.pg_quote(), + roles.join(", ") + ); + info!("grant query {}", &query); + + client.execute(query.as_str(), &[])?; + } + + // Do some per-database access adjustments. We'd better do this at db creation time, + // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants + // atomically. + for db in &node.spec.cluster.databases { + let mut conf = Config::from_str(node.connstr.as_str())?; + conf.dbname(&db.name); + + let mut db_client = conf.connect(NoTls)?; + + // This will only change ownership on the schema itself, not the objects + // inside it. Without it owner of the `public` schema will be `cloud_admin` + // and database owner cannot do anything with it. SQL procedure ensures + // that it won't error out if schema `public` doesn't exist. + let alter_query = format!( + "DO $$\n\ + DECLARE\n\ + schema_owner TEXT;\n\ + BEGIN\n\ + IF EXISTS(\n\ + SELECT nspname\n\ + FROM pg_catalog.pg_namespace\n\ + WHERE nspname = 'public'\n\ + )\n\ + THEN\n\ + SELECT nspowner::regrole::text\n\ + FROM pg_catalog.pg_namespace\n\ + WHERE nspname = 'public'\n\ + INTO schema_owner;\n\ + \n\ + IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'\n\ + THEN\n\ + ALTER SCHEMA public OWNER TO {};\n\ + END IF;\n\ + END IF;\n\ + END\n\ + $$;", + db.owner.pg_quote() + ); + db_client.simple_query(&alter_query)?; + + // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user. + // This is needed because since postgres 15 this privilege is removed by default. + let grant_query = "DO $$\n\ + BEGIN\n\ + IF EXISTS(\n\ + SELECT nspname\n\ + FROM pg_catalog.pg_namespace\n\ + WHERE nspname = 'public'\n\ + ) AND\n\ + current_setting('server_version_num')::int/10000 >= 15\n\ + THEN\n\ + IF EXISTS(\n\ + SELECT rolname\n\ + FROM pg_catalog.pg_roles\n\ + WHERE rolname = 'web_access'\n\ + )\n\ + THEN\n\ + GRANT CREATE ON SCHEMA public TO web_access;\n\ + END IF;\n\ + END IF;\n\ + END\n\ + $$;" + .to_string(); + + info!("grant query for db {} : {}", &db.name, &grant_query); + db_client.simple_query(&grant_query)?; + } + + Ok(()) +} diff --git a/compute_tools/src/zenith.rs b/compute_tools/src/zenith.rs deleted file mode 100644 index ba7dc20787..0000000000 --- a/compute_tools/src/zenith.rs +++ /dev/null @@ -1,109 +0,0 @@ -use std::process::{Command, Stdio}; - -use anyhow::Result; -use chrono::{DateTime, Utc}; -use postgres::{Client, NoTls}; -use serde::Deserialize; - -use crate::pg_helpers::*; - -/// Compute node state shared across several `zenith_ctl` threads. -/// Should be used under `RwLock` to allow HTTP API server to serve -/// status requests, while configuration is in progress. -pub struct ComputeState { - pub connstr: String, - pub pgdata: String, - pub pgbin: String, - pub spec: ClusterSpec, - /// Compute setup process has finished - pub ready: bool, - /// Timestamp of the last Postgres activity - pub last_active: DateTime, -} - -/// Cluster spec or configuration represented as an optional number of -/// delta operations + final cluster state description. -#[derive(Clone, Deserialize)] -pub struct ClusterSpec { - pub format_version: f32, - pub timestamp: String, - pub operation_uuid: Option, - /// Expected cluster state at the end of transition process. - pub cluster: Cluster, - pub delta_operations: Option>, -} - -/// Cluster state seen from the perspective of the external tools -/// like Rails web console. -#[derive(Clone, Deserialize)] -pub struct Cluster { - pub cluster_id: String, - pub name: String, - pub state: Option, - pub roles: Vec, - pub databases: Vec, - pub settings: GenericOptions, -} - -/// Single cluster state changing operation that could not be represented as -/// a static `Cluster` structure. For example: -/// - DROP DATABASE -/// - DROP ROLE -/// - ALTER ROLE name RENAME TO new_name -/// - ALTER DATABASE name RENAME TO new_name -#[derive(Clone, Deserialize)] -pub struct DeltaOp { - pub action: String, - pub name: PgIdent, - pub new_name: Option, -} - -/// Get basebackup from the libpq connection to pageserver using `connstr` and -/// unarchive it to `pgdata` directory overriding all its previous content. -pub fn get_basebackup( - pgdata: &str, - connstr: &str, - tenant: &str, - timeline: &str, - lsn: &str, -) -> Result<()> { - let mut client = Client::connect(connstr, NoTls)?; - let basebackup_cmd = match lsn { - "0/0" => format!("basebackup {} {}", tenant, timeline), // First start of the compute - _ => format!("basebackup {} {} {}", tenant, timeline, lsn), - }; - let copyreader = client.copy_out(basebackup_cmd.as_str())?; - let mut ar = tar::Archive::new(copyreader); - - ar.unpack(&pgdata)?; - - Ok(()) -} - -/// Run `postgres` in a special mode with `--sync-safekeepers` argument -/// and return the reported LSN back to the caller. -pub fn sync_safekeepers(pgdata: &str, pgbin: &str) -> Result { - let sync_handle = Command::new(&pgbin) - .args(&["--sync-safekeepers"]) - .env("PGDATA", &pgdata) // we cannot use -D in this mode - .stdout(Stdio::piped()) - .spawn() - .expect("postgres --sync-safekeepers failed to start"); - - // `postgres --sync-safekeepers` will print all log output to stderr and - // final LSN to stdout. So we pipe only stdout, while stderr will be automatically - // redirected to the caller output. - let sync_output = sync_handle - .wait_with_output() - .expect("postgres --sync-safekeepers failed"); - if !sync_output.status.success() { - anyhow::bail!( - "postgres --sync-safekeepers exited with non-zero status: {}", - sync_output.status, - ); - } - - let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim()); - - Ok(lsn) -} diff --git a/compute_tools/tests/cluster_spec.json b/compute_tools/tests/cluster_spec.json index 4a1672919c..c29416d9c4 100644 --- a/compute_tools/tests/cluster_spec.json +++ b/compute_tools/tests/cluster_spec.json @@ -85,7 +85,7 @@ "vartype": "bool" }, { - "name": "wal_acceptors", + "name": "neon.safekeepers", "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501", "vartype": "string" }, @@ -150,7 +150,7 @@ "vartype": "integer" }, { - "name": "zenith.zenith_tenant", + "name": "neon.tenant_id", "value": "b0554b632bd4d547a63b86c3630317e8", "vartype": "string" }, @@ -160,13 +160,13 @@ "vartype": "integer" }, { - "name": "zenith.zenith_timeline", + "name": "neon.timeline_id", "value": "2414a61ffc94e428f14b5758fe308e13", "vartype": "string" }, { "name": "shared_preload_libraries", - "value": "zenith", + "value": "neon", "vartype": "string" }, { @@ -175,13 +175,12 @@ "vartype": "string" }, { - "name": "zenith.page_server_connstring", + "name": "neon.pageserver_connstring", "value": "host=127.0.0.1 port=6400", "vartype": "string" } ] }, - "delta_operations": [ { "action": "delete_db", diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 472a49af4b..24cad4663a 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -4,12 +4,12 @@ mod pg_helpers_tests { use std::fs::File; use compute_tools::pg_helpers::*; - use compute_tools::zenith::ClusterSpec; + use compute_tools::spec::ComputeSpec; #[test] fn params_serialize() { let file = File::open("tests/cluster_spec.json").unwrap(); - let spec: ClusterSpec = serde_json::from_reader(file).unwrap(); + let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); assert_eq!( spec.cluster.databases.first().unwrap().to_pg_options(), @@ -24,18 +24,18 @@ mod pg_helpers_tests { #[test] fn settings_serialize() { let file = File::open("tests/cluster_spec.json").unwrap(); - let spec: ClusterSpec = serde_json::from_reader(file).unwrap(); + let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); assert_eq!( spec.cluster.settings.as_pg_settings(), - "fsync = off\nwal_level = replica\nhot_standby = on\nwal_acceptors = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nzenith.zenith_tenant = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nzenith.zenith_timeline = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'zenith'\nsynchronous_standby_names = 'walproposer'\nzenith.page_server_connstring = 'host=127.0.0.1 port=6400'" + "fsync = off\nwal_level = replica\nhot_standby = on\nneon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" ); } #[test] - fn quote_ident() { + fn ident_pg_quote() { let ident: PgIdent = PgIdent::from("\"name\";\\n select 1;"); - assert_eq!(ident.quote(), "\"\"\"name\"\";\\n select 1;\""); + assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\""); } } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 5e972200c2..a9d30b4a86 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -4,18 +4,25 @@ version = "0.1.0" edition = "2021" [dependencies] -tar = "0.4.33" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -serde = { version = "1.0", features = ["derive"] } -toml = "0.5" -lazy_static = "1.4" -regex = "1" anyhow = "1.0" -thiserror = "1" -nix = "0.23" -url = "2.2.2" +clap = "4.0" +comfy-table = "6.1" +git-version = "0.3.5" +nix = "0.25" +once_cell = "1.13.0" +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +regex = "1" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } +serde = { version = "1.0", features = ["derive"] } +serde_with = "2.0" +tar = "0.4.38" +thiserror = "1" +toml = "0.5" +url = "2.2.2" -pageserver = { path = "../pageserver" } -zenith_utils = { path = "../zenith_utils" } -workspace_hack = { path = "../workspace_hack" } +# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api +# instead, so that recompile times are better. +pageserver_api = { path = "../libs/pageserver_api" } +safekeeper_api = { path = "../libs/safekeeper_api" } +utils = { path = "../libs/utils" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/safekeepers.conf b/control_plane/safekeepers.conf index 828d5a5a1e..df7dd2adca 100644 --- a/control_plane/safekeepers.conf +++ b/control_plane/safekeepers.conf @@ -5,16 +5,16 @@ listen_http_addr = '127.0.0.1:9898' auth_type = 'Trust' [[safekeepers]] -name = 'sk1' +id = 1 pg_port = 5454 http_port = 7676 [[safekeepers]] -name = 'sk2' +id = 2 pg_port = 5455 http_port = 7677 [[safekeepers]] -name = 'sk3' +id = 3 pg_port = 5456 http_port = 7678 diff --git a/control_plane/simple.conf b/control_plane/simple.conf index 796c6adbd9..ae60657400 100644 --- a/control_plane/simple.conf +++ b/control_plane/simple.conf @@ -1,4 +1,4 @@ -# Minimal zenith environment with one safekeeper. This is equivalent to the built-in +# Minimal neon environment with one safekeeper. This is equivalent to the built-in # defaults that you get with no --config [pageserver] listen_pg_addr = '127.0.0.1:64000' @@ -6,6 +6,9 @@ listen_http_addr = '127.0.0.1:9898' auth_type = 'Trust' [[safekeepers]] -name = 'single' +id = 1 pg_port = 5454 http_port = 7676 + +[etcd_broker] +broker_endpoints = ['http://127.0.0.1:2379'] diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs new file mode 100644 index 0000000000..2f8098b7c9 --- /dev/null +++ b/control_plane/src/background_process.rs @@ -0,0 +1,264 @@ +//! Spawns and kills background processes that are needed by Neon CLI. +//! Applies common set-up such as log and pid files (if needed) to every process. +//! +//! Neon CLI does not run in background, so it needs to store the information about +//! spawned processes, which it does in this module. +//! We do that by storing the pid of the process in the "${process_name}.pid" file. +//! The pid file can be created by the process itself +//! (Neon storage binaries do that and also ensure that a lock is taken onto that file) +//! or we create such file after starting the process +//! (non-Neon binaries don't necessarily follow our pidfile conventions). +//! The pid stored in the file is later used to stop the service. +//! +//! See [`lock_file`] module for more info. + +use std::ffi::OsStr; +use std::io::Write; +use std::path::Path; +use std::process::{Child, Command}; +use std::time::Duration; +use std::{fs, io, thread}; + +use anyhow::{anyhow, bail, Context, Result}; +use nix::errno::Errno; +use nix::sys::signal::{kill, Signal}; +use nix::unistd::Pid; + +use utils::lock_file; + +const RETRIES: u32 = 15; +const RETRY_TIMEOUT_MILLIS: u64 = 500; + +/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates +/// it itself. +pub enum InitialPidFile<'t> { + /// Create a pidfile, to allow future CLI invocations to manipulate the process. + Create(&'t Path), + /// The process will create the pidfile itself, need to wait for that event. + Expect(&'t Path), +} + +/// Start a background child process using the parameters given. +pub fn start_process>( + process_name: &str, + datadir: &Path, + command: &Path, + args: &[S], + initial_pid_file: InitialPidFile, + process_status_check: F, +) -> anyhow::Result +where + F: Fn() -> anyhow::Result, +{ + let log_path = datadir.join(format!("{process_name}.log")); + let process_log_file = fs::OpenOptions::new() + .create(true) + .write(true) + .append(true) + .open(&log_path) + .with_context(|| { + format!("Could not open {process_name} log file {log_path:?} for writing") + })?; + let same_file_for_stderr = process_log_file.try_clone().with_context(|| { + format!("Could not reuse {process_name} log file {log_path:?} for writing stderr") + })?; + + let mut command = Command::new(command); + let background_command = command + .stdout(process_log_file) + .stderr(same_file_for_stderr) + .args(args); + let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command)); + + let mut spawned_process = filled_cmd.spawn().with_context(|| { + format!("Could not spawn {process_name}, see console output and log files for details.") + })?; + let pid = spawned_process.id(); + let pid = Pid::from_raw( + i32::try_from(pid) + .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?, + ); + + let pid_file_to_check = match initial_pid_file { + InitialPidFile::Create(target_pid_file_path) => { + match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) { + lock_file::LockCreationResult::Created { .. } => { + // We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon + // as this CLI invocation exits, so it's a bit useless, but doesn't any harm either. + } + lock_file::LockCreationResult::AlreadyLocked { .. } => { + anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process") + } + lock_file::LockCreationResult::CreationFailed(e) => { + return Err(e.context(format!( + "Failed to create pid file for {process_name} at path {target_pid_file_path:?}" + ))) + } + } + None + } + InitialPidFile::Expect(pid_file_path) => Some(pid_file_path), + }; + + for retries in 0..RETRIES { + match process_started(pid, pid_file_to_check, &process_status_check) { + Ok(true) => { + println!("\n{process_name} started, pid: {pid}"); + return Ok(spawned_process); + } + Ok(false) => { + if retries < 5 { + print!("."); + io::stdout().flush().unwrap(); + } else { + if retries == 5 { + println!() // put a line break after dots for second message + } + println!("{process_name} has not started yet, retrying ({retries})..."); + } + thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS)); + } + Err(e) => { + println!("{process_name} failed to start: {e:#}"); + if let Err(e) = spawned_process.kill() { + println!("Could not stop {process_name} subprocess: {e:#}") + }; + return Err(e); + } + } + } + anyhow::bail!("{process_name} could not start in {RETRIES} attempts"); +} + +/// Stops the process, using the pid file given. Returns Ok also if the process is already not running. +pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> { + if !pid_file.exists() { + println!("{process_name} is already stopped: no pid file {pid_file:?} is present"); + return Ok(()); + } + let pid = read_pidfile(pid_file)?; + + let sig = if immediate { + print!("Stopping {process_name} with pid {pid} immediately.."); + Signal::SIGQUIT + } else { + print!("Stopping {process_name} with pid {pid} gracefully.."); + Signal::SIGTERM + }; + io::stdout().flush().unwrap(); + match kill(pid, sig) { + Ok(()) => (), + Err(Errno::ESRCH) => { + println!( + "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found" + ); + return Ok(()); + } + Err(e) => anyhow::bail!("Failed to send signal to {process_name} with pid {pid}: {e}"), + } + + // Wait until process is gone + for _ in 0..RETRIES { + match process_has_stopped(pid) { + Ok(true) => { + println!("\n{process_name} stopped"); + if let Err(e) = fs::remove_file(pid_file) { + if e.kind() != io::ErrorKind::NotFound { + eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}"); + } + } + return Ok(()); + } + Ok(false) => { + print!("."); + io::stdout().flush().unwrap(); + thread::sleep(Duration::from_secs(1)) + } + Err(e) => { + println!("{process_name} with pid {pid} failed to stop: {e:#}"); + return Err(e); + } + } + } + + anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts"); +} + +fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { + let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1"); + + let var = "LLVM_PROFILE_FILE"; + if let Some(val) = std::env::var_os(var) { + filled_cmd = filled_cmd.env(var, val); + } + + const RUST_LOG_KEY: &str = "RUST_LOG"; + if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) { + filled_cmd.env(RUST_LOG_KEY, rust_log_value) + } else { + filled_cmd + } +} + +fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { + for env_key in [ + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + "AWS_SESSION_TOKEN", + ] { + if let Ok(value) = std::env::var(env_key) { + cmd = cmd.env(env_key, value); + } + } + cmd +} + +fn process_started( + pid: Pid, + pid_file_to_check: Option<&Path>, + status_check: &F, +) -> anyhow::Result +where + F: Fn() -> anyhow::Result, +{ + match status_check() { + Ok(true) => match pid_file_to_check { + Some(pid_file_path) => { + if pid_file_path.exists() { + let pid_in_file = read_pidfile(pid_file_path)?; + Ok(pid_in_file == pid) + } else { + Ok(false) + } + } + None => Ok(true), + }, + Ok(false) => Ok(false), + Err(e) => anyhow::bail!("process failed to start: {e}"), + } +} + +/// Read a PID file +/// +/// We expect a file that contains a single integer. +fn read_pidfile(pidfile: &Path) -> Result { + let pid_str = fs::read_to_string(pidfile) + .with_context(|| format!("failed to read pidfile {pidfile:?}"))?; + let pid: i32 = pid_str + .parse() + .map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?; + if pid < 1 { + bail!("pidfile {pidfile:?} contained bad value '{pid}'"); + } + Ok(Pid::from_raw(pid)) +} + +fn process_has_stopped(pid: Pid) -> anyhow::Result { + match kill(pid, None) { + // Process exists, keep waiting + Ok(_) => Ok(false), + // Process not found, we're done + Err(Errno::ESRCH) => Ok(true), + Err(err) => anyhow::bail!("Failed to send signal to process with pid {pid}: {err}"), + } +} diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs new file mode 100644 index 0000000000..42a9199037 --- /dev/null +++ b/control_plane/src/bin/neon_local.rs @@ -0,0 +1,1082 @@ +//! +//! `neon_local` is an executable that can be used to create a local +//! Neon environment, for testing purposes. The local environment is +//! quite different from the cloud environment with Kubernetes, but it +//! easier to work with locally. The python tests in `test_runner` +//! rely on `neon_local` to set up the environment for each test. +//! +use anyhow::{anyhow, bail, Context, Result}; +use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; +use control_plane::compute::ComputeControlPlane; +use control_plane::local_env::{EtcdBroker, LocalEnv}; +use control_plane::pageserver::PageServerNode; +use control_plane::safekeeper::SafekeeperNode; +use control_plane::{etcd, local_env}; +use pageserver_api::models::TimelineInfo; +use pageserver_api::{ + DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, + DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR, +}; +use safekeeper_api::{ + DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, + DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, +}; +use std::collections::{BTreeSet, HashMap}; +use std::path::{Path, PathBuf}; +use std::process::exit; +use std::str::FromStr; +use utils::{ + auth::{Claims, Scope}, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, + postgres_backend::AuthType, + project_git_version, +}; + +// Default id of a safekeeper node, if not specified on the command line. +const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1); +const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); +const DEFAULT_BRANCH_NAME: &str = "main"; +project_git_version!(GIT_VERSION); + +const DEFAULT_PG_VERSION: &str = "14"; + +fn default_conf(etcd_binary_path: &Path) -> String { + format!( + r#" +# Default built-in configuration, defined in main.rs +[etcd_broker] +broker_endpoints = ['http://localhost:2379'] +etcd_binary_path = '{etcd_binary_path}' + +[pageserver] +id = {DEFAULT_PAGESERVER_ID} +listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}' +listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}' +auth_type = '{pageserver_auth_type}' + +[[safekeepers]] +id = {DEFAULT_SAFEKEEPER_ID} +pg_port = {DEFAULT_SAFEKEEPER_PG_PORT} +http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT} +"#, + etcd_binary_path = etcd_binary_path.display(), + pageserver_auth_type = AuthType::Trust, + ) +} + +/// +/// Timelines tree element used as a value in the HashMap. +/// +struct TimelineTreeEl { + /// `TimelineInfo` received from the `pageserver` via the `timeline_list` http API call. + pub info: TimelineInfo, + /// Name, recovered from neon config mappings + pub name: Option, + /// Holds all direct children of this timeline referenced using `timeline_id`. + pub children: BTreeSet, +} + +// Main entry point for the 'neon_local' CLI utility +// +// This utility helps to manage neon installation. That includes following: +// * Management of local postgres installations running on top of the +// pageserver. +// * Providing CLI api to the pageserver +// * TODO: export/import to/from usual postgres +fn main() -> Result<()> { + let matches = cli().get_matches(); + + let (sub_name, sub_args) = match matches.subcommand() { + Some(subcommand_data) => subcommand_data, + None => bail!("no subcommand provided"), + }; + + // Check for 'neon init' command first. + let subcommand_result = if sub_name == "init" { + handle_init(sub_args).map(Some) + } else { + // all other commands need an existing config + let mut env = LocalEnv::load_config().context("Error loading config")?; + let original_env = env.clone(); + + let subcommand_result = match sub_name { + "tenant" => handle_tenant(sub_args, &mut env), + "timeline" => handle_timeline(sub_args, &mut env), + "start" => handle_start_all(sub_args, &env), + "stop" => handle_stop_all(sub_args, &env), + "pageserver" => handle_pageserver(sub_args, &env), + "pg" => handle_pg(sub_args, &env), + "safekeeper" => handle_safekeeper(sub_args, &env), + _ => bail!("unexpected subcommand {sub_name}"), + }; + + if original_env != env { + subcommand_result.map(|()| Some(env)) + } else { + subcommand_result.map(|()| None) + } + }; + + match subcommand_result { + Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?, + Ok(None) => (), + Err(e) => { + eprintln!("command failed: {e:?}"); + exit(1); + } + } + Ok(()) +} + +/// +/// Prints timelines list as a tree-like structure. +/// +fn print_timelines_tree( + timelines: Vec, + mut timeline_name_mappings: HashMap, +) -> Result<()> { + let mut timelines_hash = timelines + .iter() + .map(|t| { + ( + t.timeline_id, + TimelineTreeEl { + info: t.clone(), + children: BTreeSet::new(), + name: timeline_name_mappings + .remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)), + }, + ) + }) + .collect::>(); + + // Memorize all direct children of each timeline. + for timeline in timelines.iter() { + if let Some(ancestor_timeline_id) = timeline.ancestor_timeline_id { + timelines_hash + .get_mut(&ancestor_timeline_id) + .context("missing timeline info in the HashMap")? + .children + .insert(timeline.timeline_id); + } + } + + for timeline in timelines_hash.values() { + // Start with root local timelines (no ancestors) first. + if timeline.info.ancestor_timeline_id.is_none() { + print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?; + } + } + + Ok(()) +} + +/// +/// Recursively prints timeline info with all its children. +/// +fn print_timeline( + nesting_level: usize, + is_last: &[bool], + timeline: &TimelineTreeEl, + timelines: &HashMap, +) -> Result<()> { + if nesting_level > 0 { + let ancestor_lsn = match timeline.info.ancestor_lsn { + Some(lsn) => lsn.to_string(), + None => "Unknown Lsn".to_string(), + }; + + let mut br_sym = "┣━"; + + // Draw each nesting padding with proper style + // depending on whether its timeline ended or not. + if nesting_level > 1 { + for l in &is_last[1..is_last.len() - 1] { + if *l { + print!(" "); + } else { + print!("┃ "); + } + } + } + + // We are the last in this sub-timeline + if *is_last.last().unwrap() { + br_sym = "┗━"; + } + + print!("{} @{}: ", br_sym, ancestor_lsn); + } + + // Finally print a timeline id and name with new line + println!( + "{} [{}]", + timeline.name.as_deref().unwrap_or("_no_name_"), + timeline.info.timeline_id + ); + + let len = timeline.children.len(); + let mut i: usize = 0; + let mut is_last_new = Vec::from(is_last); + is_last_new.push(false); + + for child in &timeline.children { + i += 1; + + // Mark that the last padding is the end of the timeline + if i == len { + if let Some(last) = is_last_new.last_mut() { + *last = true; + } + } + + print_timeline( + nesting_level + 1, + &is_last_new, + timelines + .get(child) + .context("missing timeline info in the HashMap")?, + timelines, + )?; + } + + Ok(()) +} + +/// Returns a map of timeline IDs to timeline_id@lsn strings. +/// Connects to the pageserver to query this information. +fn get_timeline_infos( + env: &local_env::LocalEnv, + tenant_id: &TenantId, +) -> Result> { + Ok(PageServerNode::from_env(env) + .timeline_list(tenant_id)? + .into_iter() + .map(|timeline_info| (timeline_info.timeline_id, timeline_info)) + .collect()) +} + +// Helper function to parse --tenant_id option, or get the default from config file +fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result { + if let Some(tenant_id_from_arguments) = parse_tenant_id(sub_match).transpose() { + tenant_id_from_arguments + } else if let Some(default_id) = env.default_tenant_id { + Ok(default_id) + } else { + bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file"); + } +} + +fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { + sub_match + .get_one::("tenant-id") + .map(|tenant_id| TenantId::from_str(tenant_id)) + .transpose() + .context("Failed to parse tenant id from the argument string") +} + +fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { + sub_match + .get_one::("timeline-id") + .map(|timeline_id| TimelineId::from_str(timeline_id)) + .transpose() + .context("Failed to parse timeline id from the argument string") +} + +fn handle_init(init_match: &ArgMatches) -> anyhow::Result { + let initial_timeline_id_arg = parse_timeline_id(init_match)?; + + // Create config file + let toml_file: String = if let Some(config_path) = init_match.get_one::("config") { + // load and parse the file + std::fs::read_to_string(config_path).with_context(|| { + format!( + "Could not read configuration file '{}'", + config_path.display() + ) + })? + } else { + // Built-in default config + default_conf(&EtcdBroker::locate_etcd()?) + }; + + let pg_version = init_match + .get_one::("pg-version") + .copied() + .context("Failed to parse postgres version from the argument string")?; + + let mut env = + LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; + env.init(pg_version) + .context("Failed to initialize neon repository")?; + let initial_tenant_id = env + .default_tenant_id + .expect("default_tenant_id should be generated by the `env.init()` call above"); + + // Initialize pageserver, create initial tenant and timeline. + let pageserver = PageServerNode::from_env(&env); + let initial_timeline_id = pageserver + .initialize( + Some(initial_tenant_id), + initial_timeline_id_arg, + &pageserver_config_overrides(init_match), + pg_version, + ) + .unwrap_or_else(|e| { + eprintln!("pageserver init failed: {e}"); + exit(1); + }); + + env.register_branch_mapping( + DEFAULT_BRANCH_NAME.to_owned(), + initial_tenant_id, + initial_timeline_id, + )?; + + Ok(env) +} + +fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { + init_match + .get_many::("pageserver-config-override") + .into_iter() + .flatten() + .map(|s| s.as_str()) + .collect() +} + +fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> { + let pageserver = PageServerNode::from_env(env); + match tenant_match.subcommand() { + Some(("list", _)) => { + for t in pageserver.tenant_list()? { + println!("{} {:?}", t.id, t.state); + } + } + Some(("create", create_match)) => { + let initial_tenant_id = parse_tenant_id(create_match)?; + let tenant_conf: HashMap<_, _> = create_match + .get_many::("config") + .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) + .unwrap_or_default(); + let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?; + println!("tenant {new_tenant_id} successfully created on the pageserver"); + + // Create an initial timeline for the new tenant + let new_timeline_id = parse_timeline_id(create_match)?; + let pg_version = create_match + .get_one::("pg-version") + .copied() + .context("Failed to parse postgres version from the argument string")?; + + let timeline_info = pageserver.timeline_create( + new_tenant_id, + new_timeline_id, + None, + None, + Some(pg_version), + )?; + let new_timeline_id = timeline_info.timeline_id; + let last_record_lsn = timeline_info.last_record_lsn; + + env.register_branch_mapping( + DEFAULT_BRANCH_NAME.to_string(), + new_tenant_id, + new_timeline_id, + )?; + + println!( + "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}", + ); + } + Some(("config", create_match)) => { + let tenant_id = get_tenant_id(create_match, env)?; + let tenant_conf: HashMap<_, _> = create_match + .get_many::("config") + .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) + .unwrap_or_default(); + + pageserver + .tenant_config(tenant_id, tenant_conf) + .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; + println!("tenant {tenant_id} successfully configured on the pageserver"); + } + Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), + None => bail!("no tenant subcommand provided"), + } + Ok(()) +} + +fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { + let pageserver = PageServerNode::from_env(env); + + match timeline_match.subcommand() { + Some(("list", list_match)) => { + let tenant_id = get_tenant_id(list_match, env)?; + let timelines = pageserver.timeline_list(&tenant_id)?; + print_timelines_tree(timelines, env.timeline_name_mappings())?; + } + Some(("create", create_match)) => { + let tenant_id = get_tenant_id(create_match, env)?; + let new_branch_name = create_match + .get_one::("branch-name") + .ok_or_else(|| anyhow!("No branch name provided"))?; + + let pg_version = create_match + .get_one::("pg-version") + .copied() + .context("Failed to parse postgres version from the argument string")?; + + let timeline_info = + pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?; + let new_timeline_id = timeline_info.timeline_id; + + let last_record_lsn = timeline_info.last_record_lsn; + env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; + + println!( + "Created timeline '{}' at Lsn {last_record_lsn} for tenant: {tenant_id}", + timeline_info.timeline_id + ); + } + Some(("import", import_match)) => { + let tenant_id = get_tenant_id(import_match, env)?; + let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided"); + let name = import_match + .get_one::("node-name") + .ok_or_else(|| anyhow!("No node name provided"))?; + + // Parse base inputs + let base_tarfile = import_match + .get_one::("base-tarfile") + .ok_or_else(|| anyhow!("No base-tarfile provided"))? + .to_owned(); + let base_lsn = Lsn::from_str( + import_match + .get_one::("base-lsn") + .ok_or_else(|| anyhow!("No base-lsn provided"))?, + )?; + let base = (base_lsn, base_tarfile); + + // Parse pg_wal inputs + let wal_tarfile = import_match.get_one::("wal-tarfile").cloned(); + let end_lsn = import_match + .get_one::("end-lsn") + .map(|s| Lsn::from_str(s).unwrap()); + // TODO validate both or none are provided + let pg_wal = end_lsn.zip(wal_tarfile); + + let pg_version = import_match + .get_one::("pg-version") + .copied() + .context("Failed to parse postgres version from the argument string")?; + + let mut cplane = ComputeControlPlane::load(env.clone())?; + println!("Importing timeline into pageserver ..."); + pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?; + println!("Creating node for imported timeline ..."); + env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; + + cplane.new_node(tenant_id, name, timeline_id, None, None, pg_version)?; + println!("Done"); + } + Some(("branch", branch_match)) => { + let tenant_id = get_tenant_id(branch_match, env)?; + let new_branch_name = branch_match + .get_one::("branch-name") + .ok_or_else(|| anyhow!("No branch name provided"))?; + let ancestor_branch_name = branch_match + .get_one::("ancestor-branch-name") + .map(|s| s.as_str()) + .unwrap_or(DEFAULT_BRANCH_NAME); + let ancestor_timeline_id = env + .get_branch_timeline_id(ancestor_branch_name, tenant_id) + .ok_or_else(|| { + anyhow!("Found no timeline id for branch name '{ancestor_branch_name}'") + })?; + + let start_lsn = branch_match + .get_one::("ancestor-start-lsn") + .map(|lsn_str| Lsn::from_str(lsn_str)) + .transpose() + .context("Failed to parse ancestor start Lsn from the request")?; + let timeline_info = pageserver.timeline_create( + tenant_id, + None, + start_lsn, + Some(ancestor_timeline_id), + None, + )?; + let new_timeline_id = timeline_info.timeline_id; + + let last_record_lsn = timeline_info.last_record_lsn; + + env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; + + println!( + "Created timeline '{}' at Lsn {last_record_lsn} for tenant: {tenant_id}. Ancestor timeline: '{ancestor_branch_name}'", + timeline_info.timeline_id + ); + } + Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{sub_name}'"), + None => bail!("no tenant subcommand provided"), + } + + Ok(()) +} + +fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { + let (sub_name, sub_args) = match pg_match.subcommand() { + Some(pg_subcommand_data) => pg_subcommand_data, + None => bail!("no pg subcommand provided"), + }; + + let mut cplane = ComputeControlPlane::load(env.clone())?; + + // All subcommands take an optional --tenant-id option + let tenant_id = get_tenant_id(sub_args, env)?; + + match sub_name { + "list" => { + let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| { + eprintln!("Failed to load timeline info: {}", e); + HashMap::new() + }); + + let timeline_name_mappings = env.timeline_name_mappings(); + + let mut table = comfy_table::Table::new(); + + table.load_preset(comfy_table::presets::NOTHING); + + table.set_header(&[ + "NODE", + "ADDRESS", + "TIMELINE", + "BRANCH NAME", + "LSN", + "STATUS", + ]); + + for ((_, node_name), node) in cplane + .nodes + .iter() + .filter(|((node_tenant_id, _), _)| node_tenant_id == &tenant_id) + { + let lsn_str = match node.lsn { + None => { + // -> primary node + // Use the LSN at the end of the timeline. + timeline_infos + .get(&node.timeline_id) + .map(|bi| bi.last_record_lsn.to_string()) + .unwrap_or_else(|| "?".to_string()) + } + Some(lsn) => { + // -> read-only node + // Use the node's LSN. + lsn.to_string() + } + }; + + let branch_name = timeline_name_mappings + .get(&TenantTimelineId::new(tenant_id, node.timeline_id)) + .map(|name| name.as_str()) + .unwrap_or("?"); + + table.add_row(&[ + node_name.as_str(), + &node.address.to_string(), + &node.timeline_id.to_string(), + branch_name, + lsn_str.as_str(), + node.status(), + ]); + } + + println!("{table}"); + } + "create" => { + let branch_name = sub_args + .get_one::("branch-name") + .map(|s| s.as_str()) + .unwrap_or(DEFAULT_BRANCH_NAME); + let node_name = sub_args + .get_one::("node") + .map(|node_name| node_name.to_string()) + .unwrap_or_else(|| format!("{branch_name}_node")); + + let lsn = sub_args + .get_one::("lsn") + .map(|lsn_str| Lsn::from_str(lsn_str)) + .transpose() + .context("Failed to parse Lsn from the request")?; + let timeline_id = env + .get_branch_timeline_id(branch_name, tenant_id) + .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?; + + let port: Option = sub_args.get_one::("port").copied(); + + let pg_version = sub_args + .get_one::("pg-version") + .copied() + .context("Failed to parse postgres version from the argument string")?; + + cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?; + } + "start" => { + let port: Option = sub_args.get_one::("port").copied(); + let node_name = sub_args + .get_one::("node") + .ok_or_else(|| anyhow!("No node name was provided to start"))?; + + let node = cplane.nodes.get(&(tenant_id, node_name.to_string())); + + let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) { + let claims = Claims::new(Some(tenant_id), Scope::Tenant); + + Some(env.generate_auth_token(&claims)?) + } else { + None + }; + + if let Some(node) = node { + println!("Starting existing postgres {node_name}..."); + node.start(&auth_token)?; + } else { + let branch_name = sub_args + .get_one::("branch-name") + .map(|s| s.as_str()) + .unwrap_or(DEFAULT_BRANCH_NAME); + let timeline_id = env + .get_branch_timeline_id(branch_name, tenant_id) + .ok_or_else(|| { + anyhow!("Found no timeline id for branch name '{branch_name}'") + })?; + let lsn = sub_args + .get_one::("lsn") + .map(|lsn_str| Lsn::from_str(lsn_str)) + .transpose() + .context("Failed to parse Lsn from the request")?; + let pg_version = sub_args + .get_one::("pg-version") + .copied() + .context("Failed to `pg-version` from the argument string")?; + // when used with custom port this results in non obvious behaviour + // port is remembered from first start command, i e + // start --port X + // stop + // start <-- will also use port X even without explicit port argument + println!("Starting new postgres (v{pg_version}) {node_name} on timeline {timeline_id} ..."); + + let node = + cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?; + node.start(&auth_token)?; + } + } + "stop" => { + let node_name = sub_args + .get_one::("node") + .ok_or_else(|| anyhow!("No node name was provided to stop"))?; + let destroy = sub_args.get_flag("destroy"); + + let node = cplane + .nodes + .get(&(tenant_id, node_name.to_string())) + .with_context(|| format!("postgres {node_name} is not found"))?; + node.stop(destroy)?; + } + + _ => bail!("Unexpected pg subcommand '{sub_name}'"), + } + + Ok(()) +} + +fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { + let pageserver = PageServerNode::from_env(env); + + match sub_match.subcommand() { + Some(("start", start_match)) => { + if let Err(e) = pageserver.start(&pageserver_config_overrides(start_match)) { + eprintln!("pageserver start failed: {e}"); + exit(1); + } + } + + Some(("stop", stop_match)) => { + let immediate = stop_match + .get_one::("stop-mode") + .map(|s| s.as_str()) + == Some("immediate"); + + if let Err(e) = pageserver.stop(immediate) { + eprintln!("pageserver stop failed: {}", e); + exit(1); + } + } + + Some(("restart", restart_match)) => { + //TODO what shutdown strategy should we use here? + if let Err(e) = pageserver.stop(false) { + eprintln!("pageserver stop failed: {}", e); + exit(1); + } + + if let Err(e) = pageserver.start(&pageserver_config_overrides(restart_match)) { + eprintln!("pageserver start failed: {e}"); + exit(1); + } + } + + Some(("status", _)) => match PageServerNode::from_env(env).check_status() { + Ok(_) => println!("Page server is up and running"), + Err(err) => { + eprintln!("Page server is not available: {}", err); + exit(1); + } + }, + + Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name), + None => bail!("no pageserver subcommand provided"), + } + Ok(()) +} + +fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result { + if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) { + Ok(SafekeeperNode::from_env(env, node)) + } else { + bail!("could not find safekeeper '{}'", id) + } +} + +fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { + let (sub_name, sub_args) = match sub_match.subcommand() { + Some(safekeeper_command_data) => safekeeper_command_data, + None => bail!("no safekeeper subcommand provided"), + }; + + // All the commands take an optional safekeeper name argument + let sk_id = if let Some(id_str) = sub_args.get_one::("id") { + NodeId(id_str.parse().context("while parsing safekeeper id")?) + } else { + DEFAULT_SAFEKEEPER_ID + }; + let safekeeper = get_safekeeper(env, sk_id)?; + + match sub_name { + "start" => { + if let Err(e) = safekeeper.start() { + eprintln!("safekeeper start failed: {}", e); + exit(1); + } + } + + "stop" => { + let immediate = + sub_args.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); + + if let Err(e) = safekeeper.stop(immediate) { + eprintln!("safekeeper stop failed: {}", e); + exit(1); + } + } + + "restart" => { + let immediate = + sub_args.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); + + if let Err(e) = safekeeper.stop(immediate) { + eprintln!("safekeeper stop failed: {}", e); + exit(1); + } + + if let Err(e) = safekeeper.start() { + eprintln!("safekeeper start failed: {}", e); + exit(1); + } + } + + _ => { + bail!("Unexpected safekeeper subcommand '{}'", sub_name) + } + } + Ok(()) +} + +fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { + etcd::start_etcd_process(env)?; + let pageserver = PageServerNode::from_env(env); + + // Postgres nodes are not started automatically + + if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) { + eprintln!("pageserver start failed: {e}"); + try_stop_etcd_process(env); + exit(1); + } + + for node in env.safekeepers.iter() { + let safekeeper = SafekeeperNode::from_env(env, node); + if let Err(e) = safekeeper.start() { + eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id); + try_stop_etcd_process(env); + exit(1); + } + } + Ok(()) +} + +fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { + let immediate = + sub_match.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); + + let pageserver = PageServerNode::from_env(env); + + // Stop all compute nodes + let cplane = ComputeControlPlane::load(env.clone())?; + for (_k, node) in cplane.nodes { + if let Err(e) = node.stop(false) { + eprintln!("postgres stop failed: {}", e); + } + } + + if let Err(e) = pageserver.stop(immediate) { + eprintln!("pageserver stop failed: {}", e); + } + + for node in env.safekeepers.iter() { + let safekeeper = SafekeeperNode::from_env(env, node); + if let Err(e) = safekeeper.stop(immediate) { + eprintln!("safekeeper '{}' stop failed: {}", safekeeper.id, e); + } + } + + try_stop_etcd_process(env); + + Ok(()) +} + +fn try_stop_etcd_process(env: &local_env::LocalEnv) { + if let Err(e) = etcd::stop_etcd_process(env) { + eprintln!("etcd stop failed: {e}"); + } +} + +fn cli() -> Command { + let branch_name_arg = Arg::new("branch-name") + .long("branch-name") + .help("Name of the branch to be created or used as an alias for other services") + .required(false); + + let pg_node_arg = Arg::new("node").help("Postgres node name").required(false); + + let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false); + + let tenant_id_arg = Arg::new("tenant-id") + .long("tenant-id") + .help("Tenant id. Represented as a hexadecimal string 32 symbols length") + .required(false); + + let timeline_id_arg = Arg::new("timeline-id") + .long("timeline-id") + .help("Timeline id. Represented as a hexadecimal string 32 symbols length") + .required(false); + + let pg_version_arg = Arg::new("pg-version") + .long("pg-version") + .help("Postgres version to use for the initial tenant") + .required(false) + .value_parser(value_parser!(u32)) + .default_value(DEFAULT_PG_VERSION); + + let port_arg = Arg::new("port") + .long("port") + .required(false) + .value_parser(value_parser!(u16)) + .value_name("port"); + + let stop_mode_arg = Arg::new("stop-mode") + .short('m') + .value_parser(["fast", "immediate"]) + .help("If 'immediate', don't flush repository data at shutdown") + .required(false) + .value_name("stop-mode"); + + let pageserver_config_args = Arg::new("pageserver-config-override") + .long("pageserver-config-override") + .num_args(1) + .action(ArgAction::Append) + .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more") + .required(false); + + let lsn_arg = Arg::new("lsn") + .long("lsn") + .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.") + .required(false); + + Command::new("Neon CLI") + .arg_required_else_help(true) + .version(GIT_VERSION) + .subcommand( + Command::new("init") + .about("Initialize a new Neon repository") + .arg(pageserver_config_args.clone()) + .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) + .arg( + Arg::new("config") + .long("config") + .required(false) + .value_parser(value_parser!(PathBuf)) + .value_name("config"), + ) + .arg(pg_version_arg.clone()) + ) + .subcommand( + Command::new("timeline") + .about("Manage timelines") + .subcommand(Command::new("list") + .about("List all timelines, available to this pageserver") + .arg(tenant_id_arg.clone())) + .subcommand(Command::new("branch") + .about("Create a new timeline, using another timeline as a base, copying its data") + .arg(tenant_id_arg.clone()) + .arg(branch_name_arg.clone()) + .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name") + .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false)) + .arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn") + .help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false))) + .subcommand(Command::new("create") + .about("Create a new blank timeline") + .arg(tenant_id_arg.clone()) + .arg(branch_name_arg.clone()) + .arg(pg_version_arg.clone()) + ) + .subcommand(Command::new("import") + .about("Import timeline from basebackup directory") + .arg(tenant_id_arg.clone()) + .arg(timeline_id_arg.clone()) + .arg(Arg::new("node-name").long("node-name") + .help("Name to assign to the imported timeline")) + .arg(Arg::new("base-tarfile") + .long("base-tarfile") + .value_parser(value_parser!(PathBuf)) + .help("Basebackup tarfile to import") + ) + .arg(Arg::new("base-lsn").long("base-lsn") + .help("Lsn the basebackup starts at")) + .arg(Arg::new("wal-tarfile") + .long("wal-tarfile") + .value_parser(value_parser!(PathBuf)) + .help("Wal to add after base") + ) + .arg(Arg::new("end-lsn").long("end-lsn") + .help("Lsn the basebackup ends at")) + .arg(pg_version_arg.clone()) + ) + ).subcommand( + Command::new("tenant") + .arg_required_else_help(true) + .about("Manage tenants") + .subcommand(Command::new("list")) + .subcommand(Command::new("create") + .arg(tenant_id_arg.clone()) + .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) + .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)) + .arg(pg_version_arg.clone()) + ) + .subcommand(Command::new("config") + .arg(tenant_id_arg.clone()) + .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)) + ) + ) + .subcommand( + Command::new("pageserver") + .arg_required_else_help(true) + .about("Manage pageserver") + .subcommand(Command::new("status")) + .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone())) + .subcommand(Command::new("stop").about("Stop local pageserver") + .arg(stop_mode_arg.clone())) + .subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone())) + ) + .subcommand( + Command::new("safekeeper") + .arg_required_else_help(true) + .about("Manage safekeepers") + .subcommand(Command::new("start") + .about("Start local safekeeper") + .arg(safekeeper_id_arg.clone()) + ) + .subcommand(Command::new("stop") + .about("Stop local safekeeper") + .arg(safekeeper_id_arg.clone()) + .arg(stop_mode_arg.clone()) + ) + .subcommand(Command::new("restart") + .about("Restart local safekeeper") + .arg(safekeeper_id_arg) + .arg(stop_mode_arg.clone()) + ) + ) + .subcommand( + Command::new("pg") + .arg_required_else_help(true) + .about("Manage postgres instances") + .subcommand(Command::new("list").arg(tenant_id_arg.clone())) + .subcommand(Command::new("create") + .about("Create a postgres compute node") + .arg(pg_node_arg.clone()) + .arg(branch_name_arg.clone()) + .arg(tenant_id_arg.clone()) + .arg(lsn_arg.clone()) + .arg(port_arg.clone()) + .arg( + Arg::new("config-only") + .help("Don't do basebackup, create compute node with only config files") + .long("config-only") + .required(false)) + .arg(pg_version_arg.clone()) + ) + .subcommand(Command::new("start") + .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") + .arg(pg_node_arg.clone()) + .arg(tenant_id_arg.clone()) + .arg(branch_name_arg) + .arg(timeline_id_arg) + .arg(lsn_arg) + .arg(port_arg) + .arg(pg_version_arg) + ) + .subcommand( + Command::new("stop") + .arg(pg_node_arg) + .arg(tenant_id_arg) + .arg( + Arg::new("destroy") + .help("Also delete data directory (now optional, should be default in future)") + .long("destroy") + .action(ArgAction::SetTrue) + .required(false) + ) + ) + + ) + .subcommand( + Command::new("start") + .about("Start page server and safekeepers") + .arg(pageserver_config_args) + ) + .subcommand( + Command::new("stop") + .about("Stop page server and safekeepers") + .arg(stop_mode_arg) + ) +} + +#[test] +fn verify_cli() { + cli().debug_assert(); +} diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index a61191e7a4..359948a8c9 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -11,15 +11,15 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result}; -use zenith_utils::connstring::connection_host_port; -use zenith_utils::lsn::Lsn; -use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::ZTenantId; -use zenith_utils::zid::ZTimelineId; +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, + postgres_backend::AuthType, +}; -use crate::local_env::LocalEnv; +use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION}; +use crate::pageserver::PageServerNode; use crate::postgresql_conf::PostgresConf; -use crate::storage::PageServerNode; // // ComputeControlPlane @@ -27,7 +27,7 @@ use crate::storage::PageServerNode; pub struct ComputeControlPlane { base_port: u16, pageserver: Arc, - pub nodes: BTreeMap<(ZTenantId, String), Arc>, + pub nodes: BTreeMap<(TenantId, String), Arc>, env: LocalEnv, } @@ -37,7 +37,7 @@ impl ComputeControlPlane { // pgdatadirs // |- tenants // | |- - // | | |- + // | | |- pub fn load(env: LocalEnv) -> Result { let pageserver = Arc::new(PageServerNode::from_env(&env)); @@ -52,7 +52,7 @@ impl ComputeControlPlane { .with_context(|| format!("failed to list {}", tenant_dir.path().display()))? { let node = PostgresNode::from_dir_entry(timeline_dir?, &env, &pageserver)?; - nodes.insert((node.tenantid, node.name.clone()), Arc::new(node)); + nodes.insert((node.tenant_id, node.name.clone()), Arc::new(node)); } } @@ -73,40 +73,15 @@ impl ComputeControlPlane { .unwrap_or(self.base_port) } - // FIXME: see also parse_point_in_time in branches.rs. - fn parse_point_in_time( - &self, - tenantid: ZTenantId, - s: &str, - ) -> Result<(ZTimelineId, Option)> { - let mut strings = s.split('@'); - let name = strings.next().unwrap(); - - let lsn = strings - .next() - .map(Lsn::from_str) - .transpose() - .context("invalid LSN in point-in-time specification")?; - - // Resolve the timeline ID, given the human-readable branch name - let timeline_id = self - .pageserver - .branch_get_by_name(&tenantid, name)? - .timeline_id; - - Ok((timeline_id, lsn)) - } - pub fn new_node( &mut self, - tenantid: ZTenantId, + tenant_id: TenantId, name: &str, - timeline_spec: &str, + timeline_id: TimelineId, + lsn: Option, port: Option, + pg_version: u32, ) -> Result> { - // Resolve the human-readable timeline spec into timeline ID and LSN - let (timelineid, lsn) = self.parse_point_in_time(tenantid, timeline_spec)?; - let port = port.unwrap_or_else(|| self.get_port()); let node = Arc::new(PostgresNode { name: name.to_owned(), @@ -114,17 +89,18 @@ impl ComputeControlPlane { env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), is_test: false, - timelineid, + timeline_id, lsn, - tenantid, + tenant_id, uses_wal_proposer: false, + pg_version, }); node.create_pgdata()?; node.setup_pg_conf(self.env.pageserver.auth_type)?; self.nodes - .insert((tenantid, node.name.clone()), Arc::clone(&node)); + .insert((tenant_id, node.name.clone()), Arc::clone(&node)); Ok(node) } @@ -139,10 +115,11 @@ pub struct PostgresNode { pub env: LocalEnv, pageserver: Arc, is_test: bool, - pub timelineid: ZTimelineId, + pub timeline_id: TimelineId, pub lsn: Option, // if it's a read-only node. None for primary - pub tenantid: ZTenantId, + pub tenant_id: TenantId, uses_wal_proposer: bool, + pg_version: u32, } impl PostgresNode { @@ -173,9 +150,17 @@ impl PostgresNode { // Read a few options from the config file let context = format!("in config file {}", cfg_path_str); let port: u16 = conf.parse_field("port", &context)?; - let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?; - let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?; - let uses_wal_proposer = conf.get("wal_acceptors").is_some(); + let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?; + let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?; + let uses_wal_proposer = conf.get("neon.safekeepers").is_some(); + + // Read postgres version from PG_VERSION file to determine which postgres version binary to use. + // If it doesn't exist, assume broken data directory and use default pg version. + let pg_version_path = entry.path().join("PG_VERSION"); + + let pg_version_str = + fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string()); + let pg_version = u32::from_str(&pg_version_str)?; // parse recovery_target_lsn, if any let recovery_target_lsn: Option = @@ -188,21 +173,28 @@ impl PostgresNode { env: env.clone(), pageserver: Arc::clone(pageserver), is_test: false, - timelineid, + timeline_id, lsn: recovery_target_lsn, - tenantid, + tenant_id, uses_wal_proposer, + pg_version, }) } - fn sync_safekeepers(&self, auth_token: &Option) -> Result { - let pg_path = self.env.pg_bin_dir().join("postgres"); + fn sync_safekeepers(&self, auth_token: &Option, pg_version: u32) -> Result { + let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres"); let mut cmd = Command::new(&pg_path); cmd.arg("--sync-safekeepers") .env_clear() - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) + .env( + "LD_LIBRARY_PATH", + self.env.pg_lib_dir(pg_version)?.to_str().unwrap(), + ) + .env( + "DYLD_LIBRARY_PATH", + self.env.pg_lib_dir(pg_version)?.to_str().unwrap(), + ) .env("PGDATA", self.pgdata().to_str().unwrap()) .stdout(Stdio::piped()) // Comment this to avoid capturing stderr (useful if command hangs) @@ -241,9 +233,9 @@ impl PostgresNode { ); let sql = if let Some(lsn) = lsn { - format!("basebackup {} {} {}", self.tenantid, self.timelineid, lsn) + format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn) } else { - format!("basebackup {} {}", self.tenantid, self.timelineid) + format!("basebackup {} {}", self.tenant_id, self.timeline_id) }; let mut client = self @@ -256,8 +248,13 @@ impl PostgresNode { .context("page server 'basebackup' command failed")?; // Read the archive directly from the `CopyOutReader` - tar::Archive::new(copyreader) - .unpack(&self.pgdata()) + // + // Set `ignore_zeros` so that unpack() reads all the Copy data and + // doesn't stop at the end-of-archive marker. Otherwise, if the server + // sends an Error after finishing the tarball, we will not notice it. + let mut ar = tar::Archive::new(copyreader); + ar.set_ignore_zeros(true); + ar.unpack(&self.pgdata()) .context("extracting base backup failed")?; Ok(()) @@ -279,14 +276,12 @@ impl PostgresNode { }) } - // Connect to a page server, get base backup, and untar it to initialize a - // new data directory + // Write postgresql.conf with default configuration + // and PG_VERSION file to the data directory of a new node. fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> { let mut conf = PostgresConf::new(); conf.append("max_wal_senders", "10"); - // wal_log_hints is mandatory when running against pageserver (see gh issue#192) - // TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE? - conf.append("wal_log_hints", "on"); + conf.append("wal_log_hints", "off"); conf.append("max_replication_slots", "10"); conf.append("hot_standby", "on"); conf.append("shared_buffers", "1MB"); @@ -298,16 +293,14 @@ impl PostgresNode { conf.append("wal_sender_timeout", "5s"); conf.append("listen_addresses", &self.address.ip().to_string()); conf.append("port", &self.address.port().to_string()); - - // Never clean up old WAL. TODO: We should use a replication - // slot or something proper, to prevent the compute node - // from removing WAL that hasn't been streamed to the safekeeper or - // page server yet. (gh issue #349) - conf.append("wal_keep_size", "10TB"); + conf.append("wal_keep_size", "0"); + // walproposer panics when basebackup is invalid, it is pointless to restart in this case. + conf.append("restart_after_crash", "off"); // Configure the node to fetch pages from pageserver let pageserver_connstr = { - let (host, port) = connection_host_port(&self.pageserver.pg_connection_config); + let config = &self.pageserver.pg_connection_config; + let (host, port) = (config.host(), config.port()); // Set up authentication // @@ -315,7 +308,7 @@ impl PostgresNode { // variable during compute pg startup. It is done this way because // otherwise user will be able to retrieve the value using SHOW // command or pg_settings - let password = if let AuthType::ZenithJWT = auth_type { + let password = if let AuthType::NeonJWT = auth_type { "$ZENITH_AUTH_TOKEN" } else { "" @@ -324,13 +317,13 @@ impl PostgresNode { // Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN // We parse this string and build it back with token from env var, and for simplicity rebuild // uses only needed variables namely host, port, user, password. - format!("postgresql://no_user:{}@{}:{}", password, host, port) + format!("postgresql://no_user:{password}@{host}:{port}") }; - conf.append("shared_preload_libraries", "zenith"); + conf.append("shared_preload_libraries", "neon"); conf.append_line(""); - conf.append("zenith.page_server_connstring", &pageserver_connstr); - conf.append("zenith.zenith_tenant", &self.tenantid.to_string()); - conf.append("zenith.zenith_timeline", &self.timelineid.to_string()); + conf.append("neon.pageserver_connstring", &pageserver_connstr); + conf.append("neon.tenant_id", &self.tenant_id.to_string()); + conf.append("neon.timeline_id", &self.timeline_id.to_string()); if let Some(lsn) = self.lsn { conf.append("recovery_target_lsn", &lsn.to_string()); } @@ -357,14 +350,14 @@ impl PostgresNode { // Configure the node to connect to the safekeepers conf.append("synchronous_standby_names", "walproposer"); - let wal_acceptors = self + let safekeepers = self .env .safekeepers .iter() .map(|sk| format!("localhost:{}", sk.pg_port)) .collect::>() .join(","); - conf.append("wal_acceptors", &wal_acceptors); + conf.append("neon.safekeepers", &safekeepers); } else { // We only use setup without safekeepers for tests, // and don't care about data durability on pageserver, @@ -375,12 +368,14 @@ impl PostgresNode { // This isn't really a supported configuration, but can be useful for // testing. conf.append("synchronous_standby_names", "pageserver"); - conf.append("zenith.callmemaybe_connstring", &self.connstr()); } let mut file = File::create(self.pgdata().join("postgresql.conf"))?; file.write_all(conf.to_string().as_bytes())?; + let mut file = File::create(self.pgdata().join("PG_VERSION"))?; + file.write_all(self.pg_version.to_string().as_bytes())?; + Ok(()) } @@ -392,7 +387,7 @@ impl PostgresNode { // latest data from the pageserver. That is a bit clumsy but whole bootstrap // procedure evolves quite actively right now, so let's think about it again // when things would be more stable (TODO). - let lsn = self.sync_safekeepers(auth_token)?; + let lsn = self.sync_safekeepers(auth_token, self.pg_version)?; if lsn == Lsn(0) { None } else { @@ -408,7 +403,7 @@ impl PostgresNode { } pub fn pgdata(&self) -> PathBuf { - self.env.pg_data_dir(&self.tenantid, &self.name) + self.env.pg_data_dir(&self.tenant_id, &self.name) } pub fn status(&self) -> &str { @@ -425,7 +420,7 @@ impl PostgresNode { } fn pg_ctl(&self, args: &[&str], auth_token: &Option) -> Result<()> { - let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl"); + let pg_ctl_path = self.env.pg_bin_dir(self.pg_version)?.join("pg_ctl"); let mut cmd = Command::new(pg_ctl_path); cmd.args( [ @@ -441,15 +436,26 @@ impl PostgresNode { .concat(), ) .env_clear() - .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap()); + .env( + "LD_LIBRARY_PATH", + self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(), + ) + .env( + "DYLD_LIBRARY_PATH", + self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(), + ); if let Some(token) = auth_token { cmd.env("ZENITH_AUTH_TOKEN", token); } - let pg_ctl = cmd.status().context("pg_ctl failed")?; - if !pg_ctl.success() { - anyhow::bail!("pg_ctl failed"); + let pg_ctl = cmd.output().context("pg_ctl failed")?; + if !pg_ctl.status.success() { + anyhow::bail!( + "pg_ctl failed, exit code: {}, stdout: {}, stderr: {}", + pg_ctl.status, + String::from_utf8_lossy(&pg_ctl.stdout), + String::from_utf8_lossy(&pg_ctl.stderr), + ); } Ok(()) } @@ -517,7 +523,7 @@ impl PostgresNode { "host={} port={} user={} dbname={}", self.address.ip(), self.address.port(), - "zenith_admin", + "cloud_admin", "postgres" ) } diff --git a/control_plane/src/connection.rs b/control_plane/src/connection.rs new file mode 100644 index 0000000000..cca837de6e --- /dev/null +++ b/control_plane/src/connection.rs @@ -0,0 +1,57 @@ +use url::Url; + +#[derive(Debug)] +pub struct PgConnectionConfig { + url: Url, +} + +impl PgConnectionConfig { + pub fn host(&self) -> &str { + self.url.host_str().expect("BUG: no host") + } + + pub fn port(&self) -> u16 { + self.url.port().expect("BUG: no port") + } + + /// Return a `:` string. + pub fn raw_address(&self) -> String { + format!("{}:{}", self.host(), self.port()) + } + + /// Connect using postgres protocol with TLS disabled. + pub fn connect_no_tls(&self) -> Result { + postgres::Client::connect(self.url.as_str(), postgres::NoTls) + } +} + +impl std::str::FromStr for PgConnectionConfig { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let mut url: Url = s.parse()?; + + match url.scheme() { + "postgres" | "postgresql" => {} + other => anyhow::bail!("invalid scheme: {other}"), + } + + // It's not a valid connection url if host is unavailable. + if url.host().is_none() { + anyhow::bail!(url::ParseError::EmptyHost); + } + + // E.g. `postgres:bar`. + if url.cannot_be_a_base() { + anyhow::bail!("URL cannot be a base"); + } + + // Set the default PG port if it's missing. + if url.port().is_none() { + url.set_port(Some(5432)) + .expect("BUG: couldn't set the default port"); + } + + Ok(Self { url }) + } +} diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs new file mode 100644 index 0000000000..4c15914e24 --- /dev/null +++ b/control_plane/src/etcd.rs @@ -0,0 +1,77 @@ +use std::{fs, path::PathBuf}; + +use anyhow::Context; + +use crate::{background_process, local_env}; + +pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { + let etcd_broker = &env.etcd_broker; + println!( + "Starting etcd broker using {:?}", + etcd_broker.etcd_binary_path + ); + + let etcd_data_dir = env.base_data_dir.join("etcd"); + fs::create_dir_all(&etcd_data_dir) + .with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?; + + let client_urls = etcd_broker.comma_separated_endpoints(); + let args = [ + format!("--data-dir={}", etcd_data_dir.display()), + format!("--listen-client-urls={client_urls}"), + format!("--advertise-client-urls={client_urls}"), + // Set --quota-backend-bytes to keep the etcd virtual memory + // size smaller. Our test etcd clusters are very small. + // See https://github.com/etcd-io/etcd/issues/7910 + "--quota-backend-bytes=100000000".to_string(), + // etcd doesn't compact (vacuum) with default settings, + // enable it to prevent space exhaustion. + "--auto-compaction-mode=revision".to_string(), + "--auto-compaction-retention=1".to_string(), + ]; + + let pid_file_path = etcd_pid_file_path(env); + + let client = reqwest::blocking::Client::new(); + + background_process::start_process( + "etcd", + &etcd_data_dir, + &etcd_broker.etcd_binary_path, + &args, + background_process::InitialPidFile::Create(&pid_file_path), + || { + for broker_endpoint in &etcd_broker.broker_endpoints { + let request = broker_endpoint + .join("health") + .with_context(|| { + format!( + "Failed to append /health path to broker endopint {}", + broker_endpoint + ) + }) + .and_then(|url| { + client.get(&url.to_string()).build().with_context(|| { + format!("Failed to construct request to etcd endpoint {url}") + }) + })?; + if client.execute(request).is_ok() { + return Ok(true); + } + } + + Ok(false) + }, + ) + .context("Failed to spawn etcd subprocess")?; + + Ok(()) +} + +pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { + background_process::stop_process(true, "etcd", &etcd_pid_file_path(env)) +} + +fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf { + env.base_data_dir.join("etcd.pid") +} diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index a2ecdd3d64..c3b47fe81b 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -6,45 +6,12 @@ // Intended to be used in integration tests and in CLI tools for // local installations. // -use anyhow::{anyhow, bail, Context, Result}; -use std::fs; -use std::path::Path; -use std::process::Command; +mod background_process; pub mod compute; +pub mod connection; +pub mod etcd; pub mod local_env; +pub mod pageserver; pub mod postgresql_conf; pub mod safekeeper; -pub mod storage; - -/// Read a PID file -/// -/// We expect a file that contains a single integer. -/// We return an i32 for compatibility with libc and nix. -pub fn read_pidfile(pidfile: &Path) -> Result { - let pid_str = fs::read_to_string(pidfile) - .with_context(|| format!("failed to read pidfile {:?}", pidfile))?; - let pid: i32 = pid_str - .parse() - .map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?; - if pid < 1 { - bail!("pidfile {:?} contained bad value '{}'", pidfile, pid); - } - Ok(pid) -} - -fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { - let cmd = cmd.env_clear().env("RUST_BACKTRACE", "1"); - - let var = "LLVM_PROFILE_FILE"; - if let Some(val) = std::env::var_os(var) { - cmd.env(var, val); - } - - const RUST_LOG_KEY: &str = "RUST_LOG"; - if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) { - cmd.env(RUST_LOG_KEY, rust_log_value) - } else { - cmd - } -} diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index b80e137cb9..ac4ebd0d1e 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -3,32 +3,41 @@ //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. -use anyhow::{bail, Context}; +use anyhow::{bail, ensure, Context}; +use reqwest::Url; use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; +use std::collections::HashMap; use std::env; -use std::fmt::Write; use std::fs; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; -use zenith_utils::auth::{encode_from_key_file, Claims, Scope}; -use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{opt_display_serde, ZTenantId}; +use utils::{ + auth::{encode_from_key_file, Claims, Scope}, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + postgres_backend::AuthType, +}; + +use crate::safekeeper::SafekeeperNode; + +pub const DEFAULT_PG_VERSION: u32 = 14; // -// This data structures represents zenith CLI config +// This data structures represents neon_local CLI config // -// It is deserialized from the .zenith/config file, or the config file passed -// to 'zenith init --config=' option. See control_plane/simple.conf for +// It is deserialized from the .neon/config file, or the config file passed +// to 'neon_local init --config=' option. See control_plane/simple.conf for // an example. // -#[derive(Serialize, Deserialize, Clone, Debug)] +#[serde_as] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] pub struct LocalEnv { // Base directory for all the nodes (the pageserver, safekeepers and // compute nodes). // // This is not stored in the config file. Rather, this is the path where the - // config file itself is. It is read from the ZENITH_REPO_DIR env variable or - // '.zenith' if not given. + // config file itself is. It is read from the NEON_REPO_DIR env variable or + // '.neon' if not given. #[serde(skip)] pub base_data_dir: PathBuf, @@ -41,27 +50,103 @@ pub struct LocalEnv { // Path to pageserver binary. #[serde(default)] - pub zenith_distrib_dir: PathBuf, + pub neon_distrib_dir: PathBuf, - // Default tenant ID to use with the 'zenith' command line utility, when - // --tenantid is not explicitly specified. - #[serde(with = "opt_display_serde")] + // Default tenant ID to use with the 'neon_local' command line utility, when + // --tenant_id is not explicitly specified. #[serde(default)] - pub default_tenantid: Option, + #[serde_as(as = "Option")] + pub default_tenant_id: Option, // used to issue tokens during e.g pg start #[serde(default)] pub private_key_path: PathBuf, + pub etcd_broker: EtcdBroker, + pub pageserver: PageServerConf, #[serde(default)] pub safekeepers: Vec, + + /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. + #[serde(default)] + // A `HashMap>` would be more appropriate here, + // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. + // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". + #[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")] + branch_name_mappings: HashMap>, } -#[derive(Serialize, Deserialize, Clone, Debug)] +/// Etcd broker config for cluster internal communication. +#[serde_as] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +pub struct EtcdBroker { + /// A prefix to all to any key when pushing/polling etcd from a node. + #[serde(default)] + pub broker_etcd_prefix: Option, + + /// Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'. + #[serde(default)] + #[serde_as(as = "Vec")] + pub broker_endpoints: Vec, + + /// Etcd binary path to use. + #[serde(default)] + pub etcd_binary_path: PathBuf, +} + +impl EtcdBroker { + pub fn locate_etcd() -> anyhow::Result { + let which_output = Command::new("which") + .arg("etcd") + .output() + .context("Failed to run 'which etcd' command")?; + let stdout = String::from_utf8_lossy(&which_output.stdout); + ensure!( + which_output.status.success(), + "'which etcd' invocation failed. Status: {}, stdout: {stdout}, stderr: {}", + which_output.status, + String::from_utf8_lossy(&which_output.stderr) + ); + + let etcd_path = PathBuf::from(stdout.trim()); + ensure!( + etcd_path.is_file(), + "'which etcd' invocation was successful, but the path it returned is not a file or does not exist: {}", + etcd_path.display() + ); + + Ok(etcd_path) + } + + pub fn comma_separated_endpoints(&self) -> String { + self.broker_endpoints + .iter() + .map(|url| { + // URL by default adds a '/' path at the end, which is not what etcd CLI wants. + let url_string = url.as_str(); + if url_string.ends_with('/') { + &url_string[0..url_string.len() - 1] + } else { + url_string + } + }) + .fold(String::new(), |mut comma_separated_urls, url| { + if !comma_separated_urls.is_empty() { + comma_separated_urls.push(','); + } + comma_separated_urls.push_str(url); + comma_separated_urls + }) + } +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct PageServerConf { + // node id + pub id: NodeId, // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, @@ -76,6 +161,7 @@ pub struct PageServerConf { impl Default for PageServerConf { fn default() -> Self { Self { + id: NodeId(0), listen_pg_addr: String::new(), listen_http_addr: String::new(), auth_type: AuthType::Trust, @@ -84,50 +170,77 @@ impl Default for PageServerConf { } } -#[derive(Serialize, Deserialize, Clone, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct SafekeeperConf { - pub name: String, + pub id: NodeId, pub pg_port: u16, pub http_port: u16, pub sync: bool, + pub remote_storage: Option, + pub backup_threads: Option, + pub auth_enabled: bool, } impl Default for SafekeeperConf { fn default() -> Self { Self { - name: String::new(), + id: NodeId(0), pg_port: 0, http_port: 0, sync: true, + remote_storage: None, + backup_threads: None, + auth_enabled: false, } } } impl LocalEnv { - // postgres installation paths - pub fn pg_bin_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("bin") - } - pub fn pg_lib_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("lib") + pub fn pg_distrib_dir_raw(&self) -> PathBuf { + self.pg_distrib_dir.clone() } - pub fn pageserver_bin(&self) -> anyhow::Result { - Ok(self.zenith_distrib_dir.join("pageserver")) + pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result { + let path = self.pg_distrib_dir.clone(); + + match pg_version { + 14 => Ok(path.join(format!("v{pg_version}"))), + 15 => Ok(path.join(format!("v{pg_version}"))), + _ => bail!("Unsupported postgres version: {}", pg_version), + } } - pub fn safekeeper_bin(&self) -> anyhow::Result { - Ok(self.zenith_distrib_dir.join("safekeeper")) + pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { + match pg_version { + 14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), + 15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), + _ => bail!("Unsupported postgres version: {}", pg_version), + } + } + pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { + match pg_version { + 14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), + 15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), + _ => bail!("Unsupported postgres version: {}", pg_version), + } + } + + pub fn pageserver_bin(&self) -> PathBuf { + self.neon_distrib_dir.join("pageserver") + } + + pub fn safekeeper_bin(&self) -> PathBuf { + self.neon_distrib_dir.join("safekeeper") } pub fn pg_data_dirs_path(&self) -> PathBuf { self.base_data_dir.join("pgdatadirs").join("tenants") } - pub fn pg_data_dir(&self, tenantid: &ZTenantId, branch_name: &str) -> PathBuf { + pub fn pg_data_dir(&self, tenant_id: &TenantId, branch_name: &str) -> PathBuf { self.pg_data_dirs_path() - .join(tenantid.to_string()) + .join(tenant_id.to_string()) .join(branch_name) } @@ -136,51 +249,89 @@ impl LocalEnv { self.base_data_dir.clone() } - pub fn safekeeper_data_dir(&self, node_name: &str) -> PathBuf { - self.base_data_dir.join("safekeepers").join(node_name) + pub fn safekeeper_data_dir(&self, data_dir_name: &str) -> PathBuf { + self.base_data_dir.join("safekeepers").join(data_dir_name) + } + + pub fn register_branch_mapping( + &mut self, + branch_name: String, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> anyhow::Result<()> { + let existing_values = self + .branch_name_mappings + .entry(branch_name.clone()) + .or_default(); + + let existing_ids = existing_values + .iter() + .find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id); + + if let Some((_, old_timeline_id)) = existing_ids { + if old_timeline_id == &timeline_id { + Ok(()) + } else { + bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}"); + } + } else { + existing_values.push((tenant_id, timeline_id)); + Ok(()) + } + } + + pub fn get_branch_timeline_id( + &self, + branch_name: &str, + tenant_id: TenantId, + ) -> Option { + self.branch_name_mappings + .get(branch_name)? + .iter() + .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id) + .map(|&(_, timeline_id)| timeline_id) + .map(TimelineId::from) + } + + pub fn timeline_name_mappings(&self) -> HashMap { + self.branch_name_mappings + .iter() + .flat_map(|(name, tenant_timelines)| { + tenant_timelines.iter().map(|&(tenant_id, timeline_id)| { + (TenantTimelineId::new(tenant_id, timeline_id), name.clone()) + }) + }) + .collect() } /// Create a LocalEnv from a config file. /// /// Unlike 'load_config', this function fills in any defaults that are missing /// from the config file. - pub fn create_config(toml: &str) -> anyhow::Result { + pub fn parse_config(toml: &str) -> anyhow::Result { let mut env: LocalEnv = toml::from_str(toml)?; // Find postgres binaries. - // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install". + // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". + // Note that later in the code we assume, that distrib dirs follow the same pattern + // for all postgres versions. if env.pg_distrib_dir == Path::new("") { if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { env.pg_distrib_dir = postgres_bin.into(); } else { let cwd = env::current_dir()?; - env.pg_distrib_dir = cwd.join("tmp_install") + env.pg_distrib_dir = cwd.join("pg_install") } } - if !env.pg_distrib_dir.join("bin/postgres").exists() { - bail!( - "Can't find postgres binary at {}", - env.pg_distrib_dir.display() - ); - } - // Find zenith binaries. - if env.zenith_distrib_dir == Path::new("") { - env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); - } - for binary in ["pageserver", "safekeeper"] { - if !env.zenith_distrib_dir.join(binary).exists() { - bail!( - "Can't find binary '{}' in zenith distrib dir '{}'", - binary, - env.zenith_distrib_dir.display() - ); - } + // Find neon binaries. + if env.neon_distrib_dir == Path::new("") { + env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); } // If no initial tenant ID was given, generate it. - if env.default_tenantid.is_none() { - env.default_tenantid = Some(ZTenantId::generate()); + if env.default_tenant_id.is_none() { + env.default_tenant_id = Some(TenantId::generate()); } env.base_data_dir = base_path(); @@ -194,12 +345,12 @@ impl LocalEnv { if !repopath.exists() { bail!( - "Zenith config is not found in {}. You need to run 'zenith init' first", + "Neon config is not found in {}. You need to run 'neon_local init' first", repopath.to_str().unwrap() ); } - // TODO: check that it looks like a zenith repository + // TODO: check that it looks like a neon repository // load and parse file let config = fs::read_to_string(repopath.join("config"))?; @@ -210,6 +361,39 @@ impl LocalEnv { Ok(env) } + pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { + // Currently, the user first passes a config file with 'neon_local init --config=' + // We read that in, in `create_config`, and fill any missing defaults. Then it's saved + // to .neon/config. TODO: We lose any formatting and comments along the way, which is + // a bit sad. + let mut conf_content = r#"# This file describes a locale deployment of the page server +# and safekeeeper node. It is read by the 'neon_local' command-line +# utility. +"# + .to_string(); + + // Convert the LocalEnv to a toml file. + // + // This could be as simple as this: + // + // conf_content += &toml::to_string_pretty(env)?; + // + // But it results in a "values must be emitted before tables". I'm not sure + // why, AFAICS the table, i.e. 'safekeepers: Vec' is last. + // Maybe rust reorders the fields to squeeze avoid padding or something? + // In any case, converting to toml::Value first, and serializing that, works. + // See https://github.com/alexcrichton/toml-rs/issues/142 + conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?; + + let target_config_path = base_path.join("config"); + fs::write(&target_config_path, conf_content).with_context(|| { + format!( + "Failed to write config file into path '{}'", + target_config_path.display() + ) + }) + } + // this function is used only for testing purposes in CLI e g generate tokens during init pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result { let private_key_path = if self.private_key_path.is_absolute() { @@ -223,20 +407,35 @@ impl LocalEnv { } // - // Initialize a new Zenith repository + // Initialize a new Neon repository // - pub fn init(&mut self) -> anyhow::Result<()> { + pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> { // check if config already exists let base_path = &self.base_data_dir; - if base_path == Path::new("") { - bail!("repository base path is missing"); - } - if base_path.exists() { + ensure!( + base_path != Path::new(""), + "repository base path is missing" + ); + + ensure!( + !base_path.exists(), + "directory '{}' already exists. Perhaps already initialized?", + base_path.display() + ); + if !self.pg_bin_dir(pg_version)?.join("postgres").exists() { bail!( - "directory '{}' already exists. Perhaps already initialized?", - base_path.to_str().unwrap() + "Can't find postgres binary at {}", + self.pg_bin_dir(pg_version)?.display() ); } + for binary in ["pageserver", "safekeeper"] { + if !self.neon_distrib_dir.join(binary).exists() { + bail!( + "Can't find binary '{binary}' in neon distrib dir '{}'", + self.neon_distrib_dir.display() + ); + } + } fs::create_dir(&base_path)?; @@ -285,45 +484,44 @@ impl LocalEnv { fs::create_dir_all(self.pg_data_dirs_path())?; for safekeeper in &self.safekeepers { - fs::create_dir_all(self.safekeeper_data_dir(&safekeeper.name))?; + fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?; } - let mut conf_content = String::new(); - - // Currently, the user first passes a config file with 'zenith init --config=' - // We read that in, in `create_config`, and fill any missing defaults. Then it's saved - // to .zenith/config. TODO: We lose any formatting and comments along the way, which is - // a bit sad. - write!( - &mut conf_content, - r#"# This file describes a locale deployment of the page server -# and safekeeeper node. It is read by the 'zenith' command-line -# utility. -"# - )?; - - // Convert the LocalEnv to a toml file. - // - // This could be as simple as this: - // - // conf_content += &toml::to_string_pretty(env)?; - // - // But it results in a "values must be emitted before tables". I'm not sure - // why, AFAICS the table, i.e. 'safekeepers: Vec' is last. - // Maybe rust reorders the fields to squeeze avoid padding or something? - // In any case, converting to toml::Value first, and serializing that, works. - // See https://github.com/alexcrichton/toml-rs/issues/142 - conf_content += &toml::to_string_pretty(&toml::Value::try_from(&self)?)?; - - fs::write(base_path.join("config"), conf_content)?; - - Ok(()) + self.persist_config(base_path) } } fn base_path() -> PathBuf { - match std::env::var_os("ZENITH_REPO_DIR") { - Some(val) => PathBuf::from(val.to_str().unwrap()), - None => ".zenith".into(), + match std::env::var_os("NEON_REPO_DIR") { + Some(val) => PathBuf::from(val), + None => PathBuf::from(".neon"), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn simple_conf_parsing() { + let simple_conf_toml = include_str!("../simple.conf"); + let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml); + assert!( + simple_conf_parse_result.is_ok(), + "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}" + ); + + let string_to_replace = "broker_endpoints = ['http://127.0.0.1:2379']"; + let spoiled_url_str = "broker_endpoints = ['!@$XOXO%^&']"; + let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str); + assert!( + spoiled_url_toml.contains(spoiled_url_str), + "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}" + ); + let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml); + assert!( + spoiled_url_parse_result.is_err(), + "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}" + ); } } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs new file mode 100644 index 0000000000..18d6aee68d --- /dev/null +++ b/control_plane/src/pageserver.rs @@ -0,0 +1,539 @@ +use std::collections::HashMap; +use std::fs::{self, File}; +use std::io::{BufReader, Write}; +use std::num::NonZeroU64; +use std::path::{Path, PathBuf}; +use std::process::Child; +use std::{io, result}; + +use crate::connection::PgConnectionConfig; +use anyhow::{bail, Context}; +use pageserver_api::models::{ + TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo, +}; +use reqwest::blocking::{Client, RequestBuilder, Response}; +use reqwest::{IntoUrl, Method}; +use thiserror::Error; +use utils::{ + http::error::HttpErrorBody, + id::{TenantId, TimelineId}, + lsn::Lsn, + postgres_backend::AuthType, +}; + +use crate::{background_process, local_env::LocalEnv}; + +#[derive(Error, Debug)] +pub enum PageserverHttpError { + #[error("Reqwest error: {0}")] + Transport(#[from] reqwest::Error), + + #[error("Error: {0}")] + Response(String), +} + +impl From for PageserverHttpError { + fn from(e: anyhow::Error) -> Self { + Self::Response(e.to_string()) + } +} + +type Result = result::Result; + +pub trait ResponseErrorMessageExt: Sized { + fn error_from_body(self) -> Result; +} + +impl ResponseErrorMessageExt for Response { + fn error_from_body(self) -> Result { + let status = self.status(); + if !(status.is_client_error() || status.is_server_error()) { + return Ok(self); + } + + // reqwest does not export its error construction utility functions, so let's craft the message ourselves + let url = self.url().to_owned(); + Err(PageserverHttpError::Response( + match self.json::() { + Ok(err_body) => format!("Error: {}", err_body.msg), + Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), + }, + )) + } +} + +// +// Control routines for pageserver. +// +// Used in CLI and tests. +// +#[derive(Debug)] +pub struct PageServerNode { + pub pg_connection_config: PgConnectionConfig, + pub env: LocalEnv, + pub http_client: Client, + pub http_base_url: String, +} + +impl PageServerNode { + pub fn from_env(env: &LocalEnv) -> PageServerNode { + let password = if env.pageserver.auth_type == AuthType::NeonJWT { + &env.pageserver.auth_token + } else { + "" + }; + + Self { + pg_connection_config: Self::pageserver_connection_config( + password, + &env.pageserver.listen_pg_addr, + ), + env: env.clone(), + http_client: Client::new(), + http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr), + } + } + + /// Construct libpq connection string for connecting to the pageserver. + fn pageserver_connection_config(password: &str, listen_addr: &str) -> PgConnectionConfig { + format!("postgresql://no_user:{password}@{listen_addr}/no_db") + .parse() + .unwrap() + } + + pub fn initialize( + &self, + create_tenant: Option, + initial_timeline_id: Option, + config_overrides: &[&str], + pg_version: u32, + ) -> anyhow::Result { + let id = format!("id={}", self.env.pageserver.id); + // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. + let pg_distrib_dir_param = format!( + "pg_distrib_dir='{}'", + self.env.pg_distrib_dir_raw().display() + ); + + let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type); + let listen_http_addr_param = format!( + "listen_http_addr='{}'", + self.env.pageserver.listen_http_addr + ); + let listen_pg_addr_param = + format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr); + let broker_endpoints_param = format!( + "broker_endpoints=[{}]", + self.env + .etcd_broker + .broker_endpoints + .iter() + .map(|url| format!("'{url}'")) + .collect::>() + .join(",") + ); + let broker_etcd_prefix_param = self + .env + .etcd_broker + .broker_etcd_prefix + .as_ref() + .map(|prefix| format!("broker_etcd_prefix='{prefix}'")); + + let mut init_config_overrides = config_overrides.to_vec(); + init_config_overrides.push(&id); + init_config_overrides.push(&pg_distrib_dir_param); + init_config_overrides.push(&authg_type_param); + init_config_overrides.push(&listen_http_addr_param); + init_config_overrides.push(&listen_pg_addr_param); + init_config_overrides.push(&broker_endpoints_param); + + if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() { + init_config_overrides.push(broker_etcd_prefix_param); + } + + if self.env.pageserver.auth_type != AuthType::Trust { + init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'"); + } + + let mut pageserver_process = self + .start_node(&init_config_overrides, &self.env.base_data_dir, true) + .with_context(|| { + format!( + "Failed to start a process for pageserver {}", + self.env.pageserver.id, + ) + })?; + + let init_result = self + .try_init_timeline(create_tenant, initial_timeline_id, pg_version) + .context("Failed to create initial tenant and timeline for pageserver"); + match &init_result { + Ok(initial_timeline_id) => { + println!("Successfully initialized timeline {initial_timeline_id}") + } + Err(e) => eprintln!("{e:#}"), + } + match pageserver_process.kill() { + Err(e) => { + eprintln!( + "Failed to stop pageserver {} process with pid {}: {e:#}", + self.env.pageserver.id, + pageserver_process.id(), + ) + } + Ok(()) => { + println!( + "Stopped pageserver {} process with pid {}", + self.env.pageserver.id, + pageserver_process.id(), + ); + // cleanup after pageserver startup, since we do not call regular `stop_process` during init + let pid_file = self.pid_file(); + if let Err(e) = fs::remove_file(&pid_file) { + if e.kind() != io::ErrorKind::NotFound { + eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}"); + } + } + } + } + init_result + } + + fn try_init_timeline( + &self, + new_tenant_id: Option, + new_timeline_id: Option, + pg_version: u32, + ) -> anyhow::Result { + let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?; + let initial_timeline_info = self.timeline_create( + initial_tenant_id, + new_timeline_id, + None, + None, + Some(pg_version), + )?; + Ok(initial_timeline_info.timeline_id) + } + + pub fn repo_path(&self) -> PathBuf { + self.env.pageserver_data_dir() + } + + /// The pid file is created by the pageserver process, with its pid stored inside. + /// Other pageservers cannot lock the same file and overwrite it for as long as the current + /// pageserver runs. (Unless someone removes the file manually; never do that!) + fn pid_file(&self) -> PathBuf { + self.repo_path().join("pageserver.pid") + } + + pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result { + self.start_node(config_overrides, &self.repo_path(), false) + } + + fn start_node( + &self, + config_overrides: &[&str], + datadir: &Path, + update_config: bool, + ) -> anyhow::Result { + println!( + "Starting pageserver at '{}' in '{}'", + self.pg_connection_config.raw_address(), + datadir.display() + ); + io::stdout().flush()?; + + let mut args = vec![ + "-D", + datadir.to_str().with_context(|| { + format!("Datadir path {datadir:?} cannot be represented as a unicode string") + })?, + ]; + + if update_config { + args.push("--update-config"); + } + + for config_override in config_overrides { + args.extend(["-c", config_override]); + } + + background_process::start_process( + "pageserver", + datadir, + &self.env.pageserver_bin(), + &args, + background_process::InitialPidFile::Expect(&self.pid_file()), + || match self.check_status() { + Ok(()) => Ok(true), + Err(PageserverHttpError::Transport(_)) => Ok(false), + Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")), + }, + ) + } + + /// + /// Stop the server. + /// + /// If 'immediate' is true, we use SIGQUIT, killing the process immediately. + /// Otherwise we use SIGTERM, triggering a clean shutdown + /// + /// If the server is not running, returns success + /// + pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { + background_process::stop_process(immediate, "pageserver", &self.pid_file()) + } + + pub fn page_server_psql(&self, sql: &str) -> Vec { + let mut client = self.pg_connection_config.connect_no_tls().unwrap(); + + println!("Pageserver query: '{sql}'"); + client.simple_query(sql).unwrap() + } + + pub fn page_server_psql_client(&self) -> result::Result { + self.pg_connection_config.connect_no_tls() + } + + fn http_request(&self, method: Method, url: U) -> RequestBuilder { + let mut builder = self.http_client.request(method, url); + if self.env.pageserver.auth_type == AuthType::NeonJWT { + builder = builder.bearer_auth(&self.env.pageserver.auth_token) + } + builder + } + + pub fn check_status(&self) -> Result<()> { + self.http_request(Method::GET, format!("{}/status", self.http_base_url)) + .send()? + .error_from_body()?; + Ok(()) + } + + pub fn tenant_list(&self) -> Result> { + Ok(self + .http_request(Method::GET, format!("{}/tenant", self.http_base_url)) + .send()? + .error_from_body()? + .json()?) + } + + pub fn tenant_create( + &self, + new_tenant_id: Option, + settings: HashMap<&str, &str>, + ) -> anyhow::Result { + let mut settings = settings.clone(); + let request = TenantCreateRequest { + new_tenant_id, + checkpoint_distance: settings + .remove("checkpoint_distance") + .map(|x| x.parse::()) + .transpose()?, + checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()), + compaction_target_size: settings + .remove("compaction_target_size") + .map(|x| x.parse::()) + .transpose()?, + compaction_period: settings.remove("compaction_period").map(|x| x.to_string()), + compaction_threshold: settings + .remove("compaction_threshold") + .map(|x| x.parse::()) + .transpose()?, + gc_horizon: settings + .remove("gc_horizon") + .map(|x| x.parse::()) + .transpose()?, + gc_period: settings.remove("gc_period").map(|x| x.to_string()), + image_creation_threshold: settings + .remove("image_creation_threshold") + .map(|x| x.parse::()) + .transpose()?, + pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), + walreceiver_connect_timeout: settings + .remove("walreceiver_connect_timeout") + .map(|x| x.to_string()), + lagging_wal_timeout: settings + .remove("lagging_wal_timeout") + .map(|x| x.to_string()), + max_lsn_wal_lag: settings + .remove("max_lsn_wal_lag") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, + trace_read_requests: settings + .remove("trace_read_requests") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'trace_read_requests' as bool")?, + }; + if !settings.is_empty() { + bail!("Unrecognized tenant settings: {settings:?}") + } + self.http_request(Method::POST, format!("{}/tenant", self.http_base_url)) + .json(&request) + .send()? + .error_from_body()? + .json::>() + .with_context(|| { + format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}") + })? + .context("No tenant id was found in the tenant creation response") + .and_then(|tenant_id_string| { + tenant_id_string.parse().with_context(|| { + format!("Failed to parse response string as tenant id: '{tenant_id_string}'") + }) + }) + } + + pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> { + self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url)) + .json(&TenantConfigRequest { + tenant_id, + checkpoint_distance: settings + .get("checkpoint_distance") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'checkpoint_distance' as an integer")?, + checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()), + compaction_target_size: settings + .get("compaction_target_size") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_target_size' as an integer")?, + compaction_period: settings.get("compaction_period").map(|x| x.to_string()), + compaction_threshold: settings + .get("compaction_threshold") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_threshold' as an integer")?, + gc_horizon: settings + .get("gc_horizon") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_horizon' as an integer")?, + gc_period: settings.get("gc_period").map(|x| x.to_string()), + image_creation_threshold: settings + .get("image_creation_threshold") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'image_creation_threshold' as non zero integer")?, + pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()), + walreceiver_connect_timeout: settings + .get("walreceiver_connect_timeout") + .map(|x| x.to_string()), + lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()), + max_lsn_wal_lag: settings + .get("max_lsn_wal_lag") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, + trace_read_requests: settings + .get("trace_read_requests") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'trace_read_requests' as bool")?, + }) + .send()? + .error_from_body()?; + + Ok(()) + } + + pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result> { + let timeline_infos: Vec = self + .http_request( + Method::GET, + format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), + ) + .send()? + .error_from_body()? + .json()?; + + Ok(timeline_infos) + } + + pub fn timeline_create( + &self, + tenant_id: TenantId, + new_timeline_id: Option, + ancestor_start_lsn: Option, + ancestor_timeline_id: Option, + pg_version: Option, + ) -> anyhow::Result { + self.http_request( + Method::POST, + format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), + ) + .json(&TimelineCreateRequest { + new_timeline_id, + ancestor_start_lsn, + ancestor_timeline_id, + pg_version, + }) + .send()? + .error_from_body()? + .json::>() + .with_context(|| { + format!("Failed to parse timeline creation response for tenant id: {tenant_id}") + })? + .with_context(|| { + format!( + "No timeline id was found in the timeline creation response for tenant {tenant_id}" + ) + }) + } + + /// Import a basebackup prepared using either: + /// a) `pg_basebackup -F tar`, or + /// b) The `fullbackup` pageserver endpoint + /// + /// # Arguments + /// * `tenant_id` - tenant to import into. Created if not exists + /// * `timeline_id` - id to assign to imported timeline + /// * `base` - (start lsn of basebackup, path to `base.tar` file) + /// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`) + pub fn timeline_import( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + base: (Lsn, PathBuf), + pg_wal: Option<(Lsn, PathBuf)>, + pg_version: u32, + ) -> anyhow::Result<()> { + let mut client = self.pg_connection_config.connect_no_tls().unwrap(); + + // Init base reader + let (start_lsn, base_tarfile_path) = base; + let base_tarfile = File::open(base_tarfile_path)?; + let mut base_reader = BufReader::new(base_tarfile); + + // Init wal reader if necessary + let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal { + let wal_tarfile = File::open(wal_tarfile_path)?; + let wal_reader = BufReader::new(wal_tarfile); + (end_lsn, Some(wal_reader)) + } else { + (start_lsn, None) + }; + + // Import base + let import_cmd = format!( + "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}" + ); + let mut writer = client.copy_in(&import_cmd)?; + io::copy(&mut base_reader, &mut writer)?; + writer.finish()?; + + // Import wal if necessary + if let Some(mut wal_reader) = wal_reader { + let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"); + let mut writer = client.copy_in(&import_cmd)?; + io::copy(&mut wal_reader, &mut writer)?; + writer.finish()?; + } + + Ok(()) + } +} diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs index 83765b2c95..34dc769e78 100644 --- a/control_plane/src/postgresql_conf.rs +++ b/control_plane/src/postgresql_conf.rs @@ -2,10 +2,10 @@ /// Module for parsing postgresql.conf file. /// /// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just -/// enough to extract a few settings we need in Zenith, assuming you don't do +/// enough to extract a few settings we need in Neon, assuming you don't do /// funny stuff like include-directives or funny escaping. use anyhow::{bail, Context, Result}; -use lazy_static::lazy_static; +use once_cell::sync::Lazy; use regex::Regex; use std::collections::HashMap; use std::fmt; @@ -19,9 +19,7 @@ pub struct PostgresConf { hash: HashMap, } -lazy_static! { - static ref CONF_LINE_RE: Regex = Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap(); -} +static CONF_LINE_RE: Lazy = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap()); impl PostgresConf { pub fn new() -> PostgresConf { @@ -139,10 +137,10 @@ fn escape_str(s: &str) -> String { // // This regex is a bit more conservative than the rules in guc-file.l, so we quote some // strings that PostgreSQL would accept without quoting, but that's OK. - lazy_static! { - static ref UNQUOTED_RE: Regex = - Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap(); - } + + static UNQUOTED_RE: Lazy = + Lazy::new(|| Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap()); + if UNQUOTED_RE.is_match(s) { s.to_string() } else { diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index f5478b5922..0bc35b3680 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -1,25 +1,21 @@ use std::io::Write; -use std::net::TcpStream; use std::path::PathBuf; -use std::process::Command; +use std::process::Child; use std::sync::Arc; -use std::time::Duration; -use std::{io, result, thread}; +use std::{io, result}; -use anyhow::bail; -use nix::errno::Errno; -use nix::sys::signal::{kill, Signal}; -use nix::unistd::Pid; -use postgres::Config; +use anyhow::Context; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; -use zenith_utils::http::error::HttpErrorBody; +use utils::{http::error::HttpErrorBody, id::NodeId}; -use crate::local_env::{LocalEnv, SafekeeperConf}; -use crate::storage::PageServerNode; -use crate::{fill_rust_env_vars, read_pidfile}; -use zenith_utils::connstring::connection_address; +use crate::connection::PgConnectionConfig; +use crate::pageserver::PageServerNode; +use crate::{ + background_process, + local_env::{LocalEnv, SafekeeperConf}, +}; #[derive(Error, Debug)] pub enum SafekeeperHttpError { @@ -43,7 +39,7 @@ impl ResponseErrorMessageExt for Response { return Ok(self); } - // reqwest do not export it's error construction utility functions, so lets craft the message ourselves + // reqwest does not export its error construction utility functions, so let's craft the message ourselves let url = self.url().to_owned(); Err(SafekeeperHttpError::Response( match self.json::() { @@ -61,11 +57,11 @@ impl ResponseErrorMessageExt for Response { // #[derive(Debug)] pub struct SafekeeperNode { - pub name: String, + pub id: NodeId, pub conf: SafekeeperConf, - pub pg_connection_config: Config, + pub pg_connection_config: PgConnectionConfig, pub env: LocalEnv, pub http_client: Client, pub http_base_url: String, @@ -77,10 +73,8 @@ impl SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { let pageserver = Arc::new(PageServerNode::from_env(env)); - println!("initializing for {} for {}", conf.name, conf.http_port); - SafekeeperNode { - name: conf.name.clone(), + id: conf.id, conf: conf.clone(), pg_connection_config: Self::safekeeper_connection_config(conf.pg_port), env: env.clone(), @@ -91,85 +85,97 @@ impl SafekeeperNode { } /// Construct libpq connection string for connecting to this safekeeper. - fn safekeeper_connection_config(port: u16) -> Config { + fn safekeeper_connection_config(port: u16) -> PgConnectionConfig { // TODO safekeeper authentication not implemented yet - format!("postgresql://no_user@127.0.0.1:{}/no_db", port) + format!("postgresql://no_user@127.0.0.1:{port}/no_db") .parse() .unwrap() } + pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf { + env.safekeeper_data_dir(&format!("sk{sk_id}")) + } + pub fn datadir_path(&self) -> PathBuf { - self.env.safekeeper_data_dir(&self.name) + SafekeeperNode::datadir_path_by_id(&self.env, self.id) } pub fn pid_file(&self) -> PathBuf { self.datadir_path().join("safekeeper.pid") } - pub fn start(&self) -> anyhow::Result<()> { + pub fn start(&self) -> anyhow::Result { print!( "Starting safekeeper at '{}' in '{}'", - connection_address(&self.pg_connection_config), + self.pg_connection_config.raw_address(), self.datadir_path().display() ); io::stdout().flush().unwrap(); let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port); let listen_http = format!("127.0.0.1:{}", self.conf.http_port); + let id = self.id; + let datadir = self.datadir_path(); - let mut cmd = Command::new(self.env.safekeeper_bin()?); - fill_rust_env_vars( - cmd.args(&["-D", self.datadir_path().to_str().unwrap()]) - .args(&["--listen-pg", &listen_pg]) - .args(&["--listen-http", &listen_http]) - .args(&["--recall", "1 second"]) - .arg("--daemonize"), - ); + let id_string = id.to_string(); + let mut args = vec![ + "-D", + datadir.to_str().with_context(|| { + format!("Datadir path {datadir:?} cannot be represented as a unicode string") + })?, + "--id", + &id_string, + "--listen-pg", + &listen_pg, + "--listen-http", + &listen_http, + ]; if !self.conf.sync { - cmd.arg("--no-sync"); + args.push("--no-sync"); } - if !cmd.status()?.success() { - bail!( - "Safekeeper failed to start. See '{}' for details.", - self.datadir_path().join("safekeeper.log").display() - ); + let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints(); + if !comma_separated_endpoints.is_empty() { + args.extend(["--broker-endpoints", &comma_separated_endpoints]); + } + if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() { + args.extend(["--broker-etcd-prefix", prefix]); } - // It takes a while for the safekeeper to start up. Wait until it is - // open for business. - const RETRIES: i8 = 15; - for retries in 1..RETRIES { - match self.check_status() { - Ok(_) => { - println!("\nSafekeeper started"); - return Ok(()); - } - Err(err) => { - match err { - SafekeeperHttpError::Transport(err) => { - if err.is_connect() && retries < 5 { - print!("."); - io::stdout().flush().unwrap(); - } else { - if retries == 5 { - println!() // put a line break after dots for second message - } - println!( - "Safekeeper not responding yet, err {} retrying ({})...", - err, retries - ); - } - } - SafekeeperHttpError::Response(msg) => { - bail!("safekeeper failed to start: {} ", msg) - } - } - thread::sleep(Duration::from_secs(1)); - } - } + let mut backup_threads = String::new(); + if let Some(threads) = self.conf.backup_threads { + backup_threads = threads.to_string(); + args.extend(["--backup-threads", &backup_threads]); + } else { + drop(backup_threads); } - bail!("safekeeper failed to start in {} seconds", RETRIES); + + if let Some(ref remote_storage) = self.conf.remote_storage { + args.extend(["--remote-storage", remote_storage]); + } + + let key_path = self.env.base_data_dir.join("auth_public_key.pem"); + if self.conf.auth_enabled { + args.extend([ + "--auth-validation-public-key-path", + key_path.to_str().with_context(|| { + format!("Key path {key_path:?} cannot be represented as a unicode string") + })?, + ]); + } + + background_process::start_process( + &format!("safekeeper {id}"), + &datadir, + &self.env.safekeeper_bin(), + &args, + background_process::InitialPidFile::Expect(&self.pid_file()), + || match self.check_status() { + Ok(()) => Ok(true), + Err(SafekeeperHttpError::Transport(_)) => Ok(false), + Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")), + }, + ) } /// @@ -181,69 +187,16 @@ impl SafekeeperNode { /// If the server is not running, returns success /// pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { - let pid_file = self.pid_file(); - if !pid_file.exists() { - println!("Safekeeper {} is already stopped", self.name); - return Ok(()); - } - let pid = read_pidfile(&pid_file)?; - let pid = Pid::from_raw(pid); - - let sig = if immediate { - println!("Stop safekeeper immediately"); - Signal::SIGQUIT - } else { - println!("Stop safekeeper gracefully"); - Signal::SIGTERM - }; - match kill(pid, sig) { - Ok(_) => (), - Err(Errno::ESRCH) => { - println!( - "Safekeeper with pid {} does not exist, but a PID file was found", - pid - ); - return Ok(()); - } - Err(err) => bail!( - "Failed to send signal to safekeeper with pid {}: {}", - pid, - err.desc() - ), - } - - let address = connection_address(&self.pg_connection_config); - - // TODO Remove this "timeout" and handle it on caller side instead. - // Shutting down may take a long time, - // if safekeeper flushes a lot of data - for _ in 0..100 { - if let Err(_e) = TcpStream::connect(&address) { - println!("Safekeeper stopped receiving connections"); - - //Now check status - match self.check_status() { - Ok(_) => { - println!("Safekeeper status is OK. Wait a bit."); - thread::sleep(Duration::from_secs(1)); - } - Err(err) => { - println!("Safekeeper status is: {}", err); - return Ok(()); - } - } - } else { - println!("Safekeeper still receives connections"); - thread::sleep(Duration::from_secs(1)); - } - } - - bail!("Failed to stop safekeeper with pid {}", pid); + background_process::stop_process( + immediate, + &format!("safekeeper {}", self.id), + &self.pid_file(), + ) } fn http_request(&self, method: Method, url: U) -> RequestBuilder { // TODO: authentication - //if self.env.auth_type == AuthType::ZenithJWT { + //if self.env.auth_type == AuthType::NeonJWT { // builder = builder.bearer_auth(&self.env.safekeeper_auth_token) //} self.http_client.request(method, url) diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs deleted file mode 100644 index be594889ab..0000000000 --- a/control_plane/src/storage.rs +++ /dev/null @@ -1,378 +0,0 @@ -use std::io::Write; -use std::net::TcpStream; -use std::path::PathBuf; -use std::process::Command; -use std::time::Duration; -use std::{io, result, thread}; - -use anyhow::bail; -use nix::errno::Errno; -use nix::sys::signal::{kill, Signal}; -use nix::unistd::Pid; -use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest}; -use postgres::{Config, NoTls}; -use reqwest::blocking::{Client, RequestBuilder, Response}; -use reqwest::{IntoUrl, Method}; -use thiserror::Error; -use zenith_utils::http::error::HttpErrorBody; -use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::ZTenantId; - -use crate::local_env::LocalEnv; -use crate::{fill_rust_env_vars, read_pidfile}; -use pageserver::branches::BranchInfo; -use pageserver::tenant_mgr::TenantInfo; -use zenith_utils::connstring::connection_address; - -#[derive(Error, Debug)] -pub enum PageserverHttpError { - #[error("Reqwest error: {0}")] - Transport(#[from] reqwest::Error), - - #[error("Error: {0}")] - Response(String), -} - -type Result = result::Result; - -pub trait ResponseErrorMessageExt: Sized { - fn error_from_body(self) -> Result; -} - -impl ResponseErrorMessageExt for Response { - fn error_from_body(self) -> Result { - let status = self.status(); - if !(status.is_client_error() || status.is_server_error()) { - return Ok(self); - } - - // reqwest do not export it's error construction utility functions, so lets craft the message ourselves - let url = self.url().to_owned(); - Err(PageserverHttpError::Response( - match self.json::() { - Ok(err_body) => format!("Error: {}", err_body.msg), - Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), - }, - )) - } -} - -// -// Control routines for pageserver. -// -// Used in CLI and tests. -// -#[derive(Debug)] -pub struct PageServerNode { - pub pg_connection_config: Config, - pub env: LocalEnv, - pub http_client: Client, - pub http_base_url: String, -} - -impl PageServerNode { - pub fn from_env(env: &LocalEnv) -> PageServerNode { - let password = if env.pageserver.auth_type == AuthType::ZenithJWT { - &env.pageserver.auth_token - } else { - "" - }; - - Self { - pg_connection_config: Self::pageserver_connection_config( - password, - &env.pageserver.listen_pg_addr, - ), - env: env.clone(), - http_client: Client::new(), - http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr), - } - } - - /// Construct libpq connection string for connecting to the pageserver. - fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config { - format!("postgresql://no_user:{}@{}/no_db", password, listen_addr) - .parse() - .unwrap() - } - - pub fn init( - &self, - create_tenant: Option<&str>, - config_overrides: &[&str], - ) -> anyhow::Result<()> { - let mut cmd = Command::new(self.env.pageserver_bin()?); - - // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. - let base_data_dir_param = self.env.base_data_dir.display().to_string(); - let pg_distrib_dir_param = - format!("pg_distrib_dir='{}'", self.env.pg_distrib_dir.display()); - let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type); - let listen_http_addr_param = format!( - "listen_http_addr='{}'", - self.env.pageserver.listen_http_addr - ); - let listen_pg_addr_param = - format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr); - let mut args = Vec::with_capacity(20); - - args.push("--init"); - args.extend(["-D", &base_data_dir_param]); - args.extend(["-c", &pg_distrib_dir_param]); - args.extend(["-c", &authg_type_param]); - args.extend(["-c", &listen_http_addr_param]); - args.extend(["-c", &listen_pg_addr_param]); - - for config_override in config_overrides { - args.extend(["-c", config_override]); - } - - if self.env.pageserver.auth_type != AuthType::Trust { - args.extend([ - "-c", - "auth_validation_public_key_path='auth_public_key.pem'", - ]); - } - - if let Some(tenantid) = create_tenant { - args.extend(["--create-tenant", tenantid]) - } - - let status = fill_rust_env_vars(cmd.args(args)) - .status() - .expect("pageserver init failed"); - - if !status.success() { - bail!("pageserver init failed"); - } - - Ok(()) - } - - pub fn repo_path(&self) -> PathBuf { - self.env.pageserver_data_dir() - } - - pub fn pid_file(&self) -> PathBuf { - self.repo_path().join("pageserver.pid") - } - - pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> { - print!( - "Starting pageserver at '{}' in '{}'", - connection_address(&self.pg_connection_config), - self.repo_path().display() - ); - io::stdout().flush().unwrap(); - - let mut cmd = Command::new(self.env.pageserver_bin()?); - - let repo_path = self.repo_path(); - let mut args = vec!["-D", repo_path.to_str().unwrap()]; - - for config_override in config_overrides { - args.extend(["-c", config_override]); - } - - fill_rust_env_vars(cmd.args(&args).arg("--daemonize")); - - if !cmd.status()?.success() { - bail!( - "Pageserver failed to start. See '{}' for details.", - self.repo_path().join("pageserver.log").display() - ); - } - - // It takes a while for the page server to start up. Wait until it is - // open for business. - const RETRIES: i8 = 15; - for retries in 1..RETRIES { - match self.check_status() { - Ok(_) => { - println!("\nPageserver started"); - return Ok(()); - } - Err(err) => { - match err { - PageserverHttpError::Transport(err) => { - if err.is_connect() && retries < 5 { - print!("."); - io::stdout().flush().unwrap(); - } else { - if retries == 5 { - println!() // put a line break after dots for second message - } - println!( - "Pageserver not responding yet, err {} retrying ({})...", - err, retries - ); - } - } - PageserverHttpError::Response(msg) => { - bail!("pageserver failed to start: {} ", msg) - } - } - thread::sleep(Duration::from_secs(1)); - } - } - } - bail!("pageserver failed to start in {} seconds", RETRIES); - } - - /// - /// Stop the server. - /// - /// If 'immediate' is true, we use SIGQUIT, killing the process immediately. - /// Otherwise we use SIGTERM, triggering a clean shutdown - /// - /// If the server is not running, returns success - /// - pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { - let pid_file = self.pid_file(); - if !pid_file.exists() { - println!("Pageserver is already stopped"); - return Ok(()); - } - let pid = Pid::from_raw(read_pidfile(&pid_file)?); - - let sig = if immediate { - println!("Stop pageserver immediately"); - Signal::SIGQUIT - } else { - println!("Stop pageserver gracefully"); - Signal::SIGTERM - }; - match kill(pid, sig) { - Ok(_) => (), - Err(Errno::ESRCH) => { - println!( - "Pageserver with pid {} does not exist, but a PID file was found", - pid - ); - return Ok(()); - } - Err(err) => bail!( - "Failed to send signal to pageserver with pid {}: {}", - pid, - err.desc() - ), - } - - let address = connection_address(&self.pg_connection_config); - - // TODO Remove this "timeout" and handle it on caller side instead. - // Shutting down may take a long time, - // if pageserver checkpoints a lot of data - for _ in 0..100 { - if let Err(_e) = TcpStream::connect(&address) { - println!("Pageserver stopped receiving connections"); - - //Now check status - match self.check_status() { - Ok(_) => { - println!("Pageserver status is OK. Wait a bit."); - thread::sleep(Duration::from_secs(1)); - } - Err(err) => { - println!("Pageserver status is: {}", err); - return Ok(()); - } - } - } else { - println!("Pageserver still receives connections"); - thread::sleep(Duration::from_secs(1)); - } - } - - bail!("Failed to stop pageserver with pid {}", pid); - } - - pub fn page_server_psql(&self, sql: &str) -> Vec { - let mut client = self.pg_connection_config.connect(NoTls).unwrap(); - - println!("Pageserver query: '{}'", sql); - client.simple_query(sql).unwrap() - } - - pub fn page_server_psql_client(&self) -> result::Result { - self.pg_connection_config.connect(NoTls) - } - - fn http_request(&self, method: Method, url: U) -> RequestBuilder { - let mut builder = self.http_client.request(method, url); - if self.env.pageserver.auth_type == AuthType::ZenithJWT { - builder = builder.bearer_auth(&self.env.pageserver.auth_token) - } - builder - } - - pub fn check_status(&self) -> Result<()> { - self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status")) - .send()? - .error_from_body()?; - Ok(()) - } - - pub fn tenant_list(&self) -> Result> { - Ok(self - .http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant")) - .send()? - .error_from_body()? - .json()?) - } - - pub fn tenant_create(&self, tenantid: ZTenantId) -> Result<()> { - Ok(self - .http_request(Method::POST, format!("{}/{}", self.http_base_url, "tenant")) - .json(&TenantCreateRequest { - tenant_id: tenantid, - }) - .send()? - .error_from_body()? - .json()?) - } - - pub fn branch_list(&self, tenantid: &ZTenantId) -> Result> { - Ok(self - .http_request( - Method::GET, - format!("{}/branch/{}", self.http_base_url, tenantid), - ) - .send()? - .error_from_body()? - .json()?) - } - - pub fn branch_create( - &self, - branch_name: &str, - startpoint: &str, - tenantid: &ZTenantId, - ) -> Result { - Ok(self - .http_request(Method::POST, format!("{}/branch", self.http_base_url)) - .json(&BranchCreateRequest { - tenant_id: tenantid.to_owned(), - name: branch_name.to_owned(), - start_point: startpoint.to_owned(), - }) - .send()? - .error_from_body()? - .json()?) - } - - pub fn branch_get_by_name( - &self, - tenantid: &ZTenantId, - branch_name: &str, - ) -> Result { - Ok(self - .http_request( - Method::GET, - format!("{}/branch/{}/{}", self.http_base_url, tenantid, branch_name), - ) - .send()? - .error_for_status()? - .json()?) - } -} diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile new file mode 100644 index 0000000000..f1b1986072 --- /dev/null +++ b/docker-compose/compute_wrapper/Dockerfile @@ -0,0 +1,13 @@ +ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG COMPUTE_IMAGE=compute-node-v14 +ARG TAG=latest + +FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG + +USER root +RUN apt-get update && \ + apt-get install -y curl \ + jq \ + netcat + +USER postgres diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh new file mode 100755 index 0000000000..cef2b485f3 --- /dev/null +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -eux + +PG_VERSION=${PG_VERSION:-14} + +SPEC_FILE_ORG=/var/db/postgres/specs/spec.json +SPEC_FILE=/tmp/spec.json + +echo "Waiting pageserver become ready." +while ! nc -z pageserver 6400; do + sleep 1; +done +echo "Page server is ready." + +echo "Create a tenant and timeline" +PARAMS=( + -sb + -X POST + -H "Content-Type: application/json" + -d "{}" + http://pageserver:9898/v1/tenant/ +) +tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g') + +PARAMS=( + -sb + -X POST + -H "Content-Type: application/json" + -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}" + "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/" +) +result=$(curl "${PARAMS[@]}") +echo $result | jq . + +echo "Overwrite tenant id and timeline id in spec file" +tenant_id=$(echo ${result} | jq -r .tenant_id) +timeline_id=$(echo ${result} | jq -r .timeline_id) + +sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE} +sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE} + +cat ${SPEC_FILE} + +echo "Start compute node" +/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \ + -C "postgresql://cloud_admin@localhost:55433/postgres" \ + -b /usr/local/bin/postgres \ + -S ${SPEC_FILE} diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json new file mode 100644 index 0000000000..10ae0b0ecf --- /dev/null +++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json @@ -0,0 +1,141 @@ +{ + "format_version": 1.0, + + "timestamp": "2022-10-12T18:00:00.000Z", + "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c", + + "cluster": { + "cluster_id": "docker_compose", + "name": "docker_compose_test", + "state": "restarted", + "roles": [ + { + "name": "cloud_admin", + "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8", + "options": null + } + ], + "databases": [ + ], + "settings": [ + { + "name": "fsync", + "value": "off", + "vartype": "bool" + }, + { + "name": "wal_level", + "value": "replica", + "vartype": "enum" + }, + { + "name": "hot_standby", + "value": "on", + "vartype": "bool" + }, + { + "name": "wal_log_hints", + "value": "on", + "vartype": "bool" + }, + { + "name": "log_connections", + "value": "on", + "vartype": "bool" + }, + { + "name": "port", + "value": "55433", + "vartype": "integer" + }, + { + "name": "shared_buffers", + "value": "1MB", + "vartype": "string" + }, + { + "name": "max_connections", + "value": "100", + "vartype": "integer" + }, + { + "name": "listen_addresses", + "value": "0.0.0.0", + "vartype": "string" + }, + { + "name": "max_wal_senders", + "value": "10", + "vartype": "integer" + }, + { + "name": "max_replication_slots", + "value": "10", + "vartype": "integer" + }, + { + "name": "wal_sender_timeout", + "value": "5s", + "vartype": "string" + }, + { + "name": "wal_keep_size", + "value": "0", + "vartype": "integer" + }, + { + "name": "password_encryption", + "value": "md5", + "vartype": "enum" + }, + { + "name": "restart_after_crash", + "value": "off", + "vartype": "bool" + }, + { + "name": "synchronous_standby_names", + "value": "walproposer", + "vartype": "string" + }, + { + "name": "shared_preload_libraries", + "value": "neon", + "vartype": "string" + }, + { + "name": "neon.safekeepers", + "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454", + "vartype": "string" + }, + { + "name": "neon.timeline_id", + "value": "TIMELINE_ID", + "vartype": "string" + }, + { + "name": "neon.tenant_id", + "value": "TENANT_ID", + "vartype": "string" + }, + { + "name": "neon.pageserver_connstring", + "value": "host=pageserver port=6400", + "vartype": "string" + }, + { + "name": "max_replication_write_lag", + "value": "500MB", + "vartype": "string" + }, + { + "name": "max_replication_flush_lag", + "value": "10GB", + "vartype": "string" + } + ] + }, + + "delta_operations": [ + ] +} diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml new file mode 100644 index 0000000000..61b53dba41 --- /dev/null +++ b/docker-compose/docker-compose.yml @@ -0,0 +1,209 @@ +version: '3' + +services: + etcd: + restart: always + image: quay.io/coreos/etcd:v3.5.4 + ports: + - 2379:2379 + - 2380:2380 + environment: + # This signifficantly speeds up etcd and we anyway don't data persistency there. + ETCD_UNSAFE_NO_FSYNC: "1" + command: + - "etcd" + - "--auto-compaction-mode=revision" + - "--auto-compaction-retention=1" + - "--name=etcd-cluster" + - "--initial-cluster-state=new" + - "--initial-cluster-token=etcd-cluster-1" + - "--initial-cluster=etcd-cluster=http://etcd:2380" + - "--initial-advertise-peer-urls=http://etcd:2380" + - "--advertise-client-urls=http://etcd:2379" + - "--listen-client-urls=http://0.0.0.0:2379" + - "--listen-peer-urls=http://0.0.0.0:2380" + - "--quota-backend-bytes=134217728" # 128 MB + + minio: + restart: always + image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z + ports: + - 9000:9000 + - 9001:9001 + environment: + - MINIO_ROOT_USER=minio + - MINIO_ROOT_PASSWORD=password + command: server /data --address :9000 --console-address ":9001" + + minio_create_buckets: + image: minio/mc + environment: + - MINIO_ROOT_USER=minio + - MINIO_ROOT_PASSWORD=password + entrypoint: + - "/bin/sh" + - "-c" + command: + - "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do + echo 'Waiting to start minio...' && sleep 1; + done; + /usr/bin/mc mb minio/neon --region=eu-north-1; + exit 0;" + depends_on: + - minio + + pageserver: + restart: always + image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} + environment: + - BROKER_ENDPOINT='http://etcd:2379' + - AWS_ACCESS_KEY_ID=minio + - AWS_SECRET_ACCESS_KEY=password + #- RUST_BACKTRACE=1 + ports: + #- 6400:6400 # pg protocol handler + - 9898:9898 # http endpoints + entrypoint: + - "/bin/sh" + - "-c" + command: + - "/usr/local/bin/pageserver -D /data/.neon/ + -c \"broker_endpoints=[$$BROKER_ENDPOINT]\" + -c \"listen_pg_addr='0.0.0.0:6400'\" + -c \"listen_http_addr='0.0.0.0:9898'\" + -c \"remote_storage={endpoint='http://minio:9000', + bucket_name='neon', + bucket_region='eu-north-1', + prefix_in_bucket='/pageserver/'}\"" + depends_on: + - etcd + - minio_create_buckets + + safekeeper1: + restart: always + image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} + environment: + - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454 + - SAFEKEEPER_ID=1 + - BROKER_ENDPOINT=http://etcd:2379 + - AWS_ACCESS_KEY_ID=minio + - AWS_SECRET_ACCESS_KEY=password + #- RUST_BACKTRACE=1 + ports: + #- 5454:5454 # pg protocol handler + - 7676:7676 # http endpoints + entrypoint: + - "/bin/sh" + - "-c" + command: + - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL + --listen-http='0.0.0.0:7676' + --id=$$SAFEKEEPER_ID + --broker-endpoints=$$BROKER_ENDPOINT + -D /data + --remote-storage=\"{endpoint='http://minio:9000', + bucket_name='neon', + bucket_region='eu-north-1', + prefix_in_bucket='/safekeeper/'}\"" + depends_on: + - etcd + - minio_create_buckets + + safekeeper2: + restart: always + image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} + environment: + - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454 + - SAFEKEEPER_ID=2 + - BROKER_ENDPOINT=http://etcd:2379 + - AWS_ACCESS_KEY_ID=minio + - AWS_SECRET_ACCESS_KEY=password + #- RUST_BACKTRACE=1 + ports: + #- 5454:5454 # pg protocol handler + - 7677:7676 # http endpoints + entrypoint: + - "/bin/sh" + - "-c" + command: + - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL + --listen-http='0.0.0.0:7676' + --id=$$SAFEKEEPER_ID + --broker-endpoints=$$BROKER_ENDPOINT + -D /data + --remote-storage=\"{endpoint='http://minio:9000', + bucket_name='neon', + bucket_region='eu-north-1', + prefix_in_bucket='/safekeeper/'}\"" + depends_on: + - etcd + - minio_create_buckets + + safekeeper3: + restart: always + image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} + environment: + - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454 + - SAFEKEEPER_ID=3 + - BROKER_ENDPOINT=http://etcd:2379 + - AWS_ACCESS_KEY_ID=minio + - AWS_SECRET_ACCESS_KEY=password + #- RUST_BACKTRACE=1 + ports: + #- 5454:5454 # pg protocol handler + - 7678:7676 # http endpoints + entrypoint: + - "/bin/sh" + - "-c" + command: + - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL + --listen-http='0.0.0.0:7676' + --id=$$SAFEKEEPER_ID + --broker-endpoints=$$BROKER_ENDPOINT + -D /data + --remote-storage=\"{endpoint='http://minio:9000', + bucket_name='neon', + bucket_region='eu-north-1', + prefix_in_bucket='/safekeeper/'}\"" + depends_on: + - etcd + - minio_create_buckets + + compute: + restart: always + build: + context: ./compute_wrapper/ + args: + - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14} + - TAG=${TAG:-latest} + - http_proxy=$http_proxy + - https_proxy=$https_proxy + environment: + - PG_VERSION=${PG_VERSION:-14} + #- RUST_BACKTRACE=1 + # Mount the test files directly, for faster editing cycle. + volumes: + - ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/ + - ./compute_wrapper/shell/:/shell/ + ports: + - 55433:55433 # pg protocol handler + - 3080:3080 # http endpoints + entrypoint: + - "/shell/compute.sh" + depends_on: + - safekeeper1 + - safekeeper2 + - safekeeper3 + - pageserver + + compute_is_ready: + image: postgres:latest + entrypoint: + - "/bin/bash" + - "-c" + command: + - "until pg_isready -h compute -p 55433 ; do + echo 'Waiting to start compute...' && sleep 1; + done" + depends_on: + - compute diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh new file mode 100755 index 0000000000..9de5277bf1 --- /dev/null +++ b/docker-compose/docker_compose_test.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +# A basic test to ensure Docker images are built correctly. +# Build a wrapper around the compute, start all services and runs a simple SQL query. +# Repeats the process for all currenly supported Postgres versions. + +# Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file +# Their defaults point at DockerHub `neondatabase/neon:latest` image.`, +# to verify custom image builds (e.g pre-published ones). + +# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer. + +set -eux -o pipefail + +SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml + +COMPUTE_CONTAINER_NAME=docker-compose-compute-1 +SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;" +PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres" + +cleanup() { + echo "show container information" + docker ps + docker compose -f $COMPOSE_FILE logs + echo "stop containers..." + docker compose -f $COMPOSE_FILE down +} + +echo "clean up containers if exists" +cleanup + +for pg_version in 14 15; do + echo "start containers (pg_version=$pg_version)." + PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d + + echo "wait until the compute is ready. timeout after 60s. " + cnt=0 + while sleep 1; do + # check timeout + cnt=`expr $cnt + 1` + if [ $cnt -gt 60 ]; then + echo "timeout before the compute is ready." + cleanup + exit 1 + fi + + # check if the compute is ready + set +o pipefail + result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l` + set -o pipefail + if [ $result -eq 1 ]; then + echo "OK. The compute is ready to connect." + echo "execute simple queries." + docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION" + cleanup + break + fi + done +done diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh deleted file mode 100755 index 45c41b4c19..0000000000 --- a/docker-entrypoint.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/sh -set -eux - -if [ "$1" = 'pageserver' ]; then - if [ ! -d "/data/tenants" ]; then - echo "Initializing pageserver data directory" - pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" - fi - echo "Staring pageserver at 0.0.0.0:6400" - pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /data -else - "$@" -fi diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000000..7585238efe --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +book diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 0558fa24a8..0000000000 --- a/docs/README.md +++ /dev/null @@ -1,14 +0,0 @@ -# Zenith documentation - -## Table of contents - -- [authentication.md](authentication.md) — pageserver JWT authentication. -- [docker.md](docker.md) — Docker images and building pipeline. -- [glossary.md](glossary.md) — Glossary of all the terms used in codebase. -- [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI. -- [sourcetree.md](sourcetree.md) — Overview of the source tree layeout. -- [pageserver/README](/pageserver/README) — pageserver overview. -- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview. -- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview. -- [walkeeper/README](/walkeeper/README) — WAL service overview. -- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md new file mode 100644 index 0000000000..faf2b2336f --- /dev/null +++ b/docs/SUMMARY.md @@ -0,0 +1,85 @@ +# Summary + +[Introduction]() +- [Separation of Compute and Storage](./separation-compute-storage.md) + +# Architecture + +- [Compute]() + - [WAL proposer]() + - [WAL Backpressure]() + - [Postgres changes](./core_changes.md) + +- [Pageserver](./pageserver.md) + - [Services](./pageserver-services.md) + - [Thread management](./pageserver-thread-mgmt.md) + - [WAL Redo](./pageserver-walredo.md) + - [Page cache](./pageserver-pagecache.md) + - [Storage](./pageserver-storage.md) + - [Datadir mapping]() + - [Layer files]() + - [Branching]() + - [Garbage collection]() + - [Cloud Storage]() + - [Processing a GetPage request](./pageserver-processing-getpage.md) + - [Processing WAL](./pageserver-processing-wal.md) + - [Management API]() + - [Tenant Rebalancing]() + +- [WAL Service](walservice.md) + - [Consensus protocol](safekeeper-protocol.md) + - [Management API]() + - [Rebalancing]() + +- [Control Plane]() + +- [Proxy]() + +- [Source view](./sourcetree.md) + - [docker.md](./docker.md) — Docker images and building pipeline. + - [Error handling and logging]() + - [Testing]() + - [Unit testing]() + - [Integration testing]() + - [Benchmarks]() + + +- [Glossary](./glossary.md) + +# Uncategorized + +- [authentication.md](./authentication.md) +- [multitenancy.md](./multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI. +- [settings.md](./settings.md) +#FIXME: move these under sourcetree.md +#- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) +#- [test_runner/README.md](/test_runner/README.md) + + +# RFCs + +- [RFCs](./rfcs/README.md) + +- [002-storage](rfcs/002-storage.md) +- [003-laptop-cli](rfcs/003-laptop-cli.md) +- [004-durability](rfcs/004-durability.md) +- [005-zenith_local](rfcs/005-zenith_local.md) +- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md) +- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md) +- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md) +- [008-push-pull](rfcs/008-push-pull.md) +- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md) +- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md) +- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md) +- [010-storage_details](rfcs/010-storage_details.md) +- [011-retention-policy](rfcs/011-retention-policy.md) +- [012-background-tasks](rfcs/012-background-tasks.md) +- [013-term-history](rfcs/013-term-history.md) +- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md) +- [014-storage-lsm](rfcs/014-storage-lsm.md) +- [015-storage-messaging](rfcs/015-storage-messaging.md) +- [016-connection-routing](rfcs/016-connection-routing.md) +- [017-timeline-data-management](rfcs/017-timeline-data-management.md) +- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md) +- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md) +- [cluster-size-limits](rfcs/cluster-size-limits.md) diff --git a/docs/authentication.md b/docs/authentication.md index de408624ae..9748a7ab0d 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -2,14 +2,14 @@ ### Overview -Current state of authentication includes usage of JWT tokens in communication between compute and pageserver and between CLI and pageserver. JWT token is signed using RSA keys. CLI generates a key pair during call to `zenith init`. Using following openssl commands: +Current state of authentication includes usage of JWT tokens in communication between compute and pageserver and between CLI and pageserver. JWT token is signed using RSA keys. CLI generates a key pair during call to `neon_local init`. Using following openssl commands: ```bash openssl genrsa -out private_key.pem 2048 openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem ``` -CLI also generates signed token and saves it in the config for later access to pageserver. Now authentication is optional. Pageserver has two variables in config: `auth_validation_public_key_path` and `auth_type`, so when auth type present and set to `ZenithJWT` pageserver will require authentication for connections. Actual JWT is passed in password field of connection string. There is a caveat for psql, it silently truncates passwords to 100 symbols, so to correctly pass JWT via psql you have to either use PGPASSWORD environment variable, or store password in psql config file. +CLI also generates signed token and saves it in the config for later access to pageserver. Now authentication is optional. Pageserver has two variables in config: `auth_validation_public_key_path` and `auth_type`, so when auth type present and set to `NeonJWT` pageserver will require authentication for connections. Actual JWT is passed in password field of connection string. There is a caveat for psql, it silently truncates passwords to 100 symbols, so to correctly pass JWT via psql you have to either use PGPASSWORD environment variable, or store password in psql config file. Currently there is no authentication between compute and safekeepers, because this communication layer is under heavy refactoring. After this refactoring support for authentication will be added there too. Now safekeeper supports "hardcoded" token passed via environment variable to be able to use callmemaybe command in pageserver. @@ -27,4 +27,4 @@ management_token = jwt.encode({"scope": "pageserverapi"}, auth_keys.priv, algori tenant_token = jwt.encode({"scope": "tenant", "tenant_id": ps.initial_tenant}, auth_keys.priv, algorithm="RS256") ``` -Utility functions to work with jwts in rust are located in zenith_utils/src/auth.rs +Utility functions to work with jwts in rust are located in libs/utils/src/auth.rs diff --git a/docs/book.toml b/docs/book.toml new file mode 100644 index 0000000000..f83ac2a6aa --- /dev/null +++ b/docs/book.toml @@ -0,0 +1,5 @@ +[book] +language = "en" +multilingual = false +src = "." +title = "Neon architecture" diff --git a/docs/core_changes.md b/docs/core_changes.md index db311e3667..ea219adae9 100644 --- a/docs/core_changes.md +++ b/docs/core_changes.md @@ -1,202 +1,494 @@ -1. Add t_cid to XLOG record -- Why? - The cmin/cmax on a heap page is a real bummer. I don't see any other way to fix that than bite the bullet and modify the WAL-logging routine to include the cmin/cmax. +# Postgres core changes - To recap, the problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares abut it anymore. +This lists all the changes that have been made to the PostgreSQL +source tree, as a somewhat logical set of patches. The long-term goal +is to eliminate all these changes, by submitting patches to upstream +and refactoring code into extensions, so that you can run unmodified +PostgreSQL against Neon storage. -- Alternatives? - I don't know +In Neon, we run PostgreSQL in the compute nodes, but we also run a special WAL redo process in the +page server. We currently use the same binary for both, with --wal-redo runtime flag to launch it in +the WAL redo mode. Some PostgreSQL changes are needed in the compute node, while others are just for +the WAL redo process. -2. Add PD_WAL_LOGGED. -- Why? - Postgres sometimes writes data to the page before it is wal-logged. If such page ais swapped out, we will loose this change. The problem is currently solved by setting PD_WAL_LOGGED bit in page header. When page without this bit set is written to the SMGR, then it is forced to be written to the WAL as FPI using log_newpage_copy() function. +In addition to core PostgreSQL changes, there is a Neon extension in contrib/neon, to hook into the +smgr interface. Once all the core changes have been submitted to upstream or eliminated some other +way, the extension could live outside the postgres repository and build against vanilla PostgreSQL. - There was wrong assumption that it can happen only during construction of some exotic indexes (like gist). It is not true. The same situation can happen with COPY,VACUUM and when record hint bits are set. +Below is a list of all the PostgreSQL source code changes, categorized into changes needed for +compute, and changes needed for the WAL redo process: -- Discussion: - https://discord.com/channels/869525774699462656/882681420986851359 +# Changes for Compute node -- Alternatives: - Do not store this flag in page header, but associate this bit with shared buffer. Logically it is more correct but in practice we will get not advantages: neither in space, neither in CPU overhead. +## Add t_cid to heap WAL records + +``` + src/backend/access/heap/heapam.c | 26 +- + src/include/access/heapam_xlog.h | 6 +- +``` + +We have added a new t_cid field to heap WAL records. This changes the WAL record format, making Neon WAL format incompatible with vanilla PostgreSQL! + +### Problem we're trying to solve + +The problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works in PostgreSQL, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares about it anymore. But with Neon, we rely on WAL replay to reconstruct the page, even while the original transaction is still running. + +### How to get rid of the patch + +Bite the bullet and submit the patch to PostgreSQL, to add the t_cid to the WAL records. It makes the WAL records larger, which could make this unpopular in the PostgreSQL community. However, it might simplify some logical decoding code; Andres Freund briefly mentioned in PGCon 2022 discussion on Heikki's Neon presentation that logical decoding currently needs to jump through some hoops to reconstruct the same information. -3. XLogReadBufferForRedo not always loads and pins requested buffer. So we need to add extra checks that buffer is really pinned. Also do not use BufferGetBlockNumber for buffer returned by XLogReadBufferForRedo. -- Why? - XLogReadBufferForRedo is not pinning pages which are not requested by wal-redo. It is specific only for wal-redo Postgres. +### Alternatives +Perhaps we could write an extra WAL record with the t_cid information, when a page is evicted that contains rows that were touched a transaction that's still running. However, that seems very complicated. -- Alternatives? - No +## ginfast.c + +``` +diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c +index e0d9940946..2d964c02e9 100644 +--- a/src/backend/access/gin/ginfast.c ++++ b/src/backend/access/gin/ginfast.c +@@ -285,6 +285,17 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) + memset(&sublist, 0, sizeof(GinMetaPageData)); + makeSublist(index, collector->tuples, collector->ntuples, &sublist); + ++ if (metadata->head != InvalidBlockNumber) ++ { ++ /* ++ * ZENITH: Get buffer before XLogBeginInsert() to avoid recursive call ++ * of XLogBeginInsert(). Reading a new buffer might evict a dirty page from ++ * the buffer cache, and if that page happens to be an FSM or VM page, zenith_write() ++ * will try to WAL-log an image of the page. ++ */ ++ buffer = ReadBuffer(index, metadata->tail); ++ } ++ + if (needWal) + XLogBeginInsert(); + +@@ -316,7 +327,6 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) + data.prevTail = metadata->tail; + data.newRightlink = sublist.head; + +- buffer = ReadBuffer(index, metadata->tail); + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); +``` + +The problem is explained in the comment above + +### How to get rid of the patch + +Can we stop WAL-logging FSM or VM pages? Or delay the WAL logging until we're out of the critical +section or something. + +Maybe some bigger rewrite of FSM and VM would help to avoid WAL-logging FSM and VM page images? -4. Eliminate reporting of some warnings related with hint bits, for example -"page is not marked all-visible but visibility map bit is set in relation". -- Why? - Hint bit may be not WAL logged. +## Mark index builds that use buffer manager without logging explicitly -- Alternative? - Always wal log any page changes. +``` + src/backend/access/gin/gininsert.c | 7 + + src/backend/access/gist/gistbuild.c | 15 +- + src/backend/access/spgist/spginsert.c | 8 +- + +also some changes in src/backend/storage/smgr/smgr.c +``` + +When a GIN index is built, for example, it is built by inserting the entries into the index more or +less normally, but without WAL-logging anything. After the index has been built, we iterate through +all pages and write them to the WAL. That doesn't work for Neon, because if a page is not WAL-logged +and is evicted from the buffer cache, it is lost. We have an check to catch that in the Neon +extension. To fix that, we've added a few functions to track explicitly when we're performing such +an operation: `smgr_start_unlogged_build`, `smgr_finish_unlogged_build_phase_1` and +`smgr_end_unlogged_build`. -5. Maintain last written LSN. -- Why? - When compute node requests page from page server, we need to specify LSN. Ideally it should be LSN - of WAL record performing last update of this pages. But we do not know it, because we do not have page. - We can use current WAL flush position, but in this case there is high probability that page server - will be blocked until this peace of WAL is delivered. - As better approximation we can keep max LSN of written page. It will be better to take in account LSNs only of evicted pages, - but SMGR API doesn't provide such knowledge. +### How to get rid of the patch -- Alternatives? - Maintain map of LSNs of evicted pages. +I think it would make sense to be more explicit about that in PostgreSQL too. So extract these +changes to a patch and post to pgsql-hackers. -6. Launching Postgres without WAL. -- Why? - According to Zenith architecture compute node is stateless. So when we are launching - compute node, we need to provide some dummy PG_DATADIR. Relation pages - can be requested on demand from page server. But Postgres still need some non-relational data: - control and configuration files, SLRUs,... - It is currently implemented using basebackup (do not mix with pg_basebackup) which is created - by pageserver. It includes in this tarball config/control files, SLRUs and required directories. - As far as pageserver do not have original (non-scattered) WAL segments, it includes in - this tarball dummy WAL segment which contains only SHUTDOWN_CHECKPOINT record at the beginning of segment, - which redo field points to the end of wal. It allows to load checkpoint record in more or less - standard way with minimal changes of Postgres, but then some special handling is needed, - including restoring previous record position from zenith.signal file. - Also we have to correctly initialize header of last WAL page (pointed by checkpoint.redo) - to pass checks performed by XLogReader. +## Track last-written page LSN -- Alternatives? - We may not include fake WAL segment in tarball at all and modify xlog.c to load checkpoint record - in special way. But it may only increase number of changes in xlog.c +``` + src/backend/commands/dbcommands.c | 17 +- -7. Add redo_read_buffer_filter callback to XLogReadBufferForRedoExtended -- Why? - We need a way in wal-redo Postgres to ignore pages which are not requested by pageserver. - So wal-redo Postgres reconstructs only requested page and for all other returns BLK_DONE - which means that recovery for them is not needed. +Also one call to SetLastWrittenPageLSN() in spginsert.c, maybe elsewhere too +``` -- Alternatives? - No +Whenever a page is evicted from the buffer cache, we remember its LSN, so that we can use the same +LSN in the GetPage@LSN request when reading the page back from the page server. The value is +conservative: it would be correct to always use the last-inserted LSN, but it would be slow because +then the page server would need to wait for the recent WAL to be streamed and processed, before +responding to any GetPage@LSN request. -8. Enforce WAL logging of sequence updates. -- Why? - Due to performance reasons Postgres don't want to log each fetching of a value from a sequence, - so we pre-log a few fetches in advance. In the event of crash we can lose - (skip over) as many values as we pre-logged. - But it doesn't work with Zenith because page with sequence value can be evicted from buffer cache - and we will get a gap in sequence values even without crash. +The last-written page LSN is mostly tracked in the smgrwrite() function, without core code changes, +but there are a few exceptions where we've had to add explicit calls to the Neon-specific +SetLastWrittenPageLSN() function. -- Alternatives: - Do not try to preserve sequential order but avoid performance penalty. +There's an open PR to track the LSN in a more-fine grained fashion: +https://github.com/neondatabase/postgres/pull/177 + +PostgreSQL v15 introduces a new method to do CREATE DATABASE that WAL-logs the database instead of +relying copying files and checkpoint. With that method, we probably won't need any special handling. +The old method is still available, though. + +### How to get rid of the patch + +Wait until v15? -9. Treat unlogged tables as normal (permanent) tables. -- Why? - Unlogged tables are not transient, so them have to survive node restart (unlike temporary tables). - But as far as compute node is stateless, we need to persist their data to storage node. - And it can only be done through the WAL. +## Cache relation sizes -- Alternatives? - * Store unlogged tables locally (violates requirement of stateless compute nodes). - * Prohibit unlogged tables at all. +The Neon extension contains a little cache for smgrnblocks() and smgrexists() calls, to avoid going +to the page server every time. It might be useful to cache those in PostgreSQL, maybe in the +relcache? (I think we do cache nblocks in relcache already, check why that's not good enough for +Neon) -10. Support start Postgres in wal-redo mode -- Why? - To be able to apply WAL record and reconstruct pages at page server. +## Use buffer manager when extending VM or FSM -- Alternatives? - * Rewrite redo handlers in Rust - * Do not reconstruct pages at page server at all and do it at compute node. +``` + src/backend/storage/freespace/freespace.c | 14 +- + src/backend/access/heap/visibilitymap.c | 15 +- + +diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c +index e198df65d8..addfe93eac 100644 +--- a/src/backend/access/heap/visibilitymap.c ++++ b/src/backend/access/heap/visibilitymap.c +@@ -652,10 +652,19 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) + /* Now extend the file */ + while (vm_nblocks_now < vm_nblocks) + { +- PageSetChecksumInplace((Page) pg.data, vm_nblocks_now); ++ /* ++ * ZENITH: Initialize VM pages through buffer cache to prevent loading ++ * them from pageserver. ++ */ ++ Buffer buffer = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, P_NEW, ++ RBM_ZERO_AND_LOCK, NULL); ++ Page page = BufferGetPage(buffer); ++ ++ PageInit((Page) page, BLCKSZ, 0); ++ PageSetChecksumInplace(page, vm_nblocks_now); ++ MarkBufferDirty(buffer); ++ UnlockReleaseBuffer(buffer); + +- smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now, +- pg.data, false); + vm_nblocks_now++; + } +``` + +### Problem we're trying to solve + +??? + +### How to get rid of the patch + +Maybe this would be a reasonable change in PostgreSQL too? -11. WAL proposer -- Why? - WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes. - It is currently implemented as patch to standard WAL sender. +## Allow startup without reading checkpoint record -- Alternatives? - Can be moved to extension if some extra callbacks will be added to wal sender code. +In Neon, the compute node is stateless. So when we are launching compute node, we need to provide +some dummy PG_DATADIR. Relation pages can be requested on demand from page server. But Postgres +still need some non-relational data: control and configuration files, SLRUs,... It is currently +implemented using basebackup (do not mix with pg_basebackup) which is created by pageserver. It +includes in this tarball config/control files, SLRUs and required directories. + +As pageserver does not have the original WAL segments, the basebackup tarball includes an empty WAL +segment to bootstrap the WAL writing, but it doesn't contain the checkpoint record. There are some +changes in xlog.c, to allow starting the compute node without reading the last checkpoint record +from WAL. + +This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start +at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last +checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo. -12. Secure Computing BPF API wrapper. -- Why? - Pageserver delegates complex WAL decoding duties to Postgres, - which means that the latter might fall victim to carefully designed - malicious WAL records and start doing harmful things to the system. - To prevent this, it has been decided to limit possible interactions - with the outside world using the Secure Computing BPF mode. +### How to get rid of the patch -- Alternatives: - * Rewrite redo handlers in Rust. - * Add more checks to guarantee correctness of WAL records. - * Move seccomp.c to extension - * Many other discussed approaches to neutralize incorrect WAL records vulnerabilities. +??? -13. Callbacks for replica feedbacks -- Why? - Allowing waproposer to interact with walsender code. +### Alternatives -- Alternatives - Copy walsender code to walproposer. +Include a fake checkpoint record in the tarball. Creating fake WAL is a bit risky, though; I'm +afraid it might accidentally get streamed to the safekeepers and overwrite or corrupt the real WAL. + +## Disable sequence caching + +``` +diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c +index 0415df9ccb..9f9db3c8bc 100644 +--- a/src/backend/commands/sequence.c ++++ b/src/backend/commands/sequence.c +@@ -53,7 +53,9 @@ + * so we pre-log a few fetches in advance. In the event of + * crash we can lose (skip over) as many values as we pre-logged. + */ +-#define SEQ_LOG_VALS 32 ++/* Zenith XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */ ++/* #define SEQ_LOG_VALS 32 */ ++#define SEQ_LOG_VALS 0 +``` + +Due to performance reasons Postgres don't want to log each fetching of a value from a sequence, so +it pre-logs a few fetches in advance. In the event of crash we can lose (skip over) as many values +as we pre-logged. But with Neon, because page with sequence value can be evicted from buffer cache, +we can get a gap in sequence values even without crash. + +### How to get rid of the patch + +Maybe we can just remove it, and accept the gaps. Or add some special handling for sequence +relations in the Neon extension, to WAL log the sequence page when it's about to be evicted. It +would be weird if the sequence moved backwards though, think of PITR. + +Or add a GUC for the amount to prefix to PostgreSQL, and force it to 1 in Neon. -14. Support multiple SMGR implementations. -- Why? - Postgres provides abstract API for storage manager but it has only one implementation - and provides no way to replace it with custom storage manager. +## Walproposer -- Alternatives? - None. +``` + src/Makefile | 1 + + src/backend/replication/libpqwalproposer/Makefile | 37 + + src/backend/replication/libpqwalproposer/libpqwalproposer.c | 416 ++++++++++++ + src/backend/postmaster/bgworker.c | 4 + + src/backend/postmaster/postmaster.c | 6 + + src/backend/replication/Makefile | 4 +- + src/backend/replication/walproposer.c | 2350 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + src/backend/replication/walproposer_utils.c | 402 +++++++++++ + src/backend/replication/walreceiver.c | 7 + + src/backend/replication/walsender.c | 320 ++++++--- + src/backend/storage/ipc/ipci.c | 6 + + src/include/replication/walproposer.h | 565 ++++++++++++++++ +``` + +WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes. It is +currently implemented as patch to standard WAL sender. + +### How to get rid of the patch + +Refactor into an extension. Submit hooks or APIs into upstream if necessary. + +@MMeent did some work on this already: https://github.com/neondatabase/postgres/pull/96 + +## Ignore unexpected data beyond EOF in bufmgr.c + +``` +@@ -922,11 +928,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, + */ + bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr); + if (!PageIsNew((Page) bufBlock)) +- ereport(ERROR, ++ { ++ // XXX-ZENITH ++ MemSet((char *) bufBlock, 0, BLCKSZ); ++ ereport(DEBUG1, + (errmsg("unexpected data beyond EOF in block %u of relation %s", + blockNum, relpath(smgr->smgr_rnode, forkNum)), + errhint("This has been seen to occur with buggy kernels; consider updating your system."))); +- ++ } + /* + * We *must* do smgrextend before succeeding, else the page will not + * be reserved by the kernel, and the next P_NEW call will decide to +``` + +PostgreSQL is a bit sloppy with extending relations. Usually, the relation is extended with zeros +first, then the page is filled, and finally the new page WAL-logged. But if multiple backends extend +a relation at the same time, the pages can be WAL-logged in different order. + +I'm not sure what scenario exactly required this change in Neon, though. + +### How to get rid of the patch + +Submit patches to pgsql-hackers, to tighten up the WAL-logging around relation extension. It's a bit +confusing even in PostgreSQL. Maybe WAL log the intention to extend first, then extend the relation, +and finally WAL-log that the extension succeeded. + +## Make smgr interface available to extensions + +``` + src/backend/storage/smgr/smgr.c | 203 +++--- + src/include/storage/smgr.h | 72 +- +``` + +### How to get rid of the patch + +Submit to upstream. This could be useful for the Disk Encryption patches too, or for compression. -15. Calculate database size as sum of all database relations. -- Why? - Postgres is calculating database size by traversing data directory - but as far as Zenith compute node is stateless we can not do it. +## Added relpersistence argument to smgropen() -- Alternatives? - Send this request directly to pageserver and calculate real (physical) size - of Zenith representation of database/timeline, rather than sum logical size of all relations. +``` + src/backend/access/heap/heapam_handler.c | 2 +- + src/backend/catalog/storage.c | 10 +- + src/backend/commands/tablecmds.c | 2 +- + src/backend/storage/smgr/md.c | 4 +- + src/include/utils/rel.h | 3 +- +``` + +Neon needs to treat unlogged relations differently from others, so the smgrread(), smgrwrite() etc. +implementations need to know the 'relpersistence' of the relation. To get that information where +it's needed, we added the 'relpersistence' field to smgropen(). + +### How to get rid of the patch + +Maybe 'relpersistence' would be useful in PostgreSQL for debugging purposes? Or simply for the +benefit of extensions like Neon. Should consider this in the patch to make smgr API usable to +extensions. + +## Alternatives + +Currently in Neon, unlogged tables live on local disk in the compute node, and are wiped away on +compute node restart. One alternative would be to instead WAL-log even unlogged tables, essentially +ignoring the UNLOGGED option. Or prohibit UNLOGGED tables completely. But would we still need the +relpersistence argument to handle index builds? See item on "Mark index builds that use buffer +manager without logging explicitly". + +## Use smgr and dbsize_hook for size calculations + +``` + src/backend/utils/adt/dbsize.c | 61 +- +``` + +In PostgreSQL, the rel and db-size functions scan the data directory directly. That won't work in Neon. + +### How to get rid of the patch + +Send patch to PostgreSQL, to use smgr API functions for relation size calculation instead. Maybe as +part of the general smgr API patch. ------------------------------------------------ -Not currently committed but proposed: -1. Disable ring buffer buffer manager strategies -- Why? - Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...). - Even if there are free space in buffer cache, pages may be evicted. - Negative effect of it can be somehow compensated by file system cache, but in case of Zenith - cost of requesting page from page server is much higher. +# WAL redo process changes -- Alternatives? - Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy, - for example copy evicted page from ring buffer to some other buffer if there is free space - in buffer cache. +Pageserver delegates complex WAL decoding duties to Postgres, which means that the latter might fall +victim to carefully designed malicious WAL records and start doing harmful things to the system. To +prevent this, the redo functions are executed in a separate process that is sandboxed with Linux +Secure Computing mode (see seccomp(2) man page). -2. Disable marking page as dirty when hint bits are set. -- Why? - Postgres has to modify page twice: first time when some tuple is updated and second time when - hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL. +As an alternative to having a separate WAL redo process, we could rewrite all redo handlers in Rust +This is infeasible. However, it would take a lot of effort to rewrite them, ensure that you've done +the rewrite correctly, and once you've done that, it would be a lot of ongoing maintenance effort to +keep the rewritten code in sync over time, across new PostgreSQL versions. That's why we want to +leverage PostgreSQL code. -- Alternatives? - Add special WAL record for setting page hints. +Another alternative would be to harden all the PostgreSQL WAL redo functions so that it would be +safe to call them directly from Rust code, without needing the security sandbox. That's not feasible +for similar reasons as rewriting them in Rust. -3. Prefetching -- Why? - As far as pages in Zenith are loaded on demand, to reduce node startup time - and also sppedup some massive queries we need some mechanism for bulk loading to - reduce page request round-trip overhead. - Currently Postgres is supporting prefetching only for bitmap scan. - In Zenith we also use prefetch for sequential and index scan. For sequential scan we prefetch - some number of following pages. For index scan we prefetch pages of heap relation addressed by TIDs. +## Don't replay change in XLogReadBufferForRedo that are not for the target page we're replaying -4. Prewarming. -- Why? - Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith. - But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow. - We can capture state of compute node buffer cache and send bulk request for this pages at startup. +``` + src/backend/access/gin/ginxlog.c | 19 +- + +Also some changes in xlog.c and xlogutils.c + +Example: + +@@ -415,21 +416,27 @@ ginRedoSplit(XLogReaderState *record) + if (!isLeaf) + ginRedoClearIncompleteSplit(record, 3); + +- if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED) ++ action = XLogReadBufferForRedo(record, 0, &lbuffer); ++ if (action != BLK_RESTORED && action != BLK_DONE) + elog(ERROR, "GIN split record did not contain a full-page image of left page"); +``` + +### Problem we're trying to solve + +In PostgreSQL, if a WAL redo function calls XLogReadBufferForRead() for a page that has a full-page +image, it always succeeds. However, Neon WAL redo process is only concerned about replaying changes +to a singe page, so replaying any changes for other pages is a waste of cycles. We have modified +XLogReadBufferForRead() to return BLK_DONE for all other pages, to avoid the overhead. That is +unexpected by code like the above. + +### How to get rid of the patch + +Submit the changes to upstream, hope the community accepts them. There's no harm to PostgreSQL from +these changes, although it doesn't have any benefit either. + +To make these changes useful to upstream PostgreSQL, we could implement a feature to look ahead the +WAL, and detect truncated relations. Even in PostgreSQL, it is a waste of cycles to replay changes +to pages that are later truncated away, so we could have XLogReadBufferForRedo() return BLK_DONE or +BLK_NOTFOUND for pages that are known to be truncated away later in the WAL stream. + +### Alternatives + +Maybe we could revert this optimization, and restore pages other than the target page too. + +## Add predefined_sysidentifier flag to initdb + +``` + src/backend/bootstrap/bootstrap.c | 13 +- + src/bin/initdb/initdb.c | 4 + + +And some changes in xlog.c +``` + +This is used to help with restoring a database when you have all the WAL, all the way back to +initdb, but no backup. You can reconstruct the missing backup by running initdb again, with the same +sysidentifier. + + +### How to get rid of the patch + +Ignore it. This is only needed for disaster recovery, so once we've eliminated all other Postgres +patches, we can just keep it around as a patch or as separate branch in a repo. + + +# Not currently committed but proposed + +## Disable ring buffer buffer manager strategies + +### Why? + +Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...). +Even if there are free space in buffer cache, pages may be evicted. +Negative effect of it can be somehow compensated by file system cache, but in Neon, +cost of requesting page from page server is much higher. + +### Alternatives? + +Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy, +for example copy evicted page from ring buffer to some other buffer if there is free space +in buffer cache. + +## Disable marking page as dirty when hint bits are set. + +### Why? + +Postgres has to modify page twice: first time when some tuple is updated and second time when +hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL. + +### Alternatives? + +Add special WAL record for setting page hints. + +## Prefetching + +### Why? + +As far as pages in Neon are loaded on demand, to reduce node startup time +and also speedup some massive queries we need some mechanism for bulk loading to +reduce page request round-trip overhead. + +Currently Postgres is supporting prefetching only for bitmap scan. +In Neon we should also use prefetch for sequential and index scans, because the OS is not doing it for us. +For sequential scan we could prefetch some number of following pages. For index scan we could prefetch pages +of heap relation addressed by TIDs. + +## Prewarming + +### Why? + +Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith. +But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow. +We can capture state of compute node buffer cache and send bulk request for this pages at startup. diff --git a/docs/docker.md b/docs/docker.md index 14ba2146cb..42f0048e6f 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -1,38 +1,84 @@ -# Docker images of Zenith +# Docker images of Neon ## Images Currently we build two main images: -- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). -- [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres). +- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). +- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). -And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos: +And additional intermediate image: -- [zenithdb/build](https://hub.docker.com/repository/docker/zenithdb/build) — image with all the dependencies required to build Zenith and compute node images. This image is based on `rust:slim-buster`, so it also has a proper `rust` environment. Built from [/Dockerfile.build](/Dockerfile.build). -- [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools. +- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools. ## Building pipeline -1. Image `zenithdb/compute-tools` is re-built automatically. +We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs -2. Image `zenithdb/build` is built manually. If you want to introduce any new compile time dependencies to Zenith or compute node you have to update this image as well, build it and push to Docker Hub. +1. `neondatabase/compute-tools` and `neondatabase/compute-node` -Build: -```sh -docker build -t zenithdb/build:buster -f Dockerfile.build . +2. `neondatabase/neon` + +## Docker Compose example + +You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers. + +- etcd x 1 +- pageserver x 1 +- safekeeper x 3 +- compute x 1 +- MinIO x 1 # This is Amazon S3 compatible object storage + +### How to use + +1. create containers + +You can specify version of neon cluster using following environment values. +- PG_VERSION: postgres version for compute (default is 14) +- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml) +``` +$ cd docker-compose/docker-compose.yml +$ docker-compose down # remove the conainers if exists +$ PG_VERSION=15 TAG=2221 docker-compose up --build -d # You can specify the postgres and image version +Creating network "dockercompose_default" with the default driver +Creating dockercompose_etcd3_1 ... +(...omit...) ``` -Login: -```sh -docker login +2. connect compute node +``` +$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass +$ psql -h localhost -p 55433 -U cloud_admin +postgres=# CREATE TABLE t(key int primary key, value text); +CREATE TABLE +postgres=# insert into t values(1,1); +INSERT 0 1 +postgres=# select * from t; + key | value +-----+------- + 1 | 1 +(1 row) ``` -Push to Docker Hub: -```sh -docker push zenithdb/build:buster +3. If you want to see the log, you can use `docker-compose logs` command. +``` +# check the container name you want to see +$ docker ps +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +d6968a5ae912 dockercompose_compute "/shell/compute.sh" 5 minutes ago Up 5 minutes 0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp dockercompose_compute_1 +(...omit...) + +$ docker logs -f dockercompose_compute_1 +2022-10-21 06:15:48.757 GMT [56] LOG: connection authorized: user=cloud_admin database=postgres application_name=psql +2022-10-21 06:17:00.307 GMT [56] LOG: [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400' +(...omit...) ``` -3. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo. +4. If you want to see durable data in MinIO which is s3 compatible storage -4. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically. +Access http://localhost:9001 and sign in. + +- Username: `minio` +- Password: `password` + +You can see durable pages and WAL data in `neon` bucket. \ No newline at end of file diff --git a/docs/glossary.md b/docs/glossary.md index 159a078e30..25c66828c0 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -2,7 +2,7 @@ ### Authentication -### Backpresssure +### Backpressure Backpressure is used to limit the lag between pageserver and compute node or WAL service. @@ -21,7 +21,7 @@ NOTE:It has nothing to do with PostgreSQL pg_basebackup. ### Branch -We can create branch at certain LSN using `zenith branch` command. +We can create branch at certain LSN using `neon_local timeline branch` command. Each Branch lives in a corresponding timeline[] and has an ancestor[]. @@ -29,24 +29,32 @@ Each Branch lives in a corresponding timeline[] and has an ancestor[]. NOTE: This is an overloaded term. -A checkpoint record in the WAL marks a point in the WAL sequence at which it is guaranteed that all data files have been updated with all information from shared memory modified before that checkpoint; +A checkpoint record in the WAL marks a point in the WAL sequence at which it is guaranteed that all data files have been updated with all information from shared memory modified before that checkpoint; ### Checkpoint (Layered repository) NOTE: This is an overloaded term. Whenever enough WAL has been accumulated in memory, the page server [] -writes out the changes from in-memory layers into new layer files[]. This process -is called "checkpointing". The page server only creates layer files for -relations that have been modified since the last checkpoint. +writes out the changes from the in-memory layer into a new delta layer file. This process +is called "checkpointing". Configuration parameter `checkpoint_distance` defines the distance from current LSN to perform checkpoint of in-memory layers. Default is `DEFAULT_CHECKPOINT_DISTANCE`. -Set this parameter to `0` to force checkpoint of every layer. -Configuration parameter `checkpoint_period` defines the interval between checkpoint iterations. -Default is `DEFAULT_CHECKPOINT_PERIOD`. +### Compaction + +A background operation on layer files. Compaction takes a number of L0 +layer files, each of which covers the whole key space and a range of +LSN, and reshuffles the data in them into L1 files so that each file +covers the whole LSN range, but only part of the key space. + +Compaction should also opportunistically leave obsolete page versions +from the L1 files, and materialize other page versions for faster +access. That hasn't been implemented as of this writing, though. + + ### Compute node Stateless Postgres node that stores data in pageserver. @@ -54,10 +62,10 @@ Stateless Postgres node that stores data in pageserver. ### Garbage collection The process of removing old on-disk layers that are not needed by any timeline anymore. + ### Fork Each of the separate segmented file sets in which a relation is stored. The main fork is where the actual data resides. There also exist two secondary forks for metadata: the free space map and the visibility map. -Each PostgreSQL fork is considered a separate relish. ### Layer @@ -67,23 +75,24 @@ layer's Segment and range of LSNs. There are two kinds of layers, in-memory and on-disk layers. In-memory layers are used to ingest incoming WAL, and provide fast access to the recent page versions. On-disk layers are stored as files on disk, and -are immutable. See pageserver/src/layered_repository/README.md for more. +are immutable. See [pageserver-storage.md](./pageserver-storage.md) for more. ### Layer file (on-disk layer) Layered repository on-disk format is based on immutable files. The -files are called "layer files". Each file corresponds to one RELISH_SEG_SIZE -segment of a PostgreSQL relation fork. There are two kinds of layer -files: image files and delta files. An image file contains a -"snapshot" of the segment at a particular LSN, and a delta file -contains WAL records applicable to the segment, in a range of LSNs. +files are called "layer files". There are two kinds of layer files: +image files and delta files. An image file contains a "snapshot" of a +range of keys at a particular LSN, and a delta file contains WAL +records applicable to a range of keys, in a range of LSNs. ### Layer map -The layer map tracks what layers exist for all the relishes in a timeline. +The layer map tracks what layers exist in a timeline. + ### Layered repository -Zenith repository implementation that keeps data in layers. +Neon repository implementation that keeps data in layers. + ### LSN The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log. @@ -93,23 +102,23 @@ It is printed as two hexadecimal numbers of up to 8 digits each, separated by a Check also [PostgreSQL doc about pg_lsn type](https://www.postgresql.org/docs/devel/datatype-pg-lsn.html) Values can be compared to calculate the volume of WAL data that separates them, so they are used to measure the progress of replication and recovery. -In postgres and Zenith lsns are used to describe certain points in WAL handling. +In Postgres and Neon LSNs are used to describe certain points in WAL handling. PostgreSQL LSNs and functions to monitor them: * `pg_current_wal_insert_lsn()` - Returns the current write-ahead log insert location. * `pg_current_wal_lsn()` - Returns the current write-ahead log write location. * `pg_current_wal_flush_lsn()` - Returns the current write-ahead log flush location. * `pg_last_wal_receive_lsn()` - Returns the last write-ahead log location that has been received and synced to disk by streaming replication. While streaming replication is in progress this will increase monotonically. -* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically. +* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically. [source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html): -Zenith safekeeper LSNs. For more check [walkeeper/README_PROTO.md](/walkeeper/README_PROTO.md) +Neon safekeeper LSNs. See [safekeeper protocol section](safekeeper-protocol.md) for more information. * `CommitLSN`: position in WAL confirmed by quorum safekeepers. * `RestartLSN`: position in WAL confirmed by all safekeepers. * `FlushLSN`: part of WAL persisted to the disk by safekeeper. -* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records. +* `VCL`: the largest LSN for which we can guarantee availability of all prior records. -Zenith pageserver LSNs: +Neon pageserver LSNs: * `last_record_lsn` - the end of last processed WAL record. * `disk_consistent_lsn` - data is known to be fully flushed and fsync'd to local disk on pageserver up to this LSN. * `remote_consistent_lsn` - The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash. @@ -117,6 +126,26 @@ TODO: use this name consistently in remote storage code. Now `disk_consistent_ls * `ancestor_lsn` - LSN of the branch point (the LSN at which this branch was created) TODO: add table that describes mapping between PostgreSQL (compute), safekeeper and pageserver LSNs. + +### Logical size + +The pageserver tracks the "logical size" of a timeline. It is the +total size of all relations in all Postgres databases on the +timeline. It includes all user and system tables, including their FSM +and VM forks. But it does not include SLRUs, twophase files or any +other such data or metadata that lives outside relations. + +The logical size is calculated by the pageserver, and is sent to +PostgreSQL via feedback messages to the safekeepers. PostgreSQL uses +the logical size to enforce the size limit in the free tier. The +logical size is also shown to users in the web console. + +The logical size is not affected by branches or the physical layout of +layer files in the pageserver. If you have a database with 1 GB +logical size and you create a branch of it, both branches will have 1 +GB logical size, even though the branch is copy-on-write and won't +consume any extra physical disk space until you make changes to it. + ### Page (block) The basic structure used to store relation data. All pages are of the same size. @@ -124,7 +153,7 @@ This is the unit of data exchange between compute node and pageserver. ### Pageserver -Zenith storage engine: repositories + wal receiver + page service + wal redo. +Neon storage engine: repositories + wal receiver + page service + wal redo. ### Page service @@ -149,14 +178,6 @@ and create new databases and accounts (control plane API in our case). The generic term in PostgreSQL for all objects in a database that have a name and a list of attributes defined in a specific order. -### Relish - -We call each relation and other file that is stored in the -repository a "relish". It comes from "rel"-ish, as in "kind of a -rel", because it covers relations as well as other things that are -not relations, but are treated similarly for the purposes of the -storage layer. - ### Replication slot @@ -173,33 +194,24 @@ One repository corresponds to one Tenant. How much history do we need to keep around for PITR and read-only nodes? -### Segment (PostgreSQL) - -NOTE: This is an overloaded term. +### Segment A physical file that stores data for a given relation. File segments are limited in size by a compile-time setting (1 gigabyte by default), so if a relation exceeds that size, it is split into multiple segments. -### Segment (Layered Repository) - -NOTE: This is an overloaded term. - -Segment is a RELISH_SEG_SIZE slice of relish (identified by a SegmentTag). - ### SLRU SLRUs include pg_clog, pg_multixact/members, and pg_multixact/offsets. There are other SLRUs in PostgreSQL, but they don't need to be stored permanently (e.g. pg_subtrans), -or we do not support them in zenith yet (pg_commit_ts). -Each SLRU segment is considered a separate relish[]. +or we do not support them in neon yet (pg_commit_ts). ### Tenant (Multitenancy) -Tenant represents a single customer, interacting with Zenith. +Tenant represents a single customer, interacting with Neon. Wal redo[] activity, timelines[], layers[] are managed for each tenant independently. One pageserver[] can serve multiple tenants at once. -One safekeeper +One safekeeper See `docs/multitenancy.md` for more. diff --git a/docs/multitenancy.md b/docs/multitenancy.md index 4f1d45e970..35c69e69a1 100644 --- a/docs/multitenancy.md +++ b/docs/multitenancy.md @@ -2,26 +2,26 @@ ### Overview -Zenith supports multitenancy. One pageserver can serve multiple tenants at once. Tenants can be managed via zenith CLI. During page server setup tenant can be created using ```zenith init --create-tenant``` Also tenants can be added into the system on the fly without pageserver restart. This can be done using the following cli command: ```zenith tenant create``` Tenants use random identifiers which can be represented as a 32 symbols hexadecimal string. So zenith tenant create accepts desired tenant id as an optional argument. The concept of timelines/branches is working independently per tenant. +Neon supports multitenancy. One pageserver can serve multiple tenants at once. Tenants can be managed via neon_local CLI. During page server setup tenant can be created using ```neon_local init --create-tenant``` Also tenants can be added into the system on the fly without pageserver restart. This can be done using the following cli command: ```neon_local tenant create``` Tenants use random identifiers which can be represented as a 32 symbols hexadecimal string. So neon_local tenant create accepts desired tenant id as an optional argument. The concept of timelines/branches is working independently per tenant. ### Tenants in other commands -By default during `zenith init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct arugment `--tenantid=` is provided. So generally tenantid more frequently appears in internal pageserver interface. Its commands take tenantid argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants. +By default during `neon_local init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct argument `--tenant_id=` is provided. So generally tenant_id more frequently appears in internal pageserver interface. Its commands take tenant_id argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants. Examples for cli: ```sh -zenith tenant list +neon_local tenant list -zenith tenant create // generates new id +neon_local tenant create // generates new id -zenith tenant create ee6016ec31116c1b7c33dfdfca38892f +neon_local tenant create ee6016ec31116c1b7c33dfdfca38892f -zenith pg create main // default tenant from zenith init +neon_local pg create main // default tenant from neon init -zenith pg create main --tenantid=ee6016ec31116c1b7c33dfdfca38892f +neon_local pg create main --tenant_id=ee6016ec31116c1b7c33dfdfca38892f -zenith branch --tenantid=ee6016ec31116c1b7c33dfdfca38892f +neon_local branch --tenant_id=ee6016ec31116c1b7c33dfdfca38892f ``` ### Data layout @@ -56,4 +56,4 @@ Tenant id is passed to postgres via GUC the same way as the timeline. Tenant id ### Safety -For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline). +For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenant_id, timeline_id) pair so there can only be one writer for particular (tenant_id, timeline_id). diff --git a/docs/pageserver-page-service.md b/docs/pageserver-page-service.md new file mode 100644 index 0000000000..cea9e5a637 --- /dev/null +++ b/docs/pageserver-page-service.md @@ -0,0 +1,9 @@ +# Page Service + +The Page Service listens for GetPage@LSN requests from the Compute Nodes, +and responds with pages from the repository. On each GetPage@LSN request, +it calls into the Repository function + +A separate thread is spawned for each incoming connection to the page +service. The page service uses the libpq protocol to communicate with +the client. The client is a Compute Postgres instance. diff --git a/docs/pageserver-pagecache.md b/docs/pageserver-pagecache.md new file mode 100644 index 0000000000..d9b120bbb9 --- /dev/null +++ b/docs/pageserver-pagecache.md @@ -0,0 +1,8 @@ +# Page cache + +TODO: + +- shared across tenants +- store pages from layer files +- store pages from "in-memory layer" +- store materialized pages diff --git a/docs/pageserver-processing-getpage.md b/docs/pageserver-processing-getpage.md new file mode 100644 index 0000000000..be99ab82d4 --- /dev/null +++ b/docs/pageserver-processing-getpage.md @@ -0,0 +1,4 @@ +# Processing a GetPage request + +TODO: +- sequence diagram that shows how a GetPage@LSN request is processed diff --git a/docs/pageserver-processing-wal.md b/docs/pageserver-processing-wal.md new file mode 100644 index 0000000000..f8c43b6085 --- /dev/null +++ b/docs/pageserver-processing-wal.md @@ -0,0 +1,5 @@ +# Processing WAL + +TODO: +- diagram that shows how incoming WAL is processed +- explain durability, what is fsync'd when, disk_consistent_lsn diff --git a/pageserver/README.md b/docs/pageserver-services.md similarity index 66% rename from pageserver/README.md rename to docs/pageserver-services.md index 69080a16cc..fc259c8a5f 100644 --- a/pageserver/README.md +++ b/docs/pageserver-services.md @@ -1,19 +1,8 @@ -## Page server architecture - -The Page Server has a few different duties: - -- Respond to GetPage@LSN requests from the Compute Nodes -- Receive WAL from WAL safekeeper -- Replay WAL that's applicable to the chunks that the Page Server maintains -- Backup to S3 - -S3 is the main fault-tolerant storage of all data, as there are no Page Server -replicas. We use a separate fault-tolerant WAL service to reduce latency. It -keeps track of WAL records which are not synced to S3 yet. +# Services The Page Server consists of multiple threads that operate on a shared repository of page versions: - +``` | WAL V +--------------+ @@ -21,18 +10,22 @@ repository of page versions: | WAL receiver | | | +--------------+ - +----+ - +---------+ .......... | | - | | . . | | - GetPage@LSN | | . backup . -------> | S3 | --------------> | Page | repository . . | | - | Service | .......... | | - page | | +----+ + ...... + +---------+ +--------+ . . + | | | | . . + GetPage@LSN | | | backup | -------> . S3 . +-------------> | Page | repository | | . . + | Service | +--------+ . . + page | | ...... <------------- | | - +---------+ +--------------------+ - | Checkpointing / | - | Garbage collection | - +--------------------+ + +---------+ +-----------+ +--------------------+ + | WAL redo | | Checkpointing, | + +----------+ | processes | | Garbage collection | + | | +-----------+ +--------------------+ + | HTTP | + | mgmt API | + | | + +----------+ Legend: @@ -40,28 +33,75 @@ Legend: | | A thread or multi-threaded service +--+ -.... -. . Component at its early development phase. -.... - ---> Data flow <--- +``` - -Page Service ------------- +## Page Service The Page Service listens for GetPage@LSN requests from the Compute Nodes, -and responds with pages from the repository. +and responds with pages from the repository. On each GetPage@LSN request, +it calls into the Repository function + +A separate thread is spawned for each incoming connection to the page +service. The page service uses the libpq protocol to communicate with +the client. The client is a Compute Postgres instance. + +## WAL Receiver + +The WAL receiver connects to the external WAL safekeeping service +using PostgreSQL physical streaming replication, and continuously +receives WAL. It decodes the WAL records, and stores them to the +repository. -WAL Receiver ------------- +## Backup service -The WAL receiver connects to the external WAL safekeeping service (or -directly to the primary) using PostgreSQL physical streaming -replication, and continuously receives WAL. It decodes the WAL records, -and stores them to the repository. +The backup service, responsible for storing pageserver recovery data externally. + +Currently, pageserver stores its files in a filesystem directory it's pointed to. +That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached". +Therefore, the server interacts with external, more reliable storage to back up and restore its state. + +The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait. +There are the following implementations present: +* local filesystem — to use in tests mainly +* AWS S3 - to use in production + +The backup service is disabled by default and can be enabled to interact with a single remote storage. + +CLI examples: +* Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"` +* AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"` + +For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS. +For local S3 installations, refer to the their documentation for name format and credentials. + +Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets. +Required sections are: + +```toml +[remote_storage] +local_path = '/Users/someonetoignore/Downloads/tmp_dir/' +``` + +or + +```toml +[remote_storage] +bucket_name = 'some-sample-bucket' +bucket_region = 'eu-north-1' +prefix_in_bucket = '/test_prefix/' +``` + +`AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed. + + +## Repository background tasks + +The Repository also has a few different background threads and tokio tasks that perform +background duties like dumping accumulated WAL data from memory to disk, reorganizing +files for performance (compaction), and garbage collecting old files. Repository @@ -69,14 +109,14 @@ Repository The repository stores all the page versions, or WAL records needed to reconstruct them. Each tenant has a separate Repository, which is -stored in the .zenith/tenants/ directory. +stored in the .neon/tenants/ directory. Repository is an abstract trait, defined in `repository.rs`. It is implemented by the LayeredRepository object in `layered_repository.rs`. There is only that one implementation of the Repository trait, but it's still a useful abstraction that keeps the interface for the low-level storage functionality clean. The layered -storage format is described in layered_repository/README.md. +storage format is described in [pageserver-storage.md](./pageserver-storage.md). Each repository consists of multiple Timelines. Timeline is a workhorse that accepts page changes from the WAL, and serves @@ -92,7 +132,7 @@ Each repository also has a WAL redo manager associated with it, see records, whenever we need to reconstruct a page version from WAL to satisfy a GetPage@LSN request, or to avoid accumulating too much WAL for a page. The WAL redo manager uses a Postgres process running in -special zenith wal-redo mode to do the actual WAL redo, and +special Neon wal-redo mode to do the actual WAL redo, and communicates with the process using a pipe. @@ -116,50 +156,6 @@ Remove old on-disk layer files that are no longer needed according to the PITR retention policy -### Backup service - -The backup service, responsible for storing pageserver recovery data externally. - -Currently, pageserver stores its files in a filesystem directory it's pointed to. -That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached". -Therefore, the server interacts with external, more reliable storage to back up and restore its state. - -The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait. -There are the following implementations present: -* local filesystem — to use in tests mainly -* AWS S3 - to use in production - -Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs, parameters documentation can be found at [settings docs](../docs/settings.md). - -The backup service is disabled by default and can be enabled to interact with a single remote storage. - -CLI examples: -* Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"` -* AWS S3 : `${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/',access_key_id='SOMEKEYAAAAASADSAH*#',secret_access_key='SOMEsEcReTsd292v'}"` - -For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS. -For local S3 installations, refer to the their documentation for name format and credentials. - -Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets. -Required sections are: - -```toml -[remote_storage] -local_path = '/Users/someonetoignore/Downloads/tmp_dir/' -``` - -or - -```toml -[remote_storage] -bucket_name = 'some-sample-bucket' -bucket_region = 'eu-north-1' -prefix_in_bucket = '/test_prefix/' -access_key_id = 'SOMEKEYAAAAASADSAH*#' -secret_access_key = 'SOMEsEcReTsd292v' -``` - -Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above. TODO: Sharding -------------------- diff --git a/pageserver/src/layered_repository/README.md b/docs/pageserver-storage.md similarity index 67% rename from pageserver/src/layered_repository/README.md rename to docs/pageserver-storage.md index 20f89ddc70..77e7ff35bc 100644 --- a/pageserver/src/layered_repository/README.md +++ b/docs/pageserver-storage.md @@ -1,40 +1,44 @@ -# Overview - -The on-disk format is based on immutable files. The page server receives a -stream of incoming WAL, parses the WAL records to determine which pages they -apply to, and accumulates the incoming changes in memory. Every now and then, -the accumulated changes are written out to new immutable files. This process is -called checkpointing. Old versions of on-disk files that are not needed by any -timeline are removed by GC process. +# Pageserver storage The main responsibility of the Page Server is to process the incoming WAL, and reprocess it into a format that allows reasonably quick access to any page -version. +version. The page server slices the incoming WAL per relation and page, and +packages the sliced WAL into suitably-sized "layer files". The layer files +contain all the history of the database, back to some reasonable retention +period. This system replaces the base backups and the WAL archive used in a +traditional PostgreSQL installation. The layer files are immutable, they are not +modified in-place after creation. New layer files are created for new incoming +WAL, and old layer files are removed when they are no longer needed. + +The on-disk format is based on immutable files. The page server receives a +stream of incoming WAL, parses the WAL records to determine which pages they +apply to, and accumulates the incoming changes in memory. Whenever enough WAL +has been accumulated in memory, it is written out to a new immutable file. That +process accumulates "L0 delta files" on disk. When enough L0 files have been +accumulated, they are merged and re-partitioned into L1 files, and old files +that are no longer needed are removed by Garbage Collection (GC). The incoming WAL contains updates to arbitrary pages in the system. The distribution depends on the workload: the updates could be totally random, or there could be a long stream of updates to a single relation when data is bulk -loaded, for example, or something in between. The page server slices the -incoming WAL per relation and page, and packages the sliced WAL into -suitably-sized "layer files". The layer files contain all the history of the -database, back to some reasonable retention period. This system replaces the -base backups and the WAL archive used in a traditional PostgreSQL -installation. The layer files are immutable, they are not modified in-place -after creation. New layer files are created for new incoming WAL, and old layer -files are removed when they are no longer needed. We could also replace layer -files with new files that contain the same information, merging small files for -example, but that hasn't been implemented yet. +loaded, for example, or something in between. +``` +Cloud Storage Page Server Safekeeper + L1 L0 Memory WAL -Cloud Storage Page Server Safekeeper - Local disk Memory WAL - -|AAAA| |AAAA|AAAA| |AA -|BBBB| |BBBB|BBBB| | -|CCCC|CCCC| <---- |CCCC|CCCC|CCCC| <--- |CC <---- ADEBAABED -|DDDD|DDDD| |DDDD|DDDD| |DDD -|EEEE| |EEEE|EEEE|EEEE| |E - ++----+ +----+----+ +|AAAA| |AAAA|AAAA| +---+-----+ | ++----+ +----+----+ | | | |AA +|BBBB| |BBBB|BBBB| |BB | AA | |BB ++----+----+ +----+----+ |C | BB | |CC +|CCCC|CCCC| <---- |CCCC|CCCC| <--- |D | CC | <--- |DDD <---- ADEBAABED ++----+----+ +----+----+ | | DDD | |E +|DDDD|DDDD| |DDDD|DDDD| |E | | | ++----+----+ +----+----+ | | | +|EEEE| |EEEE|EEEE| +---+-----+ ++----+ +----+----+ +``` In this illustration, WAL is received as a stream from the Safekeeper, from the right. It is immediately captured by the page server and stored quickly in @@ -42,39 +46,29 @@ memory. The page server memory can be thought of as a quick "reorder buffer", used to hold the incoming WAL and reorder it so that we keep the WAL records for the same page and relation close to each other. -From the page server memory, whenever enough WAL has been accumulated for one -relation segment, it is moved to local disk, as a new layer file, and the memory -is released. +From the page server memory, whenever enough WAL has been accumulated, it is flushed +to disk into a new L0 layer file, and the memory is released. + +When enough L0 files have been accumulated, they are merged together and sliced +per key-space, producing a new set of files where each file contains a more +narrow key range, but larger LSN range. From the local disk, the layers are further copied to Cloud Storage, for long-term archival. After a layer has been copied to Cloud Storage, it can be removed from local disk, although we currently keep everything locally for fast access. If a layer is needed that isn't found locally, it is fetched from Cloud -Storage and stored in local disk. - -# Terms used in layered repository - -- Relish - one PostgreSQL relation or similarly treated file. -- Segment - one slice of a Relish that is stored in a LayeredTimeline. -- Layer - specific version of a relish Segment in a range of LSNs. +Storage and stored in local disk. L0 and L1 files are both uploaded to Cloud +Storage. # Layer map -The LayerMap tracks what layers exist for all the relishes in a timeline. - -LayerMap consists of two data structures: -- segs - All the layers keyed by segment tag -- open_layers - data structure that hold all open layers ordered by oldest_pending_lsn for quick access during checkpointing. oldest_pending_lsn is the LSN of the oldest page version stored in this layer. - -All operations that update InMemory Layers should update both structures to keep them up-to-date. - -- LayeredTimeline - implements Timeline interface. - -All methods of LayeredTimeline are aware of its ancestors and return data taking them into account. -TODO: Are there any exceptions to this? -For example, timeline.list_rels(lsn) will return all segments that are visible in this timeline at the LSN, -including ones that were not modified in this timeline and thus don't have a layer in the timeline's LayerMap. +The LayerMap tracks what layers exist in a timeline. +Currently, the layer map is just a resizeable array (Vec). On a GetPage@LSN or +other read request, the layer map scans through the array to find the right layer +that contains the data for the requested page. The read-code in LayeredTimeline +is aware of the ancestor, and returns data from the ancestor timeline if it's +not found on the current timeline. # Different kinds of layers @@ -92,11 +86,11 @@ To avoid OOM errors, InMemory layers can be spilled to disk into ephemeral file. TODO: Clarify the difference between Closed, Historic and Frozen. There are two kinds of OnDisk layers: -- ImageLayer represents an image or a snapshot of a 10 MB relish segment, at one particular LSN. -- DeltaLayer represents a collection of WAL records or page images in a range of LSNs, for one - relish segment. - -Dropped segments are always represented on disk by DeltaLayer. +- ImageLayer represents a snapshot of all the keys in a particular range, at one + particular LSN. Any keys that are not present in the ImageLayer are known not + to exist at that LSN. +- DeltaLayer represents a collection of WAL records or page images in a range of + LSNs, for a range of keys. # Layer life cycle @@ -109,74 +103,83 @@ layer or a delta layer, it is a valid end bound. An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1 Every layer starts its life as an Open In-Memory layer. When the page server -receives the first WAL record for a segment, it creates a new In-Memory layer -for it, and puts it to the layer map. Later, the layer is old enough, its -contents are written to disk, as On-Disk layers. This process is called -"evicting" a layer. +receives the first WAL record for a timeline, it creates a new In-Memory layer +for it, and puts it to the layer map. Later, when the layer becomes full, its +contents are written to disk, as an on-disk layers. -Layer eviction is a two-step process: First, the layer is marked as closed, so -that it no longer accepts new WAL records, and the layer map is updated -accordingly. If a new WAL record for that segment arrives after this step, a new -Open layer is created to hold it. After this first step, the layer is a Closed +Flushing a layer is a two-step process: First, the layer is marked as closed, so +that it no longer accepts new WAL records, and a new in-memory layer is created +to hold any WAL after that point. After this first step, the layer is a Closed InMemory state. This first step is called "freezing" the layer. -In the second step, new Delta and Image layers are created, containing all the -data in the Frozen InMemory layer. When the new layers are ready, the original -frozen layer is replaced with the new layers in the layer map, and the original -frozen layer is dropped, releasing the memory. +In the second step, a new Delta layers is created, containing all the data from +the Frozen InMemory layer. When it has been created and flushed to disk, the +original frozen layer is replaced with the new layers in the layer map, and the +original frozen layer is dropped, releasing the memory. # Layer files (On-disk layers) -The files are called "layer files". Each layer file corresponds -to one RELISH_SEG_SIZE slice of a PostgreSQL relation fork or -non-rel file in a range of LSNs. The layer files -for each timeline are stored in the timeline's subdirectory under -.zenith/tenants//timelines. +The files are called "layer files". Each layer file covers a range of keys, and +a range of LSNs (or a single LSN, in case of image layers). You can think of it +as a rectangle in the two-dimensional key-LSN space. The layer files for each +timeline are stored in the timeline's subdirectory under +`.neon/tenants//timelines`. -There are two kind of layer file: base images, and deltas. A base -image file contains a layer of a segment as it was at one LSN, -whereas a delta file contains modifications to a segment - mostly in -the form of WAL records - in a range of LSN +There are two kind of layer files: images, and delta layers. An image file +contains a snapshot of all keys at a particular LSN, whereas a delta file +contains modifications to a segment - mostly in the form of WAL records - in a +range of LSN. -base image file: +image file: - rel______ +``` + 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568 + start key end key LSN +``` + + +The first parts define the key range that the layer covers. See +pgdatadir_mapping.rs for how the key space is used. The last part is the LSN. delta file: - rel_______ +Delta files are named similarly, but they cover a range of LSNs: -For example: +``` + 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051 + start key end key start LSN end LSN +``` - rel_1663_13990_2609_0_10_000000000169C348 - rel_1663_13990_2609_0_10_000000000169C348_0000000001702000 +A delta file contains all the key-values in the key-range that were updated in +the LSN range. If a key has not been modified, there is no trace of it in the +delta layer. -In addition to the relations, with "rel_*" prefix, we use the same -format for storing various smaller files from the PostgreSQL data -directory. They will use different suffixes and the naming scheme up -to the LSNs vary. The Zenith source code uses the term "relish" to -mean "a relation, or other file that's treated like a relation in the -storage" For example, a base image of a CLOG segment would be named -like this: - pg_xact_0000_0_00000000198B06B0 +A delta layer file can cover a part of the overall key space, as in the previous +example, or the whole key range like this: -There is no difference in how the relation and non-relation files are -managed, except that the first part of file names is different. -Internally, the relations and non-relation files that are managed in -the versioned store are together called "relishes". +``` + 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000578C6B29-0000000057A50051 +``` -If a file has been dropped, the last layer file for it is created -with the _DROPPED suffix, e.g. - - rel_1663_13990_2609_0_10_000000000169C348_0000000001702000_DROPPED +A file that covers the whole key range is called a L0 file (Level 0), while a +file that covers only part of the key range is called a L1 file. The "level" of +a file is not explicitly stored anywhere, you can only distinguish them by +looking at the key range that a file covers. The read-path doesn't need to +treat L0 and L1 files any differently. ## Notation used in this document +FIXME: This is somewhat obsolete, the layer files cover a key-range rather than +a particular relation nowadays. However, the description on how you find a page +version, and how branching and GC works is still valid. + The full path of a delta file looks like this: - .zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000 +``` + .neon/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000 +``` For simplicity, the examples below use a simplified notation for the paths. The tenant ID is left out, the timeline ID is replaced with @@ -185,8 +188,10 @@ with a human-readable table name. The LSNs are also shorter. For example, a base image file at LSN 100 and a delta file between 100-200 for 'orders' table on 'main' branch is represented like this: +``` main/orders_100 main/orders_100_200 +``` # Creating layer files @@ -196,12 +201,14 @@ branch called 'main' and two tables, 'orders' and 'customers'. The end of WAL is currently at LSN 250. In this starting situation, you would have these files on disk: +``` main/orders_100 main/orders_100_200 main/orders_200 main/customers_100 main/customers_100_200 main/customers_200 +``` In addition to those files, the recent changes between LSN 200 and the end of WAL at 250 are kept in memory. If the page server crashes, the @@ -232,6 +239,7 @@ If the customers table is modified later, a new file is created for it at the next checkpoint. The new file will cover the "gap" from the last layer file, so the LSN ranges are always contiguous: +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -244,6 +252,7 @@ last layer file, so the LSN ranges are always contiguous: main/customers_200 main/customers_200_500 main/customers_500 +``` ## Reading page versions @@ -251,7 +260,7 @@ Whenever a GetPage@LSN request comes in from the compute node, the page server needs to reconstruct the requested page, as it was at the requested LSN. To do that, the page server first checks the recent in-memory layer; if the requested page version is found there, it can -be returned immediatedly without looking at the files on +be returned immediately without looking at the files on disk. Otherwise the page server needs to locate the layer file that contains the requested page version. @@ -267,15 +276,18 @@ involves replaying any WAL records applicable to the page between LSNs Imagine that a child branch is created at LSN 250: +``` @250 ----main--+--------------------------> \ +---child--------------> +``` Then, the 'orders' table is updated differently on the 'main' and 'child' branches. You now have this situation on disk: +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -290,6 +302,7 @@ Then, the 'orders' table is updated differently on the 'main' and child/orders_300 child/orders_300_400 child/orders_400 +``` Because the 'customers' table hasn't been modified on the child branch, there is no file for it there. If you request a page for it on @@ -302,6 +315,7 @@ is linear, and the request's LSN identifies unambiguously which file you need to look at. For example, the history for the 'orders' table on the 'main' branch consists of these files: +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -309,10 +323,12 @@ on the 'main' branch consists of these files: main/orders_300 main/orders_300_400 main/orders_400 +``` And from the 'child' branch's point of view, it consists of these files: +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -321,6 +337,7 @@ files: child/orders_300 child/orders_300_400 child/orders_400 +``` The branch metadata includes the point where the child branch was created, LSN 250. If a page request comes with LSN 275, we read the @@ -334,7 +351,7 @@ branch. Note: It doesn't make any difference if the child branch is created when the end of the main branch was at LSN 250, or later when the tip of the main branch had already moved on. The latter case, creating a -branch at a historic LSN, is how we support PITR in Zenith. +branch at a historic LSN, is how we support PITR in Neon. # Garbage collection @@ -353,6 +370,7 @@ Let's look at the single branch scenario again. Imagine that the end of the branch is LSN 525, so that the GC horizon is currently at 525-150 = 375 +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -365,33 +383,37 @@ of the branch is LSN 525, so that the GC horizon is currently at main/customers_100 main/customers_100_200 main/customers_200 +``` We can remove the following files because the end LSNs of those files are older than GC horizon 375, and there are more recent layer files for the table: +``` main/orders_100 DELETE main/orders_100_200 DELETE main/orders_200 DELETE main/orders_200_300 DELETE main/orders_300 STILL NEEDED BY orders_300_400 main/orders_300_400 KEEP, NEWER THAN GC HORIZON - main/orders_400 .. - main/orders_400_500 .. - main/orders_500 .. + main/orders_400 .. + main/orders_400_500 .. + main/orders_500 .. main/customers_100 DELETE main/customers_100_200 DELETE main/customers_200 KEEP, NO NEWER VERSION +``` -'main/customers_100_200' is old enough, but it cannot be +'main/customers_200' is old enough, but it cannot be removed because there is no newer layer file for the table. Things get slightly more complicated with multiple branches. All of the above still holds, but in addition to recent files we must also -retain older shapshot files that are still needed by child branches. +retain older snapshot files that are still needed by child branches. For example, if child branch is created at LSN 150, and the 'customers' table is updated on the branch, you would have these files: +``` main/orders_100 KEEP, NEEDED BY child BRANCH main/orders_100_200 KEEP, NEEDED BY child BRANCH main/orders_200 DELETE @@ -406,6 +428,7 @@ table is updated on the branch, you would have these files: main/customers_200 KEEP, NO NEWER VERSION child/customers_150_300 DELETE child/customers_300 KEEP, NO NEWER VERSION +``` In this situation, 'main/orders_100' and 'main/orders_100_200' cannot be removed, even though they are older than the GC horizon, because @@ -415,6 +438,7 @@ and 'main/orders_200_300' can still be removed. If 'orders' is modified later on the 'child' branch, we will create a new base image and delta file for it on the child: +``` main/orders_100 main/orders_100_200 @@ -427,6 +451,7 @@ new base image and delta file for it on the child: child/customers_300 child/orders_150_400 child/orders_400 +``` After this, the 'main/orders_100' and 'main/orders_100_200' file could be removed. It is no longer needed by the child branch, because there @@ -442,6 +467,7 @@ Describe GC and checkpoint interval settings. In principle, each relation can be checkpointed separately, i.e. the LSN ranges of the files don't need to line up. So this would be legal: +``` main/orders_100 main/orders_100_200 main/orders_200 @@ -454,6 +480,7 @@ LSN ranges of the files don't need to line up. So this would be legal: main/customers_250 main/customers_250_500 main/customers_500 +``` However, the code currently always checkpoints all relations together. So that situation doesn't arise in practice. @@ -476,11 +503,13 @@ does that. It could be useful, however, as a transient state when garbage collecting around branch points, or explicit recovery points. For example, if we start with this: +``` main/orders_100 main/orders_100_200 main/orders_200 main/orders_200_300 main/orders_300 +``` And there is a branch or explicit recovery point at LSN 150, we could replace 'main/orders_100_200' with 'main/orders_150' to keep a diff --git a/docs/pageserver-tenant-migration.md b/docs/pageserver-tenant-migration.md index a846213ab2..5fb2097030 100644 --- a/docs/pageserver-tenant-migration.md +++ b/docs/pageserver-tenant-migration.md @@ -9,7 +9,7 @@ This feature allows to migrate a timeline from one pageserver to another by util Pageserver implements two new http handlers: timeline attach and timeline detach. Timeline migration is performed in a following way: 1. Timeline attach is called on a target pageserver. This asks pageserver to download latest checkpoint uploaded to s3. -2. For now it is necessary to manually initialize replication stream via callmemaybe call so target pageserver initializes replication from safekeeper (it is desired to avoid this and initialize replication directly in attach handler, but this requires some refactoring (probably [#997](https://github.com/zenithdb/zenith/issues/997)/[#1049](https://github.com/zenithdb/zenith/issues/1049)) +2. For now it is necessary to manually initialize replication stream via callmemaybe call so target pageserver initializes replication from safekeeper (it is desired to avoid this and initialize replication directly in attach handler, but this requires some refactoring (probably [#997](https://github.com/neondatabase/neon/issues/997)/[#1049](https://github.com/neondatabase/neon/issues/1049)) 3. Replication state can be tracked via timeline detail pageserver call. 4. Compute node should be restarted with new pageserver connection string. Issue with multiple compute nodes for one timeline is handled on the safekeeper consensus level. So this is not a problem here.Currently responsibility for rescheduling the compute with updated config lies on external coordinator (console). 5. Timeline is detached from old pageserver. On disk data is removed. @@ -18,5 +18,5 @@ Timeline migration is performed in a following way: ### Implementation details Now safekeeper needs to track which pageserver it is replicating to. This introduces complications into replication code: -* We need to distinguish different pageservers (now this is done by connection string which is imperfect and is covered here: https://github.com/zenithdb/zenith/issues/1105). Callmemaybe subscription management also needs to track that (this is already implemented). +* We need to distinguish different pageservers (now this is done by connection string which is imperfect and is covered here: https://github.com/neondatabase/neon/issues/1105). Callmemaybe subscription management also needs to track that (this is already implemented). * We need to track which pageserver is the primary. This is needed to avoid reconnections to non primary pageservers. Because we shouldn't reconnect to them when they decide to stop their walreceiver. I e this can appear when there is a load on the compute and we are trying to detach timeline from old pageserver. In this case callmemaybe will try to reconnect to it because replication termination condition is not met (page server with active compute could never catch up to the latest lsn, so there is always some wal tail) diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md new file mode 100644 index 0000000000..e351c972cb --- /dev/null +++ b/docs/pageserver-thread-mgmt.md @@ -0,0 +1,39 @@ +## Thread management + +The pageserver uses Tokio for handling concurrency. Everything runs in +Tokio tasks, although some parts are written in blocking style and use +spawn_blocking(). + +Each Tokio task is tracked by the `task_mgr` module. It maintains a +registry of tasks, and which tenant or timeline they are operating +on. + +### Handling shutdown + +When a tenant or timeline is deleted, we need to shut down all tasks +operating on it, before deleting the data on disk. There's a function, +`shutdown_tasks`, to request all tasks of a particular tenant or +timeline to shutdown. It will also wait for them to finish. + +A task registered in the task registry can check if it has been +requested to shut down, by calling `is_shutdown_requested()`. There's +also a `shudown_watcher()` Future that can be used with `tokio::select!` +or similar, to wake up on shutdown. + + +### Sync vs async + +We use async to wait for incoming data on network connections, and to +perform other long-running operations. For example, each WAL receiver +connection is handled by a tokio Task. Once a piece of WAL has been +received from the network, the task calls the blocking functions in +the Repository to process the WAL. + +The core storage code in `layered_repository/` is synchronous, with +blocking locks and I/O calls. The current model is that we consider +disk I/Os to be short enough that we perform them while running in a +Tokio task. If that becomes a problem, we should use `spawn_blocking` +before entering the synchronous parts of the code, or switch to using +tokio I/O functions. + +Be very careful when mixing sync and async code! diff --git a/docs/pageserver-walredo.md b/docs/pageserver-walredo.md new file mode 100644 index 0000000000..1de9c177cc --- /dev/null +++ b/docs/pageserver-walredo.md @@ -0,0 +1,77 @@ +# WAL Redo + +To reconstruct a particular page version from an image of the page and +some WAL records, the pageserver needs to replay the WAL records. This +happens on-demand, when a GetPage@LSN request comes in, or as part of +background jobs that reorganize data for faster access. + +It's important that data cannot leak from one tenant to another, and +that a corrupt WAL record on one timeline doesn't affect other tenants +or timelines. + +## Multi-tenant security + +If you have direct access to the WAL directory, or if you have +superuser access to a running PostgreSQL server, it's easy to +construct a malicious or corrupt WAL record that causes the WAL redo +functions to crash, or to execute arbitrary code. That is not a +security problem for PostgreSQL; if you have superuser access, you +have full access to the system anyway. + +The Neon pageserver, however, is multi-tenant. It needs to execute WAL +belonging to different tenants in the same system, and malicious WAL +in one tenant must not affect other tenants. + +A separate WAL redo process is launched for each tenant, and the +process uses the seccomp(2) system call to restrict its access to the +bare minimum needed to replay WAL records. The process does not have +access to the filesystem or network. It can only communicate with the +parent pageserver process through a pipe. + +If an attacker creates a malicious WAL record and injects it into the +WAL stream of a timeline, he can take control of the WAL redo process +in the pageserver. However, the WAL redo process cannot access the +rest of the system. And because there is a separate WAL redo process +for each tenant, the hijacked WAL redo process can only see WAL and +data belonging to the same tenant, which the attacker would have +access to anyway. + +## WAL-redo process communication + +The WAL redo process runs the 'postgres' executable, launched with a +Neon-specific command-line option to put it into WAL-redo process +mode. The pageserver controls the lifetime of the WAL redo processes, +launching them as needed. If a tenant is detached from the pageserver, +any WAL redo processes for that tenant are killed. + +The pageserver communicates with each WAL redo process over its +stdin/stdout/stderr. It works in request-response model with a simple +custom protocol, described in walredo.rs. To replay a set of WAL +records for a page, the pageserver sends the "before" image of the +page and the WAL records over 'stdin', followed by a command to +perform the replay. The WAL redo process responds with an "after" +image of the page. + +## Special handling of some records + +Some WAL record types are handled directly in the pageserver, by +bespoken Rust code, and are not sent over to the WAL redo process. +This includes SLRU-related WAL records, like commit records. SLRUs +don't use the standard Postgres buffer manager, so dealing with them +in the Neon WAL redo mode would require quite a few changes to +Postgres code and special handling in the protocol anyway. + +Some record types that include a full-page-image (e.g. XLOG_FPI) are +also handled specially when incoming WAL is processed already, and are +stored as page images rather than WAL records. + + +## Records that modify multiple pages + +Some Postgres WAL records modify multiple pages. Such WAL records are +duplicated, so that a copy is stored for each affected page. This is +somewhat wasteful, but because most WAL records only affect one page, +the overhead is acceptable. + +The WAL redo always happens for one particular page. If the WAL record +coantains changes to other pages, they are ignored. diff --git a/docs/pageserver.md b/docs/pageserver.md new file mode 100644 index 0000000000..ee70032396 --- /dev/null +++ b/docs/pageserver.md @@ -0,0 +1,11 @@ +# Page server architecture + +The Page Server has a few different duties: + +- Respond to GetPage@LSN requests from the Compute Nodes +- Receive WAL from WAL safekeeper, and store it +- Upload data to S3 to make it durable, download files from S3 as needed + +S3 is the main fault-tolerant storage of all data, as there are no Page Server +replicas. We use a separate fault-tolerant WAL service to reduce latency. It +keeps track of WAL records which are not synced to S3 yet. diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md new file mode 100644 index 0000000000..f99683cf09 --- /dev/null +++ b/docs/rfcs/002-storage.md @@ -0,0 +1,186 @@ +# Zenith storage node — alternative + +## **Design considerations** + +Simplify storage operations for people => Gain adoption/installs on laptops and small private installation => Attract customers to DBaaS by seamless integration between our tooling and cloud. + +Proposed architecture addresses: + +- High availability -- tolerates n/2 - 1 failures +- Multi-tenancy -- one storage for all databases +- Elasticity -- increase storage size on the go by adding nodes +- Snapshots / backups / PITR with S3 offload +- Compression + +Minuses are: + +- Quite a lot of work +- Single page access may touch few disk pages +- Some bloat in data — may slowdown sequential scans + +## **Summary** + +Storage cluster is sharded key-value store with ordered keys. Key (****page_key****) is a tuple of `(pg_id, db_id, timeline_id, rel_id, forkno, segno, pageno, lsn)`. Value is either page or page diff/wal. Each chunk (chunk == shard) stores approx 50-100GB ~~and automatically splits in half when grows bigger then soft 100GB limit~~. by having a fixed range of pageno's it is responsible for. Chunks placement on storage nodes is stored in a separate metadata service, so chunk can be freely moved around the cluster if it is need. Chunk itself is a filesystem directory with following sub directories: + +``` + +|-chunk_42/ + |-store/ -- contains lsm with pages/pagediffs ranging from + | page_key_lo to page_key_hi + |-wal/ + | |- db_1234/ db-specific wal files with pages from page_key_lo + | to page_key_hi + | + |-chunk.meta -- small file with snapshot references + (page_key_prefix+lsn+name) + and PITR regions (page_key_start, page_key_end) +``` + +## **Chunk** + +Chunk is responsible for storing pages potentially from different databases and relations. Each page is addressed by a lexicographically ordered tuple (****page_key****) with following fields: + +- `pg_id` -- unique id of given postgres instance (or postgres cluster as it is called in postgres docs) +- `db_id` -- database that was created by 'CREATE DATABASE' in a given postgres instance +- `db_timeline` -- used to create Copy-on-Write instances from snapshots, described later +- `rel_id` -- tuple of (relation_id, 0) for tables and (indexed_relation_id, rel_id) for indices. Done this way so table indices were closer to table itself on our global key space. +- `(forkno, segno, pageno)` -- page coordinates in postgres data files +- `lsn_timeline` -- postgres feature, increments when PITR was done. +- `lsn` -- lsn of current page version. + +Chunk stores pages and page diffs ranging from page_key_lo to page_key_hi. Processing node looks at page in wal record and sends record to a chunk responsible for this page range. When wal record arrives to a chunk it is initially stored in `chunk_id/wal/db_id/wal_segno.wal`. Then background process moves records from that wal files to the lsm tree in `chunk_id/store`. Or, more precisely, wal records would be materialized into lsm memtable and when that memtable is flushed to SSTable on disk we may trim the wal. That way some not durably (in the distributed sense) committed pages may enter the tree -- here we rely on processing node behavior: page request from processing node should contain proper lsm horizons so that storage node may respond with proper page version. + +LSM here is a usual LSM for variable-length values: at first data is stored in memory (we hold incoming wal records to be able to regenerate it after restart) at some balanced tree. When this tree grows big enough we dump it into disk file (SSTable) sorting records by key. Then SStables are mergesorted in the background to a different files. All file operation are sequential and do not require WAL for durability. + +Content of SSTable can be following: + +```jsx +(pg_id, db_id, ... , pageno=42, lsn=100) (full 8k page data) +(pg_id, db_id, ... , pageno=42, lsn=150) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=180) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=200) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=220) (full 8k page data) +(pg_id, db_id, ... , pageno=42, lsn=250) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=270) (per-page diff) +(pg_id, db_id, ... , pageno=5000, lsn=100) (full 8k page data) +``` + +So query for `pageno=42 up to lsn=260` would need to find closest entry less then this key, iterate back to the latest full page and iterate forward to apply diffs. How often page is materialized in lsn-version sequence is up to us -- let's say each 5th version should be a full page. + +### **Page deletion** + +To delete old pages we insert blind deletion marker `(pg_id, db_id, #trim_lsn < 150)` into a lsm tree. During merges such marker would indicate that all pages with smaller lsn should be discarded. Delete marker will travel down the tree levels hierarchy until it reaches last level. In non-PITR scenario where old page version are not needed at all such deletion marker would (in average) prevent old page versions propagation down the tree -- so all bloat would concentrate at higher tree layers without affecting bigger bottom layers. + +### **Recovery** + +Upon storage node restart recent WAL files are applied to appropriate pages and resulting pages stored in lsm memtable. So this should be fast since we are not writing anything to disk. + +### **Checkpointing** + +No such mechanism is needed. Or we may look at the storage node as at kind of continuous checkpointer. + +### **Full page writes (torn page protection)** + +Storage node never updates individual pages, only merges SSTable, so torn pages is not an issue. + +### **Snapshot** + +That is the part that I like about this design -- snapshot creation is instant and cheap operation that can have flexible granularity level: whole instance, database, table. Snapshot creation inserts a record in `chunk.meta` file with lsn of this snapshot and key prefix `(pg_id, db_id, db_timeline, rel_id, *)` that prohibits pages deletion within this range. Storage node may not know anything about page internals, but by changing number of fields in our prefix we may change snapshot granularity. + +It is again useful to remap `rel_id` to `(indexed_relation_id, rel_id)` so that snapshot of relation would include it's indices. Also table snapshot would trickily interact with catalog. Probably all table snapshots should hold also a catalog snapshot. And when node is started with such snapshot it should check that only tables from snapshot are queried. I assume here that for snapshot reading one need to start a new postgres instance. + +Storage consumed by snapshot is proportional to the amount of data changed. We may have some heuristic (calculated based on cost of different storages) about when to offload old snapshot to s3. For example, if current database has more then 40% of changed pages with respect to previous snapshot then we may offload that snapshot to s3, and release this space. + +**Starting db from snapshot** + +When we are starting database from snapshot it can be done in two ways. First, we may create new db_id, move all the data from snapshot to a new db and start a database. Second option is to create Copy-on-Write (CoW) instance out of snapshot and read old pages from old snapshot and store new pages separately. That is why there is `db_timeline` key field near `db_id` -- CoW (🐮) database should create new `db_timeline` and remember old `db_timeline`. Such a database can have hashmap of pages that it is changed to query pages from proper snapshot on the first try. `db_timeline` is located near `db_id` so that new page versions generated by new instance would not bloat data of initial snapshot. It is not clear for whether it is possibly to effectively support "stacked" CoW snapshot, so we may disallow them. (Well, one way to support them is to move `db_timeline` close to `lsn` -- so we may scan neighboring pages and find right one. But again that way we bloat snapshot with unrelated data and may slowdown full scans that are happening in different database). + +**Snapshot export/import** + +Once we may start CoW instances it is easy to run auxiliary postgres instance on this snapshot and run `COPY FROM (...) TO stdout` or `pg_dump` and export data from the snapshot to some portable formats. Also we may start postgres on a new empty database and run `COPY FROM stdin`. This way we can initialize new non-CoW databases and transfer snapshots via network. + +### **PITR area** + +In described scheme PITR is just a prohibition to delete any versions within some key prefix, either it is a database or a table key prefix. So PITR may have different settings for different tables, databases, etc. + +PITR is quite bloaty, so we may aggressively offload it to s3 -- we may push same (or bigger) SSTables to s3 and maintain lsm structure there. + +### **Compression** + +Since we are storing page diffs of variable sizes there is no structural dependency on a page size and we may compress it. Again that could be enabled only on pages with some key prefixes, so we may have this with db/table granularity. + +### **Chunk metadata** + +Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunk should always consult this data when merging SSTables and applying delete markers. + +### **Chunk splitting** + +*(NB: following paragraph is about how to avoid page splitting)* + +When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global metadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following: + +1. Find separation key and spawn two new chunks with [lo, mid) [mid, hi) boundaries. + +2. Prohibit WAL deletion and old SSTables deletion on original chunk. + +3. On each lsm layer we would need to split only one SSTable, all other would fit within left or right range. Symlink/split that files to new chunks. + +4. Start WAL replay on new chunks. + +5. Update global metadata about new chunk boundaries. + +6. Eventually (metadata update should be pushed to processing node by metadata service) storage node will start sending WAL and page requests to the new nodes. + +7. New chunk may start serving read queries when following conditions are met: + +a) it receives at least on WAL record from processing node + +b) it replayed all WAL up to the new received one + +c) checked by downlinks that there were no WAL gaps. + +Chunk split as it is described here is quite fast operation when it is happening on the local disk -- vast majority of files will be just moved without copying anything. I suggest to keep split always local and not to mix it with chunk moving around cluster. So if we want to split some chunk but there is small amount of free space left on the device, we should first move some chunks away from the node and then proceed with splitting. + +### Fixed chunks + +Alternative strategy is to not to split at all and have pageno-fixed chunk boundaries. When table is created we first materialize this chunk by storing first new pages only and chunks is small. Then chunk is growing while table is filled, but it can't grow substantially bigger then allowed pageno range, so at max it would be 1GB or whatever limit we want + some bloat due to snapshots and old page versions. + +### **Chunk lsm internals** + +So how to implement chunk's lsm? + +- Write from scratch and use RocksDB to prototype/benchmark, then switch to own lsm implementation. RocksDB can provide some sanity check for performance of home-brewed implementation and it would be easier to prototype. +- Use postgres as lego constructor. We may model memtable with postgres B-tree referencing some in-memory log of incoming records. SSTable merging may reuse postgres external merging algorithm, etc. One thing that would definitely not fit (or I didn't came up with idea how to fit that) -- is multi-tenancy. If we are storing pages from different databases we can't use postgres buffer pool, since there is no db_id in the page header. We can add new field there but IMO it would be no go for committing that to vanilla. + +Other possibility is to not to try to fit few databases in one storage node. But that way it is no go for multi-tenant cloud installation: we would need to run a lot of storage node instances on one physical storage node, all with it own local page cache. So that would be much closer to ordinary managed RDS. + +Multi-tenant storage makes sense even on a laptop, when you work with different databases, running tests with temp database, etc. And when installation grows bigger it start to make more and more sense, so it seems important. + +# Storage fleet + +# **Storage fleet** + +- When database is smaller then a chunk size we naturally can store them in one chunk (since their page_key would fit in some chunk's [hi, lo) range). + +Screenshot_2021-02-22_at_16 49 17 + +Few databases are stored in one chunk, replicated three times + +- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we always may manually move chunks around the cluster. + +Screenshot_2021-02-22_at_16 49 10 + +Here one big database occupies two set of nodes. Also some chunks were moved around to restore replication factor after disk failure. In this case we also have "sharded" storage for a big database and issue wal writes to different chunks in parallel. + +## **Chunk placement strategies** + +There are few scenarios where we may want to move chunks around the cluster: + +- disk usage on some node is big +- some disk experienced a failure +- some node experienced a failure or need maintenance + +## **Chunk replication** + +Chunk replication may be done by cloning page ranges with respect to some lsn from peer nodes, updating global metadata, waiting for WAL to come, replaying previous WAL and becoming online -- more or less like during chunk split. + diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md new file mode 100644 index 0000000000..1a549c2df5 --- /dev/null +++ b/docs/rfcs/003-laptop-cli.md @@ -0,0 +1,267 @@ +# Command line interface (end-user) + +Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start. + +This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots. + +The most important concept here is a snapshot, which can be created/pushed/pulled/exported. Also, we may start temporary read-only postgres instance over any local snapshot. A more complex scenario would consist of several basic operations over snapshots. + +# Possible usage scenarios + +## Install zenith, run a postgres + +``` +> brew install pg-zenith +> zenith pg create # creates pgdata with default pattern pgdata$i +> zenith pg list +ID PGDATA USED STORAGE ENDPOINT +primary1 pgdata1 0G zenith-local localhost:5432 +``` + +## Import standalone postgres to zenith + +``` +> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg +[====================------------] 60% | 20MB/s +> zenith snapshot list +ID SIZE PARENT +oldpg 5G - + +> zenith pg create --snapshot oldpg +Started postgres on localhost:5432 + +> zenith pg list +ID PGDATA USED STORAGE ENDPOINT +primary1 pgdata1 5G zenith-local localhost:5432 + +> zenith snapshot destroy oldpg +Ok +``` + +Also, we may start snapshot import implicitly by looking at snapshot schema + +``` +> zenith pg create --snapshot basebackup://replication@localhost:5432/ +Downloading snapshot... Done. +Started postgres on localhost:5432 +Destroying snapshot... Done. +``` + +## Pull snapshot with some publicly shared database + +Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage). + +``` +> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies +``` + +## Create snapshot and push it to the cloud + +``` +> zenith snapshot create pgdata1@snap1 +> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1 +``` + +## Rollback database to the snapshot + +One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`. + +``` +> zenith pg list +ID PGDATA USED STORAGE ENDPOINT +primary1 pgdata1 5G zenith-local localhost:5432 + +> zenith snapshot create pgdata1@snap1 + +> zenith snapshot list +ID SIZE PARENT +oldpg 5G - +pgdata1@snap1 6G - +pgdata1@CURRENT 6G - + +> zenith pg checkout pgdata1@snap1 +Stopping postgres on pgdata1. +Rolling back pgdata1@CURRENT to pgdata1@snap1. +Starting postgres on pgdata1. + +> zenith snapshot list +ID SIZE PARENT +oldpg 5G - +pgdata1@snap1 6G - +pgdata1@HEAD{0} 6G - +pgdata1@CURRENT 6G - +``` + +Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state of the database in the data directory. When we are checking out some snapshot CURRENT will be set to this snapshot and the old CURRENT state will be named HEAD{0} (0 is the number of postgres timeline, it would be incremented after each such checkout). + +## Configure PITR area (Point In Time Recovery). + +PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite). + +``` +> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month +``` + +Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area. + +# Manual + +## storage + +Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default. + +**zenith storage attach** -t [native|s3] -c key=value -n name + +Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'. + + +**zenith storage list** + +Show currently attached storages. For example: + +``` +> zenith storage list +NAME USED TYPE OPTIONS PATH +local 5.1G zenith-local /opt/zenith/store/local +local.compr 20.4G zenith-local compression=on /opt/zenith/store/local.compr +zcloud 60G zenith-remote zenith.tech/stas/mystore +s3tank 80G S3 +``` + +**zenith storage detach** + +**zenith storage show** + + + +## pg + +Manages postgres data directories and can start postgres instances with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themselves. + +Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together. + +**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata + +Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr. + +--no-start: just init datadir without creating + +--snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1) + +--cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database) + +**zenith pg destroy** + +**zenith pg start** [--replica] pgdata + +Start postgres with proper extensions preloaded/installed. + +**zenith pg checkout** + +Rollback data directory to some previous snapshot. + +**zenith pg stop** pg_id + +**zenith pg list** + +``` +ROLE PGDATA USED STORAGE ENDPOINT +primary my_pg 5.1G local localhost:5432 +replica-1 localhost:5433 +replica-2 localhost:5434 +primary my_pg2 3.2G local.compr localhost:5435 +- my_pg3 9.2G local.compr - +``` + +**zenith pg show** + +``` +my_pg: + storage: local + space used on local: 5.1G + space used on all storages: 15.1G + snapshots: + on local: + snap1: 1G + snap2: 1G + on zcloud: + snap2: 1G + on s3tank: + snap5: 2G + pitr: + on s3tank: + pitr_one_month: 45G + +``` + +**zenith pg start-rest/graphql** pgdata + +Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea. + + +## snapshot + +Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout. + +**zenith snapshot create** pgdata_name@snap_name + +Creates a new snapshot in the same storage where pgdata_name exists. + +**zenith snapshot push** --to url pgdata_name@snap_name + +Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go. + +**zenith snapshot recv** + +Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket. + +**zenith snapshot pull** --from url or path + +Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format. + +**zenith snapshot import** --from basebackup://<...> or path + +Creates a new snapshot out of running postgres via basebackup protocol or basebackup files. + +**zenith snapshot export** + +Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay). + +**zenith snapshot diff** snap1 snap2 + +Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses. + +**zenith snapshot destroy** + +## pitr + +Pitr represents wal stream and ttl policy for that stream + +XXX: any suggestions on a better name? + +**zenith pitr create** name + +--ttl = inf | period + +--size-limit = inf | limit + +--storage = storage_name + +**zenith pitr extract-snapshot** pitr_name --lsn xxx + +Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export) + +**zenith pitr gc** pitr_name + +Force garbage collection on some PITR area. + +**zenith pitr list** + +**zenith pitr destroy** + + +## console + +**zenith console** + +Opens browser targeted at web console with the more or less same functionality as described here. diff --git a/docs/rfcs/004-durability.md b/docs/rfcs/004-durability.md new file mode 100644 index 0000000000..d4716156d1 --- /dev/null +++ b/docs/rfcs/004-durability.md @@ -0,0 +1,218 @@ +Durability & Consensus +====================== + +When a transaction commits, a commit record is generated in the WAL. +When do we consider the WAL record as durable, so that we can +acknowledge the commit to the client and be reasonably certain that we +will not lose the transaction? + +Zenith uses a group of WAL safekeeper nodes to hold the generated WAL. +A WAL record is considered durable, when it has been written to a +majority of WAL safekeeper nodes. In this document, I use 5 +safekeepers, because I have five fingers. A WAL record is durable, +when at least 3 safekeepers have written it to disk. + +First, assume that only one primary node can be running at a +time. This can be achieved by Kubernetes or etcd or some +cloud-provider specific facility, or we can implement it +ourselves. These options are discussed in later chapters. For now, +assume that there is a Magic STONITH Fairy that ensures that. + +In addition to the WAL safekeeper nodes, the WAL is archived in +S3. WAL that has been archived to S3 can be removed from the +safekeepers, so the safekeepers don't need a lot of disk space. + +``` + +----------------+ + +-----> | WAL safekeeper | + | +----------------+ + | +----------------+ + +-----> | WAL safekeeper | ++------------+ | +----------------+ +| Primary | | +----------------+ +| Processing | ---------+-----> | WAL safekeeper | +| Node | | +----------------+ ++------------+ | +----------------+ + \ +-----> | WAL safekeeper | + \ | +----------------+ + \ | +----------------+ + \ +-----> | WAL safekeeper | + \ +----------------+ + \ + \ + \ + \ + \ +--------+ + \ | | + +------> | S3 | + | | + +--------+ + +``` +Every WAL safekeeper holds a section of WAL, and a VCL value. +The WAL can be divided into three portions: + +``` + VCL LSN + | | + V V +.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX +Archived WAL Completed WAL In-flight WAL +``` + +Note that all this WAL kept in a safekeeper is a contiguous section. +This is different from Aurora: In Aurora, there can be holes in the +WAL, and there is a Gossip protocol to fill the holes. That could be +implemented in the future, but let's keep it simple for now. WAL needs +to be written to a safekeeper in order. However, during crash +recovery, In-flight WAL that has already been stored in a safekeeper +can be truncated or overwritten. + +The Archived WAL has already been stored in S3, and can be removed from +the safekeeper. + +The Completed WAL has been written to at least three safekeepers. The +algorithm ensures that it is not lost, when at most two nodes fail at +the same time. + +The In-flight WAL has been persisted in the safekeeper, but if a crash +happens, it may still be overwritten or truncated. + + +The VCL point is determined in the Primary. It is not strictly +necessary to store it in the safekeepers, but it allows some +optimizations and sanity checks and is probably generally useful for +the system as whole. The VCL values stored in the safekeepers can lag +behind the VCL computed by the primary. + + +Primary node Normal operation +----------------------------- + +1. Generate some WAL. + +2. Send the WAL to all the safekeepers that you can reach. + +3. As soon as a quorum of safekeepers have acknowledged that they have + received and durably stored the WAL up to that LSN, update local VCL + value in memory, and acknowledge commits to the clients. + +4. Send the new VCL to all the safekeepers that were part of the quorum. + (Optional) + + +Primary Crash recovery +---------------------- + +When a new Primary node starts up, before it can generate any new WAL +it needs to contact a majority of the WAL safekeepers to compute the +VCL. Remember that there is a Magic STONITH fairy that ensures that +only node process can be doing this at a time. + +1. Contact all WAL safekeepers. Find the Max((Epoch, LSN)) tuple among the ones you + can reach. This is the Winner safekeeper, and its LSN becomes the new VCL. + +2. Update the other safekeepers you can reach, by copying all the WAL + from the Winner, starting from each safekeeper's old VCL point. Any old + In-Flight WAL from previous Epoch is truncated away. + +3. Increment Epoch, and send the new Epoch to the quorum of + safekeepers. (This ensures that if any of the safekeepers that we + could not reach later come back online, they will be considered as + older than this in any future recovery) + +You can now start generating new WAL, starting from the newly-computed +VCL. + +Optimizations +------------- + +As described, the Primary node sends all the WAL to all the WAL safekeepers. That +can be a lot of network traffic. Instead of sending the WAL directly from Primary, +some safekeepers can be daisy-chained off other safekeepers, or there can be a +broadcast mechanism among them. There should still be a direct connection from the +each safekeeper to the Primary for the acknowledgments though. + +Similarly, the responsibility for archiving WAL to S3 can be delegated to one of +the safekeepers, to reduce the load on the primary. + + +Magic STONITH fairy +------------------- + +Now that we have a system that works as long as only one primary node is running at a time, how +do we ensure that? + +1. Use etcd to grant a lease on a key. The primary node is only allowed to operate as primary + when it's holding a valid lease. If the primary node dies, the lease expires after a timeout + period, and a new node is allowed to become the primary. + +2. Use S3 to store the lease. S3's consistency guarantees are more lenient, so in theory you + cannot do this safely. In practice, it would probably be OK if you make the lease times and + timeouts long enough. This has the advantage that we don't need to introduce a new + component to the architecture. + +3. Use Raft or Paxos, with the WAL safekeepers acting as the Acceptors to form the quorum. The + next chapter describes this option. + + +Built-in Paxos +-------------- + +The WAL safekeepers act as PAXOS Acceptors, and the Processing nodes +as both Proposers and Learners. + +Each WAL safekeeper holds an Epoch value in addition to the VCL and +the WAL. Each request by the primary to safekeep WAL is accompanied by +an Epoch value. If a safekeeper receives a request with Epoch that +doesn't match its current Accepted Epoch, it must ignore (NACK) it. +(In different Paxos papers, Epochs are called "terms" or "round +numbers") + +When a node wants to become the primary, it generates a new Epoch +value that is higher than any previously observed Epoch value, and +globally unique. + + +Accepted Epoch: 555 VCL LSN + | | + V V +.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX +Archived WAL Completed WAL In-flight WAL + + +Primary node startup: + +1. Contact all WAL safekeepers that you can reach (if you cannot + connect to a quorum of them, you can give up immediately). Find the + latest Epoch among them. + +2. Generate a new globally unique Epoch, greater than the latest Epoch + found in previous step. + +2. Send the new Epoch in a Prepare message to a quorum of + safekeepers. (PAXOS Prepare message) + +3. Each safekeeper responds with a Promise. If a safekeeper has + already made a promise with a higher Epoch, it doesn't respond (or + responds with a NACK). After making a promise, the safekeeper stops + responding to any write requests with earlier Epoch. + +4. Once you have received a majority of promises, you know that the + VCL cannot advance on the old Epoch anymore. This effectively kills + any old primary server. + +5. Find the highest written LSN among the quorum of safekeepers (these + can be included in the Promise messages already). This is the new + VCL. If a new node starts the election process after this point, + it will compute the same or higher VCL. + +6. Copy the WAL from the safekeeper with the highest LSN to the other + safekeepers in the quorum, using the new Epoch. (PAXOS Accept + phase) + +7. You can now start generating new WAL starting from the VCL. If + another process starts the election process after this point and + gains control of a majority of the safekeepers, we will no longer + be able to advance the VCL. + diff --git a/docs/rfcs/005-zenith_local.md b/docs/rfcs/005-zenith_local.md new file mode 100644 index 0000000000..e36d0a9ae3 --- /dev/null +++ b/docs/rfcs/005-zenith_local.md @@ -0,0 +1,103 @@ +# Zenith local + +Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together. Your comments on both parts are very welcome. + +#### Why do we need it? +- For distribution - this easy to use binary will help us to build adoption among developers. +- For internal use - to test all components together. + +In my understanding, we consider it to be just a mock-up version of zenith-cloud. +> Question: How much should we care about durability and security issues for a local setup? + + +#### Why is it better than a simple local postgres? + +- Easy one-line setup. As simple as `cargo install zenith && zenith start` + +- Quick and cheap creation of compute nodes over the same storage. +> Question: How can we describe a use-case for this feature? + +- Zenith-local can work with S3 directly. + +- Push and pull images (snapshots) to remote S3 to exchange data with other users. + +- Quick and cheap snapshot checkouts to switch back and forth in the database history. +> Question: Do we want it in the very first release? This feature seems quite complicated. + +#### Distribution: + +Ideally, just one binary that incorporates all elements we need. +> Question: Let's discuss pros and cons of having a separate package with modified PostgreSQL. + +#### Components: + +- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way. +CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md +WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli + +- **zenith-console** - WEB UI with same functionality as CLI. +>Note: not for the first release. + +- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below. + > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local. + +- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation). +> Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server? + +WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src + +- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith. +> Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)? +> Question: Do we use it together with local page store or they are interchangeable? + +WIP code is ??? + +- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed. +> Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system. + +WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper + +- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database. + + WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node + +#### REST API: + +Service endpoint: `http://localhost:3000` + +Resources: +- /storages - Where data lives: zenith-pageserver or zenith-s3 +- /pgs - Postgres - zenith-computenode +- /snapshots - snapshots **TODO** + +>Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all? + +Methods and their mapping to CLI: + +- /storages - zenith-pageserver or zenith-s3 + +CLI | REST API +------------- | ------------- +storage attach -n name --type [native\s3] --path=[datadir\URL] | PUT -d { "name": "name", "type": "native", "path": "/tmp" } /storages +storage detach -n name | DELETE /storages/:storage_name +storage list | GET /storages +storage show -n name | GET /storages/:storage_name + + +- /pgs - zenith-computenode + +CLI | REST API +------------- | ------------- +pg create -n name --s storage_name | PUT -d { "name": "name", "storage_name": "storage_name" } /pgs +pg destroy -n name | DELETE /pgs/:pg_name +pg start -n name --replica | POST -d {"action": "start", "is_replica":"replica"} /pgs/:pg_name /actions +pg stop -n name | POST -d {"action": "stop"} /pgs/:pg_name /actions +pg promote -n name | POST -d {"action": "promote"} /pgs/:pg_name /actions +pg list | GET /pgs +pg show -n name | GET /pgs/:pg_name + +- /snapshots **TODO** + +CLI | REST API +------------- | ------------- + diff --git a/docs/rfcs/006-laptop-cli-v2-CLI.md b/docs/rfcs/006-laptop-cli-v2-CLI.md new file mode 100644 index 0000000000..84dc932211 --- /dev/null +++ b/docs/rfcs/006-laptop-cli-v2-CLI.md @@ -0,0 +1,64 @@ +Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog". + +# CLI v2 (after chatting with Carl) + +Zenith introduces the notion of a repository. + +```bash +zenith init +zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory +``` + +Once you have a cluster catalog you can explore it + +```bash +zenith log -- returns a list of commits +zenith status -- returns if there are changes in the catalog that can be committed +zenith commit -- commits the changes and generates a new commit hash +zenith branch experimental -- creates a branch called testdb based on a given commit hash +``` + +To make changes in the catalog you need to run compute nodes + +```bash +-- here is how you a compute node +zenith start /home/pipedpiper/northwind:main -- starts a compute instance +zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud +-- you can start a compute node against any hash or branch +zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port) +-- you can start a compute node against any hash or branch +zenith start /home/pipedpiper/northwind: --port 8009 -- start another compute instance (on different port) + +-- After running some DML you can run +-- zenith status and see how there are two WAL streams one on top of +-- the main branch +zenith status +-- and another on top of the experimental branch +zenith status -b experimental + +-- you can commit each branch separately +zenith commit main +-- or +zenith commit -c /home/pipedpiper/northwind:experimental +``` + +Starting compute instances against cloud environments + +```bash +-- you can start a compute instance against the cloud environment +-- in this case all of the changes will be streamed into the cloud +zenith start https://zenith:tech/pipedpiper/northwind:main +zenith start https://zenith:tech/pipedpiper/northwind:main +zenith status -c https://zenith:tech/pipedpiper/northwind:main +zenith commit -c https://zenith:tech/pipedpiper/northwind:main +zenith branch -c https://zenith:tech/pipedpiper/northwind: experimental +``` + +Pushing data into the cloud + +```bash +-- pull all the commits from the cloud +zenith pull +-- push all the commits to the cloud +zenith push +``` diff --git a/docs/rfcs/006-laptop-cli-v2-repository-structure.md b/docs/rfcs/006-laptop-cli-v2-repository-structure.md new file mode 100644 index 0000000000..e6e6e172ad --- /dev/null +++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md @@ -0,0 +1,140 @@ +# Repository format + +A Zenith repository is similar to a traditional PostgreSQL backup +archive, like a WAL-G bucket or pgbarman backup catalogue. It holds +multiple versions of a PostgreSQL database cluster. + +The distinguishing feature is that you can launch a Zenith Postgres +server directly against a branch in the repository, without having to +"restore" it first. Also, Zenith manages the storage automatically, +there is no separation between full and incremental backups nor WAL +archive. Zenith relies heavily on the WAL, and uses concepts similar +to incremental backups and WAL archiving internally, but it is hidden +from the user. + +## Directory structure, version 1 + +This first version is pretty straightforward but not very +efficient. Just something to get us started. + +The repository directory looks like this: + + .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/ + .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots// + .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history + + .zenith/refs/branches/mybranch + .zenith/refs/tags/foo + .zenith/refs/tags/bar + + .zenith/datadirs/ + +### Timelines + +A timeline is similar to PostgeSQL's timeline, but is identified by a +UUID instead of a 32-bit timeline Id. For user convenience, it can be +given a name that refers to the UUID (called a branch). + +All WAL is generated on a timeline. You can launch a read-only node +against a tag or arbitrary LSN on a timeline, but in order to write, +you need to create a timeline. + +Each timeline is stored in a directory under .zenith/timelines. It +consists of a WAL archive, containing all the WAL in the standard +PostgreSQL format, under the wal/ subdirectory. + +The 'snapshots/' subdirectory, contains "base backups" of the data +directory at a different LSNs. Each snapshot is simply a copy of the +Postgres data directory. + +When a new timeline is forked from a previous timeline, the ancestor +timeline's UUID is stored in the 'history' file. + +### Refs + +There are two kinds of named objects in the repository: branches and +tags. A branch is a human-friendly name for a timeline UUID, and a +tag is a human-friendly name for a specific LSN on a timeline +(timeline UUID + LSN). Like in git, these are just for user +convenience; you can also use timeline UUIDs and LSNs directly. + +Refs do have one additional purpose though: naming a timeline or LSN +prevents it from being automatically garbage collected. + +The refs directory contains a small text file for each tag/branch. It +contains the UUID of the timeline (and LSN, for tags). + +### Datadirs + +.zenith/datadirs contains PostgreSQL data directories. You can launch +a Postgres instance on one of them with: + +``` + postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c +``` + +All the actual data is kept in the timeline directories, under +.zenith/timelines. The data directories are only needed for active +PostgreQSL instances. After an instance is stopped, the data directory +can be safely removed. "zenith start" will recreate it quickly from +the data in .zenith/timelines, if it's missing. + +## Version 2 + +The format described above isn't very different from a traditional +daily base backup + WAL archive configuration. The main difference is +the nicer naming of branches and tags. + +That's not very efficient. For performance, we need something like +incremental backups that don't require making a full copy of all +data. So only store modified files or pages. And instead of having to +replay all WAL from the last snapshot, "slice" the WAL into +per-relation WAL files and only recover what's needed when a table is +accessed. + +In version 2, the file format in the "snapshots" subdirectory gets +more advanced. The exact format is TODO. But it should support: +- storing WAL records of individual relations/pages +- storing a delta from an older snapshot +- compression + + +## Operations + +### Garbage collection + +When you run "zenith gc", old timelines that are no longer needed are +removed. That involves collecting the list of "unreachable" objects, +starting from the named branches and tags. + +Also, if enough WAL has been generated on a timeline since last +snapshot, a new snapshot or delta is created. + +### zenith push/pull + +Compare the tags and branches on both servers, and copy missing ones. +For each branch, compare the timeline it points to in both servers. If +one is behind the other, copy the missing parts. + +FIXME: how do you prevent confusion if you have to clones of the same +repository, launch an instance on the same branch in both clones, and +later try to push/pull between them? Perhaps create a new timeline +every time you start up an instance? Then you would detect that the +timelines have diverged. That would match with the "epoch" concept +that we have in the WAL safekeeper + +### zenith checkout/commit + +In this format, there is no concept of a "working tree", and hence no +concept of checking out or committing. All modifications are done on +a branch or a timeline. As soon as you launch a server, the changes are +appended to the timeline. + +You can easily fork off a temporary timeline to emulate a "working tree". +You can later remove it and have it garbage collected, or to "commit", +re-point the branch to the new timeline. + +If we want to have a worktree and "zenith checkout/commit" concept, we can +emulate that with a temporary timeline. Create the temporary timeline at +"zenith checkout", and have "zenith commit" modify the branch to point to +the new timeline. diff --git a/docs/rfcs/007-serverless-on-laptop.md b/docs/rfcs/007-serverless-on-laptop.md new file mode 100644 index 0000000000..e6355f4a03 --- /dev/null +++ b/docs/rfcs/007-serverless-on-laptop.md @@ -0,0 +1,93 @@ +How it works now +---------------- + +1. Create repository, start page server on it + +``` +$ zenith init +... +created main branch +new zenith repository was created in .zenith + +$ zenith pageserver start +Starting pageserver at '127.0.0.1:64000' in .zenith +Page server started +``` + +2. Create a branch, and start a Postgres instance on it + +``` +$ zenith branch heikki main +branching at end of WAL: 0/15ECF68 + +$ zenith pg create heikki +Initializing Postgres on timeline 76cf9279915be7797095241638e64644... +Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432 + +$ zenith pg start pg1 +Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki' +waiting for server to start.... done +server started +``` + + +3. Connect to it and run queries + +``` +$ psql "dbname=postgres port=55432" +psql (14devel) +Type "help" for help. + +postgres=# +``` + + +Proposal: Serverless on your Laptop +----------------------------------- + +We've been talking about doing the "pg create" step automatically at +"pg start", to eliminate that step. What if we go further, go +serverless on your laptop, so that the workflow becomes just: + +1. Create repository, start page server on it (same as before) + +``` +$ zenith init +... +created main branch +new zenith repository was created in .zenith + +$ zenith pageserver start +Starting pageserver at '127.0.0.1:64000' in .zenith +Page server started +``` + +2. Create branch + +``` +$ zenith branch heikki main +branching at end of WAL: 0/15ECF68 +``` + +3. Connect to it: + +``` +$ psql "dbname=postgres port=5432 branch=heikki" +psql (14devel) +Type "help" for help. + +postgres=# +``` + + +The trick behind the scenes is that when you launch the page server, +it starts to listen on port 5432. When you connect to it with psql, it +looks at the 'branch' parameter that you passed in the connection +string. It automatically performs the "pg create" and "pg start" steps +for that branch, and then forwards the connection to the Postgres +instance that it launched. After you disconnect, if there are no more +active connections to the server running on the branch, it can +automatically shut it down again. + +This is how serverless would work in the cloud. We can do it on your +laptop, too. diff --git a/docs/rfcs/008-push-pull.md b/docs/rfcs/008-push-pull.md new file mode 100644 index 0000000000..272628e1ce --- /dev/null +++ b/docs/rfcs/008-push-pull.md @@ -0,0 +1,66 @@ +# Push and pull between pageservers + +Here is a proposal about implementing push/pull mechanics between pageservers. We also want to be able to push/pull to S3 but that would depend on the exact storage format so we don't touch that in this proposal. + +## Origin management + +The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that). + +``` +zenith origin add +zenith origin list +zenith origin remove +``` + +Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport. + +Behind the scenes, this commands may update toml file inside .zenith directory. + +## Push + +### Pushing branch + +``` +zenith push mybranch cloudserver # push to eponymous branch in cloudserver +zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver +``` + +Exact mechanics would be slightly different in the following situations: + +1) Destination branch does not exist. + + That is the simplest scenario. We can just create an empty branch (or timeline in internal terminology) and transfer all the pages/records that we have in our timeline. Right now each timeline is quite independent of other timelines so I suggest skipping any checks that there is a common ancestor and just fill it with data. Later when CoW timelines will land to the pageserver we may add that check and decide whether this timeline belongs to this pageserver repository or not [*]. + + The exact mechanics may be the following: + + * CLI asks local pageserver to perform push and hands over connection uri: `perform_push `. + * local pageserver connects to the remote pageserver and runs `branch_push ` + Handler for branch_create would create destination timeline and switch connection to copyboth mode. + * Sending pageserver may start iterator on that timeline and send all the records as copy messages. + +2) Destination branch exists and latest_valid_lsn is less than ours. + + In this case, we need to send missing records. To do that we need to find all pages that were changed since that remote LSN. Right now we don't have any tracking mechanism for that, so let's just iterate over all records and send ones that are newer than remote LSN. Later we probably should add a sparse bitmap that would track changed pages to avoid full scan. + +3) Destination branch exists and latest_valid_lsn is bigger than ours. + + In this case, we can't push to that branch. We can only pull. + +### Pulling branch + +Here we need to handle the same three cases, but also keep in mind that local pageserver can be behind NAT and we can't trivially re-use pushing by asking remote to 'perform_push' to our address. So we would need a new set of commands: + +* CLI calls `perform_pull ` on local pageserver. +* local pageserver calls `branch_pull ` on remote pageserver. +* remote pageserver sends records in our direction + +But despite the different set of commands code that performs iteration over records and receiving code that inserts that records can be the same for both pull and push. + + + +[*] It looks to me that there are two different possible approaches to handling unrelated timelines: + +1) Allow storing unrelated timelines in one repo. Some timelines may have parents and some may not. +2) Transparently create and manage several repositories in one pageserver. + +But that is the topic for a separate RFC/discussion. diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md new file mode 100644 index 0000000000..0acbd68f86 --- /dev/null +++ b/docs/rfcs/009-snapshot-first-storage-cli.md @@ -0,0 +1,56 @@ +While working on export/import commands, I understood that they fit really well into "snapshot-first design". + +We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files. + +Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to zenith. + +So here is an attempt to design consistent CLI for different usage scenarios: + +#### 1. Start empty pageserver. +That is what we have now. +Init empty pageserver using `initdb` in temporary directory. + +`--storage_dest=FILE_PREFIX | S3_PREFIX |...` option defines object storage type, all other parameters are passed via env variables. Inspired by WAL-G style naming : https://wal-g.readthedocs.io/STORAGES/. + +Save`storage_dest` and other parameters in config. +Push snapshots to `storage_dest` in background. + +``` +zenith init --storage_dest=S3_PREFIX +zenith start +``` + +#### 2. Restart pageserver (manually or crash-recovery). +Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`. +Push snapshots to `storage_dest` in background. + +``` +zenith start +``` + +#### 3. Import. +Start pageserver from existing snapshot. +Path to snapshot provided via `--snapshot_path=FILE_PREFIX | S3_PREFIX | ...` +Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time operation. +Save`storage_dest` parameters in config. +Push snapshots to `storage_dest` in background. +``` +//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage. +zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX +zenith start +``` +How to pass credentials needed for `snapshot_path`? + +#### 4. Export. +Manually push snapshot to `snapshot_path` which differs from `storage_dest` +Optionally set `snapshot_format`, which can be plain pgdata format or zenith format. +``` +zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata +``` + +#### Notes and questions +- safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI? +- Why do we need `zenith init` as a separate command? Can't we init everything at first start? +- We can think of better names for all options. +- Export to plain postgres format will be useless, if we are not 100% compatible on page level. +I can recall at least one such difference - PD_WAL_LOGGED flag in pages. diff --git a/docs/rfcs/009-snapshot-first-storage-pitr.md b/docs/rfcs/009-snapshot-first-storage-pitr.md new file mode 100644 index 0000000000..29d3614d34 --- /dev/null +++ b/docs/rfcs/009-snapshot-first-storage-pitr.md @@ -0,0 +1,227 @@ +# Preface + +GetPage@LSN can be called with older LSNs, and the page server needs +to be able to reconstruct older page versions. That's needed for +having read-only replicas that lag behind the primary, or that are +"anchored" at an older LSN, and internally in the page server when you +branch at an older point in time. How do you do that? + +For now, I'm not considering incremental snapshots at all. I don't +think that changes things. So whenever you create a snapshot or a +snapshot file, it contains an image of all the pages, there is no need +to look at an older snapshot file. + +Also, I'm imagining that this works on a per-relation basis, so that +each snapshot file contains data for one relation. A "relation" is a +fuzzy concept - it could actually be one 1 GB relation segment. Or it +could include all the different "forks" of a relation, or you could +treat each fork as a separate relation for storage purpose. And once +we have the "non-relational" work is finished, a "relation" could +actually mean some other versioned object kept in the PostgreSQL data +directory. Let's ignore that for now. + +# Eric's RFC: + +Every now and then, you create a "snapshot". It means that you create +a new snapshot file for each relation that was modified after the last +snapshot, and write out the contents the relation as it is/was at the +snapshot LSN. Write-ahead log is stored separately in S3 by the WAL +safekeeping service, in the original PostgreSQL WAL file format. + + SNAPSHOT @100 WAL + . | + . | + . | + . | + SNAPSHOT @200 | + . | + . | + . | + . | + SNAPSHOT @300 | + . | + . V + IN-MEMORY @400 + +If a GetPage@LSN request comes from the primary, you return the latest +page from the in-memory layer. If there is no trace of the page in +memory, it means that it hasn't been modified since the last snapshot, +so you return the page from the latest snapshot, at LSN 300 in the +above example. + +PITR is implemented using the original WAL files: + +If a GetPage@LSN request comes from a read replica with LSN 250, you +read the image of the page from the snapshot at LSN 200, and you also +scan the WAL between 200 and 250, and apply all WAL records for the +requested page, to reconstruct it at LSN 250. + +Scanning the WAL naively for every GetPage@LSN request would be +expensive, so in practice you'd construct an in-memory data structure +of all the WAL between 200 and 250 once that allows quickly looking up +records for a given page. + +## Problems/questions + +I think you'll need to store the list of snapshot LSNs on each +timeline somewhere. + +If the latest snapshot of a relation is at LSN 100, and you request a +page at LSN 1000000, how do you know if there are some modifications +to it between 100 and 1000000 that you need to replay? You can scan +all the WAL between 100 and 1000000, but that would be expensive. + +You can skip that, if you know that a snapshot was taken e.g. at LSN +999900. Then you know that the fact that there is no snapshot file at +999900 means that the relation hasn't been modified between +100-999900. Then you only need to scan the WAL between 999900 and +1000000. However, there is no trace of a snapshot happening at LSN +999900 in the snapshot file for this relation, so you need to get +that information from somewhere else. + +Where do you get that information from? Perhaps you can scan all the +other relations, and if you see a snapshot file for *any* relation at +LSN 999900, you know that if there were modifications to this +relation, there would be a newer snapshot file for it, too. In other +words, the list of snapshots that have been taken can be constructed +by scanning all relations and computing the union of all snapshot LSNs +that you see for any relation. But that's expensive so at least you +should keep that in memory, after computing it once. Also, if you rely +on that, it's not possible to have snapshots at different intervals +for different files. That seems limiting. + +Another option is to explicitly store a list of snapshot LSNs in a +separate metadata file. + + +# Current implementation in the 'layered_repo' branch: + +We store snapshot files like in the RFC, but each snapshot file also +contains all the WAL in the range of LSNs, so that you don't need to +fetch the WAL separately from S3. So you have "layers" like this: + + SNAPSHOT+WAL 100-200 + | + | + | + | + SNAPSHOT+WAL 200-300 + | + | + | + | + IN-MEMORY 300- + +Each "snapshot+WAL" is a file that contains a snapshot - i.e. full +copy of each page in the relation, at the *start* LSN. In addition to +that, it contains all the WAL applicable to the relation from the +start LSN to the end LSN. With that, you can reconstruct any page +version in the range that the file covers. + + +## Problems/questions + +I can see one potential performance issue here, compared to the RFC. +Let's focus on a single relation for now. Imagine that you start from +an empty relation, and you receive WAL from 100 to 200, containing +a bunch of inserts and updates to the relation. You now have all that +WAL in memory: + + memory: WAL from 100-200 + +We decide that it's time to materialize that to a snapshot file on +disk. We materialize full image of the relation as it was at LSN 100 +to the snapshot file, and include all of the WAL. Since the relation +was initially empty, the "image" at the beginning of th range is empty +too. + +So now you have one file on on disk: + + SNAPSHOT+WAL 100-200 + +It contains a full image of the relation at LSN 100 and all WAL +between 100-200. (It's actually stored as a serialized BTreeMap of +page versions, with the page images and WAL records all stored +together in the same BtreeMap. But for this story, that's not +important.) + +We now receive more WAL updating the relation, up to LSN 300. We +decide it's time to materialize a new snapshot file, and we now have +two files: + + SNAPSHOT+WAL 100-200 + SNAPSHOT+WAL 200-300 + +Note that the latest "full snapshot" that we store on disk always lags +behind by one snapshot cycle. The first file contains a full image of +the relation at LSN 100, the second at LSN 200. When we have received +WAL up to LSN 300, we write a materialized image at LSN 200. That +seems a bit silly. In the design per your RFC, you would write a +snapshots at LSNs 200 and 300, instead. That seems better. + + + +# Third option (not implemented yet) + +Store snapshot files like in the RFC, but also store per-relation +WAL files that contain WAL in a range of LSNs for that relation. + + SNAPSHOT @100 WAL 100-200 + . | + . | + . | + . | + SNAPSHOT @200 WAL 200-300 + . | + . | + . | + . | + SNAPSHOT @300 + . + . + IN-MEMORY 300- + + +This could be the best of both worlds. The snapshot files would be +independent of the PostgreSQL WAL format. When it's time to write +snapshot file @300, you write a full image of the relation at LSN 300, +and you write the WAL that you had accumulated between 200 and 300 to +a separate file. That way, you don't "lag behind" for one snapshot +cycle like in the current implementation. But you still have the WAL +for a particular relation readily available alongside the snapshot +files, and you don't need to track what snapshot LSNs exist +separately. + +(If we wanted to minimize the number of files, you could include the +snapshot @300 and the WAL between 200 and 300 in the same file, but I +feel it's probably better to keep them separate) + + + +# Further thoughts + +There's no fundamental reason why the LSNs of the snapshot files and the +ranges of the WAL files would need to line up. So this would be possible +too: + + SNAPSHOT @100 WAL 100-150 + . | + . | + . WAL 150-250 + . | + SNAPSHOT @200 | + . | + . WAL 250-400 + . | + . | + SNAPSHOT @300 | + . | + . | + IN-MEMORY 300- + +I'm not sure what the benefit of this would be. You could materialize +additional snapshot files in the middle of a range covered by a WAL +file, maybe? Might be useful to speed up access when you create a new +branch in the middle of an LSN range or if there's some other reason +to believe that a particular LSN is "interesting" and there will be +a lot of requests using it. diff --git a/docs/rfcs/009-snapshot-first-storage.md b/docs/rfcs/009-snapshot-first-storage.md new file mode 100644 index 0000000000..75ed490f21 --- /dev/null +++ b/docs/rfcs/009-snapshot-first-storage.md @@ -0,0 +1,148 @@ +# Snapshot-first storage architecture + +Goals: +- Long-term storage of database pages. +- Easy snapshots; simple snapshot and branch management. +- Allow cloud-based snapshot/branch management. +- Allow cloud-centric branching; decouple branch state from running pageserver. +- Allow customer ownership of data via s3 permissions. +- Provide same or better performance for typical workloads, vs plain postgres. + +Non-goals: +- Service database reads from s3 (reads should be serviced from the pageserver cache). +- Keep every version of every page / Implement point-in-time recovery (possibly a future paid feature, based on WAL replay from an existing snapshot). + +## Principle of operation + +The database “lives in s3”. This means that all of the long term page storage is in s3, and the “live database”-- the version that lives in the pageserver-- is a set of “dirty pages” that haven’t yet been written back to s3. + +In practice, this is mostly similar to storing frequent snapshots to s3 of a database that lives primarily elsewhere. + +The main difference is that s3 is authoritative about which branches exist; pageservers consume branches, snapshots, and related metadata by reading them from s3. This allows cloud-based management of branches and snapshots, regardless of whether a pageserver is running or not. + +It’s expected that a pageserver should keep a copy of all pages, to shield users from s3 latency. A cheap/slow pageserver that falls back to s3 for some reads would be possible, but doesn’t seem very useful right now. + +Because s3 keeps all history, and the safekeeper(s) preserve any WAL records needed to reconstruct the most recent changes, the pageserver can store dirty pages in RAM or using non-durable local storage; this should allow very good write performance, since there is no need for fsync or journaling. + +Objects in s3 are immutable snapshots, never to be modified once written (only deleted). + +Objects in s3 are files, each containing a set of pages for some branch/relation/segment as of a specific time (LSN). A snapshot could be complete (meaning it has a copy of every page), or it could be incremental (containing only the pages that were modified since the previous snapshot). It’s expected that most snapshots are incremental to keep storage costs low. + +It’s expected that the pageserver would upload new snapshot objects frequently, e.g. somewhere between 30 seconds and 15 minutes, depending on cost/performance balance. + +No-longer needed snapshots can be “squashed”-- meaning snapshot N and snapshot N+1 can be read by some cloud agent software, which writes out a new object containing the combined set of pages (keeping only the newest version of each page) and then deletes the original snapshots. + +A pageserver only needs to store the set of pages needed to satisfy operations in flight: if a snapshot is still being written, the pageserver needs to hold historical pages so that snapshot captures a consistent moment in time (similar to what is needed to satisfy a slow replica). + +WAL records can be discarded once a snapshot has been stored to s3. (Unless we want to keep them longer as part of a point-in-time recovery feature.) + +## Pageserver operation + +To start a pageserver from a stored snapshot, the pageserver downloads a set of snapshots sufficient to start handling requests. We assume this includes the latest copy of every page, though it might be possible to start handling requests early, and retrieve pages for the first time only when needed. + +To halt a pageserver, one final snapshot should be written containing all pending WAL updates; then the pageserver and safekeepers can shut down. + +It’s assumed there is some cloud management service that ensures only one pageserver is active and servicing writes to a given branch. + +The pageserver needs to be able to track whether a given page has been modified since the last snapshot, and should be able to produce the set of dirty pages efficiently to create a new snapshot. + +The pageserver need only store pages that are “reachable” from a particular LSN. For example, a page may be written four times, at LSN 100, 200, 300, and 400. If no snapshot is being created when LSN 200 is written, the page at LSN 100 can be discarded. If a snapshot is triggered when the pageserver is at LSN 299, the pageserver must preserve the page from LSN 200 until that snapshot is complete. As before, the page at LSN 300 can be discarded when the LSN 400 pages is written (regardless of whether the LSN 200 snapshot has completed.) + +If the pageserver is servicing multiple branches, those branches may contain common history. While it would be possible to serve branches with zero knowledge of their common history, a pageserver could save a lot of space using an awareness of branch history to share the common set of pages. Computing the “liveness” of a historical page may be tricky in the face of multiple branches. + +The pageserver may store dirty pages to memory or to local block storage; any local block storage format is only temporary “overflow” storage, and is not expected to be readable by future software versions. + +The pageserver may store clean pages (those that are captured in a snapshot) any way it likes: in memory, in a local filesystem (possibly keeping a local copy of the snapshot file), or using some custom storage format. Reading pages from s3 would be functional, but is expected to be prohibitively slow. + +The mechanism for recovery after a pageserver failure is WAL redo. If we find that too slow in some situations (e.g. write-heavy workload causes long startup), we can write more frequent snapshots to keep the number of outstanding WAL records low. If that’s still not good enough, we could look at other options (e.g. redundant pageserver or an EBS page journal). + +A read-only pageserver is possible; such a pageserver could be a read-only cache of a specific snapshot, or could auto-update to the latest snapshot on some branch. Either way, no safekeeper is required. Multiple read-only pageservers could exist for a single branch or snapshot. + +## Cloud snapshot manager operation + +Cloud software may wish to do the following operations (commanded by a user, or based on some pre-programmed policy or other cloud agent): +Create/delete/clone/rename a database +Create a new branch (possibly from a historical snapshot) +Start/stop the pageserver/safekeeper on a branch +List databases/branches/snapshots that are visible to this user account + +Some metadata operations (e.g. list branches/snapshots of a particular db) could be performed by scanning the contents of a bucket and inspecting the file headers of each snapshot object. This might not be fast enough; it might be necessary to build a metadata service that can respond more quickly to some queries. + +This is especially true if there are public databases: there may be many thousands of buckets that are public, and scanning all of them is not a practical strategy for answering metadata queries. + +## Snapshot names, deletion and concurrency + +There may be race conditions between operations-- in particular, a “squash” operation may replace two snapshot objects (A, B) with some combined object (C). Since C is logically equivalent to B, anything that attempts to access B should be able to seamlessly switch over to C. It’s assumed that concurrent delete won’t disrupt a read in flight, but it may be possible for some process to read B’s header, and then discover on the next operation that B is gone. + +For this reason, any attempted read should attempt a fallback procedure (list objects; search list for an equivalent object) if an attempted read fails. This requires a predictable naming scheme, e.g. `XXXX_YYYY_ZZZZ_DDDD`, where `XXXX` is the branch unique id, and `YYYY` and `ZZZZ` are the starting/ending LSN values. `DDDD` is a timestamp indicating when the object was created; this is used to disambiguate a series of empty snapshots, or to help a snapshot policy engine understand which snapshots should be kept or discarded. + +## Branching + +A user may request a new branch from the cloud user interface. There is a sequence of things that needs to happen: +- If the branch is supposed to be based on the latest contents, the pageserver should perform an immediate snapshot. This is the parent snapshot for the new branch. +- Cloud software should create the new branch, by generating a new (random) unique branch identifier, and creating a placeholder snapshot object. + - The placeholder object is an empty snapshot containing only metadata (which anchors it to the right parent history) and no pages. + - The placeholder can be discarded when the first snapshot (containing data) is completed. Discarding is equivalent to squashing, when the snapshot contains no data. +- If the branch needs to be started immediately, a pageserver should be notified that it needs to start servicing the branch. This may not be the same pageserver that services the parent branch, though the common history may make it the best choice. + +Some of these steps could be combined into the pageserver, but that process would not be possible under all cases (e.g. if no pageserver is currently running, or if the branch is based on an older snapshot, or if a different pageserver will be serving the new branch). Regardless of which software drives the process, the result should look the same. + +## Long-term file format + +Snapshot files (and any other object stored in s3) must be readable by future software versions. + +It should be possible to build multiple tools (in addition to the pageserver) that can read and write this file format-- for example, to allow cloud snapshot management. + +Files should contain the following metadata, in addition to the set of pages: +- The version of the file format. +- A unique identifier for this branch (should be worldwide-unique and unchanging). +- Optionally, any human-readable names assigned to this branch (for management UI/debugging/logging). +- For incremental snapshots, the identifier of the predecessor snapshot. For new branches, this will be the parent snapshot (the point at which history diverges). +- The location of the predecessor branch snapshot, if different from this branch’s location. +- The LSN range `(parent, latest]` for this snapshot. For complete snapshots, the parent LSN can be 0. +- The UTC timestamp of the snapshot creation (which may be different from the time of its highest LSN, if the database is idle). +- A SHA2 checksum over the entire file (excluding the checksum itself), to preserve file integrity. + +A file may contain no pages, and an empty LSN range (probably `(latest, latest]`?), which serves as a placeholder for either a newly-created branch, or a snapshot of an idle database. + +Any human-readable names stored in the file may fall out of date if database/branch renames are allowed; there may need to be a cloud metadata service to query (current name -> unique identifier). We may choose instead to not store human-readable names in the database, or treat them as debugging information only. + +## S3 semantics, and other kinds of storage + +For development and testing, it may be easier to use other kinds of storage in place of s3. For example, a directory full of files can substitute for an s3 bucket with multiple objects. This mode is expected to match the s3 semantics (e.g. don’t edit existing files or use symlinks). Unit tests may omit files entirely and use an in-memory mock bucket. + +Some users may want to use a local or network filesystem in place of s3. This isn’t prohibited but it’s not a priority, either. + +Alternate implementations of s3 should be supported, including Google Cloud Storage. + +Azure Blob Storage should be supported. We assume (without evidence) that it’s semantically equivalent to s3 for this purpose. + +The properties of s3 that we depend on are: +list objects +streaming read of entire object +read byte range from object +streaming write new object (may use multipart upload for better reliability) +delete object (that should not disrupt an already-started read). + +Uploaded files, restored backups, or s3 buckets controlled by users could contain malicious content. We should always validate that objects contain the content they’re supposed to. Incorrect, Corrupt or malicious-looking contents should cause software (cloud tools, pageserver) to fail gracefully. + +## Notes + +Possible simplifications, for a first draft implementation: +- Assume that dirty pages fit in pageserver RAM. Can use kernel virtual memory to page out to disk if needed. Can improve this later. +- Don’t worry about the details of the squashing process yet. +- Don’t implement cloud metadata service; try to make everything work using basic s3 list-objects and reads. +- Don’t implement rename, delete at first. +- Don’t implement public/private, just use s3 permissions. +- Don’t worry about sharing history yet-- each user has their own bucket and a full copy of all data. +- Don’t worry about history that spans multiple buckets. +- Don’t worry about s3 regions. +- Don’t support user-writeable s3 buckets; users get only read-only access at most. + +Open questions: +- How important is point-in-time recovery? When should we add this? How should it work? +- Should snapshot files use compression? +- Should we use snapshots for async replication? A spare pageserver could stay mostly warmed up by consuming snapshots as they’re created. +- Should manual snapshots, or snapshots triggered by branch creation, be named differently from snapshots that are triggered by a snapshot policy? +- When a new branch is created, should it always be served by the same pageserver that owns its parent branch? When should we start a new pageserver? +- How can pageserver software upgrade be done with minimal downtime? diff --git a/docs/rfcs/010-storage_details.md b/docs/rfcs/010-storage_details.md new file mode 100644 index 0000000000..bc79924e7b --- /dev/null +++ b/docs/rfcs/010-storage_details.md @@ -0,0 +1,144 @@ +# Storage details + +Here I tried to describe the current state of thinking about our storage subsystem as I understand it. Feel free to correct me. Also, I tried to address items from Heikki's TODO and be specific on some of the details. + +## Overview + +![storage](images/storage.jpeg) + +### MemStore + +MemStore holds the data between `latest_snapshot_lsn` and `latest_lsn`. It consists of PageIndex that holds references to WAL records or pages, PageStore that stores recently materialized pages, and WalStore that stores recently received WAL. + +### PageIndex + +PageIndex is an ordered collection that maps `(BufferTag, LSN)` to one of the following references (by reference I mean some information that is needed to access that data, e.g. file_id and offset): + +* PageStoreRef -- page offset in the PageStore +* LocalStoreRef -- snapshot_id and page offset inside of that snapshot +* WalStoreRef -- offset (and size optionally) of WalRecord in WalStore + +PageIndex holds information about all the pages in all incremental snapshots and in the latest full snapshot. If we aren't using page compression inside snapshots we actually can avoid storing references to the full snapshot and calculate page offsets based on relation sizes metadata in the full snapshot (assuming that full snapshot stores pages sorted by page number). However, I would suggest embracing page compression from the beginning and treat all pages as variable-sized. + +We assume that PageIndex is few orders of magnitude smaller than addressed data hence it should fit memory. We also don't care about crash tolerance as we can rebuild it from snapshots metadata and WAL records from WalStore or/and Safekeeper. + +### WalStore + +WalStore is a queue of recent WalRecords. I imagine that we can store recent WAL the same way as Postgres does -- as 16MB files on disk. On top of that, we can add some fixed-size cache that would keep some amount of segments in memory. + +For now, we may rely on the Safekeeper to safely store that recent WAL. But generally, I think we can pack all S3 operations into the page server so that it would be also responsible for the recent WAL pushdown to S3 (and Safekeeper may just delete WAL that was confirmed as S3-durable by the page server). + +### PageStore + +PageStore is storage for recently materialized pages (or in other words cache of getPage results). It is also can be implemented as a file-based queue with some memory cache on top of it. + +There are few possible options for PageStore: + +a) we just add all recently materialized pages there (so several versions of the same page can be stored there) -- that is more or less how it happens now with the current RocksDB implementation. + +b) overwrite older pages with the newer pages -- if there is no replica we probably don't need older pages. During page overwrite, we would also need to change PageStoreRef back to WalStoreRef in PageIndex. + +I imagine that newly created pages would just be added to the back of PageStore (again in queue-like fashion) and this way there wouldn't be any meaningful ordering inside of that queue. When we are forming a new incremental snapshot we may prohibit any updates to the current set of pages in PageStore (giving up on single page version rule) and cut off that whole set when snapshot creation is complete. + +With option b) we can also treat PageStor as an uncompleted incremental snapshot. + +### LocalStore + +LocalStore keeps the latest full snapshot and set of incremental snapshots on top of it. We add new snapshots when the number of changed pages grows bigger than a certain threshold. + +## Granularity + +By granularity, I mean a set of pages that goes into a certain full snapshot. Following things should be taken into account: + +* can we shard big databases between page servers? +* how much time will we spend applying WAL to access certain pages with older LSN's? +* how many files do we create for a single database? + +I can think of the following options here: + +1. whole database goes to one full snapshot. + * +: we never create a lot of files for one database + * +: the approach is quite straightforward, moving data around is simple + * -: can not be sharded + * -: long recovery -- we always need to recover the whole database +2. table segment is the unit of snapshotting + * +: straightforward for sharding + * +: individual segment can be quickly recovered with sliced WAL + * -: full snapshot can be really small (e.g. when the corresponding segment consists of a single page) and we can blow amount of files. Then we would spend eternity in directory scans and the amount of metadata for sharding can be also quite big. +3. range-partitioned snapshots -- snapshot includes all pages between [BuffTagLo, BuffTagHi] mixing different relations, databases, and potentially clusters (albeit from one tenant only). When full snapshot outgrows a certain limit (could be also a few gigabytes) we split the snapshot in two during the next full snapshot write. That approach would also require pages sorted by BuffTag inside our snapshots. + * +: addresses all mentioned issues + * -: harder to implement + +I think it is okay to start with table segments granularity and just check how we will perform in cases of lots of small tables and check is there any way besides c) to deal with it. + +Both PageStore and WalStore should be "sharded" by this granularity level. + +## Security + +We can generate different IAM keys for each tenant and potentially share them with users (in read-only mode?) or even allow users to provide their S3 buckets credentials. + +Also, S3 backups are usually encrypted by per-tenant privates keys. I'm not sure in what threat model such encryption would improve something (taking into account per-tenant IAM keys), but it seems that everybody is doing that (both AMZN and YNDX). Most likely that comes as a requirement about "cold backups" by some certification procedure. + +## Dynamics + +### WAL stream handling + +When a new WAL record is received we need to parse BufferTags in that record and insert them in PageIndex with WalStoreRef as a value. + +### getPage queries + +Look up the page in PageIndex. If the value is a page reference then just respond with that page. If the referenced value is WAL record then find the most recent page with the same BuffTag (that is why we need ordering in PageIndex); recover it by applying WAL records; save it in PageStore; respond with that page. + +### Starting page server without local data + +* build set of latest full snapshots and incremental snapshots on top of them +* load all their metadata into PageIndex +* Safekeeper should connect soon and we can ask for a WAL stream starting from the latest incremental snapshot +* for databases that are connected to us through the Safekeeper we can start loading the set of the latest snapshots or we can do that lazily based on getPage request (I'd better avoid doing that lazily for now without some access stats from the previous run and just transfer all data for active database from S3 to LocalStore). + +### Starting page server with local data (aka restart or reboot) + +* check that local snapshot files are consistent with S3 + +### Snapshot creation + +Track size of future snapshots based on info in MemStore and when it exceeds some threshold (taking into account our granularity level) create a new incremental snapshot. Always emit incremental snapshots from MemStore. + +To create a new snapshot we need to walk through WalStore to get the list of all changed pages, sort it, and get the latest versions of that pages from PageStore or by WAL replay. It makes sense to maintain that set in memory while we are receiving the WAL stream to avoid parsing WAL during snapshot creation. + +Full snapshot creation can be done by GC (or we can call that entity differently -- e.g. merger?) by merging the previous full snapshot with several incremental snapshots. + +### S3 pushdown + +When we have several full snapshots GC can push the old one with its increments to S3. + +### Branch creation + +Create a new timeline and replay sliced WAL up to a requested point. When the page is not in PageIndex ask the parent timeline about a page. Relation sizes are tricky. + +## File formats + +As far as I understand Bookfile/Aversion addresses versioning and serialization parts. + +As for exact data that should go to snapshots I think it is the following for each snapshot: + +* format version number +* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknown key are present. If we add something backward compatible to the file we can keep the version number. +* array of [BuffTag, corresponding offset in file] for pages -- IIUC that is analogous to ToC in Bookfile +* array of [(BuffTag, LSN), corresponding offset in file] for the WAL records +* pages, one by one +* WAL records, one by one + +It is also important to be able to load metadata quickly since it would be one of the main factors impacting the time of page server start. E.g. if would store/cache about 10TB of data per page server, the size of uncompressed page references would be about 30GB (10TB / ( 8192 bytes page size / ( ~18 bytes per ObjectTag + 8 bytes offset in the file))). + +1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when relation_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset deltas would be small). +2) It makes sense to keep ToC at the beginning of the file to avoid extra seeks to locate it. Doesn't matter too much with the local files but matters on S3 -- if we are accessing a lot of ~1Gb files with the size of metadata ~ 1Mb then the time to transfer this metadata would be comparable with access latency itself (which is about a half of a second). So by slurping metadata with one read of file header instead of N reads we can improve the speed of page server start by this N factor. + +I think both of that optimizations can be done later, but that is something to keep in mind when we are designing our storage serialization routines. + +Also, there were some discussions about how to embed WAL in incremental snapshots. So far following ideas were mentioned: +1. snapshot lsn=200, includes WAL in range 200-300 +2. snapshot lsn=200, includes WAL in range 100-200 +3. data snapshots are separated from WAL snapshots + +Both options 2 and 3 look good. I'm inclined towards option 3 as it would allow us to apply different S3 pushdown strategies for data and WAL files (e.g. we may keep data snapshot until the next full snapshot, but we may push WAL snapshot to S3 just when they appeared if there are no replicas). diff --git a/docs/rfcs/011-retention-policy.md b/docs/rfcs/011-retention-policy.md new file mode 100644 index 0000000000..fde36c8108 --- /dev/null +++ b/docs/rfcs/011-retention-policy.md @@ -0,0 +1,91 @@ +# User-visible timeline history + +The user can specify a retention policy. The retention policy is +presented to the user as a PITR period and snapshots. The PITR period +is the amount of recent history that needs to be retained, as minutes, +hours, or days. Within that period, you can create a branch or +snapshot at any point in time, open a compute node, and start running +queries. Internally, a PITR period is represented as a range of LSNs + +The user can also create snapshots. A snapshot is a point in time, +internally represented by an LSN. The user gives the snapshot a name. + +The user can also specify an interval, at which the system creates +snapshots automatically. For example, create a snapshot every night at +2 AM. After some user-specified time, old automatically created +snapshots are removed. + + Snapshot Snapshot + PITR "Monday" "Tuesday" PITR + ----######----------+-------------+-------------######> + +If there are multiple branches, you can specify different policies or +different branches. + +The PITR period and user-visible snapshots together define the +retention policy. + +NOTE: As presented here, this is probably overly flexible. In reality, +we want to keep the user interface simple. Only allow a PITR period at +the tip of a branch, for example. But that doesn't make much +difference to the internals. + + +# Retention policy behind the scenes + +The retention policy consists of points (for snapshots) and ranges +(for PITR periods). + +The system must be able to reconstruct any page within the retention +policy. Other page versions can be garbage collected away. We have a +lot of flexibility on when to perform the garbage collection and how +aggressive it is. + + +# Base images and WAL slices + +The page versions are stored in two kinds of files: base images and +WAL slices. A base image contains a dump of all the pages of one +relation at a specific LSN. A WAL slice contains all the WAL in an LSN +range. + + + | + | + | + | --Base img @100 + + | | + | | WAL slice + | | 100-200 + | | + | --Base img @200 + + | | + | | WAL slice + | | 200-300 + | | + | + + | + V + + +To recover a page e.g. at LSN 150, you need the base image at LSN 100, +and the WAL slice 100-200. + +All of this works at a per-relation or per-relation-segment basis. If +a relation is updated very frequently, we create base images and WAL +slices for it more quickly. For a relation that's updated +infrequently, we hold the recent WAL for that relation longer, and +only write it out when we need to release the disk space occupied by +the original WAL. (We need a backstop like that, because until all the +WAL/base images have been been durably copied to S3, we must keep the +original WAL for that period somewhere, in the WAL service or in S3.) + + +# Branching + +Internally, branch points are also "retention points", in addition to +the user-visible snapshots. If a branch has been forked off at LSN +100, we need to be able to reconstruct any page on the parent branch +at that LSN, because it is needed by the child branch. If a page is +modified in the child, we don't need to keep that in the parent +anymore, though. diff --git a/docs/rfcs/012-background-tasks.md b/docs/rfcs/012-background-tasks.md new file mode 100644 index 0000000000..8692b187e6 --- /dev/null +++ b/docs/rfcs/012-background-tasks.md @@ -0,0 +1,38 @@ +# Eviction + + Write out in-memory layer to disk, into a delta layer. + +- To release memory +- To make it possible to advance disk_consistent_lsn and allow the WAL + service to release some WAL. + +- Triggered if we are short on memory +- Or if the oldest in-memory layer is so old that it's holding back + the WAL service from removing old WAL + +# Materialization + +Create a new image layer of a segment, by performing WAL redo + +- To reduce the amount of WAL that needs to be replayed on a GetPage request. +- To allow garbage collection of old layers + +- Triggered by distance to last full image of a page + +# Coalescing + +Replace N consecutive layers of a segment with one larger layer. + +- To reduce the number of small files that needs to be uploaded to S3 + + +# Bundling + +Zip together multiple small files belonging to different segments. + +- To reduce the number of small files that needs to be uploaded to S3 + + +# Garbage collection + +Remove a layer that's older than the GC horizon, and isn't needed anymore. diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md new file mode 100644 index 0000000000..7e815abf73 --- /dev/null +++ b/docs/rfcs/013-term-history.md @@ -0,0 +1,147 @@ +# What + +Currently, apart from WAL safekeeper persistently stores only two logical clock +counter (aka term) values, sourced from the same sequence. The first is bumped +whenever safekeeper gives vote to proposer (or acknowledges already elected one) +and e.g. prevents electing two proposers with the same term -- it is actually +called `term` in the code. The second, called `epoch`, reflects progress of log +receival and this might lag behind `term`; safekeeper switches to epoch `n` when +it has received all committed log records from all `< n` terms. This roughly +corresponds to proposed in + +https://github.com/zenithdb/rfcs/pull/3/files + + +This makes our biggest our difference from Raft. In Raft, every log record is +stamped with term in which it was generated; while we essentially store in +`epoch` only the term of the highest record on this safekeeper -- when we know +it -- because during recovery generally we don't, and `epoch` is bumped directly +to the term of the proposer who performs the recovery when it is finished. It is +not immediately obvious that this simplification is safe. I thought and I still +think it is; model checking confirmed that. However, some details now make me +believe it is better to keep full term switching history (which is equivalent to +knowing term of each record). + +# Why + +Without knowing full history (list of pairs) of terms it is hard to +determine the exact divergence point, and if we don't perform truncation at that +point safety becomes questionable. Consider the following history, with +safekeepers A, B, C, D, E. n_m means record created by proposer in term n with +LSN m; (t=x, e=y) means safekeeper currently has term x and epoch y. + +1) P1 in term 1 writes 1.1 everywhere, which is committed, and some more only +on A. + +
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=1, e=1) 1.1
+D(t=1, e=1) 1.1
+E(t=1, e=1) 1.1
+
+ +2) P2 is elected by CDE in term 2, epochStartLsn is 2, and writes 2.2, 2.3 on CD: + +
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=2, e=2) 1.1 2.2 2.3
+D(t=2, e=2) 1.1 2.2 2.3
+E(t=2, e=1) 1.1
+
+ + +3) P3 is elected by CDE in term 3, epochStartLsn is 4, and writes 3.4 on D: + +
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+
+ + +Now, A gets back and P3 starts recovering it. How it should proceed? There are +two options. + +## Don't try to find divergence point at all + +...start sending WAL conservatively since the horizon (1.1), and truncate +obsolete part of WAL only when recovery is finished, i.e. epochStartLsn (4) is +reached, i.e. 2.3 transferred -- that's what https://github.com/neondatabase/neon/pull/505 proposes. + +Then the following is possible: + +4) P3 moves one record 2.2 to A. + +
+A(t=1, e=1) 1.1 2.2 1.3 1.4
+B(t=1, e=1) 1.1 1.2
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+
+ +Now log of A is basically corrupted. Moreover, since ABE are all in epoch 1 and +A's log is the longest one, they can elect P4 who will commit such log. + +Note that this particular history couldn't happen if we forbid to *create* new +records in term n until majority of safekeepers switch to it. It would force CDE +to switch to 2 before 2.2 is created, and A could never become donor while his +log is corrupted. Generally with this additional barrier I believe the algorithm +becomes safe, but + - I don't like this kind of artificial barrier; + - I also feel somewhat discomfortable about even temporary having intentionally + corrupted WAL; + - I'd still model check the idea. + +## Find divergence point and truncate at it + +Then step 4 would delete 1.3 1.4 on A, and we are ok. The question is, how do we +do that? Without term switching history we have to resort to sending again since +the horizon and memcmp'ing records, which is inefficient and ugly. Or we can +maintain full history and determine truncation point by comparing 'wrong' and +'right' histories -- much like pg_rewind does -- and perform truncation + start +streaming right there. + +# Proposal + +- Add term history as array of pairs to safekeeper controlfile. +- Return it to proposer with VoteResponse so 1) proposer can tell it to other + nodes and 2) determine personal streaming starting point. However, since we + don't append WAL and update controlfile atomically, let's first always update + controlfile but send only the history of what we really have (up to highest + term in history where begin_lsn >= end of wal; this highest term replaces + current `epoch`). We also send end of wal as we do now to determine the donor. +- Create ProposerAnnouncement message which proposer sends before starting + streaming. It announces proposer as elected and + 1) Truncates wrong part of WAL on safekeeper + (divergence point is already calculated at proposer, but can be + cross-verified here). + 2) Communicates the 'right' history of its term (taken from donor). Seems + better to immediately put the history in the controlfile, + though safekeeper might not have full WAL for previous terms in it -- + this way is simpler, and we can't update WAL and controlfile atomically anyway. + + This also constitutes analogue of current epoch bump for those safekeepers + which don't need recovery, which is important for sync-safekeepers (bump + epoch without waiting records from new term). +- After ProposerAnnouncement proposer streams WAL since calculated starting + point -- only what is missing. + + +pros/cons: ++ (more) clear safety of WAL truncation -- we get very close to Raft ++ no unnecessary data sending (faster recovery for not-oldest-safekeepers, matters + only for 5+ nodes) ++ adds some observability at safekeepers + +- complexity, but not that much + + +# Misc + +- During model checking I did truncation on first locally non existent or + different record -- analogue of 'memcmp' variant described above. diff --git a/docs/rfcs/014-safekeepers-gossip.md b/docs/rfcs/014-safekeepers-gossip.md new file mode 100644 index 0000000000..3d6cc04b94 --- /dev/null +++ b/docs/rfcs/014-safekeepers-gossip.md @@ -0,0 +1,69 @@ +# Safekeeper gossip + +Extracted from this [PR](https://github.com/zenithdb/rfcs/pull/13) + +## Motivation + +In some situations, safekeeper (SK) needs coordination with other SK's that serve the same tenant: + +1. WAL deletion. SK needs to know what WAL was already safely replicated to delete it. Now we keep WAL indefinitely. +2. Deciding on who is sending WAL to the pageserver. Now sending SK crash may lead to a livelock where nobody sends WAL to the pageserver. +3. To enable SK to SK direct recovery without involving the compute + +## Summary + +Compute node has connection strings to each safekeeper. During each compute->safekeeper connection establishment, the compute node should pass down all that connection strings to each safekeeper. With that info, safekeepers may establish Postgres connections to each other and periodically send ping messages with LSN payload. + +## Components + +safekeeper, compute, compute<->safekeeper protocol, possibly console (group SK addresses) + +## Proposed implementation + +Each safekeeper can periodically ping all its peers and share connectivity and liveness info. If the ping was not receiver for, let's say, four ping periods, we may consider sending safekeeper as dead. That would mean some of the alive safekeepers should connect to the pageserver. One way to decide which one exactly: `make_connection = my_node_id == min(alive_nodes)` + +Since safekeepers are multi-tenant, we may establish either per-tenant physical connections or per-safekeeper ones. So it makes sense to group "logical" connections between corresponding tenants on different nodes into a single physical connection. That means that we should implement an interconnect thread that maintains physical connections and periodically broadcasts info about all tenants. + +Right now console may assign any 3 SK addresses to a given compute node. That may lead to a high number of gossip connections between SK's. Instead, we can assign safekeeper triples to the compute node. But if we want to "break"/" change" group by an ad-hoc action, we can do it. + +### Corner cases + +- Current safekeeper may be alive but may not have connectivity to the pageserver + + To address that, we need to gossip visibility info. Based on that info, we may define SK as alive only when it can connect to the pageserver. + +- Current safekeeper may be alive but may not have connectivity with the compute node. + + We may broadcast last_received_lsn and presence of compute connection and decide who is alive based on that. + +- It is tricky to decide when to shut down gossip connections because we need to be sure that pageserver got all the committed (in the distributed sense, so local SK info is not enough) records, and it may never lose them. It is not a strict requirement since `--sync-safekeepers` that happen before the compute start will allow the pageserver to consume missing WAL, but it is better to do that in the background. So the condition may look like that: `majority_max(flush_lsn) == pageserver_s3_lsn` Here we rely on the two facts: + - that `--sync-safekeepers` happened after the compute shutdown, and it advanced local commit_lsn's allowing pageserver to consume that WAL. + + - we wait for the `pageserver_s3_lsn` advancement to avoid pageserver's last_received_lsn/disk_consistent_lsn going backward due to the disk/hardware failure and subsequent S3 recovery + + If those conditions are not met, we will have some gossip activity (but that may be okay). + +## Pros/cons + +Pros: + +- distributed, does not introduce new services (like etcd), does not add console as a storage dependency +- lays the foundation for gossip-based recovery + +Cons: + +- Only compute knows a set of safekeepers, but they should communicate even without compute node. In case of safekeepers restart, we will lose that info and can't gossip anymore. Hence we can't trim some WAL tail until the compute node start. Also, it is ugly. + +- If the console assigns a random set of safekeepers to each Postgres, we may end up in a situation where each safekeeper needs to have a connection with all other safekeepers. We can group safekeepers into isolated triples in the console to avoid that. Then "mixing" would happen only if we do rebalancing. + +## Alternative implementation + +We can have a selected node (e.g., console) with everybody reporting to it. + +## Security implications + +We don't increase the attack surface here. Communication can happen in a private network that is not exposed to users. + +## Scalability implications + +The only thing that may grow as we grow the number of computes is the number of gossip connections. But if we group safekeepers and assign a compute node to the random SK triple, the number of connections would be constant. diff --git a/docs/rfcs/014-storage-lsm.md b/docs/rfcs/014-storage-lsm.md new file mode 100644 index 0000000000..fed9ec38cb --- /dev/null +++ b/docs/rfcs/014-storage-lsm.md @@ -0,0 +1,145 @@ +# Why LSM trees? + +In general, an LSM tree has the nice property that random updates are +fast, but the disk writes are sequential. When a new file is created, +it is immutable. New files are created and old ones are deleted, but +existing files are never modified. That fits well with storing the +files on S3. + +Currently, we create a lot of small files. That is mostly a problem +with S3, because each GET/PUT operation is expensive, and LIST +operation only returns 1000 objects at a time, and isn't free +either. Currently, the files are "archived" together into larger +checkpoint files before they're uploaded to S3 to alleviate that +problem, but garbage collecting data from the archive files would be +difficult and we have not implemented it. This proposal addresses that +problem. + + +# Overview + + +``` +^ LSN +| +| Memtable: +-----------------------------+ +| | | +| +-----------------------------+ +| +| +| L0: +-----------------------------+ +| | | +| +-----------------------------+ +| +| +-----------------------------+ +| | | +| +-----------------------------+ +| +| +-----------------------------+ +| | | +| +-----------------------------+ +| +| +-----------------------------+ +| | | +| +-----------------------------+ +| +| +| L1: +-------+ +-----+ +--+ +-+ +| | | | | | | | | +| | | | | | | | | +| +-------+ +-----+ +--+ +-+ +| +| +----+ +-----+ +--+ +----+ +| | | | | | | | | +| | | | | | | | | +| +----+ +-----+ +--+ +----+ +| ++--------------------------------------------------------------> Page ID + + ++---+ +| | Layer file ++---+ +``` + + +# Memtable + +When new WAL arrives, it is first put into the Memtable. Despite the +name, the Memtable is not a purely in-memory data structure. It can +spill to a temporary file on disk if the system is low on memory, and +is accessed through a buffer cache. + +If the page server crashes, the Memtable is lost. It is rebuilt by +processing again the WAL that's newer than the latest layer in L0. + +The size of the Memtable is configured by the "checkpoint distance" +setting. Because anything that hasn't been flushed to disk and +uploaded to S3 yet needs to be kept in the safekeeper, the "checkpoint +distance" also determines the amount of WAL that needs to kept in the +safekeeper. + +# L0 + +When the Memtable fills up, it is written out to a new file in L0. The +files are immutable; when a file is created, it is never +modified. Each file in L0 is roughly 1 GB in size (*). Like the +Memtable, each file in L0 covers the whole key range. + +When enough files have been accumulated in L0, compaction +starts. Compaction processes all the files in L0 and reshuffles the +data to create a new set of files in L1. + + +(*) except in corner cases like if we want to shut down the page +server and want to flush out the memtable to disk even though it's not +full yet. + + +# L1 + +L1 consists of ~ 1 GB files like L0. But each file covers only part of +the overall key space, and a larger range of LSNs. This speeds up +searches. When you're looking for a given page, you need to check all +the files in L0, to see if they contain a page version for the requested +page. But in L1, you only need to check the files whose key range covers +the requested page. This is particularly important at cold start, when +checking a file means downloading it from S3. + +Partitioning by key range also helps with garbage collection. If only a +part of the database is updated, we will accumulate more files for +the hot part in L1, and old files can be removed without affecting the +cold part. + + +# Image layers + +So far, we've only talked about delta layers. In addition to the delta +layers, we create image layers, when "enough" WAL has been accumulated +for some part of the database. Each image layer covers a 1 GB range of +key space. It contains images of the pages at a single LSN, a snapshot +if you will. + +The exact heuristic for what "enough" means is not clear yet. Maybe +create a new image layer when 10 GB of WAL has been accumulated for a +1 GB segment. + +The image layers limit the number of layers that a search needs to +check. That put a cap on read latency, and it also allows garbage +collecting layers that are older than the GC horizon. + + +# Partitioning scheme + +When compaction happens and creates a new set of files in L1, how do +we partition the data into the files? + +- Goal is that each file is ~ 1 GB in size +- Try to match partition boundaries at relation boundaries. (See [1] + for how PebblesDB does this, and for why that's important) +- Greedy algorithm + +# Additional Reading + +[1] Paper on PebblesDB and how it does partitioning. +https://www.cs.utexas.edu/~rak/papers/sosp17-pebblesdb.pdf diff --git a/docs/rfcs/015-storage-messaging.md b/docs/rfcs/015-storage-messaging.md new file mode 100644 index 0000000000..a415b90459 --- /dev/null +++ b/docs/rfcs/015-storage-messaging.md @@ -0,0 +1,295 @@ +# Storage messaging + +Created on 19.01.22 + +Initially created [here](https://github.com/zenithdb/rfcs/pull/16) by @kelvich. + +That it is an alternative to (014-safekeeper-gossip)[] + +## Motivation + +As in 014-safekeeper-gossip we need to solve the following problems: + +* Trim WAL on safekeepers +* Decide on which SK should push WAL to the S3 +* Decide on which SK should forward WAL to the pageserver +* Decide on when to shut down SK<->pageserver connection + +This RFC suggests a more generic and hopefully more manageable way to address those problems. However, unlike 014-safekeeper-gossip, it does not bring us any closer to safekeeper-to-safekeeper recovery but rather unties two sets of different issues we previously wanted to solve with gossip. + +Also, with this approach, we would not need "call me maybe" anymore, and the pageserver will have all the data required to understand that it needs to reconnect to another safekeeper. + +## Summary + +Instead of p2p gossip, let's have a centralized broker where all the storage nodes report per-timeline state. Each storage node should have a `--broker-url=1.2.3.4` CLI param. + +Here I propose two ways to do that. After a lot of arguing with myself, I'm leaning towards the etcd approach. My arguments for it are in the pros/cons section. Both options require adding a Grpc client in our codebase either directly or as an etcd dependency. + +## Non-goals + +That RFC does *not* suggest moving the compute to pageserver and compute to safekeeper mappings out of the console. The console is still the only place in the cluster responsible for the persistency of that info. So I'm implying that each pageserver and safekeeper exactly knows what timelines he serves, as it currently is. We need some mechanism for a new pageserver to discover mapping info, but that is out of the scope of this RFC. + +## Impacted components + +pageserver, safekeeper +adds either etcd or console as a storage dependency + +## Possible implementation: custom message broker in the console + +We've decided to go with an etcd approach instead of the message broker. + +
+Original suggestion +
+We can add a Grpc service in the console that acts as a message broker since the console knows the addresses of all the components. The broker can ignore the payload and only redirect messages. So, for example, each safekeeper may send a message to the peering safekeepers or to the pageserver responsible for a given timeline. + +Message format could be `{sender, destination, payload}`. + +The destination is either: +1. `sk_#{tenant}_#{timeline}` -- to be broadcasted on all safekeepers, responsible for that timeline, or +2. `pserver_#{tenant}_#{timeline}` -- to be broadcasted on all pageservers, responsible for that timeline + +Sender is either: +1. `sk_#{sk_id}`, or +2. `pserver_#{pserver_id}` + +I can think of the following behavior to address our original problems: + +* WAL trimming + Each safekeeper periodically broadcasts `(write_lsn, commit_lsn)` to all peering (peering == responsible for that timeline) safekeepers + +* Decide on which SK should push WAL to the S3 + + Each safekeeper periodically broadcasts `i_am_alive_#{current_timestamp}` message to all peering safekeepers. That way, safekeepers may maintain the vector of alive peers (loose one, with false negatives). Alive safekeeper with the minimal id pushes data to S3. + +* Decide on which SK should forward WAL to the pageserver + + Each safekeeper periodically sends (write_lsn, commit_lsn, compute_connected) to the relevant pageservers. With that info, pageserver can maintain a view of the safekeepers state, connect to a random one, and detect the moments (e.g., one the safekeepers is not making progress or down) when it needs to reconnect to another safekeeper. Pageserver should resolve exact IP addresses through the console, e.g., exchange `#sk_#{sk_id}` to `4.5.6.7:6400`. + + Pageserver connection to the safekeeper triggered by the state change `compute_connected: false -> true`. With that, we don't need "call me maybe" anymore. + + Also, we don't have a "peer address amnesia" problem as in the gossip approach (with gossip, after a simultaneous reboot, safekeepers wouldn't know each other addresses until the next compute connection). + +* Decide on when to shutdown sk<->pageserver connection + + Again, pageserver would have all the info to understand when to shut down the safekeeper connection. + +### Scalability + +One node is enough (c) No, seriously, it is enough. + +### High Availability + +Broker lives in the console, so we can rely on k8s maintaining the console app alive. + +If the console is down, we won't trim WAL and reconnect the pageserver to another safekeeper. But, at the same, if the console is down, we already can't accept new compute connections and start stopped computes, so we are making things a bit worse, but not dramatically. + +### Interactions + +``` + .________________. +sk_1 <-> | | <-> pserver_1 +... | Console broker | ... +sk_n <-> |________________| <-> pserver_m +``` +
+ + +## Implementation: etcd state store + +Alternatively, we can set up `etcd` and maintain the following data structure in it: + +```ruby +"compute_#{tenant}_#{timeline}" => { + safekeepers => { + "sk_#{sk_id}" => { + write_lsn: "0/AEDF130", + commit_lsn: "0/AEDF100", + compute_connected: true, + last_updated: 1642621138, + }, + } +} +``` + +As etcd doesn't support field updates in the nested objects that translates to the following set of keys: + +```ruby +"compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/write_lsn", +"compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/commit_lsn", +... +``` + +Each storage node can subscribe to the relevant sets of keys and maintain a local view of that structure. So in terms of the data flow, everything is the same as in the previous approach. Still, we can avoid implementing the message broker and prevent runtime storage dependency on a console. + +### Safekeeper address discovery + +During the startup safekeeper should publish the address he is listening on as the part of `{"sk_#{sk_id}" => ip_address}`. Then the pageserver can resolve `sk_#{sk_id}` to the actual address. This way it would work both locally and in the cloud setup. Safekeeper should have `--advertised-address` CLI option so that we can listen on e.g. 0.0.0.0 but advertise something more useful. + +### Safekeeper behavior + +For each timeline safekeeper periodically broadcasts `compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/*` fields. It subscribes to changes of `compute_#{tenant}_#{timeline}` -- that way safekeeper will have an information about peering safekeepers. +That amount of information is enough to properly trim WAL. To decide on who is pushing the data to S3 safekeeper may use etcd leases or broadcast a timestamp and hence track who is alive. + +### Pageserver behavior + +Pageserver subscribes to `compute_#{tenant}_#{timeline}` for each tenant it owns. With that info, pageserver can maintain a view of the safekeepers state, connect to a random one, and detect the moments (e.g., one the safekeepers is not making progress or down) when it needs to reconnect to another safekeeper. Pageserver should resolve exact IP addresses through the console, e.g., exchange `#sk_#{sk_id}` to `4.5.6.7:6400`. + +Pageserver connection to the safekeeper can be triggered by the state change `compute_connected: false -> true`. With that, we don't need "call me maybe" anymore. + +As an alternative to compute_connected, we can track timestamp of the latest message arrived to safekeeper from compute. Usually compute broadcasts KeepAlive to all safekeepers every second, so it'll be updated every second when connection is ok. Then the connection can be considered down when this timestamp isn't updated for a several seconds. + +This will help to faster detect issues with safekeeper (and switch to another) in the following cases: + + when compute failed but TCP connection stays alive until timeout (usually about a minute) + when safekeeper failed and didn't set compute_connected to false + +Another way to deal with [2] is to process (write_lsn, commit_lsn, compute_connected) as a KeepAlive on the pageserver side and detect issues when sk_id don't send anything for some time. This way is fully compliant to this RFC. + +Also, we don't have a "peer address amnesia" problem as in the gossip approach (with gossip, after a simultaneous reboot, safekeepers wouldn't know each other addresses until the next compute connection). + +### Interactions + +``` + .________________. +sk_1 <-> | | <-> pserver_1 +... | etcd | ... +sk_n <-> |________________| <-> pserver_m +``` + +### Sequence diagrams for different workflows + +#### Cluster startup + +```mermaid +sequenceDiagram + autonumber + participant C as Compute + participant SK1 + participant SK2 + participant SK3 + participant PS1 + participant PS2 + participant O as Orchestrator + participant M as Metadata Service + + PS1->>M: subscribe to updates to state of timeline N + C->>+SK1: WAL push + loop constantly update current lsns + SK1->>-M: I'm at lsn A + end + C->>+SK2: WAL push + loop constantly update current lsns + SK2->>-M: I'm at lsn B + end + C->>+SK3: WAL push + loop constantly update current lsns + SK3->>-M: I'm at lsn C + end + loop request pages + C->>+PS1: get_page@lsn + PS1->>-C: page image + end + M->>PS1: New compute appeared for timeline N. SK1 at A, SK2 at B, SK3 at C + note over PS1: Say SK1 at A=200, SK2 at B=150 SK3 at C=100
so connect to SK1 because it is the most up to date one + PS1->>SK1: start replication +``` + +#### Behaviour of services during typical operations + +```mermaid +sequenceDiagram + autonumber + participant C as Compute + participant SK1 + participant SK2 + participant SK3 + participant PS1 + participant PS2 + participant O as Orchestrator + participant M as Metadata Service + + note over C,M: Scenario 1: Pageserver checkpoint + note over PS1: Upload data to S3 + PS1->>M: Update remote consistent lsn + M->>SK1: propagate remote consistent lsn update + note over SK1: truncate WAL up to remote consistent lsn + M->>SK2: propagate remote consistent lsn update + note over SK2: truncate WAL up to remote consistent lsn + M->>SK3: propagate remote consistent lsn update + note over SK3: truncate WAL up to remote consistent lsn + note over C,M: Scenario 2: SK1 finds itself lagging behind MAX(150 (SK2), 200 (SK2)) - 100 (SK1) > THRESHOLD + SK1->>SK2: Fetch WAL delta between 100 (SK1) and 200 (SK2) + note over C,M: Scenario 3: PS1 detects that SK1 is lagging behind: Connection from SK1 is broken or there is no messages from it in 30 seconds. + note over PS1: e.g. SK2 is at 150, SK3 is at 100, chose SK2 as a new replication source + PS1->>SK2: start replication +``` + +#### Behaviour during timeline relocation + +```mermaid +sequenceDiagram + autonumber + participant C as Compute + participant SK1 + participant SK2 + participant SK3 + participant PS1 + participant PS2 + participant O as Orchestrator + participant M as Metadata Service + + note over C,M: Timeline is being relocated from PS1 to PS2 + O->>+PS2: Attach timeline + PS2->>-O: 202 Accepted if timeline exists in S3 + note over PS2: Download timeline from S3 + note over O: Poll for timeline download (or subscribe to metadata service) + loop wait for attach to complete + O->>PS2: timeline detail should answer that timeline is ready + end + PS2->>M: Register downloaded timeline + PS2->>M: Get safekeepers for timeline, subscribe to changes + PS2->>SK1: Start replication to catch up + note over O: PS2 caught up, time to switch compute + O->>C: Restart compute with new pageserver url in config + note over C: Wal push is restarted + loop request pages + C->>+PS2: get_page@lsn + PS2->>-C: page image + end + O->>PS1: detach timeline + note over C,M: Scenario 1: Attach call failed + O--xPS2: Attach timeline + note over O: The operation can be safely retried,
if we hit some threshold we can try another pageserver + note over C,M: Scenario 2: Attach succeeded but pageserver failed to download the data or start replication + loop wait for attach to complete + O--xPS2: timeline detail should answer that timeline is ready + end + note over O: Can wait for a timeout, and then try another pageserver
there should be a limit on number of different pageservers to try + note over C,M: Scenario 3: Detach fails + O--xPS1: Detach timeline + note over O: can be retried, if continues to fail might lead to data duplication in s3 +``` + +# Pros/cons + +## Console broker/etcd vs gossip: + +Gossip pros: +* gossip allows running storage without the console or etcd + +Console broker/etcd pros: +* simpler +* solves "call me maybe" as well +* avoid possible N-to-N connection issues with gossip without grouping safekeepers in pre-defined triples + +## Console broker vs. etcd: + +Initially, I wanted to avoid etcd as a dependency mostly because I've seen how painful for Clickhouse was their ZooKeeper dependency: in each chat, at each conference, people were complaining about configuration and maintenance barriers with ZooKeeper. It was that bad that ClickHouse re-implemented ZooKeeper to embed it: https://clickhouse.com/docs/en/operations/clickhouse-keeper/. + +But with an etcd we are in a bit different situation: + +1. We don't need persistency and strong consistency guarantees for the data we store in the etcd +2. etcd uses Grpc as a protocol, and messages are pretty simple + +So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local zenith installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres). diff --git a/docs/rfcs/016-connection-routing.md b/docs/rfcs/016-connection-routing.md new file mode 100644 index 0000000000..603a0725d6 --- /dev/null +++ b/docs/rfcs/016-connection-routing.md @@ -0,0 +1,151 @@ +# Dispatching a connection + +For each client connection, Neon service needs to authenticate the +connection, and route it to the right PostgreSQL instance. + +## Authentication + +There are three different ways to authenticate: + +- anonymous; no authentication needed +- PostgreSQL authentication +- github single sign-on using browser + +In anonymous access, the user doesn't need to perform any +authentication at all. This can be used e.g. in interactive PostgreSQL +documentation, allowing you to run the examples very quickly. Similar +to sqlfiddle.com. + +PostgreSQL authentication works the same as always. All the different +PostgreSQL authentication options like SCRAM, kerberos, etc. are +available. [1] + +The third option is to authenticate with github single sign-on. When +you open the connection in psql, you get a link that you open with +your browser. Opening the link redirects you to github authentication, +and lets the connection to proceed. This is also known as "Link auth" [2]. + + +## Routing the connection + +When a client starts a connection, it needs to be routed to the +correct PostgreSQL instance. Routing can be done by the proxy, acting +as a man-in-the-middle, or the connection can be routed at the network +level based on the hostname or IP address. + +Either way, Neon needs to identify which PostgreSQL instance the +connection should be routed to. If the instance is not already +running, it needs to be started. Some connections always require a new +PostgreSQL instance to be created, e.g. if you want to run a one-off +query against a particular point-in-time. + +The PostgreSQL instance is identified by: +- Neon account (possibly anonymous) +- cluster (known as tenant in the storage?) +- branch or snapshot name +- timestamp (PITR) +- primary or read-replica +- one-off read replica +- one-off writeable branch + +When you are using regular PostgreSQL authentication or anonymous +access, the connection URL needs to contain all the information needed +for the routing. With github single sign-on, the browser is involved +and some details - the Neon account in particular - can be deduced +from the authentication exchange. + +There are three methods for identifying the PostgreSQL instance: + +- Browser interaction (link auth) +- Options in the connection URL and the domain name +- A pre-defined endpoint, identified by domain name or IP address + +### Link Auth + + postgres://@start.neon.tech/ + +This gives you a link that you open in browser. Clicking the link +performs github authentication, and the Neon account name is +provided to the proxy behind the scenes. The proxy routes the +connection to the primary PostgreSQL instance in cluster called +"main", branch "main". + +Further ideas: +- You could pre-define a different target for link auth + connections in the UI. +- You could have a drop-down in the browser, allowing you to connect + to any cluster you want. Link Auth can be like Teleport. + +### Connection URL + +The connection URL looks like this: + + postgres://@.db.neon.tech/ + +By default, this connects you to the primary PostgreSQL instance +running on the "main" branch in the named cluster [3]. However, you can +change that by specifying options in the connection URL. The following +options are supported: + +| option name | Description | Examples | +| --- | --- | --- | +| cluster | Cluster name | cluster:myproject | +| branch | Branch name | branch:main | +| timestamp | Connect to an instance at given point-in-time. | timestamp:2022-04-08 timestamp:2022-04-08T11:42:16Z | +| lsn | Connect to an instance at given LSN | lsn:0/12FF0420 | +| read-replica | Connect to a read-replica. If the parameter is 'new', a new instance is created for this session. | read-replica read-replica:new | + +For example, to read branch 'testing' as it was on Mar 31, 2022, you could +specify a timestamp in the connection URL [4]: + + postgres://alice@cluster-1234.db.neon.tech/postgres?options=branch:testing,timestamp:2022-03-31 + +Connecting with cluster name and options can be disabled in the UI. If +disabled, you can only connect using a pre-defined endpoint. + +### Pre-defined Endpoint + +Instead of providing the cluster name, branch, and all those options +in the connection URL, you can define a named endpoint with the same +options. + +In the UI, click "create endpoint". Fill in the details: + +- Cluster name +- Branch +- timestamp or LSN +- is this for the primary or for a read replica +- etc. + +When you click Finish, a named endpoint is created. You can now use the endpoint ID to connect: + + postgres://@.endpoint.neon.tech/ + + +An endpoint can be assigned a static or dynamic IP address, so that +you can connect to it with clients that don't support TLS SNI. Maybe +bypass the proxy altogether, but that ought to be invisible to the +user. + +You can limit the range of source IP addresses that are allowed to +connect to an endpoint. An endpoint can also be exposed in an Amazon +VPC, allowing direct connections from applications. + + +# Footnotes + +[1] I'm not sure how feasible it is to set up configure like Kerberos +or LDAP in a cloud environment. But in principle I think we should +allow customers to have the full power of PostgreSQL, including all +authentication options. However, it's up to the customer to configure +it correctly. + +[2] Link is a way to both authenticate and to route the connection + +[3] This assumes that cluster-ids are globally unique, across all +Neon accounts. + +[4] The syntax accepted in the connection URL is limited by libpq. The +only way to pass arbitrary options to the server (or our proxy) is +with the "options" keyword, and the options must be percent-encoded. I +think the above would work but i haven't tested it diff --git a/docs/rfcs/017-timeline-data-management.md b/docs/rfcs/017-timeline-data-management.md new file mode 100644 index 0000000000..a8ca3c7ca9 --- /dev/null +++ b/docs/rfcs/017-timeline-data-management.md @@ -0,0 +1,413 @@ +# Name + +Tenant and timeline data management in pageserver + +## Summary + +This RFC attempts to describe timeline-related data management as it's done now in pageserver, highlight current complexities caused by this and propose a set of changes to mitigate them. + +The main goal is to prepare for future [on-demand layer downloads](https://github.com/neondatabase/neon/issues/2029), yet timeline data is one of the core primitive of pageserver, so a number of other RFCs are affected either. +Due to that, this document won't have a single implementation, rather requiring a set of code changes to achieve the final state. + +RFC considers the repository at the `main` branch, commit [`28243d68e60ffc7e69f158522f589f7d2e09186d`](https://github.com/neondatabase/neon/tree/28243d68e60ffc7e69f158522f589f7d2e09186d) on the time of writing. + +## Motivation + +In recent discussions, it became more clear that timeline-related code becomes harder to change: it consists of multiple disjoint modules, each requiring a synchronization to access. +The lower the code is, the complex the sync gets since many concurrent processes are involved and require orchestration to keep the data consistent. +As the number of modules and isolated data grows per timeline, more questions and corner cases arise: + +- https://github.com/neondatabase/neon/issues/1559 + right now it's not straightened out what to do when the synchronization task fails for too many times: every separate module's data has to be treated differently. + +- https://github.com/neondatabase/neon/issues/1751 + GC and compaction file activities are not well known outside their tasks code, causing race bugs + +- https://github.com/neondatabase/neon/issues/2003 + Even the tenant management gets affected: we have to alter its state based on timeline state, yet the data for making the decision is separated and the synchronisation logic has bugs + +- more issues were brought in discussions, but apparently they were too specific to the code to mention them in the issues. + For instance, `tenant_mgr` itself is a static object that we can not mock anyhow, which reduces our capabilities to test the data synchronization logic. + In fact, we have zero Rust tests that cover the case of synchronizing more than one module's data. + +On demand layer downloads would require us to dynamically manage the layer files, which we almost not doing at all on the module level, resulting in the most of their APIs dealing with timelines, rather than the layer files. +The disjoint data that would require data synchronization with possibly a chain of lock acquisitions, some async and some sync, and it would be hard to unit test it with the current code state. + +Neither this helps to easy start the on-demand download epic, nor it's easy to add more timeline-related code on top, whatever the task is. +We have to develop a vision on a number of topics before progressing safely: + +- timeline and tenant data structure and how should we access it +- sync and async worlds and in what way that should evolve +- unit tests for the complex logic + +This RFC aims to provide a general overview of the existing situation and propose ways to improve it. +The changes proposed are quite big and no single PR is expected to do the adjustments, they should gradually be done during the on-demand download work later. + +## What is a timeline and its data + +First, we need to define what data we want to manage per timeline. +Currently, the data every timeline operates is: + +- a set of layer files, on the FS + + Never updated files, created after pageserver's checkpoints and compaction runs, can be removed from the local FS due to compaction, gc or timeline deletion. + +- a set of layer files, on the remote storage + + Identically named and placed in tenant subdirectories files on the remote storage (S3), copied by a special background sync thread + +- a `metadata` file, on the FS + + Updated after every checkpoint with the never `disk_consistent_lsn` and `latest_gc_cutoff_lsn` values. Used to quickly restore timeline's basic metadata on pageserver restart. + Also contains data about the ancestor, if the timeline was branched off another timeline. + +- an `index_part.json` file, on the remote storage + + Contains `metadata` file contents and a list of layer files, available in the current S3 "directory" for the timeline. + Used to avoid potentially slow and expensive `S3 list` command, updated by the remotes storage sync thread after every operation with the remote layer files. + +- LayerMap and PageCache, in memory + + Dynamic, used to store and retrieve the page data to users. + +- timeline info, in memory + + LSNs, walreceiver data, `RemoteTimelineIndex` and other data to share via HTTP API and internal processes. + +- metrics data, in memory + + Data to push or provide to Prometheus, Opentelemetry, etc. + +Besides the data, every timeline currently needs an etcd connection to receive WAL events and connect to safekeepers. + +Timeline could be an ancestor to another one, forming a dependency tree, which is implicit right now: every time relations are looked up in place, based on the corresponding `TimelineMetadata` struct contents. +Yet, there's knowledge on a tenant as a group of timelines, belonging to a single user which is used in GC and compaction tasks, run on every tenant. +`tenant_mgr` manages tenant creation and its task startup, along with the remote storage sync for timeline layers. + +Last file being managed per-tenant is the tenant config file, created and updated on the local FS to hold tenant-specific configuration between restarts. +It's not yet anyhow synchronized with the remote storage, so only exists on the local FS. + +### How the data is stored + +We have multiple places where timeline data is stored: + +- `tenant_mgr` [holds](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L43) a static `static ref TENANTS: RwLock>` with the `Tenant` having the `local_timelines: HashMap>` inside + +- same `Tenant` above has actually two references to timelines: another via its `repo: Arc` with `pub type RepositoryImpl = LayeredRepository;` that [holds](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/layered_repository.rs#L178) `Mutex>` + +- `RemoteTimelineIndex` [contains](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/storage_sync/index.rs#L84) the metadata about timelines on the remote storage (S3) for sync reasons and possible HTTP API queries + +- `walreceiver` [stores](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/walreceiver.rs#L60) the metadata for possible HTTP API queries and its [internal state](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/walreceiver/connection_manager.rs#L245) with a reference to the timeline, its current connections and etcd subscription (if any) + +- `PageCache` contains timeline-related data, and is created globally for the whole pageserver + +- implicitly, we also have files on local FS, that contain timeline state. We operate on those files and for some operations (GC, compaction) yet we don't anyhow synchronize the access to the files per se: there are more high-level locks, ensuring only one of a group of operations is running at a time. + + On practice though, `LayerMap` and layer files are tightly coupled together: current low-level code requires a timeline to be loaded into the memory to work with it, and the code removes the layer files after removing the entry from the `LayerMap` first. + +Based on this, a high-level pageserver's module diagram with data and entities could be: + +![timeline tenant state diagram](./images/017-timeline-data-management/timeline_tenant_state.svg) + +A few comments on the diagram: + +- the diagram does not show all the data and replaces a few newtypes and type aliases (for example, completely ignores "unloaded" timelines due to reasons described below) + + It aims to show main data and means of synchronizing it. + +- modules tend to isolate their data inside and provide access to it via API + +Due to multitenancy, that results in a common pattern for storing both tenant and timeline data: `RwLock` or `Mutex` around the `HashMap`, gc and compaction tasks also use the same lock pattern to ensure no concurrent runs are happening. + +- part of the modules is asynchronous, while the other is not, that complicates the data access + +Currently, anything that's not related to tasks (walreceiver, storage sync, GC, compaction) is blocking. + +Async tasks that try to access the data in the sync world, have to call `std::sync::Mutex::lock` method, which blocks the thread the callee async task runs on, also blocking other async tasks running in the same thread. Methods of `std::sync::RwLock` have the same issues, forcing async tasks either to block or spawn another, "blocking" task on a separate thread. + +Sync tasks that try to access the data in the async world, cannot use `.await` hence have to have some `Runtime` doing those calls for them. [`tokio::sync::Mutex`](https://docs.rs/tokio/1.19.2/tokio/sync/struct.Mutex.html#method.blocking_lock) and [`tokio::sync::RwLock`](https://docs.rs/tokio/1.19.2/tokio/sync/struct.RwLock.html#method.blocking_read) provide an API to simplify such calls. Similarly, both `std::sync` and `tokio::sync` have channels that are able to communicate into one direction without blocking and requiring `.await` calls, hence can be used to connect both worlds without locking. + +Some modules are in transition, started as async "blocking" tasks and being fully synchronous in their entire code below the start. Current idea is to transfer them to the async further, but it's not yet done. + +- locks are used in two different ways: + + - `RwLock>` ones to hold the shared data and ensure its atomic updates + - `Mutex<()>` for synchronizing the tasks, used to implicitly order the data access + + The "shared data" locks of the first kind are mainly accessed briefly to either look up or alter the data, yet there are a few notable exceptions, such as + `latest_gc_cutoff_lsn: RwLock` that is explicitly held in a few places to prevent GC thread from progressing. Those are covered later in the data access diagrams. + +- some synchronizations are not yet implemented + +E.g. asynchronous storage sync module does not synchronize with almost synchronous GC and compaction tasks when the layer files are uploaded to the remote storage. +That occasionally results in the files being deleted before the storage upload task is run for this layer, but due to the incremental nature of the layer files, we can handle such situations without issues. + +- `LayeredRepository` covers lots of responsibilities: GC and compaction task synchronisation, timeline access (`local_timelines` in `Tenant` is not used directly before the timeline from the repository is accessed), layer flushing to FS, layer sync to remote storage scheduling, etc. + +### How is this data accessed? + +There are multiple ways the data is accessed, from different sources: + +1. [HTTP requests](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/http/routes.rs) + +High-level CRUD API for managing tenants, timelines and getting data about them. +Current API list (modified for readability): + +```rust +.get("/v1/status", status_handler) // pageserver status +.get("/v1/tenant", tenant_list_handler) +.post("/v1/tenant", tenant_create_handler) // can create "empty" timelines or branch off the existing ones +.get("/v1/tenant/:tenant_id", tenant_status) // the only tenant public metadata +.put("/v1/tenant/config", tenant_config_handler) // tenant config data and local file manager +.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) +.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) +.post("/v1/tenant/:tenant_id/attach", tenant_attach_handler) // download entire tenant from the remote storage and load its timelines memory +.post("/v1/tenant/:tenant_id/detach", tenant_detach_handler) // delete all tenant timelines from memory, remote corresponding storage and local FS files +.get("/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler) +.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler) +.get("/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver", wal_receiver_get_handler) // get walreceiver stats metadata +``` + +Overall, neither HTTP operation goes below `LayeredRepository` level and does not interact with layers: instead, they manage tenant and timeline entities, their configuration and metadata. + +`GET` data is small (relative to layer files contents), updated via brief `.write()/.lock()` calls and read via copying/cloning the data to release the lock soon. +It does not mean that the operations themselves are short, e.g. `tenant_attach_handler` downloads multiple files from the remote storage which might take time, yet the final data is inserted in memory via one brief write under the lock. + +Non-`GET` operations mostly follow the same rule, with two differences: + +- `tenant_detach_handler` has to wait for its background tasks to stop before shutting down, which requires more work with locks +- `timeline_create_handler` currently requires GC to be paused before branching the timeline, which requires orchestrating too. + This is the only HTTP operation, able to load the timeline into memory: rest of the operations are reading the metadata or, as in `tenant_attach_handler`, schedule a deferred task to download timeline and load it into memory. + +"Timeline data synchronization" section below describes both complex cases in more details. + +2. [libpq requests](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/page_service.rs) + +Is the main interface of pageserver, intended to handle libpq (and similar) requests. +Operates on `LayeredTimeline` and, lower, `LayerMap` modules; all timelines accessed during the operation are loaded into memory immediately (if not loaded already), operations bail on timeline load errors. + +- `pagestream` + + Page requests: `get_rel_exists`, `get_rel_size`, `get_page_at_lsn`, `get_db_size` + + Main API points, intended to be used by `compute` to show the data to the user. All require requests to be made at certain Lsn, if this Lsn is not available in the memory, request processing is paused until that happens or bails after a timeout. + +- `basebackup` and `fullbackup` + + Options to generate postgres-compatible backup archives. + +- `import basebackup` + +- `import wal` + + Import the `pg_wal` section of the basebackup archive. + +- `get_last_record_rlsn`, `get_lsn_by_timestamp` + +"Metadata" retrieval methods, that still requires internal knowledge about layers. + +- `set`, `fallpoints`, `show` + +Utility methods to support various edge cases or help with debugging/testing. + +- `do_gc`, `compact`, `checkpoint` + +Manual triggers for corresponding tenant tasks (GC, compaction) and inmemory layer flushing on disk (checkpointing), with upload task scheduling as a follow-up. + +Apart from loading into memory, every timeline layer has to be accessed using specific set of locking primitives, especially if a write operations happens: otherwise, GC or compaction might spoil the data. User API is implicitly affected by this synchronization during branching, when a GC has to be orchestrated properly before the new timeline could be branched off the existing one. +See "Timeline data synchronization" section for the united synchronization diagram on the topic. + +3. internal access + +Entities within pageserver that update files on local FS and remote storage, metadata in memory; has to use internal data for those operations. +Places that access internal, lower data are also required to have the corresponding timeline successfully loaded into memory and accessed with corresponding synchronization. + +If ancestors' data is accessed via its child branch, it means more than one timeline has to be loaded into memory entirely and more locking primitives usage involved. +Right now, all ancestors are resolved in-place: every place that has to check timeline's ancestor has to lock the timelines map, check if one is loaded into the memory, load it there or bail if it's not present, and get the information required and so on. + +- periodic GC and compaction tasks + +Alter metadata (GC info), in-memory data (layer relations, page caches, etc.) and layer files on disk. +Same as its libpq counterparts, needs full synchronization with the low level layer management code. + +- storage sync task + +Alters metadata (`RemoteTimelineIndex`), layer files on remote storage (upload, delete) and local FS (download) and in-memory data (registers downloaded timelines in the repository). +Currently, does not know anything about layer files contents, rather focusing on the file structure and metadata file updates: due to the fact that the layer files cannot be updated (only created or deleted), storage sync is able to back up the files to the remote storage without further low-level synchronizations: only when the timeline is downloaded, a load operation is needed to run, possibly pausing GC and compaction tasks. + +- walreceiver and walingest task + +Per timeline, subscribes for etcd events from safekeeper and eventually spawns a walreceiver connection task to receive WAL from a safekeeper node. +Fills memory with data, eventually triggering a checkpoint task that creates a new layer file in the local FS and schedules a remote storage sync upload task. +During WAL receiving, also updates a separate in-memory data structure with the walreceiver stats, used later via HTTP API. + +Layer updates require low-level set of sync primitives used to preserve the data consistency. + +- checkpoint (layer freeze) task + +Periodic, short-lived tasks to generate a new layer file in the FS. Requires low level synchronization in the end, when the layer is being registered after creating and has additional mode to ensure only one concurrent compaction happens at a time. + +### Timeline data synchronization + +Here's a high-level timeline data access diagram, considering the synchronization locks, based on the state diagram above. + +For brevity, diagrams do not show `RwLock>` data accesses, considering them almost instant to happen. +`RwLock` is close to be an exception to the previous rule, since it's taken in a multiple places to ensure all layers are inserted correctly. +Yet the only long operation in the current code is a `.write()` lock on the map during its creation, while all other lock usages tend to be short in the current code. +Note though, that due to current "working with loaded timeline only", prevailing amount of the locks taken on the struct are `.write()` locks, not the `.read()` ones. +To simplify the diagrams, these accesses are now considered "fast" data access, not the synchronization attempts. + +`write_lock` synchronization diagram: + +![timeline data access synchronization(1)](./images/017-timeline-data-management/timeline_data_access_sync_1.svg) + +Comments: + +- `write_lock: Mutex<()>` ensures that all timeline data being written into **in-memory layers** is done without races, one concurrent write at a time +- `layer_flush_lock: Mutex<()>` and layer flushing seems to be slightly bloated with various ways to create a layer on disk and write it in memory + The lock itself seem to repeat `write_lock` purpose when it touches in-memory layers, and also to limit the on-disk layer creations. + Yet the latter is not really done consistently, since remote storage sync manages to download and register the new layers without touching the locks +- `freeze_inmem_layer(true)` that touches both `write_lock` and `layer_flush_lock` seems not very aligned with the rest of the locks to those primitives; it also now restricts the layer creation concurrency even more, yet there are various `freeze_inmem_layer(false)` that are ignoring those restrictions at the same time + +![timeline data access synchronization(2)](./images/017-timeline-data-management/timeline_data_access_sync_2.svg) + +Comments: + +- `partitioning: Mutex<(KeyPartitioning, Lsn)>` lock is a data sync lock that's not used to synchronize the tasks (all other such kinds were considered "almost instant" and omitted on the diagram), yet is very similar to what `write_lock` and `layer_flush_lock` do: it ensures the timeline in-memory data is up-to-date with the layer files state on disk, which is what `LayerMap` is for. + +- there are multiple locks that do similar task management operations: + - `gc_cs: Mutex<()>` and `latest_gc_cutoff_lsn: RwLock` ensures that branching and gc are not run concurrently + - `layer_removal_cs: Mutex<()>` lock ensure gc, compaction and timeline deletion via HTTP API do not run concurrently + - `file_lock: RwLock<()>` is used as a semaphore, to ensure "all" gc and compaction tasks are shut down and do not start + Yet that lock does take only gc and compaction from internal loops: libpq call is not cancelled and waited upon. + +Those operations do not seem to belong to a timeline. Moreover, some of those could be eliminated entirely due to duplication of their tasks. + +## Proposed implementation + +### How to structure timeline data access better + +- adjust tenant state handling + +Current [`TenantState`](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L108) [changes](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/tenant_mgr.rs#L317) mainly indicates whether GC and compaction tasks are running or not; another state, `Broken` shows only in case any timeline does not load during startup. + +We could start both GC and compaction tasks at the time the tenant is created and adjust the tasks to throttle/sleep on timeline absence and wake up when the first one is added. +The latter becomes more important on download on demand, since we won't have the entire timeline in reach to verify its correctness. Moreover, if any network connection happens, the timeline could fail temporarily and entire tenant should be marked as broken due to that. + +Since nothing verifies the `TenantState` via HTTP API currently, it makes sense to remove the whole state entirely and don't write the code to synchronize its changes. +Instead, we could indicate internal issues for every timeline and have a better API to "stop" timeline processing without deleting its data, making our API less restrictive. + +- remove the "unloaded" status for the timeline + +Current approach to timeline management [assumes](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/layered_repository.rs#L486-L493) + +```rust +#[derive(Clone)] +enum LayeredTimelineEntry { + Loaded(Arc), + Unloaded { + id: ZTimelineId, + metadata: TimelineMetadata, + }, +} +``` + +supposes that timelines have to be in `Unloaded` state. + +The difference between both variants is whether its layer map was loaded from disk and kept in memory (Loaded) or not (Unloaded). +The idea behind such separation was to lazy load timelines in memory with all their layers only after its first access and potentially unload them later. + +Yet now there's no public API methods, that deal with unloaded timelines' layers: all of them either bail when such timeline is worked on, or load it into memory and continue working. +Moreover, every timeline in the local FS is loaded on pageserver startup now, so only two places where `Unloaded` variant is used are branching and timeline attach, with both loading the timeline into memory before the end of the operation. +Even if that loading into memory bails for some reason, next GC or compaction task periodic run would load such timeline into memory. +There are a few timeline methods that return timeline metadata without loading its layers, but such metadata also comes from the `metadata` FS file, not the layer files (so no page info could be retrieved without loading the entire layer map first). + +With the layer on-demand download, it's not feasible anymore to wait for the entire layer map to be loaded into the memory, since it might not even be available on the local FS when requested: `LayerMap` needs to be changed to contain metadata to retrieve the missing layers and handle partially present on the local FS timeline state. + +To accommodate to that and move away from the redundant status, a timeline should always be "loaded" with its metadata read from the disk and its layer map prepared to be downloaded when requested, per layer. + +Layers in the layer map, on the other hand, could be in various state: loaded, unloaded, downloading, downloading failed, etc. and their state has to be handled instead, if we want to support on-demand download in the future. + +This way, tenants and timelines could always try to serve requests and do their internal tasks periodically, trying to recover. + +- scale down the remote storage sync to per layer file, not per timeline as now + +Due to the reasons from the previous bullet, current remote storage model needs its timeline download approach to be changed. +Right now, a timeline is marked as "ready" only after all its layers on the remote storage are downloaded on the local storage. +With the on-demand download approach, only remote storage timeline metadata should be downloaded from S3, leaving the rest of the layers ready for download if/when it's requested. + +Note: while the remote storage sync should operate per layer, it should stay global for all tenants, to better manage S3 limits and sync queue priorities. +Yet the only place using remote storage should be the layer map. + +- encapsulate `tenant_mgr` logic into a regular Rust struct, unite with part of the `Repository` and anything else needed to manage the timeline data in a single place and to test it independently + +[`Repository`](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/repository.rs#L187) trait gets closer to `tenant_mgr` in terms of functionality: there are two background task-related functions, that are run on all timelines of a tenant: `gc_iteration` (it does allow running on a single timeline, but GC task runs it on all timelines) and `compaction_iteration` that are related to service tasks, not the data storage; and the metadata management functions, also not really related to the timeline contents. + +`tenant_mgr` proxies some of the `Repository` calls, yet both service tasks use `tenant_mgr` to access the data they need, creating a circular dependency between their APIs. +To avoid excessive synchronization between components, taking multiple locks for that and static state, we can organize the data access and updates in one place. +One potential benefit Rust gets from this is the ability to track and manage timeline resources, if all the related data is located in one place. + +- move `RemoteStorage` usage from `LayeredRepository` into `LayerMap`, as the rest of the layer-based entities (layer files, etc.) + +Layer == file in our model, since pageserver always either tries to load the LayerMap from disk for the timeline not in memory, or assumes the file contents matches its memory. +`LayeredRepository` is one of the most loaded objects currently and not everything from it deserves unification with the `tenant_mgr`. +In particular, layer files need to be better prepared for future download on demand functionality, where every layer could be dynamically loaded and unloaded from memory and local FS. +Current amount of locks and sync-async separation would make it hard to implement truly dynamic (un)loading; moreover, we would need retries with backoffs, since the unloaded layer files are most probably not available on the local FS either and network is not always reliable. + +One of the solutions to the issue is already being developed for the remote storage sync: [SyncQueue](https://github.com/neondatabase/neon/blob/28243d68e60ffc7e69f158522f589f7d2e09186d/pageserver/src/storage_sync.rs#L463) +The queue is able to batch CRUD layer operations (both for local and remote FS contexts) and reorder them to increase the sync speed. +Similar approach could be generalized for all layer modifications, including in-memory ones such as GC or compaction: this way, we could manage all layer modifications and reads in one place with lesser locks and tests that are closer to unit tests. + +- change the approach to locking synchronization + +A number of locks in the timeline seem to be used to coordinate gc, compaction tasks and related processes. +It should be done in a task manager or other place, external to the timeline. + +Timeline contents still needs to be synchronized, considering the task work, so fields like `latest_gc_cutoff_lsn: RwLock` are expected to stay for that purpose, but general amount of locks should be reduced. + +### Putting it all together + +If the proposal bullets applied to the diagrams above, the state could be represented as: + +![timeline timeline tenant state](./images/017-timeline-data-management/proposed_timeline_tenant_state.svg) + +The reorders aim to put all tasks into separated modules, with strictly defined interfaces and as less knowledge about other components, as possible. +This way, all timeline data is now in the `data_storage`, including the GC, walreceiver, `RemoteTimelineIndex`, `LayerMap`, etc. with some API to get the data in the way, +more convenient for the data sync system inside. +So far, it seems that a few maps with `Arc>` with actual data operations added inside each `SeparateData` struct, if needed. + +`page_cache` is proposed to placed into the same `data_storage` since it contains tenant timelines' data: this way, all metadata and data is in the same struct, simplifying things with Rust's borrow checker and allowing us to share internals between data modules and later might simplify timeline in-memory size tracking. + +`task_manager` is related to data storage and manages all tenant and timeline tasks, manages shared resources (runtimes, thread pools, etcd connection, etc.) and synchronizes tasks. +All locks such as `gc_cs` belong to this module tree, as primitives inherently related to the task synchronization. +Tasks have to access timelines and their metadata, but should do that through `data_storage` API and similar. + +`task_manager` should (re)start, stop and track all tasks that are run in it, selecting an appropriate runtime depending on a task kind (we have async/sync task separation, CPU and IO bound tasks separation, ...) +Some locks such as `layer_removal_cs` one are not needed, if the only component that starts the tasks ensures they don't run concurrently. + +`LayeredTimeline` is still split into two parts, more high-level with whatever primitives needed to sync its state, and the actual state storage with `LayerMap` and other low level entities. +Only `LayerMap` knows what storage it's layer files are taken from (inmem, local FS, etc.), and it's responsible for synchronizing the layers when needed, as also reacting to sync events, successful or not. + +Last but not least, `tenant config file` has to be backed into a remote storage, as tenant-specific information for all timelines. +Tenant and timelines have volatile information that's now partially mixed with constant information (e.g. fields in `metadata` file), that model should be better split and handled, in case we want to properly support its backups and synchronization. + +![proposed timeline data access synchronization(1)](./images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg) + +There's still a need to keep inmemory layer buffer synchronized during layer freezing, yet that could happen on a layer level, not on a timeline level, as `write_lock` used to be, so we could lower the sync primitives one layer deeper, preparing us for download on demand feature, where multiple layers could be concurrently streamed and written from various data sources. + +Flushing the frozen layer requires creating a new layer on disk and further remote storage upload, so `LayerMap` has to get those flushed bytes and queue them later: no need to block in the timeline itself for anything again, rather locking on the layer level, if needed. + +![proposed timeline data access synchronization(2)](./images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg) + +Lock diagrams legend: + +![lock diagrams legend](./images/017-timeline-data-management/lock_legend.svg) + +After the frozen layers are flushed, something has to ensure that the layer structure is intact, so a repartitioning lock is needed still, and could also guard the layer map structure changes, since both are needed either way. +This locking belongs to the `LowLevelLayeredTimeline` from the proposed data structure diagram, as the place with all such data being held. + +Similarly, branching is still required to be done after certain Lsn in our current model, but this needs only one lock to synchronize and that could be the `gc_cs: Mutex<()>` lock. +It raises the question of where this lock has to be placed, it's the only place that requires pausing a GC task during external, HTTP request handling. +The right place for the lock seems to be the `task_manager` that could manage GC in more fine-grained way to accommodate the incoming branching request. + +There's no explicit lock sync between GC, compaction or other mutually exclusive tasks: it is a job of the `task_manager` to ensure those are not run concurrently. diff --git a/docs/rfcs/018-storage-messaging-2.md b/docs/rfcs/018-storage-messaging-2.md new file mode 100644 index 0000000000..364f62dd2e --- /dev/null +++ b/docs/rfcs/018-storage-messaging-2.md @@ -0,0 +1,163 @@ +# Storage messaging + +Safekeepers need to communicate to each other to +* Trim WAL on safekeepers; +* Decide on which SK should push WAL to the S3; +* Decide on when to shut down SK<->pageserver connection; +* Understand state of each other to perform peer recovery; + +Pageservers need to communicate to safekeepers to decide which SK should provide +WAL to the pageserver. + +This is an iteration on [015-storage-messaging](https://github.com/neondatabase/neon/blob/main/docs/rfcs/015-storage-messaging.md) describing current situation, +potential performance issue and ways to address it. + +## Background + +What we have currently is very close to etcd variant described in +015-storage-messaging. Basically, we have single `SkTimelineInfo` message +periodically sent by all safekeepers to etcd for each timeline. +* Safekeepers subscribe to it to learn status of peers (currently they subscribe to + 'everything', but they can and should fetch data only for timelines they hold). +* Pageserver subscribes to it (separate watch per timeline) to learn safekeepers + positions; based on that, it decides from which safekeepers to pull WAL. + +Also, safekeepers use etcd elections API to make sure only single safekeeper +offloads WAL. + +It works, and callmemaybe is gone. However, this has a performance +hazard. Currently deployed etcd can do about 6k puts per second (using its own +`benchmark` tool); on my 6 core laptop, while running on tmpfs, this gets to +35k. Making benchmark closer to our usage [etcd watch bench](https://github.com/arssher/etcd-client/blob/watch-bench/examples/watch_bench.rs), +I get ~10k received messages per second with various number of publisher-subscribers +(laptop, tmpfs). Diving this by 12 (3 sks generate msg, 1 ps + 3 sk consume them) we +get about 800 active timelines, if message is sent each second. Not extremely +low, but quite reachable. + +A lot of idle watches seem to be ok though -- which is good, as pageserver +subscribes to all its timelines regardless of their activity. + +Also, running etcd with fsyncs disabled is messy -- data dir must be wiped on +each restart or there is a risk of corruption errors. + +The reason is etcd making much more than what we need; it is a fault tolerant +store with strong consistency, but I claim all we need here is just simplest pub +sub with best effort delivery, because +* We already have centralized source of truth for long running data, like which + tlis are on which nodes -- the console. +* Momentary data (safekeeper/pageserver progress) doesn't make sense to persist. + Instead of putting each change to broker, expecting it to reliably deliver it + is better to just have constant flow of data for active timelines: 1) they + serve as natural heartbeats -- if node can't send, we shouldn't pull WAL from + it 2) it is simpler -- no need to track delivery to/from the broker. + Moreover, latency here is important: the faster we obtain fresh data, the + faster we can switch to proper safekeeper after failure. +* As for WAL offloading leader election, it is trivial to achieve through these + heartbeats -- just take suitable node through deterministic rule (min node + id). Once network is stable, this is a converging process (well, except + complicated failure topology, but even then making it converge is not + hard). Such elections bear some risk of several offloaders running + concurrently for a short period of time, but that's harmless. + + Generally, if one needs strong consistency, electing leader per se is not + enough; it must be accompanied with number (logical clock ts), checked at + every action to track causality. s3 doesn't provide CAS, so it can't + differentiate old/new leader, this must be solved differently. + + We could use etcd CAS (its most powerful/useful primitive actually) to issue + these leader numbers (and e.g. prefix files in s3), but currently I don't see + need for that. + + +Obviously best effort pub sub is much more simpler and performant; the one proposed is + +## gRPC broker + +I took tonic and [prototyped](https://github.com/neondatabase/neon/blob/asher/neon-broker/broker/src/broker.rs) the replacement of functionality we currently use +with grpc streams and tokio mpsc channels. The implementation description is at the file header. + +It is just 500 lines of code and core functionality is complete. 1-1 pub sub +gives about 120k received messages per second; having multiple subscribers in +different connecitons quickly scales to 1 million received messages per second. +I had concerns about many concurrent streams in singe connection, but 2^20 +subscribers still work (though eat memory, with 10 publishers 20GB are consumed; +in this implementation each publisher holds full copy of all subscribers). There +is `bench.rs` nearby which I used for testing. + +`SkTimelineInfo` is wired here, but another message can be added (e.g. if +pageservers want to communicate with each other) with templating. + +### Fault tolerance + +Since such broker is stateless, we can run it under k8s. Or add proxying to +other members, with best-effort this is simple. + +### Security implications + +Communication happens in a private network that is not exposed to users; +additionaly we can add auth to the broker. + +## Alternative: get existing pub-sub + +We could take some existing pub sub solution, e.g. RabbitMQ, Redis. But in this +case IMV simplicity of our own outweights external dependency costs (RabbitMQ is +much more complicated and needs VM; Redis Rust client maintenance is not +ideal...). Also note that projects like CockroachDB and TiDB are based on gRPC +as well. + +## Alternative: direct communication + +Apart from being transport, broker solves one more task: discovery, i.e. letting +safekeepers and pageservers find each other. We can let safekeepers know, for +each timeline, both other safekeepers for this timeline and pageservers serving +it. In this case direct communication is possible: + - each safekeeper pushes to each other safekeeper status of timelines residing + on both of them, letting remove WAL, decide who offloads, decide on peer + recovery; + - each safekeeper pushes to each pageserver status of timelines residing on + both of them, letting pageserver choose from which sk to pull WAL; + +It was mostly described in [014-safekeeper-gossip](https://github.com/neondatabase/neon/blob/main/docs/rfcs/014-safekeepers-gossip.md), but I want to recap on that. + +The main pro is less one dependency: less moving parts, easier to run Neon +locally/manually, less places to monitor. Fault tolerance for broker disappears, +no kuber or something. To me this is a big thing. + +Also (though not a big thing) idle watches for inactive timelines disappear: +naturally safekeepers learn about compute connection first and start pushing +status to pageserver(s), notifying it should pull. + +Importantly, I think that eventually knowing and persisting peers and +pageservers on safekeepers is inevitable: +- Knowing peer safekeepers for the timeline is required for correct + automatic membership change -- new member set must be hardened on old + majority before proceeding. It is required to get rid of sync-safekeepers + as well (peer recovery up to flush_lsn). +- Knowing pageservers where the timeline is attached is needed to + 1. Understand when to shut down activity on the timeline, i.e. push data to + the broker. We can have a lot of timelines sleeping quietly which + shouldn't occupy resources. + 2. Preserve WAL for these (currently we offload to s3 and take it from there, + but serving locally is better, and we get one less condition on which WAL + can be removed from s3). + +I suppose this membership data should be passed to safekeepers directly from the +console because +1. Console is the original source of this data, conceptually this is the + simplest way (rather than passing it through compute or something). +2. We already have similar code for deleting timeline on safekeepers + (and attaching/detaching timeline on pageserver), this is a typical + action -- queue operation against storage node and execute it until it + completes (or timeline is dropped). + +Cons of direct communication are +- It is more complicated: each safekeeper should maintain set of peers it talks + to, and set of timelines for each such peer -- they ought to be multiplexed + into single connection. +- Totally, we have O(n^2) connections instead of O(n) with broker schema + (still O(n) on each node). However, these are relatively stable, async and + thus not very expensive, I don't think this is a big problem. Up to 10k + storage nodes I doubt connection overhead would be noticeable. + +I'd use gRPC for direct communication, and in this sense gRPC based broker is a +step towards it. diff --git a/docs/rfcs/019-tenant-timeline-lifecycles.md b/docs/rfcs/019-tenant-timeline-lifecycles.md new file mode 100644 index 0000000000..2734bf17b9 --- /dev/null +++ b/docs/rfcs/019-tenant-timeline-lifecycles.md @@ -0,0 +1,91 @@ +# Managing Tenant and Timeline lifecycles + +## Summary + +The pageserver has a Tenant object in memory for each tenant it manages, and a +Timeline for each timeline. There are a lot of tasks that operate on the tenants +and timelines with references to those objects. We have some mechanisms to track +which tasks are operating on each Tenant and Timeline, and to request them to +shutdown when a tenant or timeline is deleted, but it does not cover all uses, +and as a result we have many race conditions around tenant/timeline shutdown. + +## Motivation + +We have a bunch of race conditions that can produce weird errors and can be hard +to track down. + +## Non Goals + +This RFC only covers the problem of ensuring that a task/thread isn't operating +on a Tenant or Timeline. It does not cover what states, aside from Active and +non-Active, each Tenant and Timeline should have, or when exactly the transitions +should happen. + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +Pageserver. Although I wonder if the safekeeper should have a similar mechanism. + +## Current situation + +Most pageserver tasks of are managed by task_mgr.rs: + +- LibpqEndpointListener +- HttpEndPointListener +- WalReceiverManager and -Connection +- GarbageCollector and Compaction +- InitialLogicalSizeCalculation + +In addition to those tasks, the walreceiver performs some direct tokio::spawn +calls to spawn tasks that are not registered with 'task_mgr'. And all of these +tasks can spawn extra operations with tokio spawn_blocking. + +Whenever a tenant or timeline is removed from the system, by pageserver +shutdown, delete_timeline or tenant-detach operation, we rely on the task +registry in 'task_mgr.rs' to wait until there are no tasks operating on the +tenant or timeline, before its Tenant/Timeline object is removed. That relies on +each task to register itself with the tenant/timeline ID in +'task_mgr.rs'. However, there are many gaps in that. For example, +GarbageCollection and Compaction tasks are registered with the tenant, but when +they proceed to operate on a particular timeline of the tenant, they don't +register with timeline ID. Because of that, the timeline can be deleted while GC +or compaction is running on it, causing failures in the GC or compaction (see +https://github.com/neondatabase/neon/issues/2442). + +Another problem is that the task registry only works for tokio Tasks. There is +no way to register a piece of code that runs inside spawn_blocking(), for +example. + +## Proposed implementation + +This "voluntary" registration of tasks is fragile. Let's use Rust language features +to enforce that a tenant/timeline cannot be removed from the system when there is +still some code operating on it. + +Let's introduce new Guard objects for Tenant and Timeline, and do all actions through +the Guard object. Something like: + +TenantActiveGuard: Guard object over Arc. When you acquire the guard, +the code checks that the tenant is in Active state. If it's not, you get an +error. You can change the state of the tenant to Stopping while there are +ActiveTenantGuard objects still on it, to prevent new ActiveTenantGuards from +being acquired, but the Tenant cannot be removed until all the guards are gone. + +TenantMaintenanceGuard: Like ActiveTenantGuard, but can be held even when the +tenant is not in Active state. Used for operations like attach/detach. Perhaps +allow only one such guard on a Tenant at a time. + +Similarly for Timelines. We don't currentl have a "state" on Timeline, but I think +we need at least two states: Active and Stopping. The Stopping state is used at +deletion, to prevent new TimelineActiveGuards from appearing, while you wait for +existing TimelineActiveGuards to die out. + +The shutdown-signaling, using shutdown_watcher() and is_shutdown_requested(), +probably also needs changes to deal with the new Guards. The rule is that if you +have a TenantActiveGuard, and the tenant's state changes from Active to +Stopping, the is_shutdown_requested() function should return true, and +shutdown_watcher() future should return. + +This signaling doesn't neessarily need to cover all cases. For example, if you +have a block of code in spawn_blocking(), it might be acceptable if +is_shutdown_requested() doesn't return true even though the tenant is in +Stopping state, as long as the code finishes reasonably fast. diff --git a/docs/rfcs/020-pageserver-s3-coordination.md b/docs/rfcs/020-pageserver-s3-coordination.md new file mode 100644 index 0000000000..5e2912ba99 --- /dev/null +++ b/docs/rfcs/020-pageserver-s3-coordination.md @@ -0,0 +1,246 @@ +# Coordinating access of multiple pageservers to the same s3 data + +## Motivation + +There are some blind spots around coordinating access of multiple pageservers +to the same s3 data. Currently this is applicable only to tenant relocation +case, but in the future we'll need to solve similar problems for +replica/standby pageservers. + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +Pageserver + +## The problem + +### Relocation + +During relocation both pageservers can write to s3. This should be ok for all +data except the `index_part.json`. For index part it causes problems during +compaction/gc because they remove files from index/s3. + +Imagine this case: + +```mermaid +sequenceDiagram + autonumber + participant PS1 + participant S3 + participant PS2 + + PS1->>S3: Uploads L1, L2
Index contains L1 L2 + PS2->>S3: Attach called, sees L1, L2 + PS1->>S3: Compaction comes
Removes L1, adds L3 + note over S3: Index now L2, L3 + PS2->>S3: Uploads new layer L4
(added to previous view of the index) + note over S3: Index now L1, L2, L4 +``` + +At this point it is not possible to restore from index, it contains L2 which +is no longer available in s3 and doesnt contain L3 added by compaction by the +first pageserver. So if any of the pageservers restart initial sync will fail +(or in on-demand world it will fail a bit later during page request from +missing layer) + +### Standby pageserver + +Another related case is standby pageserver. In this case second pageserver can +be used as a replica to scale reads and serve as a failover target in case +first one fails. + +In this mode second pageserver needs to have the same picture of s3 files to +be able to load layers on-demand. To accomplish that second pageserver +cannot run gc/compaction jobs. Instead it needs to receive updates for index +contents. (There is no need to run walreceiver on the second pageserver then). + +## Observations + +- If both pageservers ingest wal then their layer set diverges, because layer + file generation is not deterministic +- If one of the pageservers does not ingest wal (and just picks up layer + updates) then it lags behind and cannot really answer queries in the same + pace as the primary one +- Can compaction help make layers deterministic? E g we do not upload level + zero layers and construction of higher levels should be deterministic. + This way we can guarantee that layer creation by timeout wont mess things up. + This way one pageserver uploads data and second one can just ingest it. + But we still need some form of election + +## Solutions + +### Manual orchestration + +One possible solution for relocation case is to orchestrate background jobs +from outside. The oracle who runs migration can turn off background jobs on +PS1 before migration and then run migration -> enable them on PS2. The problem +comes if migration fails. In this case in order to resume background jobs +oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt +respond then PS1 is stuck unable to run compaction/gc. This cannot be solved +without human ensuring that no upload from PS2 can happen. In order to be able +to resolve this automatically CAS is required on S3 side so pageserver can +avoid overwriting index part if it is no longer the leading one + +Note that flag that disables background jobs needs to be persistent, because +otherwise pageserver restart will clean it + +### Avoid index_part.json + +Index part consists of two parts, list of layers and metadata. List of layers +can be easily obtained by `ListObjects` S3 API method. But what to do with +metadata? Create metadata instance for each checkpoint and add some counter +to the file name? + +Back to potentially long s3 ls. + +### Coordination based approach + +Do it like safekeepers chose leader for WAL upload. Ping each other and decide +based on some heuristics e g smallest node id. During relocation PS1 sends +"resign" ping message so others can start election without waiting for a timeout. + +This still leaves metadata question open and non deterministic layers are a +problem as well + +### Avoid metadata file + +One way to eliminate metadata file is to store it in layer files under some +special key. This may resonate with intention to keep all relation sizes in +some special segment to avoid initial download during size calculation. +Maybe with that we can even store pre calculated value. + +As a downside each checkpoint gets 512 bytes larger. + +If we entirely avoid metadata file this opens up many approaches + +* * * + +During discussion it seems that we converged on the approach consisting of: + +- index files stored per pageserver in the same timeline directory. With that + index file name starts to look like: `_index_part.json`. + In such set up there are no concurrent overwrites of index file by different + pageservers. +- For replica pageservers the solution would be for primary to broadcast index + changes to any followers with an ability to check index files in s3 and + restore the full state. To properly merge changes with index files we can use + a counter that is persisted in an index file, is incremented on every change + to it and passed along with broadcasted change. This way we can determine + whether we need to apply change to the index state or not. +- Responsibility for running background jobs is assigned externally. Pageserver + keeps locally persistent flag for each tenant that indicates whether this + pageserver is considered as primary one or not. TODO what happends if we + crash and cannot start for some extended period of time? Control plane can + assign ownership to some other pageserver. Pageserver needs some way to check + if its still the blessed one. Maybe by explicit request to control plane on + start. + +Requirement for deterministic layer generation was considered overly strict +because of two reasons: + +- It can limit possible optimizations e g when pageserver wants to reshuffle + some data locally and doesnt want to coordinate this +- The deterministic algorithm itself can change so during deployments for some + time there will be two different version running at the same time which can + cause non determinism + +### External elections + +The above case with lost state in this schema with externally managed +leadership is represented like this: + +Note that here we keep objects list in the index file. + +```mermaid +sequenceDiagram + autonumber + participant PS1 + participant CP as Control Plane + participant S3 + participant PS2 + + note over PS1,PS2: PS1 starts up and still a leader + PS1->>CP: Am I still the leader for Tenant X? + activate CP + CP->>PS1: Yes + deactivate CP + PS1->>S3: Fetch PS1 index. + note over PS1: Continue operations, start backround jobs + note over PS1,PS2: PS1 starts up and still and is not a leader anymore + PS1->>CP: Am I still the leader for Tenant X? + CP->>PS1: No + PS1->>PS2: Subscribe to index changes + PS1->>S3: Fetch PS1 and PS2 indexes + note over PS1: Combine index file to include layers
from both indexes to be able
to see newer files from leader (PS2) + note over PS1: Continue operations, do not start background jobs +``` + +### Internal elections + +To manage leadership internally we can use broker to exchange pings so nodes +can decide on the leader roles. In case multiple pageservers are active leader +is the one with lowest node id. + +Operations with internally managed elections: + +```mermaid +sequenceDiagram + autonumber + participant PS1 + participant S3 + + note over PS1: Starts up + note over PS1: Subscribes to changes, waits for two ping
timeouts to see if there is a leader + PS1->>S3: Fetch indexes from s3 + alt there is a leader + note over PS1: do not start background jobs,
continue applying index updates + else there is no leader + note over PS1: start background jobs,
broadcast index changes + end + + note over PS1,S3: Then the picture is similar to external elections
the difference is that follower can become a leader
if there are no pings after some timeout new leader gets elected +``` + +### Eviction + +When two pageservers operate on a tenant for extended period of time follower +doesnt perform write operations in s3. When layer is evicted follower relies +on updates from primary to get info about layers it needs to cover range for +evicted layer. + +Note that it wont match evicted layer exactly, so layers will overlap and +lookup code needs to correctly handle that. + +### Relocation flow + +Actions become: + +- Attach tenant to new pageserver +- New pageserver becomes follower since previous one is still leading +- New pageserver starts replicating from safekeepers but does not upload layers +- Detach is called on the old one +- New pageserver becomes leader after it realizes that old one disappeared + +### Index File + +Using `s3 ls` on startup simplifies things, but we still need metadata, so we +need to fetch index files anyway. If they contain list of files we can combine +them and avoid costly `s3 ls` + +### Remaining issues + +- More than one remote consistent lsn for safekeepers to know + +Anything else? + +### Proposed solution + +To recap. On meeting we converged on approach with external elections but I +think it will be overall harder to manage and will introduce a dependency on +control plane for pageserver. Using separate index files for each pageserver +consisting of log of operations and a metadata snapshot should be enough. + +### What we need to get there? + +- Change index file structure to contain log of changes instead of just the + file list +- Implement pinging/elections for pageservers diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md new file mode 100644 index 0000000000..f7b0b3a587 --- /dev/null +++ b/docs/rfcs/README.md @@ -0,0 +1,95 @@ +This directory contains Request for Comments documents, or RFCs, for +features or concepts that have been proposed. Alternative names: +technical design doc, ERD, one-pager + +To make a new proposal, create a new text file in this directory and +open a Pull Request with it. That gives others a chance and a forum +to comment and discuss the design. + +When a feature is implemented and the code changes are committed, also +include the corresponding RFC in this directory. + +Some of the RFCs in this directory have been implemented in some form +or another, while others are on the roadmap, while still others are +just obsolete and forgotten about. So read them with a grain of salt, +but hopefully even the ones that don't reflect reality give useful +context information. + +## What + +We use Tech Design RFC’s to summarize what we are planning to +implement in our system. These RFCs should be created for large or not +obvious technical tasks, e.g. changes of the architecture or bigger +tasks that could take over a week, changes that touch multiple +components or their interaction. RFCs should fit into a couple of +pages, but could be longer on occasion. + +## Why + +We’re using RFCs to enable early review and collaboration, reduce +uncertainties, risk and save time during the implementation phase that +follows the Tech Design RFC. + +Tech Design RFCs also aim to avoid bus factor and are an additional +measure to keep more peers up to date & familiar with our design and +architecture. + +This is a crucial part for ensuring collaboration across timezones and +setting up for success a distributed team that works on complex +topics. + +## Prior art + +- Rust: [https://github.com/rust-lang/rfcs/blob/master/0000-template.md](https://github.com/rust-lang/rfcs/blob/master/0000-template.md) +- React.js: [https://github.com/reactjs/rfcs/blob/main/0000-template.md](https://github.com/reactjs/rfcs/blob/main/0000-template.md) +- Google fuchsia: [https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE](https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE) +- Apache: [https://cwiki.apache.org/confluence/display/GEODE/RFC+Template](https://cwiki.apache.org/confluence/display/GEODE/RFC+Template) / [https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process](https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process) + +## How + +RFC lifecycle: + +- Should be submitted in a pull request with and full RFC text in a committed markdown file and copy of the Summary and Motivation sections also included in the PR body. +- RFC should be published for review before most of the actual code is written. This isn’t a strict rule, don’t hesitate to experiment and build a POC in parallel with writing an RFC. +- Add labels to the PR in the same manner as you do Issues. Example TBD +- Request the review from your peers. Reviewing the RFCs from your peers is a priority, same as reviewing the actual code. +- The Tech Design RFC should evolve based on the feedback received and further during the development phase if problems are discovered with the taken approach +- RFCs stop evolving once the consensus is found or the proposal is implemented and merged. +- RFCs are not intended as a documentation that’s kept up to date **after** the implementation is finished. Do not update the Tech Design RFC when merged functionality evolves later on. In such situation a new RFC may be appropriate. + +### RFC template + +Note, a lot of the sections are marked as ‘if relevant’. They are included into the template as a reminder and to help inspiration. + +``` +# Name +Created on .. +Implemented on .. + +## Summary + +## Motivation + +## Non Goals (if relevant) + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +## Proposed implementation + +### Reliability, failure modes and corner cases (if relevant) + +### Interaction/Sequence diagram (if relevant) + +### Scalability (if relevant) + +### Security implications (if relevant) + +### Unresolved questions (if relevant) + +## Alternative implementation (if relevant) + +## Pros/cons of proposed approaches (if relevant) + +## Definition of Done (if relevant) + +``` diff --git a/docs/rfcs/cluster-size-limits.md b/docs/rfcs/cluster-size-limits.md new file mode 100644 index 0000000000..4ef006d9a6 --- /dev/null +++ b/docs/rfcs/cluster-size-limits.md @@ -0,0 +1,79 @@ +Cluster size limits +================== + +## Summary + +One of the resource consumption limits for free-tier users is a cluster size limit. + +To enforce it, we need to calculate the timeline size and check if the limit is reached before relation create/extend operations. +If the limit is reached, the query must fail with some meaningful error/warning. +We may want to exempt some operations from the quota to allow users free space to fit back into the limit. + +The stateless compute node that performs validation is separate from the storage that calculates the usage, so we need to exchange cluster size information between those components. + +## Motivation + +Limit the maximum size of a PostgreSQL instance to limit free tier users (and other tiers in the future). +First of all, this is needed to control our free tier production costs. +Another reason to limit resources is risk management — we haven't (fully) tested and optimized neon for big clusters, +so we don't want to give users access to the functionality that we don't think is ready. + +## Components + +* pageserver - calculate the size consumed by a timeline and add it to the feedback message. +* safekeeper - pass feedback message from pageserver to compute. +* compute - receive feedback message, enforce size limit based on GUC `neon.max_cluster_size`. +* console - set and update `neon.max_cluster_size` setting + +## Proposed implementation + +First of all, it's necessary to define timeline size. + +The current approach is to count all data, including SLRUs. (not including WAL) +Here we think of it as a physical disk underneath the Postgres cluster. +This is how the `LOGICAL_TIMELINE_SIZE` metric is implemented in the pageserver. + +Alternatively, we could count only relation data. As in pg_database_size(). +This approach is somewhat more user-friendly because it is the data that is really affected by the user. +On the other hand, it puts us in a weaker position than other services, i.e., RDS. +We will need to refactor the timeline_size counter or add another counter to implement it. + +Timeline size is updated during wal digestion. It is not versioned and is valid at the last_received_lsn moment. +Then this size should be reported to compute node. + +`current_timeline_size` value is included in the walreceiver's custom feedback message: `ReplicationFeedback.` + +(PR about protocol changes https://github.com/neondatabase/neon/pull/1037). + +This message is received by the safekeeper and propagated to compute node as a part of `AppendResponse`. + +Finally, when compute node receives the `current_timeline_size` from safekeeper (or from pageserver directly), it updates the global variable. + +And then every neon_extend() operation checks if limit is reached `(current_timeline_size > neon.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so. +(see Postgres error codes [https://www.postgresql.org/docs/devel/errcodes-appendix.html](https://www.postgresql.org/docs/devel/errcodes-appendix.html)) + +TODO: +We can allow autovacuum processes to bypass this check, simply checking `IsAutoVacuumWorkerProcess()`. +It would be nice to allow manual VACUUM and VACUUM FULL to bypass the check, but it's uneasy to distinguish these operations at the low level. +See issues https://github.com/neondatabase/neon/issues/1245 +https://github.com/neondatabase/neon/issues/1445 + +TODO: +We should warn users if the limit is soon to be reached. + +### **Reliability, failure modes and corner cases** + +1. `current_timeline_size` is valid at the last received and digested by pageserver lsn. + + If pageserver lags behind compute node, `current_timeline_size` will lag too. This lag can be tuned using backpressure, but it is not expected to be 0 all the time. + + So transactions that happen in this lsn range may cause limit overflow. Especially operations that generate (i.e., CREATE DATABASE) or free (i.e., TRUNCATE) a lot of data pages while generating a small amount of WAL. Are there other operations like this? + + Currently, CREATE DATABASE operations are restricted in the console. So this is not an issue. + + +### **Security implications** + +We treat compute as an untrusted component. That's why we try to isolate it with secure container runtime or a VM. +Malicious users may change the `neon.max_cluster_size`, so we need an extra size limit check. +To cover this case, we also monitor the compute node size in the console. diff --git a/docs/rfcs/images/017-timeline-data-management/lock_legend.svg b/docs/rfcs/images/017-timeline-data-management/lock_legend.svg new file mode 100644 index 0000000000..d6d2bc00ae --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/lock_legend.svg @@ -0,0 +1,4 @@ + + + +
Lock interaction legend:

Lock interaction legend:...
LOCK NAME
LOCK NAME
LOCK NAME
LOCK NAME
Event flow
Event flow
or
or
lock acquisition, 
every lock is shown with a single lines
Different lines of the same shape denote different locks
lock acquisition,...
Continuous lock acquisition,
lock release is explicitly shown later
Continuous lock acquisition,...
Lock release
Lock release
Instant lock acquisition and release
Instant lock acquisition and rele...
Lock details (RwLock/Mutex)
are shown on the corresponding arrows
and lock names
Lock details (RwLock/Mutex)...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg new file mode 100644 index 0000000000..d1c97d1738 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_1.svg @@ -0,0 +1,4 @@ + + + +
walreceiver loop
walreceiver loop
DatadirModification::flush after every file
DatadirModification::flush aft...
HTTP API call
to create an empty timeline
HTTP API call...
libpq call
to import basebackup archive
libpq call...
libpq call
to import wal
libpq call...
zenith.signal
file processed
zenith.signal...
process timeline wal
(walingest)
process timeline wal...
DatadirModification::commit
DatadirModification::commit
process timeline wal
(walingest)
process timeline wal...
process timeline wal
(walingest)
process timeline wal...
process timeline files
process timeline files
DatadirModification::commit
DatadirModification::commit
layer_write_lock.lock()
layer_write_lock.lock()
timeline::writer call
timeline::writer call
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
after all files processed
after all files processed
and
and
timeline::writer call
timeline::writer call
checkpoint(Flush)
checkpoint(Flush)
checkpoint(Forced)
checkpoint(Forced)
checkpoint(Flush)
checkpoint(Flush)
checkpoint(Forced)
checkpoint(Forced)
libpq call
to checkpoint
libpq call...
checkpoint(Forced)
checkpoint(Forced)
libpq call
to do_gc
libpq call...
checkpoint(Flush)
checkpoint(Flush)
shutdown() system call
shutdown() system call
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex inside the repo
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex i...
held through entire freezing
held through entire freezing
flush_frozen_layers
schedules the operation in to LayerMap
flush_frozen_layers...

freeze_inmem_layer(true)

freeze_inmem_layer(true)...
checkpoint(Flush)
checkpoint(Flush)
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg new file mode 100644 index 0000000000..81918fcd98 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_data_access_sync_2.svg @@ -0,0 +1,4 @@ + + + +
libpq pagerequest calls
basebackup
libpq pagerequest calls...
libpq do_gc call
libpq do_gc call
periodic GC
periodic GC
checkpoint(Forced)
checkpoint(Forced)
periodic compaction
periodic compaction
gc
gc
compact
compact
partitioning.lock()
partitioning.lock()
gc
gc
compact
compact
HTTP API call
to branch a timeline
HTTP API call...
checkpoint(Forced)
checkpoint(Forced)
takes the lock when ready to do gc
holds during entire operation
takes the lock when ready to do gc...
gc_cs.lock()
gc_cs.lock()
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
other checkpoint sources
other checkpoint sources
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
holds lock during
entire operation
holds lock during...
holds lock during
entire branching
holds lock during...
wait_or_get_last_lsn
@
page request Lsn
wait_or_get_last_lsn...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/proposed_timeline_tenant_state.svg b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_tenant_state.svg new file mode 100644 index 0000000000..207017fb1b --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/proposed_timeline_tenant_state.svg @@ -0,0 +1,4 @@ + + + +sLayer 1Layer 2
contained in
contained in
metadataLayer 1
...
...
...
...
index_part.json
Files in the remote storage
Files in the remote storage
Files in the local FS
Files in the local FS
Tenanta number of maps with Arc<RwLock<Data>> patternfor tenants, timelines, gc, walreceiver, remove storage, etc. metadataLayeredTimelinewrite_lock: Mutex<()>latest_gc_cutoff_lsn: RwLock<Lsn>process: Mutex<Option<PostgresRedoPorcess>> inside               PostgresRedoManagercompactionPeriodically runs on all tenant timelines, each processed separately. Merges (removes and adds) layer fileswalreceiver tasksetcd subscriptions, periodic timeline writes and checkpointstenant config fileLowLevelLayeredTimelinepartitioning: Mutex<(KeyPartitioning, Lsn)>layers: RwLock<LayerMap>
tenant contains timeline layer data
tenant con...
remote storage syncstorage sync queue and S3 connectionsperiodically writes into the remote indexgcPeriodically runs on all tenant timelines, with shared context.Removes layer files
Tasks interact with layers, via LayerMap
Tasks interact with layers, via LayerMap
task_managerruntime, threadpools, shared connections (etcd), etc.logic to manage tenant/timeline taskstenant config file in any form
layer map schedules sync tasks
and calls logic on their completion
layer map schedules sync tasks...
page cachematerialized_page_map: RwLock<HashMap<...>>ephemeral_page_map: RwLock<HashMap<...>>immutable_page_map: RwLock<HashMap<...>>tenant storageHashMap<TenantId, Tenant>Tenant state information, its sync and task manager interaction
layer map manages local and remote files
in a queue-based manner
layer map manages local and remote files...
tasks update or read metadata via the storage
tasks update or read metadata via the storage
Legend:
Legend:
interaction between components,
arrows show which component does the data access
interaction between components,...
data relation,
arrows show where current data is contained in
data relation,...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_1.svg b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_1.svg new file mode 100644 index 0000000000..b968fedd8c --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_1.svg @@ -0,0 +1,4 @@ + + + +
walreceiver loop
walreceiver loop
DatadirModification::flush after every file
DatadirModification::flush aft...
HTTP API call
to create an empty timeline
HTTP API call...
libpq call
to import basebackup archive
libpq call...
libpq call
to import wal
libpq call...
zenith.signal
file processed
zenith.signal...
process timeline wal
(walingest)
process timeline wal...
DatadirModification::commit
DatadirModification::commit
process timeline wal
(walingest)
process timeline wal...
process timeline wal
(walingest)
process timeline wal...
process timeline files
process timeline files
DatadirModification::commit
DatadirModification::commit
write_lock.lock()
w...
timeline::writer call
timeline::writer call
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
DatadirModification::commit
after all files processed
after all files processed
and
and
timeline::writer call
timeline::writer call
checkpoint(Flush)
checkpoint(Flush)
checkpoint(Forced)
checkpoint(Forced)
checkpoint(Flush)
checkpoint(Flush)
check_checkpoint_distance
check_checkpoint_distance
checkpoint(Forced)
checkpoint(Forced)
libpq call
to checkpoint
libpq call...
checkpoint(Forced)
checkpoint(Forced)
libpq call
to do_gc
libpq call...
checkpoint(Flush)
checkpoint(Flush)
shutdown() system call
shutdown() system call
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex inside the repo
additionally: every time the timeline is accessed, it's done via .lock() on the timeline mutex i...
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
held through entire freezing
h...
 layer_flush_lock.lock() 
...
skips both flushes if the lock is taken
s...
skips the flush if the lock is taken 
s...
always waits for the lock
and runs
frozen layers flush 
holding the lock
always waits f...
flush_frozen_layers(false)
flush_frozen_layers(false)

freeze_inmem_layer(true)

freeze_inmem_layer(true)...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_2.svg b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_2.svg new file mode 100644 index 0000000000..382d834517 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/timeline_data_access_sync_2.svg @@ -0,0 +1,4 @@ + + + +
libpq pagerequest calls
basebackup
libpq pagerequest calls...
libpq do_gc call
libpq do_gc call
periodic GC
periodic GC
checkpoint(Forced)
checkpoint(Forced)
periodic compaction
periodic compaction
tenant idle/detach
shutdown
tenant idle/detach...
gc
gc
compact
compact
lock is held for
almost entire operations
lock is held for...
RwLock(file_lock)
RwLock(file_lock)
read
read
read
read
write
write
HTTP API call
delete timeline
HTTP API call...
layer_removal_cs.lock()
layer_removal_cs.lock()
lock is held for
the entire operation
lock is held for...
partitioning.lock()
partitioning.lock()
gc
gc
compact
compact
HTTP API call
to branch a timeline
HTTP API call...
gc_cs.lock()
gc_cs.lock()
held during entire
branching
held during entire...
checkpoint(Forced)
checkpoint(Forced)
write updated value,
release the lock
write updated value,...
RwLock(latest_gc_cutoff_lsn)
RwLock(latest_gc_cutoff_lsn)
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
other checkpoint sources
other checkpoint sources
freeze_inmem_layer(false)
flush_frozen_layers(true)
freeze_inmem_layer(false)...
holds read during
enire operation
holds read during...
holds read during
enire branching
holds read during...
wait_or_get_last_lsn
@
page request Lsn
wait_or_get_last_lsn...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/017-timeline-data-management/timeline_tenant_state.svg b/docs/rfcs/images/017-timeline-data-management/timeline_tenant_state.svg new file mode 100644 index 0000000000..c4bc36f309 --- /dev/null +++ b/docs/rfcs/images/017-timeline-data-management/timeline_tenant_state.svg @@ -0,0 +1,4 @@ + + + +                                             Tasks                                                                                                                                                                   StateLayer 1Layer 2
contained in
contained in
metadataLayer 1
...
...
...
...
index_part.json
Files in the remote storage
Files in the remote storage
Files in the local FS
Files in the local FS
LayeredRepositorytimelines: Mutex<HashMap<TimelineId, LayeredTimeline>>gc_cs: Mutex<()>file_lock: RwLock<()>tenant_conf: Arc<RwLock<TenantConfOpt>>remote_index: Arc<RwLock<HashMap<                        TenantTimelineId, RemoteTimelineMetadata>>tenant_mgrstatic ref TENANTS: RwLock<HashMap<TenantId, Tenant>>Tenantstate: TenantStaterepo: Arc<LayeredRepository>local_timelines: HashMap<TimelineId, Arc<DatadirTimelineImpl>>PageCachematerialized_page_map: RwLock<HashMap<...>>ephemeral_page_map: RwLock<HashMap<...>>immutable_page_map: RwLock<HashMap<...>>DatadirTimelineImplpartitioning: Mutex<(KeyPartitioning, Lsn)>tline: Arc<LayeredTimeline>compactionPeriodically runs on all tenant timelines, each processed separately. Merges (removes and adds) layer fileswalreceiver tasksetcd subscriptions, periodic timeline writes and checkpointstenant config fileLayeredTimelinewrite_lock: Mutex<()>layer_flush_lock: Mutex<()>layer_removal_cs: Mutex<()>latest_gc_cutoff_lsn: RwLock<Lsn>tenant_conf: Arc<RwLock<TenantConfOpt>>gc_info: RwLock<GcInfo>process: Mutex<Option<PostgresRedoPorcess>> inside               PostgresRedoManagerlayers: RwLock<LayerMap>layer flush taskPer timeline, moves in-memory data to disk when scheduled (adds layers)remote storage sync taskstorage sync queue and S3 connectionsperiodically writes into the remote indexgcPeriodically runs on all tenant timelines, with shared context.Removes layer files
Backed by repository:
Backed by repository:
get page requests lookup and update
get page requests lookup and update
flushes new files on disk, loads existing into memory
flushes new files on disk, loads existing into memory
Tasks interact with files on disk, full CRUD
Remote storage sync task is the only one to interact with other storage
Tasks interact with files on disk, full CRUD...
schedules layer sync
schedules layer sync
Text is not SVG - cannot display
\ No newline at end of file diff --git a/docs/rfcs/images/storage.jpeg b/docs/rfcs/images/storage.jpeg new file mode 100644 index 0000000000..1d72a018dc Binary files /dev/null and b/docs/rfcs/images/storage.jpeg differ diff --git a/walkeeper/README_PROTO.md b/docs/safekeeper-protocol.md similarity index 97% rename from walkeeper/README_PROTO.md rename to docs/safekeeper-protocol.md index 5d79f8c2d3..a2d4fa455d 100644 --- a/walkeeper/README_PROTO.md +++ b/docs/safekeeper-protocol.md @@ -143,7 +143,7 @@ Restart of PostgreSQL initiates new round of voting and switching new epoch. ## Limitations Right now message queue is maintained in main memory and is not spilled to the disk. It can cause memory overflow in case of presence of lagging safekeepers. -It is assumed that in case of loosing local data by some safekeepers, it should be recovered using some external mechanism. +It is assumed that in case of losing local data by some safekeepers, it should be recovered using some external mechanism. ## Glossary @@ -151,9 +151,9 @@ It is assumed that in case of loosing local data by some safekeepers, it should * `RestartLSN`: position in WAL confirmed by all safekeepers. * `FlushLSN`: part of WAL persisted to the disk by safekeeper. * `NodeID`: pair (term,UUID) -* `Pager`: Zenith component restoring pages from WAL stream -* `Replica`: read-only computatio node -* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records. +* `Pager`: Neon component restoring pages from WAL stream +* `Replica`: read-only computation node +* `VCL`: the largest LSN for which we can guarantee availability of all prior records. ## Algorithm diff --git a/docs/separation-compute-storage.md b/docs/separation-compute-storage.md new file mode 100644 index 0000000000..f07fa8b6dc --- /dev/null +++ b/docs/separation-compute-storage.md @@ -0,0 +1,8 @@ +# Separation of Compute and Storage + +TODO: + +- Read path +- Write path +- Durability model +- API auth diff --git a/docs/settings.md b/docs/settings.md index 571cfba8df..878681fce1 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -6,7 +6,6 @@ If there's no such file during `init` phase of the server, it creates the file i There's a possibility to pass an arbitrary config value to the pageserver binary as an argument: such values override the values in the config file, if any are specified for the same key and get into the final config during init phase. - ### Config example ```toml @@ -16,7 +15,7 @@ listen_pg_addr = '127.0.0.1:64000' listen_http_addr = '127.0.0.1:9898' checkpoint_distance = '268435456' # in bytes -checkpoint_period = '1 s' +checkpoint_timeout = '10m' gc_period = '100 s' gc_horizon = '67108864' @@ -24,29 +23,44 @@ gc_horizon = '67108864' max_file_descriptors = '100' # initial superuser role name to use when creating a new tenant -initial_superuser_name = 'zenith_admin' +initial_superuser_name = 'cloud_admin' + +broker_etcd_prefix = 'neon' +broker_endpoints = ['some://etcd'] # [remote_storage] ``` -The config above shows default values for all basic pageserver settings. +The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user, +see the corresponding section below. Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank. Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start. Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#table) in TOML specification and -* either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'` +- either has to be placed in the config after the table-less values such as `initial_superuser_name = 'cloud_admin'` -* or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}` +- or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}` ### Config values All values can be passed as an argument to the pageserver binary, using the `-c` parameter and specified as a valid TOML string. All tables should be passed in the inline form. -Example: `${PAGESERVER_BIN} -c "checkpoint_period = '100 s'" -c "remote_storage={local_path='/some/local/path/'}"` +Example: `${PAGESERVER_BIN} -c "checkpoint_timeout = '10 m'" -c "remote_storage={local_path='/some/local/path/'}"` Note that TOML distinguishes between strings and integers, the former require single or double quotes around them. +#### broker_endpoints + +A list of endpoints (etcd currently) to connect and pull the information from. +Mandatory, does not have a default, since requires etcd to be started as a separate process, +and its connection url should be specified separately. + +#### broker_etcd_prefix + +A prefix to add for every etcd key used, to separate one group of related instances from another, in the same cluster. +Default is `neon`. + #### checkpoint_distance `checkpoint_distance` is the amount of incoming WAL that is held in @@ -57,7 +71,7 @@ but it will trigger a checkpoint operation to get it back below the limit. `checkpoint_distance` also determines how much WAL needs to be kept -durable in the safekeeper. The safekeeper must have capacity to hold +durable in the safekeeper. The safekeeper must have capacity to hold this much WAL, with some headroom, otherwise you can get stuck in a situation where the safekeeper is full and stops accepting new WAL, but the pageserver is not flushing out and releasing the space in the @@ -68,11 +82,23 @@ S3. The unit is # of bytes. -#### checkpoint_period +#### checkpoint_timeout -The pageserver checks whether `checkpoint_distance` has been reached -every `checkpoint_period` seconds. Default is 1 s, which should be -fine. +Apart from `checkpoint_distance`, open layer flushing is also triggered +`checkpoint_timeout` after the last flush. This makes WAL eventually uploaded to +s3 when activity is stopped. + +The default is 10m. + +#### compaction_period + +Every `compaction_period` seconds, the page server checks if +maintenance operations, like compaction, are needed on the layer +files. Default is 1 s, which should be fine. + +#### compaction_target_size + +File sizes for L0 delta and L1 image layers. Default is 128MB. #### gc_horizon @@ -85,11 +111,33 @@ away. Interval at which garbage collection is triggered. Default is 100 s. +#### image_creation_threshold + +L0 delta layer threshold for L1 image layer creation. Default is 3. + +#### pitr_interval + +WAL retention duration for PITR branching. Default is 30 days. + +#### walreceiver_connect_timeout + +Time to wait to establish the wal receiver connection before failing + +#### lagging_wal_timeout + +Time the pageserver did not get any WAL updates from safekeeper (if any). +Avoids lagging pageserver preemptively by forcing to switch it from stalled connections. + +#### max_lsn_wal_lag + +Difference between Lsn values of the latest available WAL on safekeepers: if currently connected safekeeper starts to lag too long and too much, +it gets swapped to the different one. + #### initial_superuser_name Name of the initial superuser role, passed to initdb when a new tenant is initialized. It doesn't affect anything after initialization. The -default is Note: The default is 'zenith_admin', and the console +default is Note: The default is 'cloud_admin', and the console depends on that, so if you change it, bad things will happen. #### page_cache_size @@ -107,14 +155,16 @@ for other files and for sockets for incoming connections. #### pg_distrib_dir A directory with Postgres installation to use during pageserver activities. +Since pageserver supports several postgres versions, `pg_distrib_dir` contains +a subdirectory for each version with naming convention `v{PG_MAJOR_VERSION}/`. Inside that dir, a `bin/postgres` binary should be present. -The default distrib dir is `./tmp_install/`. +The default distrib dir is `./pg_install/`. #### workdir (-D) A directory in the file system, where pageserver will store its files. -The default is `./.zenith/`. +The default is `./.neon/`. This parameter has a special CLI alias (`-D`) and can not be overridden with regular `-c` way. @@ -151,30 +201,28 @@ bucket_region = 'eu-north-1' # Optional, pageserver uses entire bucket if the prefix is not specified. prefix_in_bucket = '/some/prefix/' -# Access key to connect to the bucket ("login" part of the credentials) -access_key_id = 'SOMEKEYAAAAASADSAH*#' - -# Secret access key to connect to the bucket ("password" part of the credentials) -secret_access_key = 'SOMEsEcReTsd292v' +# S3 API query limit to avoid getting errors/throttling from AWS. +concurrency_limit = 100 ``` +If no IAM bucket access is used during the remote storage usage, use the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to set the access credentials. + ###### General remote storage configuration -Pagesever allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used. +Pageserver allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used. No default values are used for the remote storage configuration parameters. Besides, there are parameters common for all types of remote storage that can be configured, those have defaults: ```toml [remote_storage] -# Max number of concurrent connections to open for uploading to or downloading from the remote storage. -max_concurrent_sync = 100 +# Max number of concurrent timeline synchronized (layers uploaded or downloaded) with the remote storage at the same time. +max_concurrent_syncs = 50 # Max number of errors a single task can have before it's considered failed and not attempted to run anymore. max_sync_errors = 10 ``` - ## safekeeper TODO diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 8d35d35f2f..4b4efcecd7 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -10,7 +10,7 @@ Intended to be used in integration tests and in CLI tools for local installation `/docs`: -Documentaion of the Zenith features and concepts. +Documentation of the Neon features and concepts. Now it is mostly dev documentation. `/monitoring`: @@ -19,7 +19,7 @@ TODO `/pageserver`: -Zenith storage service. +Neon storage service. The pageserver has a few different duties: - Store and manage the data. @@ -28,12 +28,7 @@ The pageserver has a few different duties: - Receive WAL from the WAL service and decode it. - Replay WAL that's applicable to the chunks that the Page Server maintains -For more detailed info, see `/pageserver/README` - -`/postgres_ffi`: - -Utility functions for interacting with PostgreSQL file formats. -Misc constants, copied from PostgreSQL headers. +For more detailed info, see [pageserver-services.md](./pageserver-services.md) `/proxy`: @@ -45,41 +40,49 @@ and create new databases and accounts (control plane API in our case). Integration tests, written in Python using the `pytest` framework. -`/vendor/postgres`: +`/vendor/postgres-v14`: -PostgreSQL source tree, with the modifications needed for Zenith. +PostgreSQL source tree, with the modifications needed for Neon. -`/vendor/postgres/contrib/zenith`: +`/pgxn/neon`: PostgreSQL extension that implements storage manager API and network communications with remote page server. -`/vendor/postgres/contrib/zenith_test_utils`: +`/pgxn/neon_test_utils`: PostgreSQL extension that contains functions needed for testing and debugging. -`/walkeeper`: +`/pgxn/neon_walredo`: -The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver. +Library to run Postgres as a "WAL redo process" in the pageserver. + +`/safekeeper`: + +The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver. It acts as a holding area and redistribution center for recently generated WAL. -For more detailed info, see `/walkeeper/README` +For more detailed info, see [walservice.md](./walservice.md) `/workspace_hack`: The workspace_hack crate exists only to pin down some dependencies. -`/zenith` +We use [cargo-hakari](https://crates.io/crates/cargo-hakari) for automation. -Main entry point for the 'zenith' CLI utility. -TODO: Doesn't it belong to control_plane? +`/libs`: +Unites granular neon helper crates under the hood. -`/zenith_metrics`: +`/libs/postgres_ffi`: +Utility functions for interacting with PostgreSQL file formats. +Misc constants, copied from PostgreSQL headers. + +`/libs/utils`: +Generic helpers that are shared between other crates in this repository. +A subject for future modularization. + +`/libs/metrics`: Helpers for exposing Prometheus metrics from the server. -`/zenith_utils`: - -Helpers that are shared between other crates in this repository. - ## Using Python Note that Debian/Ubuntu Python packages are stale, as it commonly happens, so manual installation of dependencies is not recommended. @@ -87,28 +90,34 @@ so manual installation of dependencies is not recommended. A single virtual environment with all dependencies is described in the single `Pipfile`. ### Prerequisites -- Install Python 3.7 (the minimal supported version) or greater. - - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesnt work as expected. - - If you have some trouble with other version you can resolve it by installing Python 3.7 separately, via pyenv or via system package manager e.g.: +- Install Python 3.9 (the minimal supported version) or greater. + - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesn't work as expected. + - If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.: ```bash # In Ubuntu sudo add-apt-repository ppa:deadsnakes/ppa sudo apt update - sudo apt install python3.7 + sudo apt install python3.9 ``` - Install `poetry` - - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`. -- Install dependencies via `./scripts/pysync`. Note that CI uses Python 3.7 so if you have different version some linting tools can yield different result locally vs in the CI. + - Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation). +- Install dependencies via `./scripts/pysync`. + - Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile)) + so if you have different version some linting tools can yield different result locally vs in the CI. + - You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.9`. + This may also disable the `The currently activated Python version X.Y.Z is not supported by the project` warning. Run `poetry shell` to activate the virtual environment. Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`. ### Obligatory checks -We force code formatting via `yapf` and type hints via `mypy`. -Run the following commands in the repository's root (next to `setup.cfg`): +We force code formatting via `black`, `isort` and type hints via `mypy`. +Run the following commands in the repository's root (next to `pyproject.toml`): ```bash -poetry run yapf -ri . # All code is reformatted +poetry run isort . # Imports are reformatted +poetry run black . # All code is reformatted +poetry run flake8 . # Python linter poetry run mypy . # Ensure there are no typing errors ``` @@ -117,10 +126,59 @@ Otherwise it will not find its configuration. Also consider: -* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any. +* Running `pycodestyle` (or a linter of your choice) and fixing possible defects, if any. * Adding more type hints to your code to avoid `Any`. ### Changing dependencies To add new package or change an existing one you can use `poetry add` or `poetry update` or edit `pyproject.toml` manually. Do not forget to run `poetry lock` in the latter case. More details are available in poetry's [documentation](https://python-poetry.org/docs/). + +## Configuring IDEs +Neon consists of three projects in different languages which use different project models. + +* A bunch of Rust crates, all available from the root `Cargo.toml`. +* Integration tests in Python in the `test_runner` directory. Some stand-alone Python scripts exist as well. +* Postgres and our Postgres extensions in C built with Makefiles under `vendor/postgres` and `pgxn`. + +### CLion +You can use CLion with the [Rust plugin](https://plugins.jetbrains.com/plugin/8182-rust) to develop Neon. It should pick up Rust and Python projects whenever you open Neon's repository as a project. We have not tried setting up a debugger, though. + +C code requires some extra care, as it's built via Make, not CMake. Some of our developers have successfully used [compilation database](https://www.jetbrains.com/help/clion/compilation-database.html#compdb_generate) for CLion. It is a JSON file which lists all C source files and corresponding compilation keys. CLion can use it instead of `CMakeLists.txt`. To set up a project with a compilation database: + +1. Clone the Neon repository and install all dependencies, including Python. Do not open it with CLion just yet. +2. Run the following commands in the repository's root: + ```bash + # Install a `compiledb` tool which can parse make's output and generate the compilation database. + poetry add -D compiledb + # Clean the build tree so we can rebuild from scratch. + # Unfortunately, our and Postgres Makefiles do not work well with either --dry-run or --assume-new, + # so we don't know a way to generate the compilation database without recompiling everything, + # see https://github.com/neondatabase/neon/issues/2378#issuecomment-1241421325 + make distclean + # Rebuild the Postgres parts from scratch and save the compilation commands to the compilation database. + # You can alter the -j parameter to your liking. + # Note that we only build for a specific version of Postgres. The extension code is shared, but headers are + # different, so we set up CLion to only use a specific version of the headers. + make -j$(nproc) --print-directory postgres-v15 neon-pg-ext-v15 | poetry run compiledb --verbose --no-build + # Uninstall the tool + poetry remove -D compiledb + # Make sure the compile_commands.json file is not committed. + echo /compile_commands.json >>.git/info/exclude + ``` +3. Open CLion, click "Open File or Project" and choose the generated `compile_commands.json` file to be opened "as a project". You cannot add a compilation database into an existing CLion project, you have to create a new one. _Do not_ open the directory as a project, open the file. +4. The newly created project should start indexing Postgres source code in C, as well as the C standard library. You may have to [configure the C compiler for the compilation database](https://www.jetbrains.com/help/clion/compilation-database.html#compdb_toolchain). +5. Open the `Cargo.toml` file in an editor in the same project. CLion should pick up the hint and start indexing Rust code. +6. Now you have a CLion project which knows about C files, Rust files. It should pick up Python files automatically as well. +7. Set up correct code indentation in CLion's settings: Editor > Code Style > C/C++, choose the "Project" scheme on the top, and tick the "Use tab character" on the "Tabs and Indents" tab. Ensure that "Tab size" is 4. + +You can also enable Cargo Clippy diagnostics and enable Rustfmt instead of built-in code formatter. + +Whenever you change layout of C files, you may need to regenerate the compilation database. No need to re-create the CLion project, changes should be picked up automatically. + +Known issues (fixes and suggestions are welcome): + +* Test results may be hard to read in CLion, both for unit tests in Rust and integration tests in Python. Use command line to run them instead. +* CLion does not support non-local Python interpreters, unlike PyCharm. E.g. if you use WSL, CLion does not see `poetry` and installed dependencies. Python support is limited. +* Cargo Clippy diagnostics in CLion may take a lot of resources. +* `poetry add -D` updates some packages and changes `poetry.lock` drastically even when followed by `poetry remove -D`. Feel free to `git checkout poetry.lock` and `./scripts/pysync` to revert these changes. diff --git a/walkeeper/README b/docs/walservice.md similarity index 95% rename from walkeeper/README rename to docs/walservice.md index 4407837463..4e6db0c5a4 100644 --- a/walkeeper/README +++ b/docs/walservice.md @@ -1,12 +1,13 @@ # WAL service -The zenith WAL service acts as a holding area and redistribution +The neon WAL service acts as a holding area and redistribution center for recently generated WAL. The primary Postgres server streams the WAL to the WAL safekeeper, and treats it like a (synchronous) replica. A replication slot is used in the primary to prevent the primary from discarding WAL that hasn't been streamed to the WAL service yet. +``` +--------------+ +------------------+ | | WAL | | | Compute node | ----------> | WAL Service | @@ -23,7 +24,7 @@ service yet. | Pageservers | | | +--------------+ - +``` The WAL service consists of multiple WAL safekeepers that all store a @@ -31,6 +32,7 @@ copy of the WAL. A WAL record is considered durable when the majority of safekeepers have received and stored the WAL to local disk. A consensus algorithm based on Paxos is used to manage the quorum. +``` +-------------------------------------------+ | WAL Service | | | @@ -48,7 +50,7 @@ consensus algorithm based on Paxos is used to manage the quorum. | +------------+ | | | +-------------------------------------------+ - +``` The primary connects to the WAL safekeepers, so it works in a "push" fashion. That's different from how streaming replication usually @@ -73,8 +75,8 @@ safekeepers. The Paxos and crash recovery algorithm ensures that only one primary node can be actively streaming WAL to the quorum of safekeepers. -See README_PROTO.md for a more detailed desription of the consensus -protocol. spec/ contains TLA+ specification of it. +See [this section](safekeeper-protocol.md) for a more detailed description of +the consensus protocol. spec/ contains TLA+ specification of it. # Q&A @@ -92,7 +94,7 @@ Q: What if the compute node evicts a page, needs it back, but the page is yet A: If the compute node has evicted a page, changes to it have been WAL-logged (that's why it is called Write Ahead logging; there are some exceptions like index builds, but these are exceptions). These WAL records will eventually - reach the Page Server. The Page Server notes that the compute note requests + reach the Page Server. The Page Server notes that the compute node requests pages with a very recent LSN and will not respond to the compute node until a corresponding WAL is received from WAL safekeepers. diff --git a/libs/etcd_broker/Cargo.toml b/libs/etcd_broker/Cargo.toml new file mode 100644 index 0000000000..b18dcbe5a3 --- /dev/null +++ b/libs/etcd_broker/Cargo.toml @@ -0,0 +1,18 @@ +[package] + name = "etcd_broker" + version = "0.1.0" + edition = "2021" + + [dependencies] + etcd-client = "0.9.0" + regex = "1.4.5" + serde = { version = "1.0", features = ["derive"] } + serde_json = "1" + serde_with = "2.0" + once_cell = "1.13.0" + + utils = { path = "../utils" } + workspace_hack = { version = "0.1", path = "../../workspace_hack" } + tokio = "1" + tracing = "0.1" + thiserror = "1" diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs new file mode 100644 index 0000000000..8f698977a9 --- /dev/null +++ b/libs/etcd_broker/src/lib.rs @@ -0,0 +1,209 @@ +//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent). +//! Intended to connect services to each other, not to store their data. + +/// All broker keys, that are used when dealing with etcd. +pub mod subscription_key; +/// All broker values, possible to use when dealing with etcd. +pub mod subscription_value; + +use std::str::FromStr; + +use serde::de::DeserializeOwned; + +use subscription_key::SubscriptionKey; +use tokio::{sync::mpsc, task::JoinHandle}; +use tracing::*; + +use crate::subscription_key::SubscriptionFullKey; + +pub use etcd_client::*; + +/// Default value to use for prefixing to all etcd keys with. +/// This way allows isolating safekeeper/pageserver groups in the same etcd cluster. +pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon"; + +/// A way to control the data retrieval from a certain subscription. +pub struct BrokerSubscription { + /// An unbounded channel to fetch the relevant etcd updates from. + pub value_updates: mpsc::UnboundedReceiver>, + key: SubscriptionKey, + /// A subscription task handle, to allow waiting on it for the task to complete. + /// Both the updates channel and the handle require `&mut`, so it's better to keep + /// both `pub` to allow using both in the same structures without borrow checker complaining. + pub watcher_handle: JoinHandle>, + watcher: Watcher, +} + +impl BrokerSubscription { + /// Cancels the subscription, stopping the data poller and waiting for it to shut down. + pub async fn cancel(mut self) -> Result<(), BrokerError> { + self.watcher.cancel().await.map_err(|e| { + BrokerError::EtcdClient( + e, + format!("Failed to cancel broker subscription, kind: {:?}", self.key), + ) + })?; + match (&mut self.watcher_handle).await { + Ok(res) => res, + Err(e) => { + if e.is_cancelled() { + // don't error on the tasks that are cancelled already + Ok(()) + } else { + Err(BrokerError::InternalError(format!( + "Panicked during broker subscription task, kind: {:?}, error: {e}", + self.key + ))) + } + } + } + } +} + +impl Drop for BrokerSubscription { + fn drop(&mut self) { + // we poll data from etcd into the channel in the same struct, so if the whole struct gets dropped, + // no more data is used by the receiver and it's safe to cancel and drop the whole etcd subscription task. + self.watcher_handle.abort(); + } +} + +/// An update from the etcd broker. +pub struct BrokerUpdate { + /// Etcd generation version, the bigger the more actual the data is. + pub etcd_version: i64, + /// Etcd key for the corresponding value, parsed from the broker KV. + pub key: SubscriptionFullKey, + /// Current etcd value, parsed from the broker KV. + pub value: V, +} + +#[derive(Debug, thiserror::Error)] +pub enum BrokerError { + #[error("Etcd client error: {0}. Context: {1}")] + EtcdClient(etcd_client::Error, String), + #[error("Error during parsing etcd key: {0}")] + KeyNotParsed(String), + #[error("Internal error: {0}")] + InternalError(String), +} + +/// Creates a background task to poll etcd for timeline updates from safekeepers. +/// Stops and returns `Err` on any error during etcd communication. +/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle, +/// exiting normally in such cases. +/// Etcd values are parsed as json fukes into a type, specified in the generic patameter. +pub async fn subscribe_for_json_values( + client: &mut Client, + key: SubscriptionKey, +) -> Result, BrokerError> +where + V: DeserializeOwned + Send + 'static, +{ + subscribe_for_values(client, key, |_, value_str| { + match serde_json::from_str::(value_str) { + Ok(value) => Some(value), + Err(e) => { + error!("Failed to parse value str '{value_str}': {e}"); + None + } + } + }) + .await +} + +/// Same as [`subscribe_for_json_values`], but allows to specify a custom parser of a etcd value string. +pub async fn subscribe_for_values( + client: &mut Client, + key: SubscriptionKey, + value_parser: P, +) -> Result, BrokerError> +where + V: Send + 'static, + P: Fn(SubscriptionFullKey, &str) -> Option + Send + 'static, +{ + info!("Subscribing to broker value updates, key: {key:?}"); + let subscription_key = key.clone(); + + let (watcher, mut stream) = client + .watch(key.watch_key(), Some(WatchOptions::new().with_prefix())) + .await + .map_err(|e| { + BrokerError::EtcdClient( + e, + format!("Failed to init the watch for subscription {key:?}"), + ) + })?; + + let (value_updates_sender, value_updates_receiver) = mpsc::unbounded_channel(); + let watcher_handle = tokio::spawn(async move { + while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!( + "Failed to get messages from the subscription stream, kind: {:?}, error: {e}", key.kind + )))? { + if resp.canceled() { + info!("Watch for timeline updates subscription was canceled, exiting"); + break; + } + + let events = resp.events(); + debug!("Processing {} events", events.len()); + + for event in events { + if EventType::Put == event.event_type() { + if let Some(new_etcd_kv) = event.kv() { + match parse_etcd_kv(new_etcd_kv, &value_parser, &key.cluster_prefix) { + Ok(Some((key, value))) => if let Err(e) = value_updates_sender.send(BrokerUpdate { + etcd_version: new_etcd_kv.version(), + key, + value, + }) { + info!("Broker value updates for key {key:?} sender got dropped, exiting: {e}"); + break; + }, + Ok(None) => debug!("Ignoring key {key:?} : no value was returned by the parser"), + Err(BrokerError::KeyNotParsed(e)) => debug!("Unexpected key {key:?} for timeline update: {e}"), + Err(e) => error!("Failed to represent etcd KV {new_etcd_kv:?}: {e}"), + }; + } + } + } + } + + Ok(()) + }.instrument(info_span!("etcd_broker"))); + + Ok(BrokerSubscription { + key: subscription_key, + value_updates: value_updates_receiver, + watcher_handle, + watcher, + }) +} + +fn parse_etcd_kv( + kv: &KeyValue, + value_parser: &P, + cluster_prefix: &str, +) -> Result, BrokerError> +where + P: Fn(SubscriptionFullKey, &str) -> Option, +{ + let key_str = kv.key_str().map_err(|e| { + BrokerError::EtcdClient(e, "Failed to extract key str out of etcd KV".to_string()) + })?; + let value_str = kv.value_str().map_err(|e| { + BrokerError::EtcdClient(e, "Failed to extract value str out of etcd KV".to_string()) + })?; + + if !key_str.starts_with(cluster_prefix) { + return Err(BrokerError::KeyNotParsed(format!( + "KV has unexpected key '{key_str}' that does not start with cluster prefix {cluster_prefix}" + ))); + } + + let key = SubscriptionFullKey::from_str(&key_str[cluster_prefix.len()..]).map_err(|e| { + BrokerError::KeyNotParsed(format!("Failed to parse KV key '{key_str}': {e}")) + })?; + + Ok(value_parser(key, value_str).map(|value| (key, value))) +} diff --git a/libs/etcd_broker/src/subscription_key.rs b/libs/etcd_broker/src/subscription_key.rs new file mode 100644 index 0000000000..a11d2ab106 --- /dev/null +++ b/libs/etcd_broker/src/subscription_key.rs @@ -0,0 +1,310 @@ +//! Etcd broker keys, used in the project and shared between instances. +//! The keys are split into two categories: +//! +//! * [`SubscriptionFullKey`] full key format: `/////` +//! Always returned from etcd in this form, always start with the user key provided. +//! +//! * [`SubscriptionKey`] user input key format: always partial, since it's unknown which `node_id`'s are available. +//! Full key always starts with the user input one, due to etcd subscription properties. + +use std::{fmt::Display, str::FromStr}; + +use once_cell::sync::Lazy; +use regex::{Captures, Regex}; +use utils::id::{NodeId, TenantId, TenantTimelineId}; + +/// The subscription kind to the timeline updates from safekeeper. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SubscriptionKey { + /// Generic cluster prefix, allowing to use the same etcd instance by multiple logic groups. + pub cluster_prefix: String, + /// The subscription kind. + pub kind: SubscriptionKind, +} + +/// All currently possible key kinds of a etcd broker subscription. +/// Etcd works so, that every key that starts with the subbscription key given is considered matching and +/// returned as part of the subscrption. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum SubscriptionKind { + /// Get every update in etcd. + All, + /// Get etcd updates for any timeiline of a certain tenant, affected by any operation from any node kind. + TenantTimelines(TenantId), + /// Get etcd updates for a certain timeline of a tenant, affected by any operation from any node kind. + Timeline(TenantTimelineId), + /// Get etcd timeline updates, specific to a certain node kind. + Node(TenantTimelineId, NodeKind), + /// Get etcd timeline updates for a certain operation on specific nodes. + Operation(TenantTimelineId, NodeKind, OperationKind), +} + +/// All kinds of nodes, able to write into etcd. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum NodeKind { + Safekeeper, + Pageserver, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum OperationKind { + Safekeeper(SkOperationKind), +} + +/// Current operations, running inside the safekeeper node. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum SkOperationKind { + TimelineInfo, + WalBackup, +} + +static SUBSCRIPTION_FULL_KEY_REGEX: Lazy = Lazy::new(|| { + Regex::new("/([[:xdigit:]]+)/([[:xdigit:]]+)/([^/]+)/([^/]+)/([[:digit:]]+)$") + .expect("wrong subscription full etcd key regex") +}); + +/// Full key, received from etcd during any of the component's work. +/// No other etcd keys are considered during system's work. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct SubscriptionFullKey { + pub id: TenantTimelineId, + pub node_kind: NodeKind, + pub operation: OperationKind, + pub node_id: NodeId, +} + +impl SubscriptionKey { + /// Subscribes for all etcd updates. + pub fn all(cluster_prefix: String) -> Self { + SubscriptionKey { + cluster_prefix, + kind: SubscriptionKind::All, + } + } + + /// Subscribes to a given timeline info updates from safekeepers. + pub fn sk_timeline_info(cluster_prefix: String, timeline: TenantTimelineId) -> Self { + Self { + cluster_prefix, + kind: SubscriptionKind::Operation( + timeline, + NodeKind::Safekeeper, + OperationKind::Safekeeper(SkOperationKind::TimelineInfo), + ), + } + } + + /// Subscribes to all timeine updates during specific operations, running on the corresponding nodes. + pub fn operation( + cluster_prefix: String, + timeline: TenantTimelineId, + node_kind: NodeKind, + operation: OperationKind, + ) -> Self { + Self { + cluster_prefix, + kind: SubscriptionKind::Operation(timeline, node_kind, operation), + } + } + + /// Etcd key to use for watching a certain timeline updates from safekeepers. + pub fn watch_key(&self) -> String { + let cluster_prefix = &self.cluster_prefix; + match self.kind { + SubscriptionKind::All => cluster_prefix.to_string(), + SubscriptionKind::TenantTimelines(tenant_id) => { + format!("{cluster_prefix}/{tenant_id}") + } + SubscriptionKind::Timeline(id) => { + format!("{cluster_prefix}/{id}") + } + SubscriptionKind::Node(id, node_kind) => { + format!("{cluster_prefix}/{id}/{node_kind}") + } + SubscriptionKind::Operation(id, node_kind, operation_kind) => { + format!("{cluster_prefix}/{id}/{node_kind}/{operation_kind}") + } + } + } +} + +impl Display for OperationKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + OperationKind::Safekeeper(o) => o.fmt(f), + } + } +} + +impl FromStr for OperationKind { + type Err = String; + + fn from_str(operation_kind_str: &str) -> Result { + match operation_kind_str { + "timeline_info" => Ok(OperationKind::Safekeeper(SkOperationKind::TimelineInfo)), + "wal_backup" => Ok(OperationKind::Safekeeper(SkOperationKind::WalBackup)), + _ => Err(format!("Unknown operation kind: {operation_kind_str}")), + } + } +} + +impl Display for SubscriptionFullKey { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Self { + id, + node_kind, + operation, + node_id, + } = self; + write!(f, "{id}/{node_kind}/{operation}/{node_id}") + } +} + +impl FromStr for SubscriptionFullKey { + type Err = String; + + fn from_str(subscription_kind_str: &str) -> Result { + let key_captures = match SUBSCRIPTION_FULL_KEY_REGEX.captures(subscription_kind_str) { + Some(captures) => captures, + None => { + return Err(format!( + "Subscription kind str does not match a subscription full key regex {}", + SUBSCRIPTION_FULL_KEY_REGEX.as_str() + )); + } + }; + + Ok(Self { + id: TenantTimelineId::new( + parse_capture(&key_captures, 1)?, + parse_capture(&key_captures, 2)?, + ), + node_kind: parse_capture(&key_captures, 3)?, + operation: parse_capture(&key_captures, 4)?, + node_id: NodeId(parse_capture(&key_captures, 5)?), + }) + } +} + +fn parse_capture(caps: &Captures, index: usize) -> Result +where + T: FromStr, + ::Err: Display, +{ + let capture_match = caps + .get(index) + .ok_or_else(|| format!("Failed to get capture match at index {index}"))? + .as_str(); + capture_match.parse().map_err(|e| { + format!( + "Failed to parse {} from {capture_match}: {e}", + std::any::type_name::() + ) + }) +} + +impl Display for NodeKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Safekeeper => write!(f, "safekeeper"), + Self::Pageserver => write!(f, "pageserver"), + } + } +} + +impl FromStr for NodeKind { + type Err = String; + + fn from_str(node_kind_str: &str) -> Result { + match node_kind_str { + "safekeeper" => Ok(Self::Safekeeper), + "pageserver" => Ok(Self::Pageserver), + _ => Err(format!("Invalid node kind: {node_kind_str}")), + } + } +} + +impl Display for SkOperationKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::TimelineInfo => write!(f, "timeline_info"), + Self::WalBackup => write!(f, "wal_backup"), + } + } +} + +impl FromStr for SkOperationKind { + type Err = String; + + fn from_str(operation_str: &str) -> Result { + match operation_str { + "timeline_info" => Ok(Self::TimelineInfo), + "wal_backup" => Ok(Self::WalBackup), + _ => Err(format!("Invalid operation: {operation_str}")), + } + } +} + +#[cfg(test)] +mod tests { + use utils::id::TimelineId; + + use super::*; + + #[test] + fn full_cluster_key_parsing() { + let prefix = "neon"; + let node_kind = NodeKind::Safekeeper; + let operation_kind = OperationKind::Safekeeper(SkOperationKind::WalBackup); + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + let id = TenantTimelineId::new(tenant_id, timeline_id); + let node_id = NodeId(1); + + let timeline_subscription_keys = [ + SubscriptionKey { + cluster_prefix: prefix.to_string(), + kind: SubscriptionKind::All, + }, + SubscriptionKey { + cluster_prefix: prefix.to_string(), + kind: SubscriptionKind::TenantTimelines(tenant_id), + }, + SubscriptionKey { + cluster_prefix: prefix.to_string(), + kind: SubscriptionKind::Timeline(id), + }, + SubscriptionKey { + cluster_prefix: prefix.to_string(), + kind: SubscriptionKind::Node(id, node_kind), + }, + SubscriptionKey { + cluster_prefix: prefix.to_string(), + kind: SubscriptionKind::Operation(id, node_kind, operation_kind), + }, + ]; + + let full_key_string = format!( + "{}/{node_id}", + timeline_subscription_keys.last().unwrap().watch_key() + ); + + for key in timeline_subscription_keys { + assert!(full_key_string.starts_with(&key.watch_key()), "Full key '{full_key_string}' should start with any of the keys, keys, but {key:?} did not match"); + } + + let full_key = SubscriptionFullKey::from_str(&full_key_string).unwrap_or_else(|e| { + panic!("Failed to parse {full_key_string} as a subscription full key: {e}") + }); + + assert_eq!( + full_key, + SubscriptionFullKey { + id, + node_kind, + operation: operation_kind, + node_id + } + ) + } +} diff --git a/libs/etcd_broker/src/subscription_value.rs b/libs/etcd_broker/src/subscription_value.rs new file mode 100644 index 0000000000..60a5411926 --- /dev/null +++ b/libs/etcd_broker/src/subscription_value.rs @@ -0,0 +1,38 @@ +//! Module for the values to put into etcd. + +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; +use utils::lsn::Lsn; + +/// Data about safekeeper's timeline. Fields made optional for easy migrations. +#[serde_as] +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct SkTimelineInfo { + /// Term of the last entry. + pub last_log_term: Option, + /// LSN of the last record. + #[serde_as(as = "Option")] + #[serde(default)] + pub flush_lsn: Option, + /// Up to which LSN safekeeper regards its WAL as committed. + #[serde_as(as = "Option")] + #[serde(default)] + pub commit_lsn: Option, + /// LSN up to which safekeeper has backed WAL. + #[serde_as(as = "Option")] + #[serde(default)] + pub backup_lsn: Option, + /// LSN of last checkpoint uploaded by pageserver. + #[serde_as(as = "Option")] + #[serde(default)] + pub remote_consistent_lsn: Option, + #[serde_as(as = "Option")] + #[serde(default)] + pub peer_horizon_lsn: Option, + #[serde_as(as = "Option")] + #[serde(default)] + pub local_start_lsn: Option, + /// A connection string to use for WAL receiving. + #[serde(default)] + pub safekeeper_connstr: Option, +} diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml new file mode 100644 index 0000000000..d0cd46d2a9 --- /dev/null +++ b/libs/metrics/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "metrics" +version = "0.1.0" +edition = "2021" + +[dependencies] +prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency +libc = "0.2" +once_cell = "1.13.0" +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/zenith_metrics/src/lib.rs b/libs/metrics/src/lib.rs similarity index 57% rename from zenith_metrics/src/lib.rs rename to libs/metrics/src/lib.rs index 8756a078c3..880ab0e83c 100644 --- a/zenith_metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -2,8 +2,11 @@ //! make sure that we use the same dep version everywhere. //! Otherwise, we might not see all metrics registered via //! a default registry. -use lazy_static::lazy_static; -use once_cell::race::OnceBox; +use once_cell::sync::Lazy; +use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec}; +pub use prometheus::opts; +pub use prometheus::register; +pub use prometheus::{core, default_registry, proto}; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_gauge, Gauge}; pub use prometheus::{register_gauge_vec, GaugeVec}; @@ -14,70 +17,76 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec}; pub use prometheus::{register_int_gauge, IntGauge}; pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; pub use prometheus::{Encoder, TextEncoder}; +use prometheus::{Registry, Result}; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; +pub type UIntGauge = GenericGauge; +pub type UIntGaugeVec = GenericGaugeVec; + +#[macro_export] +macro_rules! register_uint_gauge_vec { + ($NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{ + let gauge_vec = UIntGaugeVec::new($crate::opts!($NAME, $HELP), $LABELS_NAMES).unwrap(); + $crate::register(Box::new(gauge_vec.clone())).map(|_| gauge_vec) + }}; +} + +/// Special internal registry, to collect metrics independently from the default registry. +/// Was introduced to fix deadlock with lazy registration of metrics in the default registry. +static INTERNAL_REGISTRY: Lazy = Lazy::new(Registry::new); + +/// Register a collector in the internal registry. MUST be called before the first call to `gather()`. +/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector +/// while holding the lock. +pub fn register_internal(c: Box) -> Result<()> { + INTERNAL_REGISTRY.register(c) +} + /// Gathers all Prometheus metrics and records the I/O stats just before that. /// /// Metrics gathering is a relatively simple and standalone operation, so /// it might be fine to do it this way to keep things simple. pub fn gather() -> Vec { update_rusage_metrics(); - prometheus::gather() + let mut mfs = prometheus::gather(); + let mut internal_mfs = INTERNAL_REGISTRY.gather(); + mfs.append(&mut internal_mfs); + mfs } -static COMMON_METRICS_PREFIX: OnceBox<&str> = OnceBox::new(); - -/// Sets a prefix which will be used for all common metrics, typically a service -/// name like 'pageserver'. Should be executed exactly once in the beginning of -/// any executable which uses common metrics. -pub fn set_common_metrics_prefix(prefix: &'static str) { - // Not unwrap() because metrics may be initialized after multiple threads have been started. - COMMON_METRICS_PREFIX - .set(prefix.into()) - .unwrap_or_else(|_| { - eprintln!( - "set_common_metrics_prefix() was called second time with '{}', exiting", - prefix - ); - std::process::exit(1); - }); -} - -/// Prepends a prefix to a common metric name so they are distinguished between -/// different services, see -/// A call to set_common_metrics_prefix() is necessary prior to calling this. -pub fn new_common_metric_name(unprefixed_metric_name: &str) -> String { - // Not unwrap() because metrics may be initialized after multiple threads have been started. - format!( - "{}_{}", - COMMON_METRICS_PREFIX.get().unwrap_or_else(|| { - eprintln!("set_common_metrics_prefix() was not called, but metrics are used, exiting"); - std::process::exit(1); - }), - unprefixed_metric_name - ) -} - -lazy_static! { - static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!( - new_common_metric_name("disk_io_bytes"), +static DISK_IO_BYTES: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "libmetrics_disk_io_bytes_total", "Bytes written and read from disk, grouped by the operation (read|write)", &["io_operation"] ) - .expect("Failed to register disk i/o bytes int gauge vec"); - static ref MAXRSS_KB: IntGauge = register_int_gauge!( - new_common_metric_name("maxrss_kb"), + .expect("Failed to register disk i/o bytes int gauge vec") +}); + +static MAXRSS_KB: Lazy = Lazy::new(|| { + register_int_gauge!( + "libmetrics_maxrss_kb", "Memory usage (Maximum Resident Set Size)" ) - .expect("Failed to register maxrss_kb int gauge"); -} + .expect("Failed to register maxrss_kb int gauge") +}); pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[ 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, ]; +pub fn set_build_info_metric(revision: &str) { + let metric = register_int_gauge_vec!( + "libmetrics_build_info", + "Build/version information", + &["revision"] + ) + .expect("Failed to register build info metric"); + metric.with_label_values(&[revision]).set(1); +} + // Records I/O stats in a "cross-platform" way. // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats. // An alternative is to read procfs (`/proc/[pid]/io`) which does not work under macOS at all, hence abandoned. diff --git a/zenith_metrics/src/wrappers.rs b/libs/metrics/src/wrappers.rs similarity index 92% rename from zenith_metrics/src/wrappers.rs rename to libs/metrics/src/wrappers.rs index 48202bc15e..1bf1ea0753 100644 --- a/zenith_metrics/src/wrappers.rs +++ b/libs/metrics/src/wrappers.rs @@ -8,15 +8,15 @@ use std::io::{Read, Result, Write}; /// /// ``` /// # use std::io::{Result, Read}; -/// # use zenith_metrics::{register_int_counter, IntCounter}; -/// # use zenith_metrics::CountedReader; +/// # use metrics::{register_int_counter, IntCounter}; +/// # use metrics::CountedReader; +/// # use once_cell::sync::Lazy; /// # -/// # lazy_static::lazy_static! { -/// # static ref INT_COUNTER: IntCounter = register_int_counter!( +/// # static INT_COUNTER: Lazy = Lazy::new( || { register_int_counter!( /// # "int_counter", /// # "let's count something!" -/// # ).unwrap(); -/// # } +/// # ).unwrap() +/// # }); /// # /// fn do_some_reads(stream: impl Read, count: usize) -> Result> { /// let mut reader = CountedReader::new(stream, |cnt| { @@ -83,15 +83,15 @@ impl Read for CountedReader<'_, T> { /// /// ``` /// # use std::io::{Result, Write}; -/// # use zenith_metrics::{register_int_counter, IntCounter}; -/// # use zenith_metrics::CountedWriter; +/// # use metrics::{register_int_counter, IntCounter}; +/// # use metrics::CountedWriter; +/// # use once_cell::sync::Lazy; /// # -/// # lazy_static::lazy_static! { -/// # static ref INT_COUNTER: IntCounter = register_int_counter!( +/// # static INT_COUNTER: Lazy = Lazy::new( || { register_int_counter!( /// # "int_counter", /// # "let's count something!" -/// # ).unwrap(); -/// # } +/// # ).unwrap() +/// # }); /// # /// fn do_some_writes(stream: impl Write, payload: &[u8]) -> Result<()> { /// let mut writer = CountedWriter::new(stream, |cnt| { diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml new file mode 100644 index 0000000000..2102ae5373 --- /dev/null +++ b/libs/pageserver_api/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "pageserver_api" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +serde_with = "2.0" +const_format = "0.2.21" +anyhow = { version = "1.0", features = ["backtrace"] } +bytes = "1.0.1" +byteorder = "1.4.3" + +utils = { path = "../utils" } +postgres_ffi = { path = "../postgres_ffi" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs new file mode 100644 index 0000000000..4890d54f36 --- /dev/null +++ b/libs/pageserver_api/src/lib.rs @@ -0,0 +1,10 @@ +use const_format::formatcp; + +/// Public API types +pub mod models; +pub mod reltag; + +pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; +pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); +pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; +pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs new file mode 100644 index 0000000000..af9be2d456 --- /dev/null +++ b/libs/pageserver_api/src/models.rs @@ -0,0 +1,488 @@ +use std::num::NonZeroU64; + +use byteorder::{BigEndian, ReadBytesExt}; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; +use utils::{ + id::{NodeId, TenantId, TimelineId}, + lsn::Lsn, +}; + +use crate::reltag::RelTag; +use anyhow::bail; +use bytes::{BufMut, Bytes, BytesMut}; + +/// A state of a tenant in pageserver's memory. +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum TenantState { + /// Tenant is fully operational, its background jobs might be running or not. + Active { background_jobs_running: bool }, + /// A tenant is recognized by pageserver, but not yet ready to operate: + /// e.g. not present locally and being downloaded or being read into memory from the file system. + Paused, + /// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated. + Broken, +} + +/// A state of a timeline in pageserver's memory. +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum TimelineState { + /// Timeline is fully operational, its background jobs are running. + Active, + /// A timeline is recognized by pageserver, but not yet ready to operate. + /// The status indicates, that the timeline could eventually go back to Active automatically: + /// for example, if the owning tenant goes back to Active again. + Suspended, + /// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to + /// automatically become Active after certain events: only a management call can change this status. + Paused, + /// A timeline is recognized by the pageserver, but no longer used for any operations, as failed to get activated. + Broken, +} + +#[serde_as] +#[derive(Serialize, Deserialize)] +pub struct TimelineCreateRequest { + #[serde(default)] + #[serde_as(as = "Option")] + pub new_timeline_id: Option, + #[serde(default)] + #[serde_as(as = "Option")] + pub ancestor_timeline_id: Option, + #[serde(default)] + #[serde_as(as = "Option")] + pub ancestor_start_lsn: Option, + pub pg_version: Option, +} + +#[serde_as] +#[derive(Serialize, Deserialize, Default)] +pub struct TenantCreateRequest { + #[serde(default)] + #[serde_as(as = "Option")] + pub new_tenant_id: Option, + pub checkpoint_distance: Option, + pub checkpoint_timeout: Option, + pub compaction_target_size: Option, + pub compaction_period: Option, + pub compaction_threshold: Option, + pub gc_horizon: Option, + pub gc_period: Option, + pub image_creation_threshold: Option, + pub pitr_interval: Option, + pub walreceiver_connect_timeout: Option, + pub lagging_wal_timeout: Option, + pub max_lsn_wal_lag: Option, + pub trace_read_requests: Option, +} + +#[serde_as] +#[derive(Serialize, Deserialize)] +#[serde(transparent)] +pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId); + +#[derive(Serialize)] +pub struct StatusResponse { + pub id: NodeId, +} + +impl TenantCreateRequest { + pub fn new(new_tenant_id: Option) -> TenantCreateRequest { + TenantCreateRequest { + new_tenant_id, + ..Default::default() + } + } +} + +#[serde_as] +#[derive(Serialize, Deserialize)] +pub struct TenantConfigRequest { + pub tenant_id: TenantId, + #[serde(default)] + #[serde_as(as = "Option")] + pub checkpoint_distance: Option, + pub checkpoint_timeout: Option, + pub compaction_target_size: Option, + pub compaction_period: Option, + pub compaction_threshold: Option, + pub gc_horizon: Option, + pub gc_period: Option, + pub image_creation_threshold: Option, + pub pitr_interval: Option, + pub walreceiver_connect_timeout: Option, + pub lagging_wal_timeout: Option, + pub max_lsn_wal_lag: Option, + pub trace_read_requests: Option, +} + +impl TenantConfigRequest { + pub fn new(tenant_id: TenantId) -> TenantConfigRequest { + TenantConfigRequest { + tenant_id, + checkpoint_distance: None, + checkpoint_timeout: None, + compaction_target_size: None, + compaction_period: None, + compaction_threshold: None, + gc_horizon: None, + gc_period: None, + image_creation_threshold: None, + pitr_interval: None, + walreceiver_connect_timeout: None, + lagging_wal_timeout: None, + max_lsn_wal_lag: None, + trace_read_requests: None, + } + } +} + +#[serde_as] +#[derive(Serialize, Deserialize, Clone)] +pub struct TenantInfo { + #[serde_as(as = "DisplayFromStr")] + pub id: TenantId, + pub state: TenantState, + pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint + pub has_in_progress_downloads: Option, +} + +/// This represents the output of the "timeline_detail" and "timeline_list" API calls. +#[serde_as] +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct TimelineInfo { + #[serde_as(as = "DisplayFromStr")] + pub tenant_id: TenantId, + #[serde_as(as = "DisplayFromStr")] + pub timeline_id: TimelineId, + + #[serde_as(as = "Option")] + pub ancestor_timeline_id: Option, + #[serde_as(as = "Option")] + pub ancestor_lsn: Option, + #[serde_as(as = "DisplayFromStr")] + pub last_record_lsn: Lsn, + #[serde_as(as = "Option")] + pub prev_record_lsn: Option, + #[serde_as(as = "DisplayFromStr")] + pub latest_gc_cutoff_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + pub disk_consistent_lsn: Lsn, + pub current_logical_size: Option, // is None when timeline is Unloaded + pub current_physical_size: Option, // is None when timeline is Unloaded + pub current_logical_size_non_incremental: Option, + pub current_physical_size_non_incremental: Option, + + pub wal_source_connstr: Option, + #[serde_as(as = "Option")] + pub last_received_msg_lsn: Option, + /// the timestamp (in microseconds) of the last received message + pub last_received_msg_ts: Option, + pub pg_version: u32, + + #[serde_as(as = "Option")] + pub remote_consistent_lsn: Option, + pub awaits_download: bool, + + pub state: TimelineState, + + // Some of the above fields are duplicated in 'local' and 'remote', for backwards- + // compatility with older clients. + pub local: LocalTimelineInfo, + pub remote: RemoteTimelineInfo, +} + +#[serde_as] +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct LocalTimelineInfo { + #[serde_as(as = "Option")] + pub ancestor_timeline_id: Option, + #[serde_as(as = "Option")] + pub ancestor_lsn: Option, + pub current_logical_size: Option, // is None when timeline is Unloaded + pub current_physical_size: Option, // is None when timeline is Unloaded +} + +#[serde_as] +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct RemoteTimelineInfo { + #[serde_as(as = "Option")] + pub remote_consistent_lsn: Option, +} + +pub type ConfigureFailpointsRequest = Vec; + +/// Information for configuring a single fail point +#[derive(Debug, Serialize, Deserialize)] +pub struct FailpointConfig { + /// Name of the fail point + pub name: String, + /// List of actions to take, using the format described in `fail::cfg` + /// + /// We also support `actions = "exit"` to cause the fail point to immediately exit. + pub actions: String, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct TimelineGcRequest { + pub gc_horizon: Option, +} + +// Wrapped in libpq CopyData +#[derive(PartialEq, Eq)] +pub enum PagestreamFeMessage { + Exists(PagestreamExistsRequest), + Nblocks(PagestreamNblocksRequest), + GetPage(PagestreamGetPageRequest), + DbSize(PagestreamDbSizeRequest), +} + +// Wrapped in libpq CopyData +pub enum PagestreamBeMessage { + Exists(PagestreamExistsResponse), + Nblocks(PagestreamNblocksResponse), + GetPage(PagestreamGetPageResponse), + Error(PagestreamErrorResponse), + DbSize(PagestreamDbSizeResponse), +} + +#[derive(Debug, PartialEq, Eq)] +pub struct PagestreamExistsRequest { + pub latest: bool, + pub lsn: Lsn, + pub rel: RelTag, +} + +#[derive(Debug, PartialEq, Eq)] +pub struct PagestreamNblocksRequest { + pub latest: bool, + pub lsn: Lsn, + pub rel: RelTag, +} + +#[derive(Debug, PartialEq, Eq)] +pub struct PagestreamGetPageRequest { + pub latest: bool, + pub lsn: Lsn, + pub rel: RelTag, + pub blkno: u32, +} + +#[derive(Debug, PartialEq, Eq)] +pub struct PagestreamDbSizeRequest { + pub latest: bool, + pub lsn: Lsn, + pub dbnode: u32, +} + +#[derive(Debug)] +pub struct PagestreamExistsResponse { + pub exists: bool, +} + +#[derive(Debug)] +pub struct PagestreamNblocksResponse { + pub n_blocks: u32, +} + +#[derive(Debug)] +pub struct PagestreamGetPageResponse { + pub page: Bytes, +} + +#[derive(Debug)] +pub struct PagestreamErrorResponse { + pub message: String, +} + +#[derive(Debug)] +pub struct PagestreamDbSizeResponse { + pub db_size: i64, +} + +impl PagestreamFeMessage { + pub fn serialize(&self) -> Bytes { + let mut bytes = BytesMut::new(); + + match self { + Self::Exists(req) => { + bytes.put_u8(0); + bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u64(req.lsn.0); + bytes.put_u32(req.rel.spcnode); + bytes.put_u32(req.rel.dbnode); + bytes.put_u32(req.rel.relnode); + bytes.put_u8(req.rel.forknum); + } + + Self::Nblocks(req) => { + bytes.put_u8(1); + bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u64(req.lsn.0); + bytes.put_u32(req.rel.spcnode); + bytes.put_u32(req.rel.dbnode); + bytes.put_u32(req.rel.relnode); + bytes.put_u8(req.rel.forknum); + } + + Self::GetPage(req) => { + bytes.put_u8(2); + bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u64(req.lsn.0); + bytes.put_u32(req.rel.spcnode); + bytes.put_u32(req.rel.dbnode); + bytes.put_u32(req.rel.relnode); + bytes.put_u8(req.rel.forknum); + bytes.put_u32(req.blkno); + } + + Self::DbSize(req) => { + bytes.put_u8(3); + bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u64(req.lsn.0); + bytes.put_u32(req.dbnode); + } + } + + bytes.into() + } + + pub fn parse(body: &mut R) -> anyhow::Result { + // TODO these gets can fail + + // these correspond to the NeonMessageTag enum in pagestore_client.h + // + // TODO: consider using protobuf or serde bincode for less error prone + // serialization. + let msg_tag = body.read_u8()?; + match msg_tag { + 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { + latest: body.read_u8()? != 0, + lsn: Lsn::from(body.read_u64::()?), + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + })), + 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { + latest: body.read_u8()? != 0, + lsn: Lsn::from(body.read_u64::()?), + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + })), + 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { + latest: body.read_u8()? != 0, + lsn: Lsn::from(body.read_u64::()?), + rel: RelTag { + spcnode: body.read_u32::()?, + dbnode: body.read_u32::()?, + relnode: body.read_u32::()?, + forknum: body.read_u8()?, + }, + blkno: body.read_u32::()?, + })), + 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { + latest: body.read_u8()? != 0, + lsn: Lsn::from(body.read_u64::()?), + dbnode: body.read_u32::()?, + })), + _ => bail!("unknown smgr message tag: {:?}", msg_tag), + } + } +} + +impl PagestreamBeMessage { + pub fn serialize(&self) -> Bytes { + let mut bytes = BytesMut::new(); + + match self { + Self::Exists(resp) => { + bytes.put_u8(100); /* tag from pagestore_client.h */ + bytes.put_u8(resp.exists as u8); + } + + Self::Nblocks(resp) => { + bytes.put_u8(101); /* tag from pagestore_client.h */ + bytes.put_u32(resp.n_blocks); + } + + Self::GetPage(resp) => { + bytes.put_u8(102); /* tag from pagestore_client.h */ + bytes.put(&resp.page[..]); + } + + Self::Error(resp) => { + bytes.put_u8(103); /* tag from pagestore_client.h */ + bytes.put(resp.message.as_bytes()); + bytes.put_u8(0); // null terminator + } + Self::DbSize(resp) => { + bytes.put_u8(104); /* tag from pagestore_client.h */ + bytes.put_i64(resp.db_size); + } + } + + bytes.into() + } +} + +#[cfg(test)] +mod tests { + use bytes::Buf; + + use super::*; + + #[test] + fn test_pagestream() { + // Test serialization/deserialization of PagestreamFeMessage + let messages = vec![ + PagestreamFeMessage::Exists(PagestreamExistsRequest { + latest: true, + lsn: Lsn(4), + rel: RelTag { + forknum: 1, + spcnode: 2, + dbnode: 3, + relnode: 4, + }, + }), + PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { + latest: false, + lsn: Lsn(4), + rel: RelTag { + forknum: 1, + spcnode: 2, + dbnode: 3, + relnode: 4, + }, + }), + PagestreamFeMessage::GetPage(PagestreamGetPageRequest { + latest: true, + lsn: Lsn(4), + rel: RelTag { + forknum: 1, + spcnode: 2, + dbnode: 3, + relnode: 4, + }, + blkno: 7, + }), + PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { + latest: true, + lsn: Lsn(4), + dbnode: 7, + }), + ]; + for msg in messages { + let bytes = msg.serialize(); + let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap(); + assert!(msg == reconstructed); + } + } +} diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs new file mode 100644 index 0000000000..43d38bd986 --- /dev/null +++ b/libs/pageserver_api/src/reltag.rs @@ -0,0 +1,128 @@ +use serde::{Deserialize, Serialize}; +use std::cmp::Ordering; +use std::fmt; + +use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; +use postgres_ffi::relfile_utils::forknumber_to_name; +use postgres_ffi::Oid; + +/// +/// Relation data file segment id throughout the Postgres cluster. +/// +/// Every data file in Postgres is uniquely identified by 4 numbers: +/// - relation id / node (`relnode`) +/// - database id (`dbnode`) +/// - tablespace id (`spcnode`), in short this is a unique id of a separate +/// directory to store data files. +/// - forknumber (`forknum`) is used to split different kinds of data of the same relation +/// between some set of files (`relnode`, `relnode_fsm`, `relnode_vm`). +/// +/// In native Postgres code `RelFileNode` structure and individual `ForkNumber` value +/// are used for the same purpose. +/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57). +/// +// FIXME: should move 'forknum' as last field to keep this consistent with Postgres. +// Then we could replace the custo Ord and PartialOrd implementations below with +// deriving them. +#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)] +pub struct RelTag { + pub forknum: u8, + pub spcnode: Oid, + pub dbnode: Oid, + pub relnode: Oid, +} + +impl PartialOrd for RelTag { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for RelTag { + fn cmp(&self, other: &Self) -> Ordering { + let mut cmp = self.spcnode.cmp(&other.spcnode); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.dbnode.cmp(&other.dbnode); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.relnode.cmp(&other.relnode); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.forknum.cmp(&other.forknum); + + cmp + } +} + +/// Display RelTag in the same format that's used in most PostgreSQL debug messages: +/// +/// //[_fsm|_vm|_init] +/// +impl fmt::Display for RelTag { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(forkname) = forknumber_to_name(self.forknum) { + write!( + f, + "{}/{}/{}_{}", + self.spcnode, self.dbnode, self.relnode, forkname + ) + } else { + write!(f, "{}/{}/{}", self.spcnode, self.dbnode, self.relnode) + } + } +} + +impl RelTag { + pub fn to_segfile_name(&self, segno: u32) -> String { + let mut name = if self.spcnode == GLOBALTABLESPACE_OID { + "global/".to_string() + } else { + format!("base/{}/", self.dbnode) + }; + + name += &self.relnode.to_string(); + + if let Some(fork_name) = forknumber_to_name(self.forknum) { + name += "_"; + name += fork_name; + } + + if segno != 0 { + name += "."; + name += &segno.to_string(); + } + + name + } +} + +/// +/// Non-relation transaction status files (clog (a.k.a. pg_xact) and +/// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer, +/// hence the name. +/// +/// These files are global for a postgres instance. +/// +/// These files are divided into segments, which are divided into +/// pages of the same BLCKSZ as used for relation files. +/// +#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +pub enum SlruKind { + Clog, + MultiXactMembers, + MultiXactOffsets, +} + +impl SlruKind { + pub fn to_str(&self) -> &'static str { + match self { + Self::Clog => "pg_xact", + Self::MultiXactMembers => "pg_multixact/members", + Self::MultiXactOffsets => "pg_multixact/offsets", + } + } +} diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml new file mode 100644 index 0000000000..01ff6ab60e --- /dev/null +++ b/libs/postgres_ffi/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "postgres_ffi" +version = "0.1.0" +edition = "2021" + +[dependencies] +rand = "0.8.3" +regex = "1.4.5" +bytes = "1.0.1" +byteorder = "1.4.3" +anyhow = "1.0" +crc32c = "0.6.0" +hex = "0.4.3" +once_cell = "1.13.0" +log = "0.4.14" +memoffset = "0.7" +thiserror = "1.0" +serde = { version = "1.0", features = ["derive"] } +utils = { path = "../utils" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } + +[dev-dependencies] +env_logger = "0.9" +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +wal_craft = { path = "wal_craft" } + +[build-dependencies] +anyhow = "1.0" +bindgen = "0.61" diff --git a/postgres_ffi/README b/libs/postgres_ffi/README.md similarity index 73% rename from postgres_ffi/README rename to libs/postgres_ffi/README.md index 5656314fd7..de046eb3da 100644 --- a/postgres_ffi/README +++ b/libs/postgres_ffi/README.md @@ -9,9 +9,11 @@ should be auto-generated too, but that's a TODO. The PostgreSQL on-disk file format is not portable across different CPU architectures and operating systems. It is also subject to change -in each major PostgreSQL version. Currently, this module is based on -PostgreSQL v14, but in the future we will probably need a separate -copy for each PostgreSQL version. +in each major PostgreSQL version. Currently, this module supports +PostgreSQL v14 and v15: bindings and code that depends on them are version-specific. +This code is organized in modules: `postgres_ffi::v14` and `postgres_ffi::v15` +Version independend code is explicitly exported into shared `postgres_ffi`. + TODO: Currently, there is also some code that deals with WAL records in pageserver/src/waldecoder.rs. That should be moved into this diff --git a/postgres_ffi/pg_control_ffi.h b/libs/postgres_ffi/bindgen_deps.h similarity index 100% rename from postgres_ffi/pg_control_ffi.h rename to libs/postgres_ffi/bindgen_deps.h diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs new file mode 100644 index 0000000000..25ff398bbd --- /dev/null +++ b/libs/postgres_ffi/build.rs @@ -0,0 +1,154 @@ +extern crate bindgen; + +use std::env; +use std::path::PathBuf; +use std::process::Command; + +use anyhow::{anyhow, Context}; +use bindgen::callbacks::ParseCallbacks; + +#[derive(Debug)] +struct PostgresFfiCallbacks; + +impl ParseCallbacks for PostgresFfiCallbacks { + fn include_file(&self, filename: &str) { + // This does the equivalent of passing bindgen::CargoCallbacks + // to the builder .parse_callbacks() method. + let cargo_callbacks = bindgen::CargoCallbacks; + cargo_callbacks.include_file(filename) + } + + // Add any custom #[derive] attributes to the data structures that bindgen + // creates. + fn add_derives(&self, name: &str) -> Vec { + // This is the list of data structures that we want to serialize/deserialize. + let serde_list = [ + "XLogRecord", + "XLogPageHeaderData", + "XLogLongPageHeaderData", + "CheckPoint", + "FullTransactionId", + "ControlFileData", + ]; + + if serde_list.contains(&name) { + vec![ + "Default".into(), // Default allows us to easily fill the padding fields with 0. + "Serialize".into(), + "Deserialize".into(), + ] + } else { + vec![] + } + } +} + +fn main() -> anyhow::Result<()> { + // Tell cargo to invalidate the built crate whenever the wrapper changes + println!("cargo:rerun-if-changed=bindgen_deps.h"); + + // Finding the location of C headers for the Postgres server: + // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/pg_install` + // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/pg_install/{PG_MAJORVERSION}/include/postgresql/server` + let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") { + postgres_install_dir.into() + } else { + PathBuf::from("pg_install") + }; + + for pg_version in &["v14", "v15"] { + let mut pg_install_dir_versioned = pg_install_dir.join(pg_version); + if pg_install_dir_versioned.is_relative() { + let cwd = env::current_dir().context("Failed to get current_dir")?; + pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned); + } + + let pg_config_bin = pg_install_dir_versioned + .join(pg_version) + .join("bin") + .join("pg_config"); + let inc_server_path: String = if pg_config_bin.exists() { + let output = Command::new(pg_config_bin) + .arg("--includedir-server") + .output() + .context("failed to execute `pg_config --includedir-server`")?; + + if !output.status.success() { + panic!("`pg_config --includedir-server` failed") + } + + String::from_utf8(output.stdout) + .context("pg_config output is not UTF-8")? + .trim_end() + .into() + } else { + let server_path = pg_install_dir_versioned + .join("include") + .join("postgresql") + .join("server") + .into_os_string(); + server_path + .into_string() + .map_err(|s| anyhow!("Bad postgres server path {s:?}"))? + }; + + // The bindgen::Builder is the main entry point + // to bindgen, and lets you build up options for + // the resulting bindings. + let bindings = bindgen::Builder::default() + // + // All the needed PostgreSQL headers are included from 'bindgen_deps.h' + // + .header("bindgen_deps.h") + // + // Tell cargo to invalidate the built crate whenever any of the + // included header files changed. + // + .parse_callbacks(Box::new(PostgresFfiCallbacks)) + // + // These are the types and constants that we want to generate bindings for + // + .allowlist_type("BlockNumber") + .allowlist_type("OffsetNumber") + .allowlist_type("XLogRecPtr") + .allowlist_type("XLogSegNo") + .allowlist_type("TimeLineID") + .allowlist_type("TimestampTz") + .allowlist_type("MultiXactId") + .allowlist_type("MultiXactOffset") + .allowlist_type("MultiXactStatus") + .allowlist_type("ControlFileData") + .allowlist_type("CheckPoint") + .allowlist_type("FullTransactionId") + .allowlist_type("XLogRecord") + .allowlist_type("XLogPageHeaderData") + .allowlist_type("XLogLongPageHeaderData") + .allowlist_var("XLOG_PAGE_MAGIC") + .allowlist_var("PG_CONTROL_FILE_SIZE") + .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") + .allowlist_type("PageHeaderData") + .allowlist_type("DBState") + // Because structs are used for serialization, tell bindgen to emit + // explicit padding fields. + .explicit_padding(true) + // + .clang_arg(format!("-I{inc_server_path}")) + // + // Finish the builder and generate the bindings. + // + .generate() + .context("Unable to generate bindings")?; + + // Write the bindings to the $OUT_DIR/bindings_$pg_version.rs file. + let out_path: PathBuf = env::var("OUT_DIR") + .context("Couldn't read OUT_DIR environment variable var")? + .into(); + let filename = format!("bindings_{pg_version}.rs"); + + bindings + .write_to_file(out_path.join(filename)) + .context("Couldn't write bindings")?; + } + + Ok(()) +} diff --git a/postgres_ffi/samples/pg_hba.conf b/libs/postgres_ffi/samples/pg_hba.conf similarity index 100% rename from postgres_ffi/samples/pg_hba.conf rename to libs/postgres_ffi/samples/pg_hba.conf diff --git a/postgres_ffi/src/controlfile_utils.rs b/libs/postgres_ffi/src/controlfile_utils.rs similarity index 96% rename from postgres_ffi/src/controlfile_utils.rs rename to libs/postgres_ffi/src/controlfile_utils.rs index b72c86c71c..0918d15001 100644 --- a/postgres_ffi/src/controlfile_utils.rs +++ b/libs/postgres_ffi/src/controlfile_utils.rs @@ -23,7 +23,7 @@ //! information. You can use PostgreSQL's pg_controldata utility to view its //! contents. //! -use crate::{ControlFileData, PG_CONTROL_FILE_SIZE}; +use super::bindings::{ControlFileData, PG_CONTROL_FILE_SIZE}; use anyhow::{bail, Result}; use bytes::{Bytes, BytesMut}; @@ -43,7 +43,7 @@ impl ControlFileData { /// Interpret a slice of bytes as a Postgres control file. /// pub fn decode(buf: &[u8]) -> Result { - use zenith_utils::bin_ser::LeSer; + use utils::bin_ser::LeSer; // Check that the slice has the expected size. The control file is // padded with zeros up to a 512 byte sector size, so accept a @@ -77,7 +77,7 @@ impl ControlFileData { /// /// The CRC is recomputed to match the contents of the fields. pub fn encode(&self) -> Bytes { - use zenith_utils::bin_ser::LeSer; + use utils::bin_ser::LeSer; // Serialize into a new buffer. let b = self.ser().unwrap(); diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs new file mode 100644 index 0000000000..f3dad159be --- /dev/null +++ b/libs/postgres_ffi/src/lib.rs @@ -0,0 +1,237 @@ +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +// bindgen creates some unsafe code with no doc comments. +#![allow(clippy::missing_safety_doc)] +// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code. +#![allow(clippy::useless_transmute)] +// modules included with the postgres_ffi macro depend on the types of the specific version's +// types, and trigger a too eager lint. +#![allow(clippy::duplicate_mod)] + +use bytes::Bytes; +use utils::bin_ser::SerializeError; +use utils::lsn::Lsn; + +macro_rules! postgres_ffi { + ($version:ident) => { + #[path = "."] + pub mod $version { + pub mod bindings { + // bindgen generates bindings for a lot of stuff we don't need + #![allow(dead_code)] + + use serde::{Deserialize, Serialize}; + include!(concat!( + env!("OUT_DIR"), + "/bindings_", + stringify!($version), + ".rs" + )); + + include!(concat!("pg_constants_", stringify!($version), ".rs")); + } + pub mod controlfile_utils; + pub mod nonrelfile_utils; + pub mod waldecoder_handler; + pub mod xlog_utils; + + pub const PG_MAJORVERSION: &str = stringify!($version); + + // Re-export some symbols from bindings + pub use bindings::DBState_DB_SHUTDOWNED; + pub use bindings::{CheckPoint, ControlFileData, XLogRecord}; + } + }; +} + +postgres_ffi!(v14); +postgres_ffi!(v15); + +pub mod pg_constants; +pub mod relfile_utils; + +// Export some widely used datatypes that are unlikely to change across Postgres versions +pub use v14::bindings::{uint32, uint64, Oid}; +pub use v14::bindings::{BlockNumber, OffsetNumber}; +pub use v14::bindings::{MultiXactId, TransactionId}; +pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo}; + +// Likewise for these, although the assumption that these don't change is a little more iffy. +pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; +pub use v14::bindings::{PageHeaderData, XLogRecord}; +pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; + +pub use v14::bindings::{CheckPoint, ControlFileData}; + +// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and +// --with-segsize=SEGSIZE, but assume the defaults for now. +pub const BLCKSZ: u16 = 8192; +pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32); +pub const XLOG_BLCKSZ: usize = 8192; +pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; + +pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; + +// Export some version independent functions that are used outside of this mod +pub use v14::xlog_utils::encode_logical_message; +pub use v14::xlog_utils::get_current_timestamp; +pub use v14::xlog_utils::to_pg_timestamp; +pub use v14::xlog_utils::XLogFileName; + +pub use v14::bindings::DBState_DB_SHUTDOWNED; + +pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result { + match version { + 14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0), + 15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0 + || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0 + || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0), + _ => anyhow::bail!("Unknown version {}", version), + } +} + +pub fn generate_wal_segment( + segno: u64, + system_id: u64, + pg_version: u32, +) -> Result { + match pg_version { + 14 => v14::xlog_utils::generate_wal_segment(segno, system_id), + 15 => v15::xlog_utils::generate_wal_segment(segno, system_id), + _ => Err(SerializeError::BadInput), + } +} + +pub fn generate_pg_control( + pg_control_bytes: &[u8], + checkpoint_bytes: &[u8], + lsn: Lsn, + pg_version: u32, +) -> anyhow::Result<(Bytes, u64)> { + match pg_version { + 14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), + 15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), + _ => anyhow::bail!("Unknown version {}", pg_version), + } +} + +// PG timeline is always 1, changing it doesn't have any useful meaning in Neon. +// +// NOTE: this is not to be confused with Neon timelines; different concept! +// +// It's a shaky assumption, that it's always 1. We might import a +// PostgreSQL data directory that has gone through timeline bumps, +// for example. FIXME later. +pub const PG_TLI: u32 = 1; + +// See TransactionIdIsNormal in transam.h +pub const fn transaction_id_is_normal(id: TransactionId) -> bool { + id > pg_constants::FIRST_NORMAL_TRANSACTION_ID +} + +// See TransactionIdPrecedes in transam.c +pub const fn transaction_id_precedes(id1: TransactionId, id2: TransactionId) -> bool { + /* + * If either ID is a permanent XID then we can just do unsigned + * comparison. If both are normal, do a modulo-2^32 comparison. + */ + + if !(transaction_id_is_normal(id1)) || !transaction_id_is_normal(id2) { + return id1 < id2; + } + + let diff = id1.wrapping_sub(id2) as i32; + diff < 0 +} + +// Check if page is not yet initialized (port of Postgres PageIsInit() macro) +pub fn page_is_new(pg: &[u8]) -> bool { + pg[14] == 0 && pg[15] == 0 // pg_upper == 0 +} + +// ExtractLSN from page header +pub fn page_get_lsn(pg: &[u8]) -> Lsn { + Lsn( + ((u32::from_le_bytes(pg[0..4].try_into().unwrap()) as u64) << 32) + | u32::from_le_bytes(pg[4..8].try_into().unwrap()) as u64, + ) +} + +pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) { + pg[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); + pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); +} + +pub mod waldecoder { + + use crate::{v14, v15}; + use bytes::{Buf, Bytes, BytesMut}; + use std::num::NonZeroU32; + use thiserror::Error; + use utils::lsn::Lsn; + + pub enum State { + WaitingForRecord, + ReassemblingRecord { + recordbuf: BytesMut, + contlen: NonZeroU32, + }, + SkippingEverything { + skip_until_lsn: Lsn, + }, + } + + pub struct WalStreamDecoder { + pub lsn: Lsn, + pub pg_version: u32, + pub inputbuf: BytesMut, + pub state: State, + } + + #[derive(Error, Debug, Clone)] + #[error("{msg} at {lsn}")] + pub struct WalDecodeError { + pub msg: String, + pub lsn: Lsn, + } + + impl WalStreamDecoder { + pub fn new(lsn: Lsn, pg_version: u32) -> WalStreamDecoder { + WalStreamDecoder { + lsn, + pg_version, + inputbuf: BytesMut::new(), + state: State::WaitingForRecord, + } + } + + // The latest LSN position fed to the decoder. + pub fn available(&self) -> Lsn { + self.lsn + self.inputbuf.remaining() as u64 + } + + pub fn feed_bytes(&mut self, buf: &[u8]) { + self.inputbuf.extend_from_slice(buf); + } + + pub fn poll_decode(&mut self) -> Result, WalDecodeError> { + match self.pg_version { + // This is a trick to support both versions simultaneously. + // See WalStreamDecoderHandler comments. + 14 => { + use self::v14::waldecoder_handler::WalStreamDecoderHandler; + self.poll_decode_internal() + } + 15 => { + use self::v15::waldecoder_handler::WalStreamDecoderHandler; + self.poll_decode_internal() + } + _ => Err(WalDecodeError { + msg: format!("Unknown version {}", self.pg_version), + lsn: self.lsn, + }), + } + } + } +} diff --git a/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs similarity index 96% rename from postgres_ffi/src/nonrelfile_utils.rs rename to libs/postgres_ffi/src/nonrelfile_utils.rs index b92207cd81..01e5554b8a 100644 --- a/postgres_ffi/src/nonrelfile_utils.rs +++ b/libs/postgres_ffi/src/nonrelfile_utils.rs @@ -1,11 +1,12 @@ //! //! Common utilities for dealing with PostgreSQL non-relation files. //! -use crate::{pg_constants, transaction_id_precedes}; +use crate::pg_constants; +use crate::transaction_id_precedes; use bytes::BytesMut; use log::*; -use crate::MultiXactId; +use super::bindings::MultiXactId; pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) { trace!( diff --git a/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs similarity index 88% rename from postgres_ffi/src/pg_constants.rs rename to libs/postgres_ffi/src/pg_constants.rs index 76f837cefc..6aaa739a69 100644 --- a/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -1,13 +1,16 @@ //! //! Misc constants, copied from PostgreSQL headers. //! +//! Only place version-independent constants here. +//! //! TODO: These probably should be auto-generated using bindgen, //! rather than copied by hand. Although on the other hand, it's nice //! to have them all here in one place, and have the ability to add //! comments on them. //! -use crate::PageHeaderData; +use crate::BLCKSZ; +use crate::{PageHeaderData, XLogRecord}; // // From pg_tablespace_d.h @@ -15,24 +18,14 @@ use crate::PageHeaderData; pub const DEFAULTTABLESPACE_OID: u32 = 1663; pub const GLOBALTABLESPACE_OID: u32 = 1664; -// -// Fork numbers, from relpath.h -// -pub const MAIN_FORKNUM: u8 = 0; -pub const FSM_FORKNUM: u8 = 1; -pub const VISIBILITYMAP_FORKNUM: u8 = 2; -pub const INIT_FORKNUM: u8 = 3; - // From storage_xlog.h +pub const XLOG_SMGR_CREATE: u8 = 0x10; +pub const XLOG_SMGR_TRUNCATE: u8 = 0x20; + pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001; pub const SMGR_TRUNCATE_VM: u32 = 0x0002; pub const SMGR_TRUNCATE_FSM: u32 = 0x0004; -// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and -// --with-segsize=SEGSIZE, but assume the defaults for now. -pub const BLCKSZ: u16 = 8192; -pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32); - // // From bufpage.h // @@ -113,10 +106,8 @@ pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4; // From pg_control.h and rmgrlist.h pub const XLOG_NEXTOID: u8 = 0x30; pub const XLOG_SWITCH: u8 = 0x40; -pub const XLOG_SMGR_TRUNCATE: u8 = 0x20; pub const XLOG_FPI_FOR_HINT: u8 = 0xA0; pub const XLOG_FPI: u8 = 0xB0; -pub const DB_SHUTDOWNED: u32 = 1; // From multixact.h pub const FIRST_MULTIXACT_ID: u32 = 1; @@ -171,14 +162,10 @@ pub const RM_HEAP_ID: u8 = 10; pub const XLR_INFO_MASK: u8 = 0x0F; pub const XLR_RMGR_INFO_MASK: u8 = 0xF0; -// from dbcommands_xlog.h -pub const XLOG_DBASE_CREATE: u8 = 0x00; -pub const XLOG_DBASE_DROP: u8 = 0x10; - pub const XLOG_TBLSPC_CREATE: u8 = 0x00; pub const XLOG_TBLSPC_DROP: u8 = 0x10; -pub const SIZEOF_XLOGRECORD: u32 = 24; +pub const SIZEOF_XLOGRECORD: u32 = std::mem::size_of::() as u32; // // from xlogrecord.h @@ -199,8 +186,6 @@ pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous /* Information stored in bimg_info */ pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */ -pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ -pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ /* From transam.h */ pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3; @@ -208,16 +193,10 @@ pub const INVALID_TRANSACTION_ID: u32 = 0; pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000; pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384; -/* FIXME: pageserver should request wal_seg_size from compute node */ -pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024; - -pub const XLOG_BLCKSZ: usize = 8192; pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; pub const XLP_LONG_HEADER: u16 = 0x0002; -pub const PG_MAJORVERSION: &str = "14"; - // List of subdirectories inside pgdata. // Copied from src/bin/initdb/initdb.c pub const PGDATA_SUBDIRS: [&str; 22] = [ diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs new file mode 100644 index 0000000000..810898ee80 --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v14.rs @@ -0,0 +1,5 @@ +pub const XLOG_DBASE_CREATE: u8 = 0x00; +pub const XLOG_DBASE_DROP: u8 = 0x10; + +pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */ +pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */ diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs new file mode 100644 index 0000000000..6fa5eb008c --- /dev/null +++ b/libs/postgres_ffi/src/pg_constants_v15.rs @@ -0,0 +1,10 @@ +pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; + +pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; +pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x00; +pub const XLOG_DBASE_DROP: u8 = 0x20; + +pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */ +pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */ +pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */ diff --git a/postgres_ffi/src/relfile_utils.rs b/libs/postgres_ffi/src/relfile_utils.rs similarity index 83% rename from postgres_ffi/src/relfile_utils.rs rename to libs/postgres_ffi/src/relfile_utils.rs index 97c8f0afea..1dc9f367ff 100644 --- a/postgres_ffi/src/relfile_utils.rs +++ b/libs/postgres_ffi/src/relfile_utils.rs @@ -1,11 +1,18 @@ //! //! Common utilities for dealing with PostgreSQL relation files. //! -use crate::pg_constants; -use lazy_static::lazy_static; +use once_cell::sync::OnceCell; use regex::Regex; -#[derive(Debug, Clone, thiserror::Error, PartialEq)] +// +// Fork numbers, from relpath.h +// +pub const MAIN_FORKNUM: u8 = 0; +pub const FSM_FORKNUM: u8 = 1; +pub const VISIBILITYMAP_FORKNUM: u8 = 2; +pub const INIT_FORKNUM: u8 = 3; + +#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] pub enum FilePathError { #[error("invalid relation fork name")] InvalidForkName, @@ -23,10 +30,10 @@ impl From for FilePathError { pub fn forkname_to_number(forkname: Option<&str>) -> Result { match forkname { // "main" is not in filenames, it's implicit if the fork name is not present - None => Ok(pg_constants::MAIN_FORKNUM), - Some("fsm") => Ok(pg_constants::FSM_FORKNUM), - Some("vm") => Ok(pg_constants::VISIBILITYMAP_FORKNUM), - Some("init") => Ok(pg_constants::INIT_FORKNUM), + None => Ok(MAIN_FORKNUM), + Some("fsm") => Ok(FSM_FORKNUM), + Some("vm") => Ok(VISIBILITYMAP_FORKNUM), + Some("init") => Ok(INIT_FORKNUM), Some(_) => Err(FilePathError::InvalidForkName), } } @@ -34,10 +41,10 @@ pub fn forkname_to_number(forkname: Option<&str>) -> Result { /// Convert Postgres fork number to the right suffix of the relation data file. pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> { match forknum { - pg_constants::MAIN_FORKNUM => None, - pg_constants::FSM_FORKNUM => Some("fsm"), - pg_constants::VISIBILITYMAP_FORKNUM => Some("vm"), - pg_constants::INIT_FORKNUM => Some("init"), + MAIN_FORKNUM => None, + FSM_FORKNUM => Some("fsm"), + VISIBILITYMAP_FORKNUM => Some("vm"), + INIT_FORKNUM => Some("init"), _ => Some("UNKNOWN FORKNUM"), } } @@ -54,11 +61,14 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> { /// See functions relpath() and _mdfd_segpath() in PostgreSQL sources. /// pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> { - lazy_static! { - static ref RELFILE_RE: Regex = - Regex::new(r"^(?P\d+)(_(?P[a-z]+))?(\.(?P\d+))?$").unwrap(); - } + static RELFILE_RE: OnceCell = OnceCell::new(); + RELFILE_RE.get_or_init(|| { + Regex::new(r"^(?P\d+)(_(?P[a-z]+))?(\.(?P\d+))?$").unwrap() + }); + let caps = RELFILE_RE + .get() + .unwrap() .captures(fname) .ok_or(FilePathError::InvalidFileName)?; diff --git a/libs/postgres_ffi/src/waldecoder_handler.rs b/libs/postgres_ffi/src/waldecoder_handler.rs new file mode 100644 index 0000000000..b4d50375bd --- /dev/null +++ b/libs/postgres_ffi/src/waldecoder_handler.rs @@ -0,0 +1,254 @@ +//! +//! Basic WAL stream decoding. +//! +//! This understands the WAL page and record format, enough to figure out where the WAL record +//! boundaries are, and to reassemble WAL records that cross page boundaries. +//! +//! This functionality is needed by both the pageserver and the safekeepers. The pageserver needs +//! to look deeper into the WAL records to also understand which blocks they modify, the code +//! for that is in pageserver/src/walrecord.rs +//! +use super::super::waldecoder::{State, WalDecodeError, WalStreamDecoder}; +use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC}; +use super::xlog_utils::*; +use crate::WAL_SEGMENT_SIZE; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use crc32c::*; +use log::*; +use std::cmp::min; +use std::num::NonZeroU32; +use utils::lsn::Lsn; + +pub trait WalStreamDecoderHandler { + fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError>; + fn poll_decode_internal(&mut self) -> Result, WalDecodeError>; + fn complete_record(&mut self, recordbuf: Bytes) -> Result<(Lsn, Bytes), WalDecodeError>; +} + +// +// This is a trick to support several postgres versions simultaneously. +// +// Page decoding code depends on postgres bindings, so it is compiled for each version. +// Thus WalStreamDecoder implements several WalStreamDecoderHandler traits. +// WalStreamDecoder poll_decode() method dispatches to the right handler based on the postgres version. +// Other methods are internal and are not dispatched. +// +// It is similar to having several impl blocks for the same struct, +// but the impls here are in different modules, so need to use a trait. +// +impl WalStreamDecoderHandler for WalStreamDecoder { + fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError> { + let validate_impl = || { + if hdr.xlp_magic != XLOG_PAGE_MAGIC as u16 { + return Err(format!( + "invalid xlog page header: xlp_magic={}, expected {}", + hdr.xlp_magic, XLOG_PAGE_MAGIC + )); + } + if hdr.xlp_pageaddr != self.lsn.0 { + return Err(format!( + "invalid xlog page header: xlp_pageaddr={}, expected {}", + hdr.xlp_pageaddr, self.lsn + )); + } + match self.state { + State::WaitingForRecord => { + if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD != 0 { + return Err( + "invalid xlog page header: unexpected XLP_FIRST_IS_CONTRECORD".into(), + ); + } + if hdr.xlp_rem_len != 0 { + return Err(format!( + "invalid xlog page header: xlp_rem_len={}, but it's not a contrecord", + hdr.xlp_rem_len + )); + } + } + State::ReassemblingRecord { contlen, .. } => { + if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD == 0 { + return Err( + "invalid xlog page header: XLP_FIRST_IS_CONTRECORD expected, not found" + .into(), + ); + } + if hdr.xlp_rem_len != contlen.get() { + return Err(format!( + "invalid xlog page header: xlp_rem_len={}, expected {}", + hdr.xlp_rem_len, + contlen.get() + )); + } + } + State::SkippingEverything { .. } => { + panic!("Should not be validating page header in the SkippingEverything state"); + } + }; + Ok(()) + }; + validate_impl().map_err(|msg| WalDecodeError { msg, lsn: self.lsn }) + } + + /// Attempt to decode another WAL record from the input that has been fed to the + /// decoder so far. + /// + /// Returns one of the following: + /// Ok((Lsn, Bytes)): a tuple containing the LSN of next record, and the record itself + /// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function + /// Err(WalDecodeError): an error occurred while decoding, meaning the input was invalid. + /// + fn poll_decode_internal(&mut self) -> Result, WalDecodeError> { + // Run state machine that validates page headers, and reassembles records + // that cross page boundaries. + loop { + // parse and verify page boundaries as we go + // However, we may have to skip some page headers if we're processing the XLOG_SWITCH record or skipping padding for whatever reason. + match self.state { + State::WaitingForRecord | State::ReassemblingRecord { .. } => { + if self.lsn.segment_offset(WAL_SEGMENT_SIZE) == 0 { + // parse long header + + if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD { + return Ok(None); + } + + let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err( + |e| WalDecodeError { + msg: format!("long header deserialization failed {}", e), + lsn: self.lsn, + }, + )?; + + self.validate_page_header(&hdr.std)?; + + self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64; + } else if self.lsn.block_offset() == 0 { + if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD { + return Ok(None); + } + + let hdr = + XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| { + WalDecodeError { + msg: format!("header deserialization failed {}", e), + lsn: self.lsn, + } + })?; + + self.validate_page_header(&hdr)?; + + self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64; + } + } + State::SkippingEverything { .. } => {} + } + // now read page contents + match &mut self.state { + State::WaitingForRecord => { + // need to have at least the xl_tot_len field + if self.inputbuf.remaining() < 4 { + return Ok(None); + } + + // peek xl_tot_len at the beginning of the record. + // FIXME: assumes little-endian + let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le(); + if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD { + return Err(WalDecodeError { + msg: format!("invalid xl_tot_len {}", xl_tot_len), + lsn: self.lsn, + }); + } + // Fast path for the common case that the whole record fits on the page. + let pageleft = self.lsn.remaining_in_block() as u32; + if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft { + self.lsn += xl_tot_len as u64; + let recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize); + return Ok(Some(self.complete_record(recordbuf)?)); + } else { + // Need to assemble the record from pieces. Remember the size of the + // record, and loop back. On next iterations, we will reach the branch + // below, and copy the part of the record that was on this or next page(s) + // to 'recordbuf'. Subsequent iterations will skip page headers, and + // append the continuations from the next pages to 'recordbuf'. + self.state = State::ReassemblingRecord { + recordbuf: BytesMut::with_capacity(xl_tot_len as usize), + contlen: NonZeroU32::new(xl_tot_len).unwrap(), + } + } + } + State::ReassemblingRecord { recordbuf, contlen } => { + // we're continuing a record, possibly from previous page. + let pageleft = self.lsn.remaining_in_block() as u32; + + // read the rest of the record, or as much as fits on this page. + let n = min(contlen.get(), pageleft) as usize; + + if self.inputbuf.remaining() < n { + return Ok(None); + } + + recordbuf.put(self.inputbuf.split_to(n)); + self.lsn += n as u64; + *contlen = match NonZeroU32::new(contlen.get() - n as u32) { + Some(x) => x, + None => { + // The record is now complete. + let recordbuf = std::mem::replace(recordbuf, BytesMut::new()).freeze(); + return Ok(Some(self.complete_record(recordbuf)?)); + } + } + } + State::SkippingEverything { skip_until_lsn } => { + assert!(*skip_until_lsn >= self.lsn); + let n = skip_until_lsn.0 - self.lsn.0; + if self.inputbuf.remaining() < n as usize { + return Ok(None); + } + self.inputbuf.advance(n as usize); + self.lsn += n; + self.state = State::WaitingForRecord; + } + } + } + } + + fn complete_record(&mut self, recordbuf: Bytes) -> Result<(Lsn, Bytes), WalDecodeError> { + // We now have a record in the 'recordbuf' local variable. + let xlogrec = + XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| { + WalDecodeError { + msg: format!("xlog record deserialization failed {}", e), + lsn: self.lsn, + } + })?; + + let mut crc = 0; + crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]); + crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]); + if crc != xlogrec.xl_crc { + return Err(WalDecodeError { + msg: "WAL record crc mismatch".into(), + lsn: self.lsn, + }); + } + + // XLOG_SWITCH records are special. If we see one, we need to skip + // to the next WAL segment. + let next_lsn = if xlogrec.is_xlog_switch_record() { + trace!("saw xlog switch record at {}", self.lsn); + self.lsn + self.lsn.calc_padding(WAL_SEGMENT_SIZE as u64) + } else { + // Pad to an 8-byte boundary + self.lsn.align() + }; + self.state = State::SkippingEverything { + skip_until_lsn: next_lsn, + }; + + // We should return LSN of the next record, not the last byte of this record or + // the byte immediately after. Note that this handles both XLOG_SWITCH and usual + // records, the former "spans" until the next WAL segment (see test_xlog_switch). + Ok((next_lsn, recordbuf)) + } +} diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs new file mode 100644 index 0000000000..953723a8f0 --- /dev/null +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -0,0 +1,658 @@ +// +// This file contains common utilities for dealing with PostgreSQL WAL files and +// LSNs. +// +// Many of these functions have been copied from PostgreSQL, and rewritten in +// Rust. That's why they don't follow the usual Rust naming conventions, they +// have been named the same as the corresponding PostgreSQL functions instead. +// + +use crc32c::crc32c_append; + +use super::super::waldecoder::WalStreamDecoder; +use super::bindings::{ + CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz, + XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, +}; +use super::PG_MAJORVERSION; +use crate::pg_constants; +use crate::PG_TLI; +use crate::{uint32, uint64, Oid}; +use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; + +use bytes::BytesMut; +use bytes::{Buf, Bytes}; + +use log::*; + +use serde::Serialize; +use std::fs::File; +use std::io::prelude::*; +use std::io::ErrorKind; +use std::io::SeekFrom; +use std::path::{Path, PathBuf}; +use std::time::SystemTime; +use utils::bin_ser::DeserializeError; +use utils::bin_ser::SerializeError; + +use utils::lsn::Lsn; + +pub const XLOG_FNAME_LEN: usize = 24; +pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; +pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8; +pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2; + +pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::(); +pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::(); +pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::(); +#[allow(clippy::identity_op)] +pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; + +/// Interval of checkpointing metadata file. We should store metadata file to enforce +/// predicate that checkpoint.nextXid is larger than any XID in WAL. +/// But flushing checkpoint file for each transaction seems to be too expensive, +/// so XID_CHECKPOINT_INTERVAL is used to forward align nextXid and so perform +/// metadata checkpoint only once per XID_CHECKPOINT_INTERVAL transactions. +/// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE +/// in order to let CLOG_TRUNCATE mechanism correctly extend CLOG. +const XID_CHECKPOINT_INTERVAL: u32 = 1024; + +pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo { + (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo +} + +pub fn XLogSegNoOffsetToRecPtr( + segno: XLogSegNo, + offset: u32, + wal_segsz_bytes: usize, +) -> XLogRecPtr { + segno * (wal_segsz_bytes as u64) + (offset as u64) +} + +pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { + format!( + "{:>08X}{:>08X}{:>08X}", + tli, + logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes), + logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes) + ) +} + +pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) { + let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); + let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; + let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo; + (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli) +} + +pub fn IsXLogFileName(fname: &str) -> bool { + return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit()); +} + +pub fn IsPartialXLogFileName(fname: &str) -> bool { + fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8]) +} + +/// If LSN points to the beginning of the page, then shift it to first record, +/// otherwise align on 8-bytes boundary (required for WAL records) +pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn { + if lsn.0 % XLOG_BLCKSZ as u64 == 0 { + let hdr_size = if lsn.0 % seg_sz as u64 == 0 { + XLOG_SIZE_OF_XLOG_LONG_PHD + } else { + XLOG_SIZE_OF_XLOG_SHORT_PHD + }; + lsn + hdr_size as u64 + } else { + lsn.align() + } +} + +pub fn generate_pg_control( + pg_control_bytes: &[u8], + checkpoint_bytes: &[u8], + lsn: Lsn, +) -> anyhow::Result<(Bytes, u64)> { + let mut pg_control = ControlFileData::decode(pg_control_bytes)?; + let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?; + + // Generate new pg_control needed for bootstrap + checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0; + + //reset some fields we don't want to preserve + //TODO Check this. + //We may need to determine the value from twophase data. + checkpoint.oldestActiveXid = 0; + + //save new values in pg_control + pg_control.checkPoint = 0; + pg_control.checkPointCopy = checkpoint; + pg_control.state = DBState_DB_SHUTDOWNED; + + Ok((pg_control.encode(), pg_control.system_identifier)) +} + +pub fn get_current_timestamp() -> TimestampTz { + to_pg_timestamp(SystemTime::now()) +} + +pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz { + const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */ + const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */ + const SECS_PER_DAY: u64 = 86400; + const USECS_PER_SEC: u64 = 1000000; + match time.duration_since(SystemTime::UNIX_EPOCH) { + Ok(n) => { + ((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY)) + * USECS_PER_SEC + + n.subsec_micros() as u64) as i64 + } + Err(_) => panic!("SystemTime before UNIX EPOCH!"), + } +} + +// Returns (aligned) end_lsn of the last record in data_dir with WAL segments. +// start_lsn must point to some previously known record boundary (beginning of +// the next record). If no valid record after is found, start_lsn is returned +// back. +pub fn find_end_of_wal( + data_dir: &Path, + wal_seg_size: usize, + start_lsn: Lsn, // start reading WAL at this point; must point at record start_lsn. +) -> anyhow::Result { + let mut result = start_lsn; + let mut curr_lsn = start_lsn; + let mut buf = [0u8; XLOG_BLCKSZ]; + let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); + debug!("find_end_of_wal PG_VERSION: {}", pg_version); + + let mut decoder = WalStreamDecoder::new(start_lsn, pg_version); + + // loop over segments + loop { + let segno = curr_lsn.segment_number(wal_seg_size); + let seg_file_name = XLogFileName(PG_TLI, segno, wal_seg_size); + let seg_file_path = data_dir.join(seg_file_name); + match open_wal_segment(&seg_file_path)? { + None => { + // no more segments + debug!( + "find_end_of_wal reached end at {:?}, segment {:?} doesn't exist", + result, seg_file_path + ); + return Ok(result); + } + Some(mut segment) => { + let seg_offs = curr_lsn.segment_offset(wal_seg_size); + segment.seek(SeekFrom::Start(seg_offs as u64))?; + // loop inside segment + loop { + let bytes_read = segment.read(&mut buf)?; + if bytes_read == 0 { + break; // EOF + } + curr_lsn += bytes_read as u64; + decoder.feed_bytes(&buf[0..bytes_read]); + + // advance result past all completely read records + loop { + match decoder.poll_decode() { + Ok(Some(record)) => result = record.0, + Err(e) => { + debug!( + "find_end_of_wal reached end at {:?}, decode error: {:?}", + result, e + ); + return Ok(result); + } + Ok(None) => break, // need more data + } + } + } + } + } + } +} + +// Open .partial or full WAL segment file, if present. +fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result> { + let mut partial_path = seg_file_path.to_owned(); + partial_path.set_extension("partial"); + match File::open(partial_path) { + Ok(file) => Ok(Some(file)), + Err(e) => match e.kind() { + ErrorKind::NotFound => { + // .partial not found, try full + match File::open(seg_file_path) { + Ok(file) => Ok(Some(file)), + Err(e) => match e.kind() { + ErrorKind::NotFound => Ok(None), + _ => Err(e.into()), + }, + } + } + _ => Err(e.into()), + }, + } +} + +pub fn main() { + let mut data_dir = PathBuf::new(); + data_dir.push("."); + let wal_end = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, Lsn(0)).unwrap(); + println!("wal_end={:?}", wal_end); +} + +impl XLogRecord { + pub fn from_slice(buf: &[u8]) -> Result { + use utils::bin_ser::LeSer; + XLogRecord::des(buf) + } + + pub fn from_bytes(buf: &mut B) -> Result { + use utils::bin_ser::LeSer; + XLogRecord::des_from(&mut buf.reader()) + } + + pub fn encode(&self) -> Result { + use utils::bin_ser::LeSer; + Ok(self.ser()?.into()) + } + + // Is this record an XLOG_SWITCH record? They need some special processing, + pub fn is_xlog_switch_record(&self) -> bool { + self.xl_info == pg_constants::XLOG_SWITCH && self.xl_rmid == pg_constants::RM_XLOG_ID + } +} + +impl XLogPageHeaderData { + pub fn from_bytes(buf: &mut B) -> Result { + use utils::bin_ser::LeSer; + XLogPageHeaderData::des_from(&mut buf.reader()) + } +} + +impl XLogLongPageHeaderData { + pub fn from_bytes(buf: &mut B) -> Result { + use utils::bin_ser::LeSer; + XLogLongPageHeaderData::des_from(&mut buf.reader()) + } + + pub fn encode(&self) -> Result { + use utils::bin_ser::LeSer; + self.ser().map(|b| b.into()) + } +} + +pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::(); + +impl CheckPoint { + pub fn encode(&self) -> Result { + use utils::bin_ser::LeSer; + Ok(self.ser()?.into()) + } + + pub fn decode(buf: &[u8]) -> Result { + use utils::bin_ser::LeSer; + CheckPoint::des(buf) + } + + /// Update next XID based on provided new_xid and stored epoch. + /// Next XID should be greater than new_xid. This handles 32-bit + /// XID wraparound correctly. + /// + /// Returns 'true' if the XID was updated. + pub fn update_next_xid(&mut self, xid: u32) -> bool { + // nextXid should nw greater than any XID in WAL, so increment provided XID and check for wraparround. + let mut new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID); + // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL. + // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE + new_xid = + new_xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1); + let full_xid = self.nextXid.value; + let old_xid = full_xid as u32; + if new_xid.wrapping_sub(old_xid) as i32 > 0 { + let mut epoch = full_xid >> 32; + if new_xid < old_xid { + // wrap-around + epoch += 1; + } + let nextXid = (epoch << 32) | new_xid as u64; + + if nextXid != self.nextXid.value { + self.nextXid = FullTransactionId { value: nextXid }; + return true; + } + } + false + } +} + +// +// Generate new, empty WAL segment. +// We need this segment to start compute node. +// +pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result { + let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE as usize); + + let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); + let hdr = XLogLongPageHeaderData { + std: { + XLogPageHeaderData { + xlp_magic: XLOG_PAGE_MAGIC as u16, + xlp_info: pg_constants::XLP_LONG_HEADER, + xlp_tli: PG_TLI, + xlp_pageaddr: pageaddr, + xlp_rem_len: 0, + ..Default::default() // Put 0 in padding fields. + } + }, + xlp_sysid: system_id, + xlp_seg_size: WAL_SEGMENT_SIZE as u32, + xlp_xlog_blcksz: XLOG_BLCKSZ as u32, + }; + + let hdr_bytes = hdr.encode()?; + seg_buf.extend_from_slice(&hdr_bytes); + + //zero out the rest of the file + seg_buf.resize(WAL_SEGMENT_SIZE, 0); + Ok(seg_buf.freeze()) +} + +#[repr(C)] +#[derive(Serialize)] +struct XlLogicalMessage { + db_id: Oid, + transactional: uint32, // bool, takes 4 bytes due to alignment in C structures + prefix_size: uint64, + message_size: uint64, +} + +impl XlLogicalMessage { + pub fn encode(&self) -> Bytes { + use utils::bin_ser::LeSer; + self.ser().unwrap().into() + } +} + +/// Create new WAL record for non-transactional logical message. +/// Used for creating artificial WAL for tests, as LogicalMessage +/// record is basically no-op. +/// +/// NOTE: This leaves the xl_prev field zero. The safekeeper and +/// pageserver tolerate that, but PostgreSQL does not. +pub fn encode_logical_message(prefix: &str, message: &str) -> Vec { + let mut prefix_bytes: Vec = Vec::with_capacity(prefix.len() + 1); + prefix_bytes.write_all(prefix.as_bytes()).unwrap(); + prefix_bytes.push(0); + + let message_bytes = message.as_bytes(); + + let logical_message = XlLogicalMessage { + db_id: 0, + transactional: 0, + prefix_size: prefix_bytes.len() as u64, + message_size: message_bytes.len() as u64, + }; + + let mainrdata = logical_message.encode(); + let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len(); + // only short mainrdata is supported for now + assert!(mainrdata_len <= 255); + let mainrdata_len = mainrdata_len as u8; + + let mut data: Vec = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len]; + data.extend_from_slice(&mainrdata); + data.extend_from_slice(&prefix_bytes); + data.extend_from_slice(message_bytes); + + let total_len = XLOG_SIZE_OF_XLOG_RECORD + data.len(); + + let mut header = XLogRecord { + xl_tot_len: total_len as u32, + xl_xid: 0, + xl_prev: 0, + xl_info: 0, + xl_rmid: 21, + __bindgen_padding_0: [0u8; 2usize], + xl_crc: 0, // crc will be calculated later + }; + + let header_bytes = header.encode().expect("failed to encode header"); + let crc = crc32c_append(0, &data); + let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]); + header.xl_crc = crc; + + let mut wal: Vec = Vec::new(); + wal.extend_from_slice(&header.encode().expect("failed to encode header")); + wal.extend_from_slice(&data); + + // WAL start position must be aligned at 8 bytes, + // this will add padding for the next WAL record. + const PADDING: usize = 8; + let padding_rem = wal.len() % PADDING; + if padding_rem != 0 { + wal.resize(wal.len() + PADDING - padding_rem, 0); + } + + wal +} + +#[cfg(test)] +mod tests { + use super::super::PG_MAJORVERSION; + use super::*; + use regex::Regex; + use std::cmp::min; + use std::fs; + use std::{env, str::FromStr}; + use utils::const_assert; + + fn init_logging() { + let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or( + format!("wal_craft=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"), + )) + .is_test(true) + .try_init(); + } + + fn test_end_of_wal(test_name: &str) { + use wal_craft::*; + + let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); + + // Craft some WAL + let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join(".."); + let cfg = Conf { + pg_version, + pg_distrib_dir: top_path.join("pg_install"), + datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)), + }; + if cfg.datadir.exists() { + fs::remove_dir_all(&cfg.datadir).unwrap(); + } + cfg.initdb().unwrap(); + let srv = cfg.start_server().unwrap(); + let (intermediate_lsns, expected_end_of_wal_partial) = + C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap(); + let intermediate_lsns: Vec = intermediate_lsns + .iter() + .map(|&lsn| u64::from(lsn).into()) + .collect(); + let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into(); + srv.kill(); + + // Check find_end_of_wal on the initial WAL + let last_segment = cfg + .wal_dir() + .read_dir() + .unwrap() + .map(|f| f.unwrap().file_name().into_string().unwrap()) + .filter(|fname| IsXLogFileName(fname)) + .max() + .unwrap(); + check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal); + for start_lsn in intermediate_lsns + .iter() + .chain(std::iter::once(&expected_end_of_wal)) + { + // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`. + // We assume that `start_lsn` is non-decreasing. + info!( + "Checking with start_lsn={}, erasing WAL before it", + start_lsn + ); + for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() { + let fname = file.file_name().into_string().unwrap(); + if !IsXLogFileName(&fname) { + continue; + } + let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE); + let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); + if seg_start_lsn > u64::from(*start_lsn) { + continue; + } + let mut f = File::options().write(true).open(file.path()).unwrap(); + const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE]; + f.write_all( + &ZEROS[0..min( + WAL_SEGMENT_SIZE, + (u64::from(*start_lsn) - seg_start_lsn) as usize, + )], + ) + .unwrap(); + } + check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal); + } + } + + fn check_pg_waldump_end_of_wal( + cfg: &wal_craft::Conf, + last_segment: &str, + expected_end_of_wal: Lsn, + ) { + // Get the actual end of WAL by pg_waldump + let waldump_output = cfg + .pg_waldump("000000010000000000000001", last_segment) + .unwrap() + .stderr; + let waldump_output = std::str::from_utf8(&waldump_output).unwrap(); + let caps = match Regex::new(r"invalid record length at (.+):") + .unwrap() + .captures(waldump_output) + { + Some(caps) => caps, + None => { + error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output); + panic!(); + } + }; + let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap(); + info!( + "waldump erred on {}, expected wal end at {}", + waldump_wal_end, expected_end_of_wal + ); + assert_eq!(waldump_wal_end, expected_end_of_wal); + } + + fn check_end_of_wal( + cfg: &wal_craft::Conf, + last_segment: &str, + start_lsn: Lsn, + expected_end_of_wal: Lsn, + ) { + // Check end_of_wal on non-partial WAL segment (we treat it as fully populated) + // let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap(); + // info!( + // "find_end_of_wal returned wal_end={} with non-partial WAL segment", + // wal_end + // ); + // assert_eq!(wal_end, expected_end_of_wal_non_partial); + + // Rename file to partial to actually find last valid lsn, then rename it back. + fs::rename( + cfg.wal_dir().join(&last_segment), + cfg.wal_dir().join(format!("{}.partial", last_segment)), + ) + .unwrap(); + let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap(); + info!( + "find_end_of_wal returned wal_end={} with partial WAL segment", + wal_end + ); + assert_eq!(wal_end, expected_end_of_wal); + fs::rename( + cfg.wal_dir().join(format!("{}.partial", last_segment)), + cfg.wal_dir().join(last_segment), + ) + .unwrap(); + } + + const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024); + + #[test] + pub fn test_find_end_of_wal_simple() { + init_logging(); + test_end_of_wal::("test_find_end_of_wal_simple"); + } + + #[test] + pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() { + init_logging(); + test_end_of_wal::( + "test_find_end_of_wal_crossing_segment_followed_by_small_one", + ); + } + + #[test] + pub fn test_find_end_of_wal_last_crossing_segment() { + init_logging(); + test_end_of_wal::( + "test_find_end_of_wal_last_crossing_segment", + ); + } + + /// Check the math in update_next_xid + /// + /// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL, + /// currently 1024. + #[test] + pub fn test_update_next_xid() { + let checkpoint_buf = [0u8; std::mem::size_of::()]; + let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap(); + + checkpoint.nextXid = FullTransactionId { value: 10 }; + assert_eq!(checkpoint.nextXid.value, 10); + + // The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL + // boundary + checkpoint.update_next_xid(100); + assert_eq!(checkpoint.nextXid.value, 1024); + + // No change + checkpoint.update_next_xid(500); + assert_eq!(checkpoint.nextXid.value, 1024); + checkpoint.update_next_xid(1023); + assert_eq!(checkpoint.nextXid.value, 1024); + + // The function returns the *next* XID, given the highest XID seen so + // far. So when we pass 1024, the nextXid gets bumped up to the next + // XID_CHECKPOINT_INTERVAL boundary. + checkpoint.update_next_xid(1024); + assert_eq!(checkpoint.nextXid.value, 2048); + } + + #[test] + pub fn test_encode_logical_message() { + let expected = [ + 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, + 38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, + 101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, + ]; + let actual = encode_logical_message("prefix", "message"); + assert_eq!(expected, actual[..]); + } +} diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml new file mode 100644 index 0000000000..4c35c5a650 --- /dev/null +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "wal_craft" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow = "1.0" +clap = "4.0" +env_logger = "0.9" +log = "0.4" +once_cell = "1.13.0" +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres_ffi = { path = "../" } +tempfile = "3.2" +workspace_hack = { version = "0.1", path = "../../../workspace_hack" } diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs new file mode 100644 index 0000000000..e87ca27e90 --- /dev/null +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -0,0 +1,139 @@ +use anyhow::*; +use clap::{value_parser, Arg, ArgMatches, Command}; +use std::{path::PathBuf, str::FromStr}; +use wal_craft::*; + +fn main() -> Result<()> { + env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("wal_craft=info")) + .init(); + let arg_matches = cli().get_matches(); + + let wal_craft = |arg_matches: &ArgMatches, client| { + let (intermediate_lsns, end_of_wal_lsn) = match arg_matches + .get_one::("type") + .map(|s| s.as_str()) + .context("'type' is required")? + { + Simple::NAME => Simple::craft(client)?, + LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?, + LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => { + LastWalRecordXlogSwitchEndsOnPageBoundary::craft(client)? + } + WalRecordCrossingSegmentFollowedBySmallOne::NAME => { + WalRecordCrossingSegmentFollowedBySmallOne::craft(client)? + } + LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?, + a => panic!("Unknown --type argument: {a}"), + }; + for lsn in intermediate_lsns { + println!("intermediate_lsn = {lsn}"); + } + println!("end_of_wal = {end_of_wal_lsn}"); + Ok(()) + }; + + match arg_matches.subcommand() { + None => panic!("No subcommand provided"), + Some(("print-postgres-config", _)) => { + for cfg in REQUIRED_POSTGRES_CONFIG.iter() { + println!("{cfg}"); + } + Ok(()) + } + + Some(("with-initdb", arg_matches)) => { + let cfg = Conf { + pg_version: *arg_matches + .get_one::("pg-version") + .context("'pg-version' is required")?, + pg_distrib_dir: arg_matches + .get_one::("pg-distrib-dir") + .context("'pg-distrib-dir' is required")? + .to_owned(), + datadir: arg_matches + .get_one::("datadir") + .context("'datadir' is required")? + .to_owned(), + }; + cfg.initdb()?; + let srv = cfg.start_server()?; + wal_craft(arg_matches, &mut srv.connect_with_timeout()?)?; + srv.kill(); + Ok(()) + } + Some(("in-existing", arg_matches)) => wal_craft( + arg_matches, + &mut postgres::Config::from_str( + arg_matches + .get_one::("connection") + .context("'connection' is required")?, + ) + .context( + "'connection' argument value could not be parsed as a postgres connection string", + )? + .connect(postgres::NoTls)?, + ), + Some(_) => panic!("Unknown subcommand"), + } +} + +fn cli() -> Command { + let type_arg = &Arg::new("type") + .help("Type of WAL to craft") + .value_parser([ + Simple::NAME, + LastWalRecordXlogSwitch::NAME, + LastWalRecordXlogSwitchEndsOnPageBoundary::NAME, + WalRecordCrossingSegmentFollowedBySmallOne::NAME, + LastWalRecordCrossingSegment::NAME, + ]) + .required(true); + + Command::new("Postgres WAL crafter") + .about("Crafts Postgres databases with specific WAL properties") + .subcommand( + Command::new("print-postgres-config") + .about("Print the configuration required for PostgreSQL server before running this script") + ) + .subcommand( + Command::new("with-initdb") + .about("Craft WAL in a new data directory first initialized with initdb") + .arg(type_arg) + .arg( + Arg::new("datadir") + .help("Data directory for the Postgres server") + .value_parser(value_parser!(PathBuf)) + .required(true) + ) + .arg( + Arg::new("pg-distrib-dir") + .long("pg-distrib-dir") + .value_parser(value_parser!(PathBuf)) + .help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)") + .default_value("/usr/local") + ) + .arg( + Arg::new("pg-version") + .long("pg-version") + .help("Postgres version to use for the initial tenant") + .value_parser(value_parser!(u32)) + .required(true) + + ) + ) + .subcommand( + Command::new("in-existing") + .about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.") + .arg(type_arg) + .arg( + Arg::new("connection") + .help("Connection string to the Postgres database to populate") + .required(true) + ) + ) +} + +#[test] +fn verify_cli() { + cli().debug_assert(); +} diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs new file mode 100644 index 0000000000..c4404b37ba --- /dev/null +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -0,0 +1,433 @@ +use anyhow::*; +use core::time::Duration; +use log::*; +use once_cell::sync::Lazy; +use postgres::types::PgLsn; +use postgres::Client; +use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; +use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; +use std::cmp::Ordering; +use std::fs; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; +use std::time::Instant; +use tempfile::{tempdir, TempDir}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Conf { + pub pg_version: u32, + pub pg_distrib_dir: PathBuf, + pub datadir: PathBuf, +} + +pub struct PostgresServer { + process: std::process::Child, + _unix_socket_dir: TempDir, + client_config: postgres::Config, +} + +pub static REQUIRED_POSTGRES_CONFIG: Lazy> = Lazy::new(|| { + vec![ + "wal_keep_size=50MB", // Ensure old WAL is not removed + "shared_preload_libraries=neon", // can only be loaded at startup + // Disable background processes as much as possible + "wal_writer_delay=10s", + "autovacuum=off", + ] +}); + +impl Conf { + pub fn pg_distrib_dir(&self) -> anyhow::Result { + let path = self.pg_distrib_dir.clone(); + + match self.pg_version { + 14 => Ok(path.join(format!("v{}", self.pg_version))), + 15 => Ok(path.join(format!("v{}", self.pg_version))), + _ => bail!("Unsupported postgres version: {}", self.pg_version), + } + } + + fn pg_bin_dir(&self) -> anyhow::Result { + Ok(self.pg_distrib_dir()?.join("bin")) + } + + fn pg_lib_dir(&self) -> anyhow::Result { + Ok(self.pg_distrib_dir()?.join("lib")) + } + + pub fn wal_dir(&self) -> PathBuf { + self.datadir.join("pg_wal") + } + + fn new_pg_command(&self, command: impl AsRef) -> Result { + let path = self.pg_bin_dir()?.join(command); + ensure!(path.exists(), "Command {:?} does not exist", path); + let mut cmd = Command::new(path); + cmd.env_clear() + .env("LD_LIBRARY_PATH", self.pg_lib_dir()?) + .env("DYLD_LIBRARY_PATH", self.pg_lib_dir()?); + Ok(cmd) + } + + pub fn initdb(&self) -> Result<()> { + if let Some(parent) = self.datadir.parent() { + info!("Pre-creating parent directory {:?}", parent); + // Tests may be run concurrently and there may be a race to create `test_output/`. + // std::fs::create_dir_all is guaranteed to have no races with another thread creating directories. + std::fs::create_dir_all(parent)?; + } + info!( + "Running initdb in {:?} with user \"postgres\"", + self.datadir + ); + let output = self + .new_pg_command("initdb")? + .arg("-D") + .arg(self.datadir.as_os_str()) + .args(&["-U", "postgres", "--no-instructions", "--no-sync"]) + .output()?; + debug!("initdb output: {:?}", output); + ensure!( + output.status.success(), + "initdb failed, stdout and stderr follow:\n{}{}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr), + ); + Ok(()) + } + + pub fn start_server(&self) -> Result { + info!("Starting Postgres server in {:?}", self.datadir); + let log_file = fs::File::create(self.datadir.join("pg.log")).with_context(|| { + format!( + "Failed to create pg.log file in directory {}", + self.datadir.display() + ) + })?; + let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols) + let unix_socket_dir_path = unix_socket_dir.path().to_owned(); + let server_process = self + .new_pg_command("postgres")? + .args(&["-c", "listen_addresses="]) + .arg("-k") + .arg(unix_socket_dir_path.as_os_str()) + .arg("-D") + .arg(self.datadir.as_os_str()) + .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output + .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg])) + .stderr(Stdio::from(log_file)) + .spawn()?; + let server = PostgresServer { + process: server_process, + _unix_socket_dir: unix_socket_dir, + client_config: { + let mut c = postgres::Config::new(); + c.host_path(&unix_socket_dir_path); + c.user("postgres"); + c.connect_timeout(Duration::from_millis(1000)); + c + }, + }; + Ok(server) + } + + pub fn pg_waldump( + &self, + first_segment_name: &str, + last_segment_name: &str, + ) -> Result { + let first_segment_file = self.datadir.join(first_segment_name); + let last_segment_file = self.datadir.join(last_segment_name); + info!( + "Running pg_waldump for {} .. {}", + first_segment_file.display(), + last_segment_file.display() + ); + let output = self + .new_pg_command("pg_waldump")? + .args(&[ + &first_segment_file.as_os_str(), + &last_segment_file.as_os_str(), + ]) + .output()?; + debug!("waldump output: {:?}", output); + Ok(output) + } +} + +impl PostgresServer { + pub fn connect_with_timeout(&self) -> Result { + let retry_until = Instant::now() + *self.client_config.get_connect_timeout().unwrap(); + while Instant::now() < retry_until { + use std::result::Result::Ok; + if let Ok(client) = self.client_config.connect(postgres::NoTls) { + return Ok(client); + } + std::thread::sleep(Duration::from_millis(100)); + } + bail!("Connection timed out"); + } + + pub fn kill(mut self) { + self.process.kill().unwrap(); + self.process.wait().unwrap(); + } +} + +impl Drop for PostgresServer { + fn drop(&mut self) { + use std::result::Result::Ok; + match self.process.try_wait() { + Ok(Some(_)) => return, + Ok(None) => { + warn!("Server was not terminated, will be killed"); + } + Err(e) => { + error!("Unable to get status of the server: {}, will be killed", e); + } + } + let _ = self.process.kill(); + } +} + +pub trait PostgresClientExt: postgres::GenericClient { + fn pg_current_wal_insert_lsn(&mut self) -> Result { + Ok(self + .query_one("SELECT pg_current_wal_insert_lsn()", &[])? + .get(0)) + } + fn pg_current_wal_flush_lsn(&mut self) -> Result { + Ok(self + .query_one("SELECT pg_current_wal_flush_lsn()", &[])? + .get(0)) + } +} + +impl PostgresClientExt for C {} + +pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result<()> { + client.execute("create extension if not exists neon_test_utils", &[])?; + + let wal_keep_size: String = client.query_one("SHOW wal_keep_size", &[])?.get(0); + ensure!(wal_keep_size == "50MB"); + let wal_writer_delay: String = client.query_one("SHOW wal_writer_delay", &[])?.get(0); + ensure!(wal_writer_delay == "10s"); + let autovacuum: String = client.query_one("SHOW autovacuum", &[])?.get(0); + ensure!(autovacuum == "off"); + + let wal_segment_size = client.query_one( + "select cast(setting as bigint) as setting, unit \ + from pg_settings where name = 'wal_segment_size'", + &[], + )?; + ensure!( + wal_segment_size.get::<_, String>("unit") == "B", + "Unexpected wal_segment_size unit" + ); + ensure!( + wal_segment_size.get::<_, i64>("setting") == WAL_SEGMENT_SIZE as i64, + "Unexpected wal_segment_size in bytes" + ); + + Ok(()) +} + +pub trait Crafter { + const NAME: &'static str; + + /// Generates WAL using the client `client`. Returns a pair of: + /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from. + /// May include or exclude Lsn(0) and the end-of-wal. + /// * The expected end-of-wal LSN. + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)>; +} + +fn craft_internal( + client: &mut C, + f: impl Fn(&mut C, PgLsn) -> Result<(Vec, Option)>, +) -> Result<(Vec, PgLsn)> { + ensure_server_config(client)?; + + let initial_lsn = client.pg_current_wal_insert_lsn()?; + info!("LSN initial = {}", initial_lsn); + + let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?; + let last_lsn = match last_lsn { + None => client.pg_current_wal_insert_lsn()?, + Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) { + Ordering::Less => bail!("Some records were inserted after the crafted WAL"), + Ordering::Equal => last_lsn, + Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"), + }, + }; + if !intermediate_lsns.starts_with(&[initial_lsn]) { + intermediate_lsns.insert(0, initial_lsn); + } + + // Some records may be not flushed, e.g. non-transactional logical messages. + client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?; + match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) { + Ordering::Less => bail!("Some records were flushed after the crafted WAL"), + Ordering::Equal => {} + Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"), + } + Ok((intermediate_lsns, last_lsn)) +} + +pub struct Simple; +impl Crafter for Simple { + const NAME: &'static str = "simple"; + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + craft_internal(client, |client, _| { + client.execute("CREATE table t(x int)", &[])?; + Ok((Vec::new(), None)) + }) + } +} + +pub struct LastWalRecordXlogSwitch; +impl Crafter for LastWalRecordXlogSwitch { + const NAME: &'static str = "last_wal_record_xlog_switch"; + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + // Do not use generate_internal because here we end up with flush_lsn exactly on + // the segment boundary and insert_lsn after the initial page header, which is unusual. + ensure_server_config(client)?; + + client.execute("CREATE table t(x int)", &[])?; + let before_xlog_switch = client.pg_current_wal_insert_lsn()?; + let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + let next_segment = PgLsn::from(0x0200_0000); + ensure!( + after_xlog_switch <= next_segment, + "XLOG_SWITCH message ended after the expected segment boundary: {} > {}", + after_xlog_switch, + next_segment + ); + Ok((vec![before_xlog_switch, after_xlog_switch], next_segment)) + } +} + +pub struct LastWalRecordXlogSwitchEndsOnPageBoundary; +impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { + const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary"; + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + // Do not use generate_internal because here we end up with flush_lsn exactly on + // the segment boundary and insert_lsn after the initial page header, which is unusual. + ensure_server_config(client)?; + + client.execute("CREATE table t(x int)", &[])?; + + // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. + // We will use logical message as the padding. We start with detecting how much WAL + // it takes for one logical message, considering all alignments and headers. + let base_wal_advance = { + let before_lsn = client.pg_current_wal_insert_lsn()?; + // Small non-empty message bigger than few bytes is more likely than an empty + // message to have the same format as the big padding message. + client.execute( + "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))", + &[], + )?; + // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD. + (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize + + XLOG_SIZE_OF_XLOG_RECORD + }; + let mut remaining_lsn = + XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ; + if remaining_lsn < base_wal_advance { + remaining_lsn += XLOG_BLCKSZ; + } + let repeats = 10 + remaining_lsn - base_wal_advance; + info!( + "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}", + client.pg_current_wal_insert_lsn()?, + remaining_lsn, + base_wal_advance, + repeats + ); + client.execute( + "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", + &[&(repeats as i32)], + )?; + info!( + "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", + client.pg_current_wal_insert_lsn()?, + XLOG_SIZE_OF_XLOG_RECORD + ); + + // Emit the XLOG_SWITCH + let before_xlog_switch = client.pg_current_wal_insert_lsn()?; + let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + let next_segment = PgLsn::from(0x0200_0000); + ensure!( + after_xlog_switch < next_segment, + "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}", + after_xlog_switch, + next_segment + ); + ensure!( + u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD, + "XLOG_SWITCH message ended not on page boundary: {}", + after_xlog_switch + ); + Ok((vec![before_xlog_switch, after_xlog_switch], next_segment)) + } +} + +fn craft_single_logical_message( + client: &mut impl postgres::GenericClient, + transactional: bool, +) -> Result<(Vec, PgLsn)> { + craft_internal(client, |client, initial_lsn| { + ensure!( + initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024), + "Initial LSN is too far in the future" + ); + + let message_lsn: PgLsn = client + .query_one( + "select pg_logical_emit_message($1, 'big-16mb-msg', \ + concat(repeat('abcd', 16 * 256 * 1024), 'end')) as message_lsn", + &[&transactional], + )? + .get("message_lsn"); + ensure!( + message_lsn > PgLsn::from(0x0200_0000 + 4 * 8192), + "Logical message did not cross the segment boundary" + ); + ensure!( + message_lsn < PgLsn::from(0x0400_0000), + "Logical message crossed two segments" + ); + + if transactional { + // Transactional logical messages are part of a transaction, so the one above is + // followed by a small COMMIT record. + + let after_message_lsn = client.pg_current_wal_insert_lsn()?; + ensure!( + message_lsn < after_message_lsn, + "No record found after the emitted message" + ); + Ok((vec![message_lsn], Some(after_message_lsn))) + } else { + Ok((Vec::new(), Some(message_lsn))) + } + }) +} + +pub struct WalRecordCrossingSegmentFollowedBySmallOne; +impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { + const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one"; + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + craft_single_logical_message(client, true) + } +} + +pub struct LastWalRecordCrossingSegment; +impl Crafter for LastWalRecordCrossingSegment { + const NAME: &'static str = "last_wal_record_crossing_segment"; + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + craft_single_logical_message(client, false) + } +} diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml new file mode 100644 index 0000000000..4d48e431b4 --- /dev/null +++ b/libs/pq_proto/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "pq_proto" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = "1.0" +bytes = "1.0.1" +pin-project-lite = "0.2.7" +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +rand = "0.8.3" +serde = { version = "1.0", features = ["derive"] } +tokio = { version = "1.17", features = ["macros"] } +tracing = "0.1" + +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/zenith_utils/src/pq_proto.rs b/libs/pq_proto/src/lib.rs similarity index 63% rename from zenith_utils/src/pq_proto.rs rename to libs/pq_proto/src/lib.rs index 355b38fc95..2e311dd6e3 100644 --- a/zenith_utils/src/pq_proto.rs +++ b/libs/pq_proto/src/lib.rs @@ -2,16 +2,23 @@ //! //! on message formats. -use crate::sync::{AsyncishRead, SyncFuture}; +// Tools for calling certain async methods in sync contexts. +pub mod sync; + use anyhow::{bail, ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use postgres_protocol::PG_EPOCH; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::future::Future; -use std::io::{self, Cursor}; -use std::str; -use std::time::{Duration, SystemTime}; +use std::{ + borrow::Cow, + collections::HashMap, + fmt, + future::Future, + io::{self, Cursor}, + str, + time::{Duration, SystemTime}, +}; +use sync::{AsyncishRead, SyncFuture}; use tokio::io::AsyncReadExt; use tracing::{trace, warn}; @@ -25,8 +32,10 @@ pub const TEXT_OID: Oid = 25; #[derive(Debug)] pub enum FeMessage { StartupPacket(FeStartupPacket), - Query(FeQueryMessage), // Simple query - Parse(FeParseMessage), // Extended query protocol + // Simple query. + Query(Bytes), + // Extended query protocol. + Parse(FeParseMessage), Describe(FeDescribeMessage), Bind(FeBindMessage), Execute(FeExecuteMessage), @@ -47,16 +56,91 @@ pub enum FeStartupPacket { StartupMessage { major_version: u32, minor_version: u32, - params: HashMap, + params: StartupMessageParams, }, } +#[derive(Debug)] +pub struct StartupMessageParams { + params: HashMap, +} + +impl StartupMessageParams { + /// Get parameter's value by its name. + pub fn get(&self, name: &str) -> Option<&str> { + self.params.get(name).map(|s| s.as_str()) + } + + /// Split command-line options according to PostgreSQL's logic, + /// taking into account all escape sequences but leaving them as-is. + /// [`None`] means that there's no `options` in [`Self`]. + pub fn options_raw(&self) -> Option> { + // See `postgres: pg_split_opts`. + let mut last_was_escape = false; + let iter = self + .get("options")? + .split(move |c: char| { + // We split by non-escaped whitespace symbols. + let should_split = c.is_ascii_whitespace() && !last_was_escape; + last_was_escape = c == '\\' && !last_was_escape; + should_split + }) + .filter(|s| !s.is_empty()); + + Some(iter) + } + + /// Split command-line options according to PostgreSQL's logic, + /// applying all escape sequences (using owned strings as needed). + /// [`None`] means that there's no `options` in [`Self`]. + pub fn options_escaped(&self) -> Option>> { + // See `postgres: pg_split_opts`. + let iter = self.options_raw()?.map(|s| { + let mut preserve_next_escape = false; + let escape = |c| { + // We should remove '\\' unless it's preceded by '\\'. + let should_remove = c == '\\' && !preserve_next_escape; + preserve_next_escape = should_remove; + should_remove + }; + + match s.contains('\\') { + true => Cow::Owned(s.replace(escape, "")), + false => Cow::Borrowed(s), + } + }); + + Some(iter) + } + + // This function is mostly useful in tests. + #[doc(hidden)] + pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self { + Self { + params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(), + } + } +} + #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub struct CancelKeyData { pub backend_pid: i32, pub cancel_key: i32, } +impl fmt::Display for CancelKeyData { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let hi = (self.backend_pid as u64) << 32; + let lo = self.cancel_key as u64; + let id = hi | lo; + + // This format is more compact and might work better for logs. + f.debug_tuple("CancelKeyData") + .field(&format_args!("{:x}", id)) + .finish() + } +} + use rand::distributions::{Distribution, Standard}; impl Distribution for Standard { fn sample(&self, rng: &mut R) -> CancelKeyData { @@ -67,11 +151,6 @@ impl Distribution for Standard { } } -#[derive(Debug)] -pub struct FeQueryMessage { - pub body: Bytes, -} - // We only support the simple case of Parse on unnamed prepared statement and // no params #[derive(Debug)] @@ -87,7 +166,7 @@ pub struct FeDescribeMessage { // we only support unnamed prepared stmt and portal #[derive(Debug)] -pub struct FeBindMessage {} +pub struct FeBindMessage; // we only support unnamed prepared stmt or portal #[derive(Debug)] @@ -98,7 +177,22 @@ pub struct FeExecuteMessage { // we only support unnamed prepared stmt and portal #[derive(Debug)] -pub struct FeCloseMessage {} +pub struct FeCloseMessage; + +/// Retry a read on EINTR +/// +/// This runs the enclosed expression, and if it returns +/// Err(io::ErrorKind::Interrupted), retries it. +macro_rules! retry_read { + ( $x:expr ) => { + loop { + match $x { + Err(e) if e.kind() == io::ErrorKind::Interrupted => continue, + res => break res, + } + } + }; +} impl FeMessage { /// Read one message from the stream. @@ -107,7 +201,7 @@ impl FeMessage { /// /// ``` /// # use std::io; - /// # use zenith_utils::pq_proto::FeMessage; + /// # use pq_proto::FeMessage; /// # /// # fn process_message(msg: FeMessage) -> anyhow::Result<()> { /// # Ok(()) @@ -141,27 +235,25 @@ impl FeMessage { // Each libpq message begins with a message type byte, followed by message length // If the client closes the connection, return None. But if the client closes the // connection in the middle of a message, we will return an error. - let tag = match stream.read_u8().await { + let tag = match retry_read!(stream.read_u8().await) { Ok(b) => b, Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), Err(e) => return Err(e.into()), }; - let len = stream.read_u32().await?; - // The message length includes itself, so it better be at least 4 - let bodylen = len + // The message length includes itself, so it better be at least 4. + let len = retry_read!(stream.read_u32().await)? .checked_sub(4) - .context("invalid message length: parsing u32")?; + .context("invalid message length")?; - // Read message body - let mut body_buf: Vec = vec![0; bodylen as usize]; - stream.read_exact(&mut body_buf).await?; + let body = { + let mut buffer = vec![0u8; len as usize]; + stream.read_exact(&mut buffer).await?; + Bytes::from(buffer) + }; - let body = Bytes::from(body_buf); - - // Parse it match tag { - b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { body }))), + b'Q' => Ok(Some(FeMessage::Query(body))), b'P' => Ok(Some(FeParseMessage::parse(body)?)), b'D' => Ok(Some(FeDescribeMessage::parse(body)?)), b'E' => Ok(Some(FeExecuteMessage::parse(body)?)), @@ -207,17 +299,18 @@ impl FeStartupPacket { // reading 4 bytes, to be precise), return None to indicate that the connection // was closed. This matches the PostgreSQL server's behavior, which avoids noise // in the log if the client opens connection but closes it immediately. - let len = match stream.read_u32().await { + let len = match retry_read!(stream.read_u32().await) { Ok(len) => len as usize, Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None), Err(e) => return Err(e.into()), }; + #[allow(clippy::manual_range_contains)] if len < 4 || len > MAX_STARTUP_PACKET_LENGTH { bail!("invalid message length"); } - let request_code = stream.read_u32().await?; + let request_code = retry_read!(stream.read_u32().await)?; // the rest of startup packet are params let params_len = len - 8; @@ -225,9 +318,9 @@ impl FeStartupPacket { stream.read_exact(params_bytes.as_mut()).await?; // Parse params depending on request code - let most_sig_16_bits = request_code >> 16; - let least_sig_16_bits = request_code & ((1 << 16) - 1); - let message = match (most_sig_16_bits, least_sig_16_bits) { + let req_hi = request_code >> 16; + let req_lo = request_code & ((1 << 16) - 1); + let message = match (req_hi, req_lo) { (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => { ensure!(params_len == 8, "expected 8 bytes for CancelRequest params"); let mut cursor = Cursor::new(params_bytes); @@ -236,172 +329,122 @@ impl FeStartupPacket { cancel_key: cursor.read_i32().await?, }) } - (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => FeStartupPacket::SslRequest, + (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => { + // Requested upgrade to SSL (aka TLS) + FeStartupPacket::SslRequest + } (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => { + // Requested upgrade to GSSAPI FeStartupPacket::GssEncRequest } (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => { bail!("Unrecognized request code {}", unrecognized_code) } + // TODO bail if protocol major_version is not 3? (major_version, minor_version) => { - // TODO bail if protocol major_version is not 3? - // Parse null-terminated (String) pairs of param name / param value - let params_str = str::from_utf8(¶ms_bytes).unwrap(); - let mut params_tokens = params_str.split('\0'); - let mut params: HashMap = HashMap::new(); - while let Some(name) = params_tokens.next() { - let value = params_tokens + // Parse pairs of null-terminated strings (key, value). + // See `postgres: ProcessStartupPacket, build_startup_packet`. + let mut tokens = str::from_utf8(¶ms_bytes) + .context("StartupMessage params: invalid utf-8")? + .strip_suffix('\0') // drop packet's own null terminator + .context("StartupMessage params: missing null terminator")? + .split_terminator('\0'); + + let mut params = HashMap::new(); + while let Some(name) = tokens.next() { + let value = tokens .next() - .context("expected even number of params in StartupMessage")?; - if name == "options" { - // deprecated way of passing params as cmd line args - for cmdopt in value.split(' ') { - let nameval: Vec<&str> = cmdopt.split('=').collect(); - if nameval.len() == 2 { - params.insert(nameval[0].to_string(), nameval[1].to_string()); - } - } - } else { - params.insert(name.to_string(), value.to_string()); - } + .context("StartupMessage params: key without value")?; + + params.insert(name.to_owned(), value.to_owned()); } + FeStartupPacket::StartupMessage { major_version, minor_version, - params, + params: StartupMessageParams { params }, } } }; + Ok(Some(FeMessage::StartupPacket(message))) }) } } impl FeParseMessage { - pub fn parse(mut buf: Bytes) -> anyhow::Result { - let _pstmt_name = read_null_terminated(&mut buf)?; - let query_string = read_null_terminated(&mut buf)?; - let nparams = buf.get_i16(); - + fn parse(mut buf: Bytes) -> anyhow::Result { // FIXME: the rust-postgres driver uses a named prepared statement // for copy_out(). We're not prepared to handle that correctly. For // now, just ignore the statement name, assuming that the client never // uses more than one prepared statement at a time. - /* - if !pstmt_name.is_empty() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "named prepared statements not implemented in Parse", - )); - } - */ - if nparams != 0 { - bail!("query params not implemented"); - } + let _pstmt_name = read_cstr(&mut buf)?; + let query_string = read_cstr(&mut buf)?; + let nparams = buf.get_i16(); + + ensure!(nparams == 0, "query params not implemented"); Ok(FeMessage::Parse(FeParseMessage { query_string })) } } impl FeDescribeMessage { - pub fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> anyhow::Result { let kind = buf.get_u8(); - let _pstmt_name = read_null_terminated(&mut buf)?; + let _pstmt_name = read_cstr(&mut buf)?; // FIXME: see FeParseMessage::parse - /* - if !pstmt_name.is_empty() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "named prepared statements not implemented in Describe", - )); - } - */ - - if kind != b'S' { - bail!("only prepared statmement Describe is implemented"); - } + ensure!( + kind == b'S', + "only prepared statemement Describe is implemented" + ); Ok(FeMessage::Describe(FeDescribeMessage { kind })) } } impl FeExecuteMessage { - pub fn parse(mut buf: Bytes) -> anyhow::Result { - let portal_name = read_null_terminated(&mut buf)?; + fn parse(mut buf: Bytes) -> anyhow::Result { + let portal_name = read_cstr(&mut buf)?; let maxrows = buf.get_i32(); - if !portal_name.is_empty() { - bail!("named portals not implemented"); - } - - if maxrows != 0 { - bail!("row limit in Execute message not supported"); - } + ensure!(portal_name.is_empty(), "named portals not implemented"); + ensure!(maxrows == 0, "row limit in Execute message not implemented"); Ok(FeMessage::Execute(FeExecuteMessage { maxrows })) } } impl FeBindMessage { - pub fn parse(mut buf: Bytes) -> anyhow::Result { - let portal_name = read_null_terminated(&mut buf)?; - let _pstmt_name = read_null_terminated(&mut buf)?; - - if !portal_name.is_empty() { - bail!("named portals not implemented"); - } + fn parse(mut buf: Bytes) -> anyhow::Result { + let portal_name = read_cstr(&mut buf)?; + let _pstmt_name = read_cstr(&mut buf)?; // FIXME: see FeParseMessage::parse - /* - if !pstmt_name.is_empty() { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "named prepared statements not implemented", - )); - } - */ + ensure!(portal_name.is_empty(), "named portals not implemented"); - Ok(FeMessage::Bind(FeBindMessage {})) + Ok(FeMessage::Bind(FeBindMessage)) } } impl FeCloseMessage { - pub fn parse(mut buf: Bytes) -> anyhow::Result { + fn parse(mut buf: Bytes) -> anyhow::Result { let _kind = buf.get_u8(); - let _pstmt_or_portal_name = read_null_terminated(&mut buf)?; + let _pstmt_or_portal_name = read_cstr(&mut buf)?; // FIXME: we do nothing with Close - - Ok(FeMessage::Close(FeCloseMessage {})) + Ok(FeMessage::Close(FeCloseMessage)) } } -fn read_null_terminated(buf: &mut Bytes) -> anyhow::Result { - let mut result = BytesMut::new(); - - loop { - if !buf.has_remaining() { - bail!("no null-terminator in string"); - } - - let byte = buf.get_u8(); - - if byte == 0 { - break; - } - result.put_u8(byte); - } - Ok(result.freeze()) -} - // Backend #[derive(Debug)] pub enum BeMessage<'a> { AuthenticationOk, - AuthenticationMD5Password(&'a [u8; 4]), + AuthenticationMD5Password([u8; 4]), + AuthenticationSasl(BeAuthenticationSaslMessage<'a>), AuthenticationCleartextPassword, BackendKeyData(CancelKeyData), BindComplete, @@ -416,7 +459,7 @@ pub enum BeMessage<'a> { // None means column is NULL DataRow(&'a [Option<&'a [u8]>]), ErrorResponse(&'a str), - // single byte - used in response to SSLRequest/GSSENCRequest + /// Single byte - used in response to SSLRequest/GSSENCRequest. EncryptionResponse(bool), NoData, ParameterDescription, @@ -425,10 +468,17 @@ pub enum BeMessage<'a> { ReadyForQuery, RowDescription(&'a [RowDescriptor<'a>]), XLogData(XLogDataBody<'a>), - NoticeResponse(String), + NoticeResponse(&'a str), KeepAlive(WalSndKeepAlive), } +#[derive(Debug)] +pub enum BeAuthenticationSaslMessage<'a> { + Methods(&'a [&'a str]), + Continue(&'a [u8]), + Final(&'a [u8]), +} + #[derive(Debug)] pub enum BeParameterStatusMessage<'a> { Encoding(&'a str), @@ -441,7 +491,7 @@ impl BeParameterStatusMessage<'static> { } } -// One row desciption in RowDescription packet. +// One row description in RowDescription packet. #[derive(Debug)] pub struct RowDescriptor<'a> { pub name: &'a [u8], @@ -480,6 +530,18 @@ impl RowDescriptor<'_> { formatcode: 0, } } + + pub const fn text_col(name: &[u8]) -> RowDescriptor { + RowDescriptor { + name, + tableoid: 0, + attnum: 0, + typoid: TEXT_OID, + typlen: -1, + typmod: 0, + formatcode: 0, + } + } } #[derive(Debug)] @@ -510,49 +572,22 @@ pub static SINGLE_COL_ROWDESC: BeMessage = BeMessage::RowDescription(&[RowDescri formatcode: 0, }]); -// Safe usize -> i32|i16 conversion, from rust-postgres -trait FromUsize: Sized { - fn from_usize(x: usize) -> Result; -} - -macro_rules! from_usize { - ($t:ty) => { - impl FromUsize for $t { - #[inline] - fn from_usize(x: usize) -> io::Result<$t> { - if x > <$t>::max_value() as usize { - Err(io::Error::new( - io::ErrorKind::InvalidInput, - "value too large to transmit", - )) - } else { - Ok(x as $t) - } - } - } - }; -} - -from_usize!(i32); - /// Call f() to write body of the message and prepend it with 4-byte len as /// prescribed by the protocol. -fn write_body(buf: &mut BytesMut, f: F) -> io::Result<()> -where - F: FnOnce(&mut BytesMut) -> io::Result<()>, -{ +fn write_body(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R { let base = buf.len(); buf.extend_from_slice(&[0; 4]); - f(buf)?; + let res = f(buf); - let size = i32::from_usize(buf.len() - base)?; + let size = i32::try_from(buf.len() - base).expect("message too big to transmit"); (&mut buf[base..]).put_slice(&size.to_be_bytes()); - Ok(()) + + res } /// Safe write of s into buf as cstring (String in the protocol). -pub fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> { +fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> { if s.contains(&0) { return Err(io::Error::new( io::ErrorKind::InvalidInput, @@ -564,21 +599,17 @@ pub fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> { Ok(()) } -// Truncate 0 from C string in Bytes and stringify it (returns slice, no allocations) -// PG protocol strings are always C strings. -fn cstr_to_str(b: &Bytes) -> Result<&str> { - let without_null = if b.last() == Some(&0) { - &b[..b.len() - 1] - } else { - &b[..] - }; - std::str::from_utf8(without_null).map_err(|e| e.into()) +fn read_cstr(buf: &mut Bytes) -> anyhow::Result { + let pos = buf.iter().position(|x| *x == 0); + let result = buf.split_to(pos.context("missing terminator")?); + buf.advance(1); // drop the null terminator + Ok(result) } impl<'a> BeMessage<'a> { /// Write message to the given buf. // Unlike the reading side, we use BytesMut - // here as msg len preceeds its body and it is handy to write it down first + // here as msg len precedes its body and it is handy to write it down first // and then fill the length. With Write we would have to either calc it // manually or have one more buffer. pub fn write(buf: &mut BytesMut, message: &BeMessage) -> io::Result<()> { @@ -587,18 +618,14 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'R'); write_body(buf, |buf| { buf.put_i32(0); // Specifies that the authentication was successful. - Ok::<_, io::Error>(()) - }) - .unwrap(); // write into BytesMut can't fail + }); } BeMessage::AuthenticationCleartextPassword => { buf.put_u8(b'R'); write_body(buf, |buf| { buf.put_i32(3); // Specifies that clear text password is required. - Ok::<_, io::Error>(()) - }) - .unwrap(); // write into BytesMut can't fail + }); } BeMessage::AuthenticationMD5Password(salt) => { @@ -606,9 +633,32 @@ impl<'a> BeMessage<'a> { write_body(buf, |buf| { buf.put_i32(5); // Specifies that an MD5-encrypted password is required. buf.put_slice(&salt[..]); + }); + } + + BeMessage::AuthenticationSasl(msg) => { + buf.put_u8(b'R'); + write_body(buf, |buf| { + use BeAuthenticationSaslMessage::*; + match msg { + Methods(methods) => { + buf.put_i32(10); // Specifies that SASL auth method is used. + for method in methods.iter() { + write_cstr(method.as_bytes(), buf)?; + } + buf.put_u8(0); // zero terminator for the list + } + Continue(extra) => { + buf.put_i32(11); // Continue SASL auth. + buf.put_slice(extra); + } + Final(extra) => { + buf.put_i32(12); // Send final SASL message. + buf.put_slice(extra); + } + } Ok::<_, io::Error>(()) - }) - .unwrap(); // write into BytesMut can't fail + })?; } BeMessage::BackendKeyData(key_data) => { @@ -616,77 +666,64 @@ impl<'a> BeMessage<'a> { write_body(buf, |buf| { buf.put_i32(key_data.backend_pid); buf.put_i32(key_data.cancel_key); - Ok(()) - }) - .unwrap(); + }); } BeMessage::BindComplete => { buf.put_u8(b'2'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::CloseComplete => { buf.put_u8(b'3'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::CommandComplete(cmd) => { buf.put_u8(b'C'); - write_body(buf, |buf| { - write_cstr(cmd, buf)?; - Ok::<_, io::Error>(()) - })?; + write_body(buf, |buf| write_cstr(cmd, buf))?; } BeMessage::CopyData(data) => { buf.put_u8(b'd'); write_body(buf, |buf| { buf.put_slice(data); - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } BeMessage::CopyDone => { buf.put_u8(b'c'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::CopyFail => { buf.put_u8(b'f'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::CopyInResponse => { buf.put_u8(b'G'); write_body(buf, |buf| { - buf.put_u8(1); /* copy_is_binary */ - buf.put_i16(0); /* numAttributes */ - Ok::<_, io::Error>(()) - }) - .unwrap(); + buf.put_u8(1); // copy_is_binary + buf.put_i16(0); // numAttributes + }); } BeMessage::CopyOutResponse => { buf.put_u8(b'H'); write_body(buf, |buf| { - buf.put_u8(0); /* copy_is_binary */ - buf.put_i16(0); /* numAttributes */ - Ok::<_, io::Error>(()) - }) - .unwrap(); + buf.put_u8(0); // copy_is_binary + buf.put_i16(0); // numAttributes + }); } BeMessage::CopyBothResponse => { buf.put_u8(b'W'); write_body(buf, |buf| { // doesn't matter, used only for replication - buf.put_u8(0); /* copy_is_binary */ - buf.put_i16(0); /* numAttributes */ - Ok::<_, io::Error>(()) - }) - .unwrap(); + buf.put_u8(0); // copy_is_binary + buf.put_i16(0); // numAttributes + }); } BeMessage::DataRow(vals) => { @@ -701,9 +738,7 @@ impl<'a> BeMessage<'a> { buf.put_i32(-1); } } - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } // ErrorResponse is a zero-terminated array of zero-terminated fields. @@ -718,18 +753,17 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'E'); write_body(buf, |buf| { buf.put_u8(b'S'); // severity - write_cstr(&Bytes::from("ERROR"), buf)?; + buf.put_slice(b"ERROR\0"); buf.put_u8(b'C'); // SQLSTATE error code - write_cstr(&Bytes::from("CXX000"), buf)?; + buf.put_slice(b"CXX000\0"); buf.put_u8(b'M'); // the message write_cstr(error_msg.as_bytes(), buf)?; buf.put_u8(0); // terminator Ok::<_, io::Error>(()) - }) - .unwrap(); + })?; } // NoticeResponse has the same format as ErrorResponse. From doc: "The frontend should display the @@ -742,23 +776,22 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'N'); write_body(buf, |buf| { buf.put_u8(b'S'); // severity - write_cstr(&Bytes::from("NOTICE"), buf)?; + buf.put_slice(b"NOTICE\0"); buf.put_u8(b'C'); // SQLSTATE error code - write_cstr(&Bytes::from("CXX000"), buf)?; + buf.put_slice(b"CXX000\0"); buf.put_u8(b'M'); // the message write_cstr(error_msg.as_bytes(), buf)?; buf.put_u8(0); // terminator Ok::<_, io::Error>(()) - }) - .unwrap(); + })?; } BeMessage::NoData => { buf.put_u8(b'n'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::EncryptionResponse(should_negotiate) => { @@ -783,9 +816,7 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'S'); write_body(buf, |buf| { buf.put_slice(&buffer[..cnt]); - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } BeMessage::ParameterDescription => { @@ -793,23 +824,19 @@ impl<'a> BeMessage<'a> { write_body(buf, |buf| { // we don't support params, so always 0 buf.put_i16(0); - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } BeMessage::ParseComplete => { buf.put_u8(b'1'); - write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); + write_body(buf, |_| {}); } BeMessage::ReadyForQuery => { buf.put_u8(b'Z'); write_body(buf, |buf| { buf.put_u8(b'I'); - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } BeMessage::RowDescription(rows) => { @@ -837,9 +864,7 @@ impl<'a> BeMessage<'a> { buf.put_u64(body.wal_end); buf.put_i64(body.timestamp); buf.put_slice(body.data); - Ok::<_, io::Error>(()) - }) - .unwrap(); + }); } BeMessage::KeepAlive(req) => { @@ -848,20 +873,18 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'k'); buf.put_u64(req.sent_ptr); buf.put_i64(req.timestamp); - buf.put_u8(if req.request_reply { 1u8 } else { 0u8 }); - Ok::<_, io::Error>(()) - }) - .unwrap(); + buf.put_u8(if req.request_reply { 1 } else { 0 }); + }); } } Ok(()) } } -// Zenith extension of postgres replication protocol -// See ZENITH_STATUS_UPDATE_TAG_BYTE -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] -pub struct ZenithFeedback { +// Neon extension of postgres replication protocol +// See NEON_STATUS_UPDATE_TAG_BYTE +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct ReplicationFeedback { // Last known size of the timeline. Used to enforce timeline size limit. pub current_timeline_size: u64, // Parts of StandbyStatusUpdate we resend to compute via safekeeper @@ -871,13 +894,13 @@ pub struct ZenithFeedback { pub ps_replytime: SystemTime, } -// NOTE: Do not forget to increment this number when adding new fields to ZenithFeedback. +// NOTE: Do not forget to increment this number when adding new fields to ReplicationFeedback. // Do not remove previously available fields because this might be backwards incompatible. -pub const ZENITH_FEEDBACK_FIELDS_NUMBER: u8 = 5; +pub const REPLICATION_FEEDBACK_FIELDS_NUMBER: u8 = 5; -impl ZenithFeedback { - pub fn empty() -> ZenithFeedback { - ZenithFeedback { +impl ReplicationFeedback { + pub fn empty() -> ReplicationFeedback { + ReplicationFeedback { current_timeline_size: 0, ps_writelsn: 0, ps_applylsn: 0, @@ -886,7 +909,7 @@ impl ZenithFeedback { } } - // Serialize ZenithFeedback using custom format + // Serialize ReplicationFeedback using custom format // to support protocol extensibility. // // Following layout is used: @@ -897,18 +920,18 @@ impl ZenithFeedback { // uint32 - value length in bytes // value itself pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> { - buf.put_u8(ZENITH_FEEDBACK_FIELDS_NUMBER); // # of keys - write_cstr(&Bytes::from("current_timeline_size"), buf)?; + buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys + buf.put_slice(b"current_timeline_size\0"); buf.put_i32(8); buf.put_u64(self.current_timeline_size); - write_cstr(&Bytes::from("ps_writelsn"), buf)?; + buf.put_slice(b"ps_writelsn\0"); buf.put_i32(8); buf.put_u64(self.ps_writelsn); - write_cstr(&Bytes::from("ps_flushlsn"), buf)?; + buf.put_slice(b"ps_flushlsn\0"); buf.put_i32(8); buf.put_u64(self.ps_flushlsn); - write_cstr(&Bytes::from("ps_applylsn"), buf)?; + buf.put_slice(b"ps_applylsn\0"); buf.put_i32(8); buf.put_u64(self.ps_applylsn); @@ -918,64 +941,61 @@ impl ZenithFeedback { .expect("failed to serialize pg_replytime earlier than PG_EPOCH") .as_micros() as i64; - write_cstr(&Bytes::from("ps_replytime"), buf)?; + buf.put_slice(b"ps_replytime\0"); buf.put_i32(8); buf.put_i64(timestamp); Ok(()) } - // Deserialize ZenithFeedback message - pub fn parse(mut buf: Bytes) -> ZenithFeedback { - let mut zf = ZenithFeedback::empty(); + // Deserialize ReplicationFeedback message + pub fn parse(mut buf: Bytes) -> ReplicationFeedback { + let mut rf = ReplicationFeedback::empty(); let nfields = buf.get_u8(); - let mut i = 0; - while i < nfields { - i += 1; - let key_cstr = read_null_terminated(&mut buf).unwrap(); - let key = cstr_to_str(&key_cstr).unwrap(); - match key { - "current_timeline_size" => { + for _ in 0..nfields { + let key = read_cstr(&mut buf).unwrap(); + match key.as_ref() { + b"current_timeline_size" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.current_timeline_size = buf.get_u64(); + rf.current_timeline_size = buf.get_u64(); } - "ps_writelsn" => { + b"ps_writelsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.ps_writelsn = buf.get_u64(); + rf.ps_writelsn = buf.get_u64(); } - "ps_flushlsn" => { + b"ps_flushlsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.ps_flushlsn = buf.get_u64(); + rf.ps_flushlsn = buf.get_u64(); } - "ps_applylsn" => { + b"ps_applylsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - zf.ps_applylsn = buf.get_u64(); + rf.ps_applylsn = buf.get_u64(); } - "ps_replytime" => { + b"ps_replytime" => { let len = buf.get_i32(); assert_eq!(len, 8); let raw_time = buf.get_i64(); if raw_time > 0 { - zf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); + rf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); } else { - zf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); + rf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); } } _ => { let len = buf.get_i32(); warn!( - "ZenithFeedback parse. unknown key {} of len {}. Skip it.", - key, len + "ReplicationFeedback parse. unknown key {} of len {len}. Skip it.", + String::from_utf8_lossy(key.as_ref()) ); buf.advance(len as usize); } } } - trace!("ZenithFeedback parsed is {:?}", zf); - zf + trace!("ReplicationFeedback parsed is {:?}", rf); + rf } } @@ -984,43 +1004,70 @@ mod tests { use super::*; #[test] - fn test_zenithfeedback_serialization() { - let mut zf = ZenithFeedback::empty(); - // Fill zf wih some values - zf.current_timeline_size = 12345678; + fn test_replication_feedback_serialization() { + let mut rf = ReplicationFeedback::empty(); + // Fill rf with some values + rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - zf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); - zf.serialize(&mut data).unwrap(); + rf.serialize(&mut data).unwrap(); - let zf_parsed = ZenithFeedback::parse(data.freeze()); - assert_eq!(zf, zf_parsed); + let rf_parsed = ReplicationFeedback::parse(data.freeze()); + assert_eq!(rf, rf_parsed); } #[test] - fn test_zenithfeedback_unknown_key() { - let mut zf = ZenithFeedback::empty(); - // Fill zf wih some values - zf.current_timeline_size = 12345678; + fn test_replication_feedback_unknown_key() { + let mut rf = ReplicationFeedback::empty(); + // Fill rf with some values + rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - zf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); - zf.serialize(&mut data).unwrap(); + rf.serialize(&mut data).unwrap(); // Add an extra field to the buffer and adjust number of keys if let Some(first) = data.first_mut() { - *first = ZENITH_FEEDBACK_FIELDS_NUMBER + 1; + *first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1; } - write_cstr(&Bytes::from("new_field_one"), &mut data).unwrap(); + data.put_slice(b"new_field_one\0"); data.put_i32(8); data.put_u64(42); // Parse serialized data and check that new field is not parsed - let zf_parsed = ZenithFeedback::parse(data.freeze()); - assert_eq!(zf, zf_parsed); + let rf_parsed = ReplicationFeedback::parse(data.freeze()); + assert_eq!(rf, rf_parsed); + } + + #[test] + fn test_startup_message_params_options_escaped() { + fn split_options(params: &StartupMessageParams) -> Vec> { + params + .options_escaped() + .expect("options are None") + .collect() + } + + let make_params = |options| StartupMessageParams::new([("options", options)]); + + let params = StartupMessageParams::new([]); + assert!(matches!(params.options_escaped(), None)); + + let params = make_params(""); + assert!(split_options(¶ms).is_empty()); + + let params = make_params("foo"); + assert_eq!(split_options(¶ms), ["foo"]); + + let params = make_params(" foo bar "); + assert_eq!(split_options(¶ms), ["foo", "bar"]); + + let params = make_params("foo\\ bar \\ \\\\ baz\\ lol"); + assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]); } // Make sure that `read` is sync/async callable diff --git a/zenith_utils/src/sync.rs b/libs/pq_proto/src/sync.rs similarity index 99% rename from zenith_utils/src/sync.rs rename to libs/pq_proto/src/sync.rs index 5e61480bc3..b7ff1fb70b 100644 --- a/zenith_utils/src/sync.rs +++ b/libs/pq_proto/src/sync.rs @@ -29,7 +29,7 @@ impl SyncFuture { /// Example: /// /// ``` - /// # use zenith_utils::sync::SyncFuture; + /// # use pq_proto::sync::SyncFuture; /// # use std::future::Future; /// # use tokio::io::AsyncReadExt; /// # diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml new file mode 100644 index 0000000000..f54d91905c --- /dev/null +++ b/libs/remote_storage/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "remote_storage" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = { version = "1.0", features = ["backtrace"] } +async-trait = "0.1" +metrics = { version = "0.1", path = "../metrics" } +utils = { version = "0.1", path = "../utils" } +once_cell = "1.13.0" +rusoto_core = "0.48" +rusoto_s3 = "0.48" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1" +tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] } +tokio-util = { version = "0.7", features = ["io"] } +toml_edit = { version = "0.14", features = ["easy"] } +tracing = "0.1.27" + +workspace_hack = { version = "0.1", path = "../../workspace_hack" } + +[dev-dependencies] +tempfile = "3.2" diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs new file mode 100644 index 0000000000..4bdd2b9608 --- /dev/null +++ b/libs/remote_storage/src/lib.rs @@ -0,0 +1,451 @@ +//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage. +//! No other modules from this tree are supposed to be used directly by the external code. +//! +//! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations: +//! * [`local_fs`] allows to use local file system as an external storage +//! * [`s3_bucket`] uses AWS S3 bucket as an external storage +//! +mod local_fs; +mod s3_bucket; + +use std::{ + collections::HashMap, + fmt::{Debug, Display}, + num::{NonZeroU32, NonZeroUsize}, + ops::Deref, + path::{Path, PathBuf}, + pin::Pin, + sync::Arc, +}; + +use anyhow::{bail, Context}; + +use tokio::io; +use toml_edit::Item; +use tracing::info; + +pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket}; + +/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage. +/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency +/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach. +/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed. +pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50; +pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; +/// Currently, sync happens with AWS S3, that has two limits on requests per second: +/// ~200 RPS for IAM services +/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html +/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests +/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ +pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; + +const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; + +#[derive(Clone, PartialEq, Eq)] +pub struct RemoteObjectId(String); + +/// +/// A key that refers to an object in remote storage. It works much like a Path, +/// but it's a separate datatype so that you don't accidentally mix local paths +/// and remote keys. +/// +impl RemoteObjectId { + // Needed to retrieve last component for RemoteObjectId. + // In other words a file name + /// Turn a/b/c or a/b/c/ into c + pub fn object_name(&self) -> Option<&str> { + // corner case, char::to_string is not const, thats why this is more verbose than it needs to be + // see https://github.com/rust-lang/rust/issues/88674 + if self.0.len() == 1 && self.0.chars().next().unwrap() == REMOTE_STORAGE_PREFIX_SEPARATOR { + return None; + } + + if self.0.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + self.0.rsplit(REMOTE_STORAGE_PREFIX_SEPARATOR).nth(1) + } else { + self.0 + .rsplit_once(REMOTE_STORAGE_PREFIX_SEPARATOR) + .map(|(_, last)| last) + } + } +} + +impl Debug for RemoteObjectId { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + Debug::fmt(&self.0, fmt) + } +} + +impl Display for RemoteObjectId { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + Display::fmt(&self.0, fmt) + } +} + +/// Storage (potentially remote) API to manage its state. +/// This storage tries to be unaware of any layered repository context, +/// providing basic CRUD operations for storage files. +#[async_trait::async_trait] +pub trait RemoteStorage: Send + Sync + 'static { + /// Attempts to derive the storage path out of the local path, if the latter is correct. + fn remote_object_id(&self, local_path: &Path) -> anyhow::Result; + + /// Gets the download path of the given storage file. + fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result; + + /// Lists all items the storage has right now. + async fn list(&self) -> anyhow::Result>; + + /// Lists all top level subdirectories for a given prefix + /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id + /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS) + /// so this method doesnt need to. + async fn list_prefixes( + &self, + prefix: Option<&RemoteObjectId>, + ) -> anyhow::Result>; + + /// Streams the local file contents into remote into the remote storage entry. + async fn upload( + &self, + from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, + // S3 PUT request requires the content length to be specified, + // otherwise it starts to fail with the concurrent connection count increasing. + from_size_bytes: usize, + to: &RemoteObjectId, + metadata: Option, + ) -> anyhow::Result<()>; + + /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer. + /// Returns the metadata, if any was stored with the file previously. + async fn download(&self, from: &RemoteObjectId) -> Result; + + /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer. + /// Returns the metadata, if any was stored with the file previously. + async fn download_byte_range( + &self, + from: &RemoteObjectId, + start_inclusive: u64, + end_exclusive: Option, + ) -> Result; + + async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()>; + + /// Downcast to LocalFs implementation. For tests. + fn as_local(&self) -> Option<&LocalFs> { + None + } +} + +pub struct Download { + pub download_stream: Pin>, + /// Extra key-value data, associated with the current remote file. + pub metadata: Option, +} + +impl Debug for Download { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Download") + .field("metadata", &self.metadata) + .finish() + } +} + +#[derive(Debug)] +pub enum DownloadError { + /// Validation or other error happened due to user input. + BadInput(anyhow::Error), + /// The file was not found in the remote storage. + NotFound, + /// The file was found in the remote storage, but the download failed. + Other(anyhow::Error), +} + +impl std::fmt::Display for DownloadError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DownloadError::BadInput(e) => { + write!(f, "Failed to download a remote file due to user input: {e}") + } + DownloadError::NotFound => write!(f, "No file found for the remote object id given"), + DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e}"), + } + } +} + +impl std::error::Error for DownloadError {} + +/// Every storage, currently supported. +/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. +#[derive(Clone)] +pub struct GenericRemoteStorage(Arc); + +impl Deref for GenericRemoteStorage { + type Target = dyn RemoteStorage; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +impl GenericRemoteStorage { + pub fn new(storage: impl RemoteStorage) -> Self { + Self(Arc::new(storage)) + } + + pub fn from_config( + working_directory: PathBuf, + storage_config: &RemoteStorageConfig, + ) -> anyhow::Result { + Ok(match &storage_config.storage { + RemoteStorageKind::LocalFs(root) => { + info!("Using fs root '{}' as a remote storage", root.display()); + GenericRemoteStorage::new(LocalFs::new(root.clone(), working_directory)?) + } + RemoteStorageKind::AwsS3(s3_config) => { + info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'", + s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); + GenericRemoteStorage::new(S3Bucket::new(s3_config, working_directory)?) + } + }) + } + + /// Takes storage object contents and its size and uploads to remote storage, + /// mapping `from_path` to the corresponding remote object id in the storage. + /// + /// The storage object does not have to be present on the `from_path`, + /// this path is used for the remote object id conversion only. + pub async fn upload_storage_object( + &self, + from: Box, + from_size_bytes: usize, + from_path: &Path, + ) -> anyhow::Result<()> { + let target_storage_path = self.remote_object_id(from_path).with_context(|| { + format!( + "Failed to get the storage path for source local path '{}'", + from_path.display() + ) + })?; + + self.upload(from, from_size_bytes, &target_storage_path, None) + .await + .with_context(|| { + format!( + "Failed to upload from '{}' to storage path '{:?}'", + from_path.display(), + target_storage_path + ) + }) + } + + /// Downloads the storage object into the `to_path` provided. + /// `byte_range` could be specified to dowload only a part of the file, if needed. + pub async fn download_storage_object( + &self, + byte_range: Option<(u64, Option)>, + to_path: &Path, + ) -> Result { + let remote_object_path = self + .remote_object_id(to_path) + .with_context(|| { + format!( + "Failed to get the storage path for target local path '{}'", + to_path.display() + ) + }) + .map_err(DownloadError::BadInput)?; + + match byte_range { + Some((start, end)) => { + self.download_byte_range(&remote_object_path, start, end) + .await + } + None => self.download(&remote_object_path).await, + } + } +} + +/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry. +/// Immutable, cannot be changed once the file is created. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct StorageMetadata(HashMap); + +fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> { + if prefix == path { + anyhow::bail!( + "Prefix and the path are equal, cannot strip: '{}'", + prefix.display() + ) + } else { + path.strip_prefix(prefix).with_context(|| { + format!( + "Path '{}' is not prefixed with '{}'", + path.display(), + prefix.display(), + ) + }) + } +} + +/// External backup storage configuration, enough for creating a client for that storage. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct RemoteStorageConfig { + /// Max allowed number of concurrent sync operations between the API user and the remote storage. + pub max_concurrent_syncs: NonZeroUsize, + /// Max allowed errors before the sync task is considered failed and evicted. + pub max_sync_errors: NonZeroU32, + /// The storage connection configuration. + pub storage: RemoteStorageKind, +} + +/// A kind of a remote storage to connect to, with its connection configuration. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum RemoteStorageKind { + /// Storage based on local file system. + /// Specify a root folder to place all stored files into. + LocalFs(PathBuf), + /// AWS S3 based storage, storing all files in the S3 bucket + /// specified by the config + AwsS3(S3Config), +} + +/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write). +#[derive(Clone, PartialEq, Eq)] +pub struct S3Config { + /// Name of the bucket to connect to. + pub bucket_name: String, + /// The region where the bucket is located at. + pub bucket_region: String, + /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once. + pub prefix_in_bucket: Option, + /// A base URL to send S3 requests to. + /// By default, the endpoint is derived from a region name, assuming it's + /// an AWS S3 region name, erroring on wrong region name. + /// Endpoint provides a way to support other S3 flavors and their regions. + /// + /// Example: `http://127.0.0.1:5000` + pub endpoint: Option, + /// AWS S3 has various limits on its API calls, we need not to exceed those. + /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. + pub concurrency_limit: NonZeroUsize, +} + +impl Debug for S3Config { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("S3Config") + .field("bucket_name", &self.bucket_name) + .field("bucket_region", &self.bucket_region) + .field("prefix_in_bucket", &self.prefix_in_bucket) + .field("concurrency_limit", &self.concurrency_limit) + .finish() + } +} + +impl RemoteStorageConfig { + pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { + let local_path = toml.get("local_path"); + let bucket_name = toml.get("bucket_name"); + let bucket_region = toml.get("bucket_region"); + + let max_concurrent_syncs = NonZeroUsize::new( + parse_optional_integer("max_concurrent_syncs", toml)? + .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS), + ) + .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?; + + let max_sync_errors = NonZeroU32::new( + parse_optional_integer("max_sync_errors", toml)? + .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS), + ) + .context("Failed to parse 'max_sync_errors' as a positive integer")?; + + let concurrency_limit = NonZeroUsize::new( + parse_optional_integer("concurrency_limit", toml)? + .unwrap_or(DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT), + ) + .context("Failed to parse 'concurrency_limit' as a positive integer")?; + + let storage = match (local_path, bucket_name, bucket_region) { + (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"), + (_, Some(_), None) => { + bail!("'bucket_region' option is mandatory if 'bucket_name' is given ") + } + (_, None, Some(_)) => { + bail!("'bucket_name' option is mandatory if 'bucket_region' is given ") + } + (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config { + bucket_name: parse_toml_string("bucket_name", bucket_name)?, + bucket_region: parse_toml_string("bucket_region", bucket_region)?, + prefix_in_bucket: toml + .get("prefix_in_bucket") + .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket)) + .transpose()?, + endpoint: toml + .get("endpoint") + .map(|endpoint| parse_toml_string("endpoint", endpoint)) + .transpose()?, + concurrency_limit, + }), + (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from( + parse_toml_string("local_path", local_path)?, + )), + (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"), + }; + + Ok(RemoteStorageConfig { + max_concurrent_syncs, + max_sync_errors, + storage, + }) + } +} + +// Helper functions to parse a toml Item +fn parse_optional_integer(name: &str, item: &toml_edit::Item) -> anyhow::Result> +where + I: TryFrom, + E: std::error::Error + Send + Sync + 'static, +{ + let toml_integer = match item.get(name) { + Some(item) => item + .as_integer() + .with_context(|| format!("configure option {name} is not an integer"))?, + None => return Ok(None), + }; + + I::try_from(toml_integer) + .map(Some) + .with_context(|| format!("configure option {name} is too large")) +} + +fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result { + let s = item + .as_str() + .with_context(|| format!("configure option {name} is not a string"))?; + Ok(s.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn object_name() { + let k = RemoteObjectId("a/b/c".to_owned()); + assert_eq!(k.object_name(), Some("c")); + + let k = RemoteObjectId("a/b/c/".to_owned()); + assert_eq!(k.object_name(), Some("c")); + + let k = RemoteObjectId("a/".to_owned()); + assert_eq!(k.object_name(), Some("a")); + + // XXX is it impossible to have an empty key? + let k = RemoteObjectId("".to_owned()); + assert_eq!(k.object_name(), None); + + let k = RemoteObjectId("/".to_owned()); + assert_eq!(k.object_name(), None); + } +} diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs new file mode 100644 index 0000000000..2f824cc453 --- /dev/null +++ b/libs/remote_storage/src/local_fs.rs @@ -0,0 +1,932 @@ +//! Local filesystem acting as a remote storage. +//! Multiple API users can use the same "storage" of this kind by using different storage roots. +//! +//! This storage used in tests, but can also be used in cases when a certain persistent +//! volume is mounted to the local FS. + +use std::{ + future::Future, + path::{Path, PathBuf}, + pin::Pin, +}; + +use anyhow::{bail, ensure, Context}; +use tokio::{ + fs, + io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, +}; +use tracing::*; +use utils::crashsafe::path_with_suffix_extension; + +use crate::{Download, DownloadError, RemoteObjectId}; + +use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; + +const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp"; + +/// Convert a Path in the remote storage into a RemoteObjectId +fn remote_object_id_from_path(path: &Path) -> anyhow::Result { + Ok(RemoteObjectId( + path.to_str() + .ok_or_else(|| anyhow::anyhow!("unexpected characters found in path"))? + .to_string(), + )) +} + +pub struct LocalFs { + working_directory: PathBuf, + storage_root: PathBuf, +} + +impl LocalFs { + /// Attempts to create local FS storage, along with its root directory. + pub fn new(root: PathBuf, working_directory: PathBuf) -> anyhow::Result { + if !root.exists() { + std::fs::create_dir_all(&root).with_context(|| { + format!( + "Failed to create all directories in the given root path '{}'", + root.display(), + ) + })?; + } + Ok(Self { + working_directory, + storage_root: root, + }) + } + + /// + /// Get the absolute path in the local filesystem to given remote object. + /// + /// This is public so that it can be used in tests. Should not be used elsewhere. + /// + pub fn resolve_in_storage(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result { + let path = PathBuf::from(&remote_object_id.0); + if path.is_relative() { + Ok(self.storage_root.join(path)) + } else if path.starts_with(&self.storage_root) { + Ok(path) + } else { + bail!( + "Path '{}' does not belong to the current storage", + path.display() + ) + } + } + + async fn read_storage_metadata( + &self, + file_path: &Path, + ) -> anyhow::Result> { + let metadata_path = storage_metadata_path(file_path); + if metadata_path.exists() && metadata_path.is_file() { + let metadata_string = fs::read_to_string(&metadata_path).await.with_context(|| { + format!( + "Failed to read metadata from the local storage at '{}'", + metadata_path.display() + ) + })?; + + serde_json::from_str(&metadata_string) + .with_context(|| { + format!( + "Failed to deserialize metadata from the local storage at '{}'", + metadata_path.display() + ) + }) + .map(|metadata| Some(StorageMetadata(metadata))) + } else { + Ok(None) + } + } +} + +#[async_trait::async_trait] +impl RemoteStorage for LocalFs { + /// Convert a "local" path into a "remote path" + fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { + let path = self.storage_root.join( + strip_path_prefix(&self.working_directory, local_path) + .context("local path does not belong to this storage")?, + ); + remote_object_id_from_path(&path) + } + + fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result { + let storage_path = PathBuf::from(&remote_object_id.0); + let relative_path = strip_path_prefix(&self.storage_root, &storage_path) + .context("local path does not belong to this storage")?; + Ok(self.working_directory.join(relative_path)) + } + + async fn list(&self) -> anyhow::Result> { + get_all_files(&self.storage_root, true).await + } + + async fn list_prefixes( + &self, + prefix: Option<&RemoteObjectId>, + ) -> anyhow::Result> { + let path = match prefix { + Some(prefix) => Path::new(&prefix.0), + None => &self.storage_root, + }; + get_all_files(path, false).await + } + + async fn upload( + &self, + from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, + from_size_bytes: usize, + to: &RemoteObjectId, + metadata: Option, + ) -> anyhow::Result<()> { + let target_file_path = self.resolve_in_storage(to)?; + create_target_directory(&target_file_path).await?; + // We need this dance with sort of durable rename (without fsyncs) + // to prevent partial uploads. This was really hit when pageserver shutdown + // cancelled the upload and partial file was left on the fs + let temp_file_path = + path_with_suffix_extension(&target_file_path, LOCAL_FS_TEMP_FILE_SUFFIX); + let mut destination = io::BufWriter::new( + fs::OpenOptions::new() + .write(true) + .create(true) + .open(&temp_file_path) + .await + .with_context(|| { + format!( + "Failed to open target fs destination at '{}'", + target_file_path.display() + ) + })?, + ); + + let from_size_bytes = from_size_bytes as u64; + let mut buffer_to_read = from.take(from_size_bytes); + + let bytes_read = io::copy(&mut buffer_to_read, &mut destination) + .await + .with_context(|| { + format!( + "Failed to upload file (write temp) to the local storage at '{}'", + temp_file_path.display() + ) + })?; + + if bytes_read < from_size_bytes { + bail!("Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes"); + } + // Check if there is any extra data after the given size. + let mut from = buffer_to_read.into_inner(); + let extra_read = from.read(&mut [1]).await?; + ensure!( + extra_read == 0, + "Provided stream was larger than expected: expected {from_size_bytes} bytes", + ); + + destination.flush().await.with_context(|| { + format!( + "Failed to upload (flush temp) file to the local storage at '{}'", + temp_file_path.display() + ) + })?; + + fs::rename(temp_file_path, &target_file_path) + .await + .with_context(|| { + format!( + "Failed to upload (rename) file to the local storage at '{}'", + target_file_path.display() + ) + })?; + + if let Some(storage_metadata) = metadata { + let storage_metadata_path = storage_metadata_path(&target_file_path); + fs::write( + &storage_metadata_path, + serde_json::to_string(&storage_metadata.0) + .context("Failed to serialize storage metadata as json")?, + ) + .await + .with_context(|| { + format!( + "Failed to write metadata to the local storage at '{}'", + storage_metadata_path.display() + ) + })?; + } + + Ok(()) + } + + async fn download(&self, from: &RemoteObjectId) -> Result { + let file_path = self + .resolve_in_storage(from) + .map_err(DownloadError::BadInput)?; + if file_exists(&file_path).map_err(DownloadError::BadInput)? { + let source = io::BufReader::new( + fs::OpenOptions::new() + .read(true) + .open(&file_path) + .await + .with_context(|| { + format!( + "Failed to open source file '{}' to use in the download", + file_path.display() + ) + }) + .map_err(DownloadError::Other)?, + ); + + let metadata = self + .read_storage_metadata(&file_path) + .await + .map_err(DownloadError::Other)?; + Ok(Download { + metadata, + download_stream: Box::pin(source), + }) + } else { + Err(DownloadError::NotFound) + } + } + + async fn download_byte_range( + &self, + from: &RemoteObjectId, + start_inclusive: u64, + end_exclusive: Option, + ) -> Result { + if let Some(end_exclusive) = end_exclusive { + if end_exclusive <= start_inclusive { + return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) is not less than end_exclusive ({end_exclusive:?})"))); + }; + if start_inclusive == end_exclusive.saturating_sub(1) { + return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes"))); + } + } + let file_path = self + .resolve_in_storage(from) + .map_err(DownloadError::BadInput)?; + if file_exists(&file_path).map_err(DownloadError::BadInput)? { + let mut source = io::BufReader::new( + fs::OpenOptions::new() + .read(true) + .open(&file_path) + .await + .with_context(|| { + format!( + "Failed to open source file '{}' to use in the download", + file_path.display() + ) + }) + .map_err(DownloadError::Other)?, + ); + source + .seek(io::SeekFrom::Start(start_inclusive)) + .await + .context("Failed to seek to the range start in a local storage file") + .map_err(DownloadError::Other)?; + let metadata = self + .read_storage_metadata(&file_path) + .await + .map_err(DownloadError::Other)?; + + Ok(match end_exclusive { + Some(end_exclusive) => Download { + metadata, + download_stream: Box::pin(source.take(end_exclusive - start_inclusive)), + }, + None => Download { + metadata, + download_stream: Box::pin(source), + }, + }) + } else { + Err(DownloadError::NotFound) + } + } + + async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()> { + let file_path = self.resolve_in_storage(path)?; + if file_path.exists() && file_path.is_file() { + Ok(fs::remove_file(file_path).await?) + } else { + bail!( + "File '{}' either does not exist or is not a file", + file_path.display() + ) + } + } + + fn as_local(&self) -> Option<&LocalFs> { + Some(self) + } +} + +fn storage_metadata_path(original_path: &Path) -> PathBuf { + path_with_suffix_extension(original_path, "metadata") +} + +fn get_all_files<'a, P>( + directory_path: P, + recursive: bool, +) -> Pin>> + Send + Sync + 'a>> +where + P: AsRef + Send + Sync + 'a, +{ + Box::pin(async move { + let directory_path = directory_path.as_ref(); + if directory_path.exists() { + if directory_path.is_dir() { + let mut paths = Vec::new(); + let mut dir_contents = fs::read_dir(directory_path).await?; + while let Some(dir_entry) = dir_contents.next_entry().await? { + let file_type = dir_entry.file_type().await?; + let entry_path = dir_entry.path(); + if file_type.is_symlink() { + debug!("{:?} us a symlink, skipping", entry_path) + } else if file_type.is_dir() { + if recursive { + paths.extend(get_all_files(&entry_path, true).await?.into_iter()) + } else { + paths.push(remote_object_id_from_path(&dir_entry.path())?) + } + } else { + paths.push(remote_object_id_from_path(&dir_entry.path())?); + } + } + Ok(paths) + } else { + bail!("Path '{}' is not a directory", directory_path.display()) + } + } else { + Ok(Vec::new()) + } + }) +} + +async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> { + let target_dir = match target_file_path.parent() { + Some(parent_dir) => parent_dir, + None => bail!( + "File path '{}' has no parent directory", + target_file_path.display() + ), + }; + if !target_dir.exists() { + fs::create_dir_all(target_dir).await?; + } + Ok(()) +} + +fn file_exists(file_path: &Path) -> anyhow::Result { + if file_path.exists() { + ensure!( + file_path.is_file(), + "file path '{}' is not a file", + file_path.display() + ); + Ok(true) + } else { + Ok(false) + } +} + +#[cfg(test)] +mod pure_tests { + use tempfile::tempdir; + + use super::*; + + #[test] + fn storage_path_positive() -> anyhow::Result<()> { + let workdir = tempdir()?.path().to_owned(); + + let storage_root = PathBuf::from("somewhere").join("else"); + let storage = LocalFs { + working_directory: workdir.clone(), + storage_root: storage_root.clone(), + }; + + let local_path = workdir + .join("timelines") + .join("some_timeline") + .join("file_name"); + let expected_path = storage_root.join(local_path.strip_prefix(&workdir)?); + + let actual_path = PathBuf::from( + storage + .remote_object_id(&local_path) + .expect("Matching path should map to storage path normally") + .0, + ); + assert_eq!( + expected_path, + actual_path, + "File paths from workdir should be stored in local fs storage with the same path they have relative to the workdir" + ); + + Ok(()) + } + + #[test] + fn storage_path_negatives() -> anyhow::Result<()> { + #[track_caller] + fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String { + match storage.remote_object_id(mismatching_path) { + Ok(wrong_path) => panic!( + "Expected path '{}' to error, but got storage path: {:?}", + mismatching_path.display(), + wrong_path, + ), + Err(e) => format!("{:?}", e), + } + } + + let workdir = tempdir()?.path().to_owned(); + let storage_root = PathBuf::from("somewhere").join("else"); + let storage = LocalFs { + working_directory: workdir.clone(), + storage_root, + }; + + let error_string = storage_path_error(&storage, &workdir); + assert!(error_string.contains("does not belong to this storage")); + assert!(error_string.contains(workdir.to_str().unwrap())); + + let mismatching_path_str = "/something/else"; + let error_message = storage_path_error(&storage, Path::new(mismatching_path_str)); + assert!( + error_message.contains(mismatching_path_str), + "Error should mention wrong path" + ); + assert!( + error_message.contains(workdir.to_str().unwrap()), + "Error should mention server workdir" + ); + assert!(error_message.contains("does not belong to this storage")); + + Ok(()) + } + + #[test] + fn local_path_positive() -> anyhow::Result<()> { + let workdir = tempdir()?.path().to_owned(); + let storage_root = PathBuf::from("somewhere").join("else"); + let storage = LocalFs { + working_directory: workdir.clone(), + storage_root: storage_root.clone(), + }; + + let name = "not a metadata"; + let local_path = workdir.join("timelines").join("some_timeline").join(name); + assert_eq!( + local_path, + storage + .local_path(&remote_object_id_from_path( + &storage_root.join(local_path.strip_prefix(&workdir)?) + )?) + .expect("For a valid input, valid local path should be parsed"), + "Should be able to parse metadata out of the correctly named remote delta file" + ); + + let local_metadata_path = workdir + .join("timelines") + .join("some_timeline") + .join("metadata"); + let remote_metadata_path = storage.remote_object_id(&local_metadata_path)?; + assert_eq!( + local_metadata_path, + storage + .local_path(&remote_metadata_path) + .expect("For a valid input, valid local path should be parsed"), + "Should be able to parse metadata out of the correctly named remote metadata file" + ); + + Ok(()) + } + + #[test] + fn local_path_negatives() -> anyhow::Result<()> { + #[track_caller] + fn local_path_error(storage: &LocalFs, storage_path: &RemoteObjectId) -> String { + match storage.local_path(storage_path) { + Ok(wrong_path) => panic!( + "Expected local path input {:?} to cause an error, but got file path: {:?}", + storage_path, wrong_path, + ), + Err(e) => format!("{:?}", e), + } + } + + let storage_root = PathBuf::from("somewhere").join("else"); + let storage = LocalFs { + working_directory: tempdir()?.path().to_owned(), + storage_root, + }; + + let totally_wrong_path = "wrong_wrong_wrong"; + let error_message = + local_path_error(&storage, &RemoteObjectId(totally_wrong_path.to_string())); + assert!(error_message.contains(totally_wrong_path)); + + Ok(()) + } + + #[test] + fn download_destination_matches_original_path() -> anyhow::Result<()> { + let workdir = tempdir()?.path().to_owned(); + let original_path = workdir + .join("timelines") + .join("some_timeline") + .join("some name"); + + let storage_root = PathBuf::from("somewhere").join("else"); + let dummy_storage = LocalFs { + working_directory: workdir, + storage_root, + }; + + let storage_path = dummy_storage.remote_object_id(&original_path)?; + let download_destination = dummy_storage.local_path(&storage_path)?; + + assert_eq!( + original_path, download_destination, + "'original path -> storage path -> matching fs path' transformation should produce the same path as the input one for the correct path" + ); + + Ok(()) + } +} + +#[cfg(test)] +mod fs_tests { + use super::*; + + use std::{collections::HashMap, io::Write}; + use tempfile::tempdir; + + async fn read_and_assert_remote_file_contents( + storage: &LocalFs, + #[allow(clippy::ptr_arg)] + // have to use &PathBuf due to `storage.local_path` parameter requirements + remote_storage_path: &RemoteObjectId, + expected_metadata: Option<&StorageMetadata>, + ) -> anyhow::Result { + let mut download = storage + .download(remote_storage_path) + .await + .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?; + ensure!( + download.metadata.as_ref() == expected_metadata, + "Unexpected metadata returned for the downloaded file" + ); + + let mut contents = String::new(); + download + .download_stream + .read_to_string(&mut contents) + .await + .context("Failed to read remote file contents into string")?; + Ok(contents) + } + + #[tokio::test] + async fn upload_file() -> anyhow::Result<()> { + let workdir = tempdir()?.path().to_owned(); + let storage = create_storage()?; + + let (file, size) = create_file_for_upload( + &storage.working_directory.join("whatever"), + "whatever_contents", + ) + .await?; + let target_path = "/somewhere/else"; + match storage + .upload( + Box::new(file), + size, + &RemoteObjectId(target_path.to_string()), + None, + ) + .await + { + Ok(()) => panic!("Should not allow storing files with wrong target path"), + Err(e) => { + let message = format!("{:?}", e); + assert!(message.contains(target_path)); + assert!(message.contains("does not belong to the current storage")); + } + } + assert!(storage.list().await?.is_empty()); + + let target_path_1 = upload_dummy_file(&workdir, &storage, "upload_1", None).await?; + assert_eq!( + storage.list().await?, + vec![target_path_1.clone()], + "Should list a single file after first upload" + ); + + let target_path_2 = upload_dummy_file(&workdir, &storage, "upload_2", None).await?; + assert_eq!( + list_files_sorted(&storage).await?, + vec![target_path_1.clone(), target_path_2.clone()], + "Should list a two different files after second upload" + ); + + Ok(()) + } + + #[tokio::test] + async fn upload_file_negatives() -> anyhow::Result<()> { + let storage = create_storage()?; + + let id = storage.remote_object_id(&storage.working_directory.join("dummy"))?; + let content = std::io::Cursor::new(b"12345"); + + // Check that you get an error if the size parameter doesn't match the actual + // size of the stream. + storage + .upload(Box::new(content.clone()), 0, &id, None) + .await + .expect_err("upload with zero size succeeded"); + storage + .upload(Box::new(content.clone()), 4, &id, None) + .await + .expect_err("upload with too short size succeeded"); + storage + .upload(Box::new(content.clone()), 6, &id, None) + .await + .expect_err("upload with too large size succeeded"); + + // Correct size is 5, this should succeed. + storage.upload(Box::new(content), 5, &id, None).await?; + + Ok(()) + } + + fn create_storage() -> anyhow::Result { + LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned()) + } + + #[tokio::test] + async fn download_file() -> anyhow::Result<()> { + let workdir = tempdir()?.path().to_owned(); + + let storage = create_storage()?; + let upload_name = "upload_1"; + let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; + + let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?; + assert_eq!( + dummy_contents(upload_name), + contents, + "We should upload and download the same contents" + ); + + let non_existing_path = "somewhere/else"; + match storage.download(&RemoteObjectId(non_existing_path.to_string())).await { + Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys + other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"), + } + Ok(()) + } + + #[tokio::test] + async fn download_file_range_positive() -> anyhow::Result<()> { + let workdir = tempdir()?.path().to_owned(); + + let storage = create_storage()?; + let upload_name = "upload_1"; + let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; + + let full_range_download_contents = + read_and_assert_remote_file_contents(&storage, &upload_target, None).await?; + assert_eq!( + dummy_contents(upload_name), + full_range_download_contents, + "Download full range should return the whole upload" + ); + + let uploaded_bytes = dummy_contents(upload_name).into_bytes(); + let (first_part_local, second_part_local) = uploaded_bytes.split_at(3); + + let mut first_part_download = storage + .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64)) + .await?; + assert!( + first_part_download.metadata.is_none(), + "No metadata should be returned for no metadata upload" + ); + + let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); + io::copy( + &mut first_part_download.download_stream, + &mut first_part_remote, + ) + .await?; + first_part_remote.flush().await?; + let first_part_remote = first_part_remote.into_inner().into_inner(); + assert_eq!( + first_part_local, + first_part_remote.as_slice(), + "First part bytes should be returned when requested" + ); + + let mut second_part_download = storage + .download_byte_range( + &upload_target, + first_part_local.len() as u64, + Some((first_part_local.len() + second_part_local.len()) as u64), + ) + .await?; + assert!( + second_part_download.metadata.is_none(), + "No metadata should be returned for no metadata upload" + ); + + let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); + io::copy( + &mut second_part_download.download_stream, + &mut second_part_remote, + ) + .await?; + second_part_remote.flush().await?; + let second_part_remote = second_part_remote.into_inner().into_inner(); + assert_eq!( + second_part_local, + second_part_remote.as_slice(), + "Second part bytes should be returned when requested" + ); + + Ok(()) + } + + #[tokio::test] + async fn download_file_range_negative() -> anyhow::Result<()> { + let workdir = tempdir()?.path().to_owned(); + + let storage = create_storage()?; + let upload_name = "upload_1"; + let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; + + let start = 1_000_000_000; + let end = start + 1; + match storage + .download_byte_range( + &upload_target, + start, + Some(end), // exclusive end + ) + .await + { + Ok(_) => panic!("Should not allow downloading wrong ranges"), + Err(e) => { + let error_string = e.to_string(); + assert!(error_string.contains("zero bytes")); + assert!(error_string.contains(&start.to_string())); + assert!(error_string.contains(&end.to_string())); + } + } + + let start = 10000; + let end = 234; + assert!(start > end, "Should test an incorrect range"); + match storage + .download_byte_range(&upload_target, start, Some(end)) + .await + { + Ok(_) => panic!("Should not allow downloading wrong ranges"), + Err(e) => { + let error_string = e.to_string(); + assert!(error_string.contains("Invalid range")); + assert!(error_string.contains(&start.to_string())); + assert!(error_string.contains(&end.to_string())); + } + } + + Ok(()) + } + + #[tokio::test] + async fn delete_file() -> anyhow::Result<()> { + let workdir = tempdir()?.path().to_owned(); + + let storage = create_storage()?; + let upload_name = "upload_1"; + let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; + + storage.delete(&upload_target).await?; + assert!(storage.list().await?.is_empty()); + + match storage.delete(&upload_target).await { + Ok(()) => panic!("Should not allow deleting non-existing storage files"), + Err(e) => { + let error_string = e.to_string(); + assert!(error_string.contains("does not exist")); + assert!(error_string.contains(&upload_target.0)); + } + } + Ok(()) + } + + #[tokio::test] + async fn file_with_metadata() -> anyhow::Result<()> { + let workdir = tempdir()?.path().to_owned(); + + let storage = create_storage()?; + let upload_name = "upload_1"; + let metadata = StorageMetadata(HashMap::from([ + ("one".to_string(), "1".to_string()), + ("two".to_string(), "2".to_string()), + ])); + let upload_target = + upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?; + + let full_range_download_contents = + read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?; + assert_eq!( + dummy_contents(upload_name), + full_range_download_contents, + "We should upload and download the same contents" + ); + + let uploaded_bytes = dummy_contents(upload_name).into_bytes(); + let (first_part_local, _) = uploaded_bytes.split_at(3); + + let mut partial_download_with_metadata = storage + .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64)) + .await?; + let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); + io::copy( + &mut partial_download_with_metadata.download_stream, + &mut first_part_remote, + ) + .await?; + first_part_remote.flush().await?; + let first_part_remote = first_part_remote.into_inner().into_inner(); + assert_eq!( + first_part_local, + first_part_remote.as_slice(), + "First part bytes should be returned when requested" + ); + + assert_eq!( + partial_download_with_metadata.metadata, + Some(metadata), + "We should get the same metadata back for partial download" + ); + + Ok(()) + } + + async fn upload_dummy_file( + workdir: &Path, + storage: &LocalFs, + name: &str, + metadata: Option, + ) -> anyhow::Result { + let timeline_path = workdir.join("timelines").join("some_timeline"); + let relative_timeline_path = timeline_path.strip_prefix(&workdir)?; + let storage_path = storage.storage_root.join(relative_timeline_path).join(name); + let remote_object_id = RemoteObjectId(storage_path.to_str().unwrap().to_string()); + + let from_path = storage.working_directory.join(name); + let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?; + + storage + .upload(Box::new(file), size, &remote_object_id, metadata) + .await?; + remote_object_id_from_path(&storage_path) + } + + async fn create_file_for_upload( + path: &Path, + contents: &str, + ) -> anyhow::Result<(io::BufReader, usize)> { + std::fs::create_dir_all(path.parent().unwrap())?; + let mut file_for_writing = std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .open(path)?; + write!(file_for_writing, "{}", contents)?; + drop(file_for_writing); + let file_size = path.metadata()?.len() as usize; + Ok(( + io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?), + file_size, + )) + } + + fn dummy_contents(name: &str) -> String { + format!("contents for {name}") + } + + async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result> { + let mut files = storage.list().await?; + files.sort_by(|a, b| a.0.cmp(&b.0)); + Ok(files) + } +} diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs new file mode 100644 index 0000000000..74632430cd --- /dev/null +++ b/libs/remote_storage/src/s3_bucket.rs @@ -0,0 +1,620 @@ +//! AWS S3 storage wrapper around `rusoto` library. +//! +//! Respects `prefix_in_bucket` property from [`S3Config`], +//! allowing multiple api users to independently work with the same S3 bucket, if +//! their bucket prefixes are both specified and different. + +use std::path::{Path, PathBuf}; + +use anyhow::Context; +use rusoto_core::{ + credential::{InstanceMetadataProvider, StaticProvider}, + HttpClient, Region, RusotoError, +}; +use rusoto_s3::{ + DeleteObjectRequest, GetObjectError, GetObjectRequest, ListObjectsV2Request, PutObjectRequest, + S3Client, StreamingBody, S3, +}; +use tokio::{io, sync::Semaphore}; +use tokio_util::io::ReaderStream; +use tracing::debug; + +use crate::{ + strip_path_prefix, Download, DownloadError, RemoteObjectId, RemoteStorage, S3Config, + REMOTE_STORAGE_PREFIX_SEPARATOR, +}; + +use super::StorageMetadata; + +pub(super) mod metrics { + use metrics::{register_int_counter_vec, IntCounterVec}; + use once_cell::sync::Lazy; + + static S3_REQUESTS_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "remote_storage_s3_requests_count", + "Number of s3 requests of particular type", + &["request_type"], + ) + .expect("failed to define a metric") + }); + + static S3_REQUESTS_FAIL_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "remote_storage_s3_failures_count", + "Number of failed s3 requests of particular type", + &["request_type"], + ) + .expect("failed to define a metric") + }); + + pub fn inc_get_object() { + S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc(); + } + + pub fn inc_get_object_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["get_object"]) + .inc(); + } + + pub fn inc_put_object() { + S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc(); + } + + pub fn inc_put_object_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["put_object"]) + .inc(); + } + + pub fn inc_delete_object() { + S3_REQUESTS_COUNT + .with_label_values(&["delete_object"]) + .inc(); + } + + pub fn inc_delete_object_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["delete_object"]) + .inc(); + } + + pub fn inc_list_objects() { + S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc(); + } + + pub fn inc_list_objects_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["list_objects"]) + .inc(); + } +} + +fn download_destination( + id: &RemoteObjectId, + workdir: &Path, + prefix_to_strip: Option<&str>, +) -> PathBuf { + let path_without_prefix = match prefix_to_strip { + Some(prefix) => id.0.strip_prefix(prefix).unwrap_or_else(|| { + panic!( + "Could not strip prefix '{}' from S3 object key '{}'", + prefix, id.0 + ) + }), + None => &id.0, + }; + + workdir.join( + path_without_prefix + .split(REMOTE_STORAGE_PREFIX_SEPARATOR) + .collect::(), + ) +} + +/// AWS S3 storage. +pub struct S3Bucket { + workdir: PathBuf, + client: S3Client, + bucket_name: String, + prefix_in_bucket: Option, + // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded. + // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold. + // The helps to ensure we don't exceed the thresholds. + concurrency_limiter: Semaphore, +} + +impl S3Bucket { + /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. + pub fn new(aws_config: &S3Config, workdir: PathBuf) -> anyhow::Result { + debug!( + "Creating s3 remote storage for S3 bucket {}", + aws_config.bucket_name + ); + let region = match aws_config.endpoint.clone() { + Some(custom_endpoint) => Region::Custom { + name: aws_config.bucket_region.clone(), + endpoint: custom_endpoint, + }, + None => aws_config + .bucket_region + .parse::() + .context("Failed to parse the s3 region from config")?, + }; + let request_dispatcher = HttpClient::new().context("Failed to create S3 http client")?; + + let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok(); + let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok(); + // session token is used when authorizing through sso + // which is typically the case when testing locally on developer machine + let session_token = std::env::var("AWS_SESSION_TOKEN").ok(); + + let client = if access_key_id.is_none() && secret_access_key.is_none() { + debug!("Using IAM-based AWS access"); + S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region) + } else { + debug!( + "Using credentials-based AWS access. Session token is set: {}", + session_token.is_some() + ); + S3Client::new_with( + request_dispatcher, + StaticProvider::new( + access_key_id.unwrap_or_default(), + secret_access_key.unwrap_or_default(), + session_token, + None, + ), + region, + ) + }; + + let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| { + let mut prefix = prefix; + while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + prefix = &prefix[1..] + } + + let mut prefix = prefix.to_string(); + while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + prefix.pop(); + } + prefix + }); + + Ok(Self { + client, + workdir, + bucket_name: aws_config.bucket_name.clone(), + prefix_in_bucket, + concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()), + }) + } + + async fn download_object(&self, request: GetObjectRequest) -> Result { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 download") + .map_err(DownloadError::Other)?; + + metrics::inc_get_object(); + + match self.client.get_object(request).await { + Ok(object_output) => match object_output.body { + None => { + metrics::inc_get_object_fail(); + Err(DownloadError::Other(anyhow::anyhow!( + "Got no body for the S3 object given" + ))) + } + Some(body) => Ok(Download { + metadata: object_output.metadata.map(StorageMetadata), + download_stream: Box::pin(io::BufReader::new(body.into_async_read())), + }), + }, + Err(RusotoError::Service(GetObjectError::NoSuchKey(_))) => Err(DownloadError::NotFound), + Err(e) => { + metrics::inc_get_object_fail(); + Err(DownloadError::Other(anyhow::anyhow!( + "Failed to download S3 object: {e}" + ))) + } + } + } +} + +#[async_trait::async_trait] +impl RemoteStorage for S3Bucket { + fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { + let relative_path = strip_path_prefix(&self.workdir, local_path)?; + let mut key = self.prefix_in_bucket.clone().unwrap_or_default(); + for segment in relative_path { + key.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + key.push_str(&segment.to_string_lossy()); + } + Ok(RemoteObjectId(key)) + } + + fn local_path(&self, storage_path: &RemoteObjectId) -> anyhow::Result { + Ok(download_destination( + storage_path, + &self.workdir, + self.prefix_in_bucket.as_deref(), + )) + } + + async fn list(&self) -> anyhow::Result> { + let mut document_keys = Vec::new(); + + let mut continuation_token = None; + loop { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 list")?; + + metrics::inc_list_objects(); + + let fetch_response = self + .client + .list_objects_v2(ListObjectsV2Request { + bucket: self.bucket_name.clone(), + prefix: self.prefix_in_bucket.clone(), + continuation_token, + ..ListObjectsV2Request::default() + }) + .await + .map_err(|e| { + metrics::inc_list_objects_fail(); + e + })?; + document_keys.extend( + fetch_response + .contents + .unwrap_or_default() + .into_iter() + .filter_map(|o| Some(RemoteObjectId(o.key?))), + ); + + match fetch_response.continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + + Ok(document_keys) + } + + /// See the doc for `RemoteStorage::list_prefixes` + /// Note: it wont include empty "directories" + async fn list_prefixes( + &self, + prefix: Option<&RemoteObjectId>, + ) -> anyhow::Result> { + // get the passed prefix or if it is not set use prefix_in_bucket value + let list_prefix = prefix + .map(|p| p.0.clone()) + .or_else(|| self.prefix_in_bucket.clone()) + .map(|mut p| { + // required to end with a separator + // otherwise request will return only the entry of a prefix + if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + } + p + }); + + let mut document_keys = Vec::new(); + + let mut continuation_token = None; + loop { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 list")?; + + metrics::inc_list_objects(); + + let fetch_response = self + .client + .list_objects_v2(ListObjectsV2Request { + bucket: self.bucket_name.clone(), + prefix: list_prefix.clone(), + continuation_token, + delimiter: Some(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()), + ..ListObjectsV2Request::default() + }) + .await + .map_err(|e| { + metrics::inc_list_objects_fail(); + e + })?; + + document_keys.extend( + fetch_response + .common_prefixes + .unwrap_or_default() + .into_iter() + .filter_map(|o| Some(RemoteObjectId(o.prefix?))), + ); + + match fetch_response.continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + + Ok(document_keys) + } + + async fn upload( + &self, + from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, + from_size_bytes: usize, + to: &RemoteObjectId, + metadata: Option, + ) -> anyhow::Result<()> { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 upload")?; + + metrics::inc_put_object(); + self.client + .put_object(PutObjectRequest { + body: Some(StreamingBody::new_with_size( + ReaderStream::new(from), + from_size_bytes, + )), + bucket: self.bucket_name.clone(), + key: to.0.to_owned(), + metadata: metadata.map(|m| m.0), + ..PutObjectRequest::default() + }) + .await + .map_err(|e| { + metrics::inc_put_object_fail(); + e + })?; + Ok(()) + } + + async fn download(&self, from: &RemoteObjectId) -> Result { + self.download_object(GetObjectRequest { + bucket: self.bucket_name.clone(), + key: from.0.to_owned(), + ..GetObjectRequest::default() + }) + .await + } + + async fn download_byte_range( + &self, + from: &RemoteObjectId, + start_inclusive: u64, + end_exclusive: Option, + ) -> Result { + // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 + // and needs both ends to be exclusive + let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1)); + let range = Some(match end_inclusive { + Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive), + None => format!("bytes={}-", start_inclusive), + }); + + self.download_object(GetObjectRequest { + bucket: self.bucket_name.clone(), + key: from.0.to_owned(), + range, + ..GetObjectRequest::default() + }) + .await + } + + async fn delete(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<()> { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 delete")?; + + metrics::inc_delete_object(); + + self.client + .delete_object(DeleteObjectRequest { + bucket: self.bucket_name.clone(), + key: remote_object_id.0.to_owned(), + ..DeleteObjectRequest::default() + }) + .await + .map_err(|e| { + metrics::inc_delete_object_fail(); + e + })?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use tempfile::tempdir; + + use super::*; + + #[test] + fn test_download_destination() -> anyhow::Result<()> { + let workdir = tempdir()?.path().to_owned(); + let local_path = workdir.join("one").join("two").join("test_name"); + let relative_path = local_path.strip_prefix(&workdir)?; + + let key = RemoteObjectId(format!( + "{}{}", + REMOTE_STORAGE_PREFIX_SEPARATOR, + relative_path + .iter() + .map(|segment| segment.to_str().unwrap()) + .collect::>() + .join(&REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()), + )); + + assert_eq!( + local_path, + download_destination(&key, &workdir, None), + "Download destination should consist of s3 path joined with the workdir prefix" + ); + + Ok(()) + } + + #[test] + fn storage_path_positive() -> anyhow::Result<()> { + let workdir = tempdir()?.path().to_owned(); + + let segment_1 = "matching"; + let segment_2 = "file"; + let local_path = &workdir.join(segment_1).join(segment_2); + + let storage = dummy_storage(workdir); + + let expected_key = RemoteObjectId(format!( + "{}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_1}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_2}", + storage.prefix_in_bucket.as_deref().unwrap_or_default(), + )); + + let actual_key = storage + .remote_object_id(local_path) + .expect("Matching path should map to S3 path normally"); + assert_eq!( + expected_key, + actual_key, + "S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator" + ); + + Ok(()) + } + + #[test] + fn storage_path_negatives() -> anyhow::Result<()> { + #[track_caller] + fn storage_path_error(storage: &S3Bucket, mismatching_path: &Path) -> String { + match storage.remote_object_id(mismatching_path) { + Ok(wrong_key) => panic!( + "Expected path '{}' to error, but got S3 key: {:?}", + mismatching_path.display(), + wrong_key, + ), + Err(e) => e.to_string(), + } + } + + let workdir = tempdir()?.path().to_owned(); + let storage = dummy_storage(workdir.clone()); + + let error_message = storage_path_error(&storage, &workdir); + assert!( + error_message.contains("Prefix and the path are equal"), + "Message '{}' does not contain the required string", + error_message + ); + + let mismatching_path = PathBuf::from("somewhere").join("else"); + let error_message = storage_path_error(&storage, &mismatching_path); + assert!( + error_message.contains(mismatching_path.to_str().unwrap()), + "Error should mention wrong path" + ); + assert!( + error_message.contains(workdir.to_str().unwrap()), + "Error should mention server workdir" + ); + assert!( + error_message.contains("is not prefixed with"), + "Message '{}' does not contain a required string", + error_message + ); + + Ok(()) + } + + #[test] + fn local_path_positive() -> anyhow::Result<()> { + let workdir = tempdir()?.path().to_owned(); + let storage = dummy_storage(workdir.clone()); + let timeline_dir = workdir.join("timelines").join("test_timeline"); + let relative_timeline_path = timeline_dir.strip_prefix(&workdir)?; + + let s3_key = create_s3_key( + &relative_timeline_path.join("not a metadata"), + storage.prefix_in_bucket.as_deref(), + ); + assert_eq!( + download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()), + storage + .local_path(&s3_key) + .expect("For a valid input, valid S3 info should be parsed"), + "Should be able to parse metadata out of the correctly named remote delta file" + ); + + let s3_key = create_s3_key( + &relative_timeline_path.join("metadata"), + storage.prefix_in_bucket.as_deref(), + ); + assert_eq!( + download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()), + storage + .local_path(&s3_key) + .expect("For a valid input, valid S3 info should be parsed"), + "Should be able to parse metadata out of the correctly named remote metadata file" + ); + + Ok(()) + } + + #[test] + fn download_destination_matches_original_path() -> anyhow::Result<()> { + let workdir = tempdir()?.path().to_owned(); + let original_path = workdir + .join("timelines") + .join("some_timeline") + .join("some name"); + + let dummy_storage = dummy_storage(workdir); + + let key = dummy_storage.remote_object_id(&original_path)?; + let download_destination = dummy_storage.local_path(&key)?; + + assert_eq!( + original_path, download_destination, + "'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path" + ); + + Ok(()) + } + + fn dummy_storage(workdir: PathBuf) -> S3Bucket { + S3Bucket { + workdir, + client: S3Client::new("us-east-1".parse().unwrap()), + bucket_name: "dummy-bucket".to_string(), + prefix_in_bucket: Some("dummy_prefix/".to_string()), + concurrency_limiter: Semaphore::new(1), + } + } + + fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> RemoteObjectId { + RemoteObjectId(relative_file_path.iter().fold( + prefix.unwrap_or_default().to_string(), + |mut path_string, segment| { + path_string.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + path_string.push_str(segment.to_str().unwrap()); + path_string + }, + )) + } +} diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml new file mode 100644 index 0000000000..15bdecd71d --- /dev/null +++ b/libs/safekeeper_api/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "safekeeper_api" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde = { version = "1.0", features = ["derive"] } +serde_with = "2.0" +const_format = "0.2.21" + +utils = { path = "../utils" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs new file mode 100644 index 0000000000..0a391478da --- /dev/null +++ b/libs/safekeeper_api/src/lib.rs @@ -0,0 +1,10 @@ +use const_format::formatcp; + +/// Public API types +pub mod models; + +pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; +pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); + +pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; +pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs new file mode 100644 index 0000000000..85c6439367 --- /dev/null +++ b/libs/safekeeper_api/src/models.rs @@ -0,0 +1,24 @@ +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; + +use utils::{ + id::{NodeId, TenantId, TimelineId}, + lsn::Lsn, +}; + +#[serde_as] +#[derive(Serialize, Deserialize)] +pub struct TimelineCreateRequest { + #[serde_as(as = "DisplayFromStr")] + pub tenant_id: TenantId, + #[serde_as(as = "DisplayFromStr")] + pub timeline_id: TimelineId, + pub peer_ids: Option>, + pub pg_version: u32, + pub system_id: Option, + pub wal_seg_size: Option, + #[serde_as(as = "DisplayFromStr")] + pub commit_lsn: Lsn, + // If not passed, it is assigned to the beginning of commit_lsn segment. + pub local_start_lsn: Option, +} diff --git a/libs/tenant_size_model/.gitignore b/libs/tenant_size_model/.gitignore new file mode 100644 index 0000000000..15a65bec1e --- /dev/null +++ b/libs/tenant_size_model/.gitignore @@ -0,0 +1,3 @@ +*.dot +*.png +*.svg diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml new file mode 100644 index 0000000000..1aabf5a4f9 --- /dev/null +++ b/libs/tenant_size_model/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "tenant_size_model" +version = "0.1.0" +edition = "2021" +publish = false + +[dependencies] +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/tenant_size_model/Makefile b/libs/tenant_size_model/Makefile new file mode 100644 index 0000000000..1cffe81c10 --- /dev/null +++ b/libs/tenant_size_model/Makefile @@ -0,0 +1,13 @@ +all: 1.svg 2.svg 3.svg 4.svg 1.png 2.png 3.png 4.png + +../../target/debug/tenant_size_model: Cargo.toml src/main.rs src/lib.rs + cargo build --bin tenant_size_model + +%.svg: %.dot + dot -Tsvg $< > $@ + +%.png: %.dot + dot -Tpng $< > $@ + +%.dot: ../../target/debug/tenant_size_model + ../../target/debug/tenant_size_model $* > $@ diff --git a/libs/tenant_size_model/README.md b/libs/tenant_size_model/README.md new file mode 100644 index 0000000000..b850130d67 --- /dev/null +++ b/libs/tenant_size_model/README.md @@ -0,0 +1,7 @@ +# Logical size + WAL pricing + +This is a simulator to calculate the tenant size in different scenarios, +using the "Logical size + WAL" method. Makefile produces diagrams used in a +private presentation: + +https://docs.google.com/presentation/d/1OapE4k11xmcwMh7I7YvNWGC63yCRLh6udO9bXZ-fZmo/edit?usp=sharing diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs new file mode 100644 index 0000000000..c7ec1e8870 --- /dev/null +++ b/libs/tenant_size_model/src/lib.rs @@ -0,0 +1,349 @@ +use std::borrow::Cow; +use std::collections::HashMap; + +/// Pricing model or history size builder. +/// +/// Maintains knowledge of the branches and their modifications. Generic over the branch name key +/// type. +pub struct Storage { + segments: Vec, + + /// Mapping from the branch name to the index of a segment describing it's latest state. + branches: HashMap, +} + +/// Snapshot of a branch. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Segment { + /// Previous segment index into ['Storage::segments`], if any. + parent: Option, + + /// Description of how did we get to this state. + /// + /// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when + /// modifying a branch directly. + pub op: Cow<'static, str>, + + /// LSN before this state + start_lsn: u64, + + /// LSN at this state + pub end_lsn: u64, + + /// Logical size before this state + start_size: u64, + + /// Logical size at this state + pub end_size: u64, + + /// Indices to [`Storage::segments`] + /// + /// FIXME: this could be an Option + children_after: Vec, + + /// Determined by `retention_period` given to [`Storage::calculate`] + pub needed: bool, +} + +// +// +// +// +// *-g--*---D---> +// / +// / +// / *---b----*-B---> +// / / +// / / +// -----*--e---*-----f----* C +// E \ +// \ +// *--a---*---A--> +// +// If A and B need to be retained, is it cheaper to store +// snapshot at C+a+b, or snapshots at A and B ? +// +// If D also needs to be retained, which is cheaper: +// +// 1. E+g+e+f+a+b +// 2. D+C+a+b +// 3. D+A+B + +/// [`Segment`] which has had it's size calculated. +pub struct SegmentSize { + pub seg_id: usize, + + pub method: SegmentMethod, + + this_size: u64, + + pub children: Vec, +} + +impl SegmentSize { + fn total(&self) -> u64 { + self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total()) + } + + pub fn total_children(&self) -> u64 { + if self.method == SnapshotAfter { + self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total()) + } else { + self.children.iter().fold(0, |acc, x| acc + x.total()) + } + } +} + +/// Different methods to retain history from a particular state +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum SegmentMethod { + SnapshotAfter, + Wal, + WalNeeded, + Skipped, +} + +use SegmentMethod::*; + +impl Storage { + /// Creates a new storage with the given default branch name. + pub fn new(initial_branch: K) -> Storage { + let init_segment = Segment { + op: "".into(), + needed: false, + parent: None, + start_lsn: 0, + end_lsn: 0, + start_size: 0, + end_size: 0, + children_after: Vec::new(), + }; + + Storage { + segments: vec![init_segment], + branches: HashMap::from([(initial_branch, 0)]), + } + } + + /// Advances the branch with the named operation, by the relative LSN and logical size bytes. + pub fn modify_branch( + &mut self, + branch: &Q, + op: Cow<'static, str>, + lsn_bytes: u64, + size_bytes: i64, + ) where + K: std::borrow::Borrow, + Q: std::hash::Hash + Eq, + { + let lastseg_id = *self.branches.get(branch).unwrap(); + let newseg_id = self.segments.len(); + let lastseg = &mut self.segments[lastseg_id]; + + let newseg = Segment { + op, + parent: Some(lastseg_id), + start_lsn: lastseg.end_lsn, + end_lsn: lastseg.end_lsn + lsn_bytes, + start_size: lastseg.end_size, + end_size: (lastseg.end_size as i64 + size_bytes) as u64, + children_after: Vec::new(), + needed: false, + }; + lastseg.children_after.push(newseg_id); + + self.segments.push(newseg); + *self.branches.get_mut(branch).expect("read already") = newseg_id; + } + + pub fn insert(&mut self, branch: &Q, bytes: u64) + where + K: std::borrow::Borrow, + Q: std::hash::Hash + Eq, + { + self.modify_branch(branch, "insert".into(), bytes, bytes as i64); + } + + pub fn update(&mut self, branch: &Q, bytes: u64) + where + K: std::borrow::Borrow, + Q: std::hash::Hash + Eq, + { + self.modify_branch(branch, "update".into(), bytes, 0i64); + } + + pub fn delete(&mut self, branch: &Q, bytes: u64) + where + K: std::borrow::Borrow, + Q: std::hash::Hash + Eq, + { + self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64)); + } + + /// Panics if the parent branch cannot be found. + pub fn branch(&mut self, parent: &Q, name: K) + where + K: std::borrow::Borrow, + Q: std::hash::Hash + Eq, + { + // Find the right segment + let branchseg_id = *self + .branches + .get(parent) + .expect("should had found the parent by key"); + let _branchseg = &mut self.segments[branchseg_id]; + + // Create branch name for it + self.branches.insert(name, branchseg_id); + } + + pub fn calculate(&mut self, retention_period: u64) -> SegmentSize { + // Phase 1: Mark all the segments that need to be retained + for (_branch, &last_seg_id) in self.branches.iter() { + let last_seg = &self.segments[last_seg_id]; + let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period); + let mut seg_id = last_seg_id; + loop { + let seg = &mut self.segments[seg_id]; + if seg.end_lsn < cutoff_lsn { + break; + } + seg.needed = true; + if let Some(prev_seg_id) = seg.parent { + seg_id = prev_seg_id; + } else { + break; + } + } + } + + // Phase 2: For each oldest segment in a chain that needs to be retained, + // calculate if we should store snapshot or WAL + self.size_from_snapshot_later(0) + } + + fn size_from_wal(&self, seg_id: usize) -> SegmentSize { + let seg = &self.segments[seg_id]; + + let this_size = seg.end_lsn - seg.start_lsn; + + let mut children = Vec::new(); + + // try both ways + for &child_id in seg.children_after.iter() { + // try each child both ways + let child = &self.segments[child_id]; + let p1 = self.size_from_wal(child_id); + + let p = if !child.needed { + let p2 = self.size_from_snapshot_later(child_id); + if p1.total() < p2.total() { + p1 + } else { + p2 + } + } else { + p1 + }; + children.push(p); + } + SegmentSize { + seg_id, + method: if seg.needed { WalNeeded } else { Wal }, + this_size, + children, + } + } + + fn size_from_snapshot_later(&self, seg_id: usize) -> SegmentSize { + // If this is needed, then it's time to do the snapshot and continue + // with wal method. + let seg = &self.segments[seg_id]; + //eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed); + if seg.needed { + let mut children = Vec::new(); + + for &child_id in seg.children_after.iter() { + // try each child both ways + let child = &self.segments[child_id]; + let p1 = self.size_from_wal(child_id); + + let p = if !child.needed { + let p2 = self.size_from_snapshot_later(child_id); + if p1.total() < p2.total() { + p1 + } else { + p2 + } + } else { + p1 + }; + children.push(p); + } + SegmentSize { + seg_id, + method: WalNeeded, + this_size: seg.start_size, + children, + } + } else { + // If any of the direct children are "needed", need to be able to reconstruct here + let mut children_needed = false; + for &child in seg.children_after.iter() { + let seg = &self.segments[child]; + if seg.needed { + children_needed = true; + break; + } + } + + let method1 = if !children_needed { + let mut children = Vec::new(); + for child in seg.children_after.iter() { + children.push(self.size_from_snapshot_later(*child)); + } + Some(SegmentSize { + seg_id, + method: Skipped, + this_size: 0, + children, + }) + } else { + None + }; + + // If this a junction, consider snapshotting here + let method2 = if children_needed || seg.children_after.len() >= 2 { + let mut children = Vec::new(); + for child in seg.children_after.iter() { + children.push(self.size_from_wal(*child)); + } + Some(SegmentSize { + seg_id, + method: SnapshotAfter, + this_size: seg.end_size, + children, + }) + } else { + None + }; + + match (method1, method2) { + (None, None) => panic!(), + (Some(method), None) => method, + (None, Some(method)) => method, + (Some(method1), Some(method2)) => { + if method1.total() < method2.total() { + method1 + } else { + method2 + } + } + } + } + } + + pub fn into_segments(self) -> Vec { + self.segments + } +} diff --git a/libs/tenant_size_model/src/main.rs b/libs/tenant_size_model/src/main.rs new file mode 100644 index 0000000000..47c0e8122f --- /dev/null +++ b/libs/tenant_size_model/src/main.rs @@ -0,0 +1,268 @@ +//! Tenant size model testing ground. +//! +//! Has a number of scenarios and a `main` for invoking these by number, calculating the history +//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios +//! into pngs. + +use tenant_size_model::{Segment, SegmentSize, Storage}; + +// Main branch only. Some updates on it. +fn scenario_1() -> (Vec, SegmentSize) { + // Create main branch + let mut storage = Storage::new("main"); + + // Bulk load 5 GB of data to it + storage.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + storage.update("main", 1_000); + } + + let size = storage.calculate(1000); + + (storage.into_segments(), size) +} + +// Main branch only. Some updates on it. +fn scenario_2() -> (Vec, SegmentSize) { + // Create main branch + let mut storage = Storage::new("main"); + + // Bulk load 5 GB of data to it + storage.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + storage.update("main", 1_000); + } + + // Branch + storage.branch("main", "child"); + storage.update("child", 1_000); + + // More updates on parent + storage.update("main", 1_000); + + let size = storage.calculate(1000); + + (storage.into_segments(), size) +} + +// Like 2, but more updates on main +fn scenario_3() -> (Vec, SegmentSize) { + // Create main branch + let mut storage = Storage::new("main"); + + // Bulk load 5 GB of data to it + storage.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + storage.update("main", 1_000); + } + + // Branch + storage.branch("main", "child"); + storage.update("child", 1_000); + + // More updates on parent + for _ in 0..5 { + storage.update("main", 1_000); + } + + let size = storage.calculate(1000); + + (storage.into_segments(), size) +} + +// Diverged branches +fn scenario_4() -> (Vec, SegmentSize) { + // Create main branch + let mut storage = Storage::new("main"); + + // Bulk load 5 GB of data to it + storage.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + storage.update("main", 1_000); + } + + // Branch + storage.branch("main", "child"); + storage.update("child", 1_000); + + // More updates on parent + for _ in 0..8 { + storage.update("main", 1_000); + } + + let size = storage.calculate(1000); + + (storage.into_segments(), size) +} + +fn scenario_5() -> (Vec, SegmentSize) { + let mut storage = Storage::new("a"); + storage.insert("a", 5000); + storage.branch("a", "b"); + storage.update("b", 4000); + storage.update("a", 2000); + storage.branch("a", "c"); + storage.insert("c", 4000); + storage.insert("a", 2000); + + let size = storage.calculate(5000); + + (storage.into_segments(), size) +} + +fn scenario_6() -> (Vec, SegmentSize) { + use std::borrow::Cow; + + const NO_OP: Cow<'static, str> = Cow::Borrowed(""); + + let branches = [ + Some(0x7ff1edab8182025f15ae33482edb590a_u128), + Some(0xb1719e044db05401a05a2ed588a3ad3f), + Some(0xb68d6691c895ad0a70809470020929ef), + ]; + + // compared to other scenarios, this one uses bytes instead of kB + + let mut storage = Storage::new(None); + + storage.branch(&None, branches[0]); // at 0 + storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064 + storage.branch(&branches[0], branches[1]); // at 108951064 + storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472 + storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424 + storage.branch(&branches[0], branches[2]); // at 283415424 + storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616 + storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400 + + let size = storage.calculate(100_000); + + (storage.into_segments(), size) +} + +fn main() { + let args: Vec = std::env::args().collect(); + + let scenario = if args.len() < 2 { "1" } else { &args[1] }; + + let (segments, size) = match scenario { + "1" => scenario_1(), + "2" => scenario_2(), + "3" => scenario_3(), + "4" => scenario_4(), + "5" => scenario_5(), + "6" => scenario_6(), + other => { + eprintln!("invalid scenario {}", other); + std::process::exit(1); + } + }; + + graphviz_tree(&segments, &size); +} + +fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) { + use tenant_size_model::SegmentMethod::*; + + let seg_id = node.seg_id; + let seg = segments.get(seg_id).unwrap(); + let lsn = seg.end_lsn; + let size = seg.end_size; + let method = node.method; + + println!(" {{"); + println!(" node [width=0.1 height=0.1 shape=oval]"); + + let tenant_size = node.total_children(); + + let penwidth = if seg.needed { 6 } else { 3 }; + let x = match method { + SnapshotAfter => + format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"), + Wal => + format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"), + WalNeeded => + format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"), + Skipped => + format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"), + }; + + println!(" \"seg{seg_id}\" [{x}]"); + println!(" }}"); + + // Recurse. Much of the data is actually on the edge + for child in node.children.iter() { + let child_id = child.seg_id; + graphviz_recurse(segments, child); + + let edge_color = match child.method { + SnapshotAfter => "gray", + Wal => "black", + WalNeeded => "black", + Skipped => "gray", + }; + + println!(" {{"); + println!(" edge [] "); + print!(" \"seg{seg_id}\" -> \"seg{child_id}\" ["); + print!("color={edge_color}"); + if child.method == WalNeeded { + print!(" penwidth=6"); + } + if child.method == Wal { + print!(" penwidth=3"); + } + + let next = segments.get(child_id).unwrap(); + + if next.op.is_empty() { + print!( + " label=\"{} / {}\"", + next.end_lsn - seg.end_lsn, + (next.end_size as i128 - seg.end_size as i128) + ); + } else { + print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn); + } + println!("]"); + println!(" }}"); + } +} + +fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) { + println!("digraph G {{"); + println!(" fontname=\"Helvetica,Arial,sans-serif\""); + println!(" node [fontname=\"Helvetica,Arial,sans-serif\"]"); + println!(" edge [fontname=\"Helvetica,Arial,sans-serif\"]"); + println!(" graph [center=1 rankdir=LR]"); + println!(" edge [dir=none]"); + + graphviz_recurse(segments, tree); + + println!("}}"); +} + +#[test] +fn scenarios_return_same_size() { + type ScenarioFn = fn() -> (Vec, SegmentSize); + let truths: &[(u32, ScenarioFn, _)] = &[ + (line!(), scenario_1, 8000), + (line!(), scenario_2, 9000), + (line!(), scenario_3, 13000), + (line!(), scenario_4, 16000), + (line!(), scenario_5, 17000), + (line!(), scenario_6, 333_792_000), + ]; + + for (line, scenario, expected) in truths { + let (_, size) = scenario(); + assert_eq!(*expected, size.total_children(), "scenario on line {line}"); + } +} diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml new file mode 100644 index 0000000000..36a379b47a --- /dev/null +++ b/libs/utils/Cargo.toml @@ -0,0 +1,47 @@ +[package] +name = "utils" +version = "0.1.0" +edition = "2021" + +[dependencies] +async-trait = "0.1" +anyhow = "1.0" +bincode = "1.3" +bytes = "1.0.1" +hyper = { version = "0.14.7", features = ["full"] } +routerify = "3" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1" +thiserror = "1.0" +tokio = { version = "1.17", features = ["macros"]} +tokio-rustls = "0.23" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } +nix = "0.25" +signal-hook = "0.3.10" +rand = "0.8.3" +jsonwebtoken = "8" +hex = { version = "0.4.3", features = ["serde"] } +rustls = "0.20.2" +rustls-split = "0.3.0" +git-version = "0.3.5" +serde_with = "2.0" +once_cell = "1.13.0" +strum = "0.24" +strum_macros = "0.24" + +metrics = { path = "../metrics" } +pq_proto = { path = "../pq_proto" } +workspace_hack = { version = "0.1", path = "../../workspace_hack" } + +[dev-dependencies] +byteorder = "1.4.3" +bytes = "1.0.1" +hex-literal = "0.3" +tempfile = "3.2" +criterion = "0.4" +rustls-pemfile = "1" + +[[bench]] +name = "benchmarks" +harness = false diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs new file mode 100644 index 0000000000..98d839ca55 --- /dev/null +++ b/libs/utils/benches/benchmarks.rs @@ -0,0 +1,22 @@ +#![allow(unused)] + +use criterion::{criterion_group, criterion_main, Criterion}; +use utils::id; + +pub fn bench_id_stringify(c: &mut Criterion) { + // Can only use public methods. + let ttid = id::TenantTimelineId::generate(); + + c.bench_function("id.to_string", |b| { + b.iter(|| { + // FIXME measurement overhead? + //for _ in 0..1000 { + // ttid.tenant_id.to_string(); + //} + ttid.tenant_id.to_string(); + }) + }); +} + +criterion_group!(benches, bench_id_stringify); +criterion_main!(benches); diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh new file mode 100755 index 0000000000..9bd860affb --- /dev/null +++ b/libs/utils/scripts/restore_from_wal.sh @@ -0,0 +1,21 @@ +#!/bin/bash +PG_BIN=$1 +WAL_PATH=$2 +DATA_DIR=$3 +PORT=$4 +SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-` +rm -fr $DATA_DIR +env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID +echo port=$PORT >> $DATA_DIR/postgresql.conf +REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-` +declare -i WAL_SIZE=$REDO_POS+114 +$PG_BIN/pg_ctl -D $DATA_DIR -l logfile start +$PG_BIN/pg_ctl -D $DATA_DIR -l logfile stop -m immediate +cp $DATA_DIR/pg_wal/000000010000000000000001 . +cp $WAL_PATH/* $DATA_DIR/pg_wal/ +if [ -f $DATA_DIR/pg_wal/*.partial ] +then + (cd $DATA_DIR/pg_wal ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done) +fi +dd if=000000010000000000000001 of=$DATA_DIR/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc +rm -f 000000010000000000000001 diff --git a/libs/utils/scripts/restore_from_wal_archive.sh b/libs/utils/scripts/restore_from_wal_archive.sh new file mode 100755 index 0000000000..ce58b349fc --- /dev/null +++ b/libs/utils/scripts/restore_from_wal_archive.sh @@ -0,0 +1,20 @@ +PG_BIN=$1 +WAL_PATH=$2 +DATA_DIR=$3 +PORT=$4 +SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-` +rm -fr $DATA_DIR /tmp/pg_wals +mkdir /tmp/pg_wals +env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID +echo port=$PORT >> $DATA_DIR/postgresql.conf +REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-` +declare -i WAL_SIZE=$REDO_POS+114 +cp $WAL_PATH/* /tmp/pg_wals +if [ -f $DATA_DIR/pg_wal/*.partial ] +then + (cd /tmp/pg_wals ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done) +fi +dd if=$DATA_DIR/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc +echo > $DATA_DIR/recovery.signal +rm -f $DATA_DIR/pg_wal/* +echo "restore_command = 'cp /tmp/pg_wals/%f %p'" >> $DATA_DIR/postgresql.conf diff --git a/zenith_utils/src/accum.rs b/libs/utils/src/accum.rs similarity index 96% rename from zenith_utils/src/accum.rs rename to libs/utils/src/accum.rs index d3ad61e514..0fb0190a92 100644 --- a/zenith_utils/src/accum.rs +++ b/libs/utils/src/accum.rs @@ -5,7 +5,7 @@ /// For example, to calculate the smallest value among some integers: /// /// ``` -/// use zenith_utils::accum::Accum; +/// use utils::accum::Accum; /// /// let values = [1, 2, 3]; /// diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs new file mode 100644 index 0000000000..b190b0d1c5 --- /dev/null +++ b/libs/utils/src/auth.rs @@ -0,0 +1,98 @@ +// For details about authentication see docs/authentication.md +// +// TODO: use ed25519 keys +// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162 + +use serde; +use std::fs; +use std::path::Path; + +use anyhow::{bail, Result}; +use jsonwebtoken::{ + decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, +}; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; + +use crate::id::TenantId; + +const JWT_ALGORITHM: Algorithm = Algorithm::RS256; + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "lowercase")] +pub enum Scope { + Tenant, + PageServerApi, +} + +#[serde_as] +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct Claims { + #[serde(default)] + #[serde_as(as = "Option")] + pub tenant_id: Option, + pub scope: Scope, +} + +impl Claims { + pub fn new(tenant_id: Option, scope: Scope) -> Self { + Self { tenant_id, scope } + } +} + +pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result<()> { + match (&claims.scope, tenant_id) { + (Scope::Tenant, None) => { + bail!("Attempt to access management api with tenant scope. Permission denied") + } + (Scope::Tenant, Some(tenant_id)) => { + if claims.tenant_id.unwrap() != tenant_id { + bail!("Tenant id mismatch. Permission denied") + } + Ok(()) + } + (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope + (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope + } +} + +pub struct JwtAuth { + decoding_key: DecodingKey, + validation: Validation, +} + +impl JwtAuth { + pub fn new(decoding_key: DecodingKey) -> Self { + let mut validation = Validation::new(JWT_ALGORITHM); + // The default 'required_spec_claims' is 'exp'. But we don't want to require + // expiration. + validation.required_spec_claims = [].into(); + Self { + decoding_key, + validation, + } + } + + pub fn from_key_path(key_path: &Path) -> Result { + let public_key = fs::read(key_path)?; + Ok(Self::new(DecodingKey::from_rsa_pem(&public_key)?)) + } + + pub fn decode(&self, token: &str) -> Result> { + Ok(decode(token, &self.decoding_key, &self.validation)?) + } +} + +impl std::fmt::Debug for JwtAuth { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("JwtAuth") + .field("validation", &self.validation) + .finish() + } +} + +// this function is used only for testing purposes in CLI e g generate tokens during init +pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result { + let key = EncodingKey::from_rsa_pem(key_data)?; + Ok(encode(&Header::new(JWT_ALGORITHM), claims, &key)?) +} diff --git a/zenith_utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs similarity index 98% rename from zenith_utils/src/bin_ser.rs rename to libs/utils/src/bin_ser.rs index 063d69557d..42b45eeea0 100644 --- a/zenith_utils/src/bin_ser.rs +++ b/libs/utils/src/bin_ser.rs @@ -71,7 +71,7 @@ impl From for SerializeError { /// - Fixed integer encoding (i.e. 1u32 is 00000001 not 01) /// /// Does not allow trailing bytes in deserialization. If this is desired, you -/// may set [`Options::allow_trailing_bytes`] to explicitly accomodate this. +/// may set [`Options::allow_trailing_bytes`] to explicitly accommodate this. pub fn be_coder() -> impl Options { bincode::DefaultOptions::new() .with_big_endian() @@ -85,7 +85,7 @@ pub fn be_coder() -> impl Options { /// - Fixed integer encoding (i.e. 1u32 is 00000001 not 01) /// /// Does not allow trailing bytes in deserialization. If this is desired, you -/// may set [`Options::allow_trailing_bytes`] to explicitly accomodate this. +/// may set [`Options::allow_trailing_bytes`] to explicitly accommodate this. pub fn le_coder() -> impl Options { bincode::DefaultOptions::new() .with_little_endian() @@ -265,7 +265,7 @@ mod tests { use serde::{Deserialize, Serialize}; use std::io::Cursor; - #[derive(Debug, PartialEq, Serialize, Deserialize)] + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct ShortStruct { a: u8, b: u32, @@ -286,7 +286,7 @@ mod tests { const SHORT2_ENC_LE: &[u8] = &[8, 0, 0, 3, 7]; const SHORT2_ENC_LE_TRAILING: &[u8] = &[8, 0, 0, 3, 7, 0xff, 0xff, 0xff]; - #[derive(Debug, PartialEq, Serialize, Deserialize)] + #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct LongMsg { pub tag: u8, pub blockpos: u32, diff --git a/zenith_utils/src/crashsafe_dir.rs b/libs/utils/src/crashsafe.rs similarity index 56% rename from zenith_utils/src/crashsafe_dir.rs rename to libs/utils/src/crashsafe.rs index a7eab73a43..3726779cb2 100644 --- a/zenith_utils/src/crashsafe_dir.rs +++ b/libs/utils/src/crashsafe.rs @@ -1,7 +1,9 @@ use std::{ + borrow::Cow, + ffi::OsStr, fs::{self, File}, io, - path::Path, + path::{Path, PathBuf}, }; /// Similar to [`std::fs::create_dir`], except we fsync the @@ -10,16 +12,8 @@ pub fn create_dir(path: impl AsRef) -> io::Result<()> { let path = path.as_ref(); fs::create_dir(path)?; - File::open(path)?.sync_all()?; - - if let Some(parent) = path.parent() { - File::open(parent)?.sync_all() - } else { - Err(io::Error::new( - io::ErrorKind::InvalidInput, - "can't find parent", - )) - } + fsync_file_and_parent(path)?; + Ok(()) } /// Similar to [`std::fs::create_dir_all`], except we fsync all @@ -63,17 +57,60 @@ pub fn create_dir_all(path: impl AsRef) -> io::Result<()> { // Fsync the created directories from child to parent. for &path in dirs_to_create.iter() { - File::open(path)?.sync_all()?; + fsync(path)?; } // If we created any new directories, fsync the parent. if !dirs_to_create.is_empty() { - File::open(path)?.sync_all()?; + fsync(path)?; } Ok(()) } +/// Adds a suffix to the file(directory) name, either appending the suffix to the end of its extension, +/// or if there's no extension, creates one and puts a suffix there. +pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) -> PathBuf { + let new_extension = match original_path + .as_ref() + .extension() + .map(OsStr::to_string_lossy) + { + Some(extension) => Cow::Owned(format!("{extension}.{suffix}")), + None => Cow::Borrowed(suffix), + }; + original_path + .as_ref() + .with_extension(new_extension.as_ref()) +} + +pub fn fsync_file_and_parent(file_path: &Path) -> io::Result<()> { + let parent = file_path.parent().ok_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + format!("File {file_path:?} has no parent"), + ) + })?; + + fsync(file_path)?; + fsync(parent)?; + Ok(()) +} + +pub fn fsync(path: &Path) -> io::Result<()> { + File::open(path) + .map_err(|e| io::Error::new(e.kind(), format!("Failed to open the file {path:?}: {e}"))) + .and_then(|file| { + file.sync_all().map_err(|e| { + io::Error::new( + e.kind(), + format!("Failed to sync file {path:?} data and metadata: {e}"), + ) + }) + }) + .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}"))) +} + #[cfg(test)] mod tests { use tempfile::tempdir; @@ -122,4 +159,33 @@ mod tests { let invalid_dir_path = file_path.join("folder"); create_dir_all(&invalid_dir_path).unwrap_err(); } + + #[test] + fn test_path_with_suffix_extension() { + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp").to_string_lossy(), + "/foo/bar.temp" + ); + let p = PathBuf::from("/foo/bar"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + "/foo/bar.baz.temp.temp" + ); + let p = PathBuf::from("/foo/bar.baz"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar.baz..temp" + ); + let p = PathBuf::from("/foo/bar/dir/"); + assert_eq!( + &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + "/foo/bar/dir..temp" + ); + } } diff --git a/zenith_utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs similarity index 87% rename from zenith_utils/src/http/endpoint.rs rename to libs/utils/src/http/endpoint.rs index 0be08f45e1..7a519929cf 100644 --- a/zenith_utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -1,29 +1,29 @@ use crate::auth::{self, Claims, JwtAuth}; use crate::http::error; -use crate::zid::ZTenantId; +use crate::id::TenantId; use anyhow::anyhow; use hyper::header::AUTHORIZATION; use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server}; -use lazy_static::lazy_static; +use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; +use once_cell::sync::Lazy; use routerify::ext::RequestExt; use routerify::RequestInfo; use routerify::{Middleware, Router, RouterBuilder, RouterService}; +use tokio::task::JoinError; use tracing::info; -use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter}; -use zenith_metrics::{Encoder, TextEncoder}; use std::future::Future; use std::net::TcpListener; use super::error::ApiError; -lazy_static! { - static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!( - new_common_metric_name("serve_metrics_count"), +static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { + register_int_counter!( + "libmetrics_metric_handler_requests_total", "Number of metric requests made" ) - .expect("failed to define a metric"); -} + .expect("failed to define a metric") +}); async fn logger(res: Response, info: RequestInfo) -> Result, ApiError> { info!("{} {} {}", info.method(), info.uri().path(), res.status(),); @@ -36,7 +36,13 @@ async fn prometheus_metrics_handler(_req: Request) -> Result( }) } -pub fn check_permission(req: &Request, tenantid: Option) -> Result<(), ApiError> { +pub fn check_permission(req: &Request, tenant_id: Option) -> Result<(), ApiError> { match req.context::() { - Some(claims) => Ok(auth::check_permission(&claims, tenantid) + Some(claims) => Ok(auth::check_permission(&claims, tenant_id) .map_err(|err| ApiError::Forbidden(err.to_string()))?), None => Ok(()), // claims is None because auth is disabled } @@ -160,7 +166,7 @@ pub fn serve_thread_main( where S: Future + Send + Sync, { - info!("Starting a http endpoint at {}", listener.local_addr()?); + info!("Starting an HTTP endpoint at {}", listener.local_addr()?); // Create a Service from the router above to handle incoming requests. let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap(); diff --git a/zenith_utils/src/http/error.rs b/libs/utils/src/http/error.rs similarity index 73% rename from zenith_utils/src/http/error.rs rename to libs/utils/src/http/error.rs index 3262c33a51..b0ecb746d9 100644 --- a/zenith_utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -1,12 +1,11 @@ -use anyhow::anyhow; use hyper::{header, Body, Response, StatusCode}; use serde::{Deserialize, Serialize}; use thiserror::Error; #[derive(Debug, Error)] pub enum ApiError { - #[error("Bad request: {0}")] - BadRequest(String), + #[error("Bad request: {0:#?}")] + BadRequest(anyhow::Error), #[error("Forbidden: {0}")] Forbidden(String), @@ -14,19 +13,21 @@ pub enum ApiError { #[error("Unauthorized: {0}")] Unauthorized(String), + #[error("NotFound: {0}")] + NotFound(anyhow::Error), + + #[error("Conflict: {0}")] + Conflict(String), + #[error(transparent)] - InternalServerError(#[from] anyhow::Error), + InternalServerError(anyhow::Error), } impl ApiError { - pub fn from_err>(err: E) -> Self { - Self::InternalServerError(anyhow!(err)) - } - pub fn into_response(self) -> Response { match self { - ApiError::BadRequest(_) => HttpErrorBody::response_from_msg_and_status( - self.to_string(), + ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( + format!("{err:#?}"), // use debug printing so that we give the cause StatusCode::BAD_REQUEST, ), ApiError::Forbidden(_) => { @@ -36,6 +37,12 @@ impl ApiError { self.to_string(), StatusCode::UNAUTHORIZED, ), + ApiError::NotFound(_) => { + HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::NOT_FOUND) + } + ApiError::Conflict(_) => { + HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::CONFLICT) + } ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, diff --git a/zenith_utils/src/http/json.rs b/libs/utils/src/http/json.rs similarity index 57% rename from zenith_utils/src/http/json.rs rename to libs/utils/src/http/json.rs index f57e81649c..8981fdd1dd 100644 --- a/zenith_utils/src/http/json.rs +++ b/libs/utils/src/http/json.rs @@ -1,3 +1,4 @@ +use anyhow::Context; use bytes::Buf; use hyper::{header, Body, Request, Response, StatusCode}; use serde::{Deserialize, Serialize}; @@ -9,20 +10,24 @@ pub async fn json_request Deserialize<'de>>( ) -> Result { let whole_body = hyper::body::aggregate(request.body_mut()) .await - .map_err(ApiError::from_err)?; - Ok(serde_json::from_reader(whole_body.reader()) - .map_err(|err| ApiError::BadRequest(format!("Failed to parse json request {}", err)))?) + .context("Failed to read request body") + .map_err(ApiError::BadRequest)?; + serde_json::from_reader(whole_body.reader()) + .context("Failed to parse json request") + .map_err(ApiError::BadRequest) } pub fn json_response( status: StatusCode, data: T, ) -> Result, ApiError> { - let json = serde_json::to_string(&data).map_err(ApiError::from_err)?; + let json = serde_json::to_string(&data) + .context("Failed to serialize JSON response") + .map_err(ApiError::InternalServerError)?; let response = Response::builder() .status(status) .header(header::CONTENT_TYPE, "application/json") .body(Body::from(json)) - .map_err(ApiError::from_err)?; + .map_err(|e| ApiError::InternalServerError(e.into()))?; Ok(response) } diff --git a/zenith_utils/src/http/mod.rs b/libs/utils/src/http/mod.rs similarity index 75% rename from zenith_utils/src/http/mod.rs rename to libs/utils/src/http/mod.rs index 0bb53ef51d..74ed6bb5b2 100644 --- a/zenith_utils/src/http/mod.rs +++ b/libs/utils/src/http/mod.rs @@ -3,6 +3,6 @@ pub mod error; pub mod json; pub mod request; -/// Current fast way to apply simple http routing in various Zenith binaries. +/// Current fast way to apply simple http routing in various Neon binaries. /// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed. pub use routerify::{ext::RequestExt, RouterBuilder, RouterService}; diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs new file mode 100644 index 0000000000..7b96ccd584 --- /dev/null +++ b/libs/utils/src/http/request.rs @@ -0,0 +1,37 @@ +use std::str::FromStr; + +use super::error::ApiError; +use anyhow::anyhow; +use hyper::{body::HttpBody, Body, Request}; +use routerify::ext::RequestExt; + +pub fn get_request_param<'a>( + request: &'a Request, + param_name: &str, +) -> Result<&'a str, ApiError> { + match request.param(param_name) { + Some(arg) => Ok(arg), + None => Err(ApiError::BadRequest(anyhow!( + "no {param_name} specified in path param", + ))), + } +} + +pub fn parse_request_param( + request: &Request, + param_name: &str, +) -> Result { + match get_request_param(request, param_name)?.parse() { + Ok(v) => Ok(v), + Err(_) => Err(ApiError::BadRequest(anyhow!( + "failed to parse {param_name}", + ))), + } +} + +pub async fn ensure_no_body(request: &mut Request) -> Result<(), ApiError> { + match request.body_mut().data().await { + Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))), + None => Ok(()), + } +} diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs new file mode 100644 index 0000000000..7ce324614d --- /dev/null +++ b/libs/utils/src/id.rs @@ -0,0 +1,258 @@ +use std::{fmt, str::FromStr}; + +use hex::FromHex; +use rand::Rng; +use serde::{Deserialize, Serialize}; + +/// Neon ID is a 128-bit random ID. +/// Used to represent various identifiers. Provides handy utility methods and impls. +/// +/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look +/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. +/// +/// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. +/// Check the `serde_with::serde_as` documentation for options for more complex types. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] +struct Id([u8; 16]); + +impl Id { + pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> Id { + let mut arr = [0u8; 16]; + buf.copy_to_slice(&mut arr); + Id::from(arr) + } + + pub fn as_arr(&self) -> [u8; 16] { + self.0 + } + + pub fn generate() -> Self { + let mut tli_buf = [0u8; 16]; + rand::thread_rng().fill(&mut tli_buf); + Id::from(tli_buf) + } + + fn hex_encode(&self) -> String { + static HEX: &[u8] = b"0123456789abcdef"; + + let mut buf = vec![0u8; self.0.len() * 2]; + for (&b, chunk) in self.0.as_ref().iter().zip(buf.chunks_exact_mut(2)) { + chunk[0] = HEX[((b >> 4) & 0xf) as usize]; + chunk[1] = HEX[(b & 0xf) as usize]; + } + unsafe { String::from_utf8_unchecked(buf) } + } +} + +impl FromStr for Id { + type Err = hex::FromHexError; + + fn from_str(s: &str) -> Result { + Self::from_hex(s) + } +} + +// this is needed for pretty serialization and deserialization of Id's using serde integration with hex crate +impl FromHex for Id { + type Error = hex::FromHexError; + + fn from_hex>(hex: T) -> Result { + let mut buf: [u8; 16] = [0u8; 16]; + hex::decode_to_slice(hex, &mut buf)?; + Ok(Id(buf)) + } +} + +impl AsRef<[u8]> for Id { + fn as_ref(&self) -> &[u8] { + &self.0 + } +} + +impl From<[u8; 16]> for Id { + fn from(b: [u8; 16]) -> Self { + Id(b) + } +} + +impl From for u128 { + fn from(id: Id) -> Self { + u128::from_le_bytes(id.0) + } +} + +impl fmt::Display for Id { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.hex_encode()) + } +} + +impl fmt::Debug for Id { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.hex_encode()) + } +} + +macro_rules! id_newtype { + ($t:ident) => { + impl $t { + pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> $t { + $t(Id::get_from_buf(buf)) + } + + pub fn as_arr(&self) -> [u8; 16] { + self.0.as_arr() + } + + pub fn generate() -> Self { + $t(Id::generate()) + } + + pub const fn from_array(b: [u8; 16]) -> Self { + $t(Id(b)) + } + } + + impl FromStr for $t { + type Err = hex::FromHexError; + + fn from_str(s: &str) -> Result<$t, Self::Err> { + let value = Id::from_str(s)?; + Ok($t(value)) + } + } + + impl From<[u8; 16]> for $t { + fn from(b: [u8; 16]) -> Self { + $t(Id::from(b)) + } + } + + impl FromHex for $t { + type Error = hex::FromHexError; + + fn from_hex>(hex: T) -> Result { + Ok($t(Id::from_hex(hex)?)) + } + } + + impl AsRef<[u8]> for $t { + fn as_ref(&self) -> &[u8] { + &self.0 .0 + } + } + + impl From<$t> for u128 { + fn from(id: $t) -> Self { + u128::from(id.0) + } + } + + impl fmt::Display for $t { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } + } + + impl fmt::Debug for $t { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } + } + }; +} + +/// Neon timeline IDs are different from PostgreSQL timeline +/// IDs. They serve a similar purpose though: they differentiate +/// between different "histories" of the same cluster. However, +/// PostgreSQL timeline IDs are a bit cumbersome, because they are only +/// 32-bits wide, and they must be in ascending order in any given +/// timeline history. Those limitations mean that we cannot generate a +/// new PostgreSQL timeline ID by just generating a random number. And +/// that in turn is problematic for the "pull/push" workflow, where you +/// have a local copy of a Neon repository, and you periodically sync +/// the local changes with a remote server. When you work "detached" +/// from the remote server, you cannot create a PostgreSQL timeline ID +/// that's guaranteed to be different from all existing timelines in +/// the remote server. For example, if two people are having a clone of +/// the repository on their laptops, and they both create a new branch +/// with different name. What timeline ID would they assign to their +/// branches? If they pick the same one, and later try to push the +/// branches to the same remote server, they will get mixed up. +/// +/// To avoid those issues, Neon has its own concept of timelines that +/// is separate from PostgreSQL timelines, and doesn't have those +/// limitations. A Neon timeline is identified by a 128-bit ID, which +/// is usually printed out as a hex string. +/// +/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look +/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. +/// See [`Id`] for alternative ways to serialize it. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] +pub struct TimelineId(Id); + +id_newtype!(TimelineId); + +/// Neon Tenant Id represents identifiar of a particular tenant. +/// Is used for distinguishing requests and data belonging to different users. +/// +/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look +/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. +/// See [`Id`] for alternative ways to serialize it. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] +pub struct TenantId(Id); + +id_newtype!(TenantId); + +/// Neon Connection Id identifies long-lived connections (for example a pagestream +/// connection with the page_service). Is used for better logging and tracing +/// +/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look +/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. +/// See [`Id`] for alternative ways to serialize it. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] +pub struct ConnectionId(Id); + +id_newtype!(ConnectionId); + +// A pair uniquely identifying Neon instance. +#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct TenantTimelineId { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, +} + +impl TenantTimelineId { + pub fn new(tenant_id: TenantId, timeline_id: TimelineId) -> Self { + TenantTimelineId { + tenant_id, + timeline_id, + } + } + + pub fn generate() -> Self { + Self::new(TenantId::generate(), TimelineId::generate()) + } + + pub fn empty() -> Self { + Self::new(TenantId::from([0u8; 16]), TimelineId::from([0u8; 16])) + } +} + +impl fmt::Display for TenantTimelineId { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}/{}", self.tenant_id, self.timeline_id) + } +} + +// Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued +// by the console. +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)] +#[serde(transparent)] +pub struct NodeId(pub u64); + +impl fmt::Display for NodeId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs new file mode 100644 index 0000000000..11ee7ac7eb --- /dev/null +++ b/libs/utils/src/lib.rs @@ -0,0 +1,99 @@ +//! `utils` is intended to be a place to put code that is shared +//! between other crates in this repository. + +/// `Lsn` type implements common tasks on Log Sequence Numbers +pub mod lsn; +/// SeqWait allows waiting for a future sequence number to arrive +pub mod seqwait; + +/// A simple Read-Copy-Update implementation. +pub mod simple_rcu; + +/// append only ordered map implemented with a Vec +pub mod vec_map; + +pub mod bin_ser; +pub mod postgres_backend; +pub mod postgres_backend_async; + +// helper functions for creating and fsyncing +pub mod crashsafe; + +// common authentication routines +pub mod auth; + +// utility functions and helper traits for unified unique id generation/serialization etc. +pub mod id; +// http endpoint utils +pub mod http; + +// socket splitting utils +pub mod sock_split; + +// common log initialisation routine +pub mod logging; + +pub mod lock_file; + +// Misc +pub mod accum; +pub mod shutdown; + +// Utility for binding TcpListeners with proper socket options. +pub mod tcp_listener; + +// Utility for putting a raw file descriptor into non-blocking mode +pub mod nonblock; + +// Default signal handling +pub mod signals; + +/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages +/// +/// we have several cases: +/// * building locally from git repo +/// * building in CI from git repo +/// * building in docker (either in CI or locally) +/// +/// One thing to note is that .git is not available in docker (and it is bad to include it there). +/// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required. +/// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro. +/// Git version received from environment variable used as a fallback in git_version invocation. +/// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option. +/// So the build script will be run only when GIT_VERSION envvar has changed. +/// +/// Why not to use buildscript to get git commit sha directly without procmacro from different crate? +/// Caching and workspaces complicates that. In case `utils` is not +/// recompiled due to caching then version may become outdated. +/// git_version crate handles that case by introducing a dependency on .git internals via include_bytes! macro, +/// so if we changed the index state git_version will pick that up and rerun the macro. +/// +/// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`. +/// +/// ############################################################################################# +/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details. +/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036 +/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains +/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation. +/// The problem needs further investigation and regular `const` declaration instead of a macro. +#[macro_export] +macro_rules! project_git_version { + ($const_identifier:ident) => { + const $const_identifier: &str = git_version::git_version!( + prefix = "git:", + fallback = concat!( + "git-env:", + env!("GIT_VERSION", "Missing GIT_VERSION envvar") + ), + args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha + ); + }; +} + +/// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime. +#[macro_export] +macro_rules! const_assert { + ($($args:tt)*) => { + const _: () = assert!($($args)*); + }; +} diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs new file mode 100644 index 0000000000..4fef65852b --- /dev/null +++ b/libs/utils/src/lock_file.rs @@ -0,0 +1,81 @@ +//! A module to create and read lock files. A lock file ensures that only one +//! process is running at a time, in a particular directory. +//! +//! File locking is done using [`fcntl::flock`], which means that holding the +//! lock on file only prevents acquiring another lock on it; all other +//! operations are still possible on files. Other process can still open, read, +//! write, or remove the file, for example. +//! If the file is removed while a process is holding a lock on it, +//! the process that holds the lock does not get any error or notification. +//! Furthermore, you can create a new file with the same name and lock the new file, +//! while the old process is still running. +//! Deleting the lock file while the locking process is still running is a bad idea! + +use std::{fs, os::unix::prelude::AsRawFd, path::Path}; + +use anyhow::Context; +use nix::fcntl; + +use crate::crashsafe; + +pub enum LockCreationResult { + Created { + new_lock_contents: String, + file: fs::File, + }, + AlreadyLocked { + existing_lock_contents: String, + }, + CreationFailed(anyhow::Error), +} + +/// Creates a lock file in the path given and writes the given contents into the file. +/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program. +pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult { + let lock_file = match fs::OpenOptions::new() + .create(true) // O_CREAT + .write(true) + .open(lock_file_path) + .context("Failed to open lock file") + { + Ok(file) => file, + Err(e) => return LockCreationResult::CreationFailed(e), + }; + + match fcntl::flock( + lock_file.as_raw_fd(), + fcntl::FlockArg::LockExclusiveNonblock, + ) { + Ok(()) => { + match lock_file + .set_len(0) + .context("Failed to truncate lockfile") + .and_then(|()| { + fs::write(lock_file_path, &contents).with_context(|| { + format!("Failed to write '{contents}' contents into lockfile") + }) + }) + .and_then(|()| { + crashsafe::fsync_file_and_parent(lock_file_path) + .context("Failed to fsync lockfile") + }) { + Ok(()) => LockCreationResult::Created { + new_lock_contents: contents, + file: lock_file, + }, + Err(e) => LockCreationResult::CreationFailed(e), + } + } + Err(nix::errno::Errno::EAGAIN) => { + match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") { + Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked { + existing_lock_contents, + }, + Err(e) => LockCreationResult::CreationFailed(e), + } + } + Err(e) => { + LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}")) + } + } +} diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs new file mode 100644 index 0000000000..3b1a1f5aff --- /dev/null +++ b/libs/utils/src/logging.rs @@ -0,0 +1,45 @@ +use std::str::FromStr; + +use anyhow::Context; +use strum_macros::{EnumString, EnumVariantNames}; + +#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)] +#[strum(serialize_all = "snake_case")] +pub enum LogFormat { + Plain, + Json, +} + +impl LogFormat { + pub fn from_config(s: &str) -> anyhow::Result { + use strum::VariantNames; + LogFormat::from_str(s).with_context(|| { + format!( + "Unrecognized log format. Please specify one of: {:?}", + LogFormat::VARIANTS + ) + }) + } +} + +pub fn init(log_format: LogFormat) -> anyhow::Result<()> { + let default_filter_str = "info"; + + // We fall back to printing all spans at info-level or above if + // the RUST_LOG environment variable is not set. + let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str)); + + let base_logger = tracing_subscriber::fmt() + .with_env_filter(env_filter) + .with_target(false) + .with_ansi(false) + .with_writer(std::io::stdout); + + match log_format { + LogFormat::Json => base_logger.json().init(), + LogFormat::Plain => base_logger.init(), + } + + Ok(()) +} diff --git a/zenith_utils/src/lsn.rs b/libs/utils/src/lsn.rs similarity index 94% rename from zenith_utils/src/lsn.rs rename to libs/utils/src/lsn.rs index c09d8c67ce..39fed8670d 100644 --- a/zenith_utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -13,12 +13,12 @@ use crate::seqwait::MonotonicCounter; pub const XLOG_BLCKSZ: u32 = 8192; /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr -#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Serialize, Deserialize)] +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)] #[serde(transparent)] pub struct Lsn(pub u64); /// We tried to parse an LSN from a string, but failed -#[derive(Debug, PartialEq, thiserror::Error)] +#[derive(Debug, PartialEq, Eq, thiserror::Error)] #[error("LsnParseError")] pub struct LsnParseError; @@ -26,6 +26,9 @@ impl Lsn { /// Maximum possible value for an LSN pub const MAX: Lsn = Lsn(u64::MAX); + /// Invalid value for InvalidXLogRecPtr, as defined in xlogdefs.h + pub const INVALID: Lsn = Lsn(0); + /// Subtract a number, returning None on overflow. pub fn checked_sub>(self, other: T) -> Option { let other: u64 = other.into(); @@ -63,6 +66,11 @@ impl Lsn { (self.0 % seg_sz as u64) as usize } + /// Compute LSN of the segment start. + pub fn segment_lsn(self, seg_sz: usize) -> Lsn { + Lsn(self.0 - (self.0 % seg_sz as u64)) + } + /// Compute the segment number pub fn segment_number(self, seg_sz: usize) -> u64 { self.0 / seg_sz as u64 @@ -103,6 +111,12 @@ impl Lsn { pub fn is_aligned(&self) -> bool { *self == self.align() } + + /// Return if the LSN is valid + /// mimics postgres XLogRecPtrIsInvalid macro + pub fn is_valid(self) -> bool { + self != Lsn::INVALID + } } impl From for Lsn { diff --git a/zenith_utils/src/nonblock.rs b/libs/utils/src/nonblock.rs similarity index 100% rename from zenith_utils/src/nonblock.rs rename to libs/utils/src/nonblock.rs diff --git a/zenith_utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs similarity index 89% rename from zenith_utils/src/postgres_backend.rs rename to libs/utils/src/postgres_backend.rs index 83792f2aca..89f7197718 100644 --- a/zenith_utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -3,23 +3,20 @@ //! implementation determining how to process the queries. Currently its API //! is rather narrow, but we can extend it once required. -use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket}; use crate::sock_split::{BidiStream, ReadStream, WriteStream}; use anyhow::{bail, ensure, Context, Result}; use bytes::{Bytes, BytesMut}; +use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket}; use rand::Rng; use serde::{Deserialize, Serialize}; use std::fmt; use std::io::{self, Write}; use std::net::{Shutdown, SocketAddr, TcpStream}; use std::str::FromStr; -use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::time::Duration; use tracing::*; -static PGBACKEND_SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false); - pub trait Handler { /// Handle single query. /// postgres_backend will issue ReadyForQuery after calling this (this @@ -45,11 +42,15 @@ pub trait Handler { fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> { bail!("JWT auth failed") } + + fn is_shutdown_requested(&self) -> bool { + false + } } /// PostgresBackend protocol state. /// XXX: The order of the constructors matters. -#[derive(Clone, Copy, PartialEq, PartialOrd)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] pub enum ProtoState { Initialization, Encrypted, @@ -62,7 +63,7 @@ pub enum AuthType { Trust, MD5, // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT - ZenithJWT, + NeonJWT, } impl FromStr for AuthType { @@ -72,8 +73,8 @@ impl FromStr for AuthType { match s { "Trust" => Ok(Self::Trust), "MD5" => Ok(Self::MD5), - "ZenithJWT" => Ok(Self::ZenithJWT), - _ => bail!("invalid value \"{}\" for auth type", s), + "NeonJWT" => Ok(Self::NeonJWT), + _ => bail!("invalid value \"{s}\" for auth type"), } } } @@ -83,7 +84,7 @@ impl fmt::Display for AuthType { f.write_str(match self { AuthType::Trust => "Trust", AuthType::MD5 => "MD5", - AuthType::ZenithJWT => "ZenithJWT", + AuthType::NeonJWT => "NeonJWT", }) } } @@ -162,14 +163,9 @@ pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool { false } -// Truncate 0 from C string in Bytes and stringify it (returns slice, no allocations) -// PG protocol strings are always C strings. -fn cstr_to_str(b: &Bytes) -> Result<&str> { - let without_null = if b.last() == Some(&0) { - &b[..b.len() - 1] - } else { - &b[..] - }; +// Cast a byte slice to a string slice, dropping null terminator if there's one. +fn cstr_to_str(bytes: &[u8]) -> Result<&str> { + let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); std::str::from_utf8(without_null).map_err(|e| e.into()) } @@ -274,7 +270,7 @@ impl PostgresBackend { let mut unnamed_query_string = Bytes::new(); - while !PGBACKEND_SHUTDOWN_REQUESTED.load(Ordering::Relaxed) { + while !handler.is_shutdown_requested() { match self.read_message() { Ok(message) => { if let Some(msg) = message { @@ -304,8 +300,8 @@ impl PostgresBackend { pub fn start_tls(&mut self) -> anyhow::Result<()> { match self.stream.take() { Some(Stream::Bidirectional(bidi_stream)) => { - let session = rustls::ServerSession::new(&self.tls_config.clone().unwrap()); - self.stream = Some(Stream::Bidirectional(bidi_stream.start_tls(session)?)); + let conn = rustls::ServerConnection::new(self.tls_config.clone().unwrap())?; + self.stream = Some(Stream::Bidirectional(bidi_stream.start_tls(conn)?)); Ok(()) } stream => { @@ -336,11 +332,11 @@ impl PostgresBackend { let have_tls = self.tls_config.is_some(); match msg { FeMessage::StartupPacket(m) => { - trace!("got startup message {:?}", m); + trace!("got startup message {m:?}"); match m { FeStartupPacket::SslRequest => { - info!("SSL requested"); + debug!("SSL requested"); self.write_message(&BeMessage::EncryptionResponse(have_tls))?; if have_tls { @@ -349,7 +345,7 @@ impl PostgresBackend { } } FeStartupPacket::GssEncRequest => { - info!("GSS requested"); + debug!("GSS requested"); self.write_message(&BeMessage::EncryptionResponse(false))?; } FeStartupPacket::StartupMessage { .. } => { @@ -375,13 +371,12 @@ impl PostgresBackend { } AuthType::MD5 => { rand::thread_rng().fill(&mut self.md5_salt); - let md5_salt = self.md5_salt; self.write_message(&BeMessage::AuthenticationMD5Password( - &md5_salt, + self.md5_salt, ))?; self.state = ProtoState::Authentication; } - AuthType::ZenithJWT => { + AuthType::NeonJWT => { self.write_message(&BeMessage::AuthenticationCleartextPassword)?; self.state = ProtoState::Authentication; } @@ -408,7 +403,7 @@ impl PostgresBackend { bail!("auth failed: {}", e); } } - AuthType::ZenithJWT => { + AuthType::NeonJWT => { let (_, jwt_response) = m.split_last().context("protocol violation")?; if let Err(e) = handler.check_auth_jwt(self, jwt_response) { @@ -423,9 +418,9 @@ impl PostgresBackend { self.state = ProtoState::Established; } - FeMessage::Query(m) => { + FeMessage::Query(body) => { // remove null terminator - let query_string = cstr_to_str(&m.body)?; + let query_string = cstr_to_str(&body)?; trace!("got query {:?}", query_string); // xxx distinguish fatal and recoverable errors? @@ -434,8 +429,22 @@ impl PostgresBackend { // full cause of the error, not just the top-level context + its trace. // We don't want to send that in the ErrorResponse though, // because it's not relevant to the compute node logs. - error!("query handler for '{}' failed: {:?}", query_string, e); - self.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?; + // + // We also don't want to log full stacktrace when the error is primitive, + // such as usual connection closed. + let short_error = format!("{:#}", e); + let root_cause = e.root_cause().to_string(); + if root_cause.contains("connection closed unexpectedly") + || root_cause.contains("Broken pipe (os error 32)") + { + error!( + "query handler for '{}' failed: {}", + query_string, short_error + ); + } else { + error!("query handler for '{}' failed: {:?}", query_string, e); + } + self.write_message_noflush(&BeMessage::ErrorResponse(&short_error))?; // TODO: untangle convoluted control flow if e.to_string().contains("failed to run") { return Ok(ProcessMsgResult::Break); @@ -471,7 +480,7 @@ impl PostgresBackend { self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; } // NOTE there is no ReadyForQuery message. This handler is used - // for basebackup and it uses CopyOut which doesnt require + // for basebackup and it uses CopyOut which doesn't require // ReadyForQuery message and backend just switches back to // processing mode after sending CopyDone or ErrorResponse. } @@ -494,8 +503,3 @@ impl PostgresBackend { Ok(ProcessMsgResult::Continue) } } - -// Set the flag to inform connections to cancel -pub fn set_pgbackend_shutdown_requested() { - PGBACKEND_SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed); -} diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs new file mode 100644 index 0000000000..376819027b --- /dev/null +++ b/libs/utils/src/postgres_backend_async.rs @@ -0,0 +1,485 @@ +//! Server-side asynchronous Postgres connection, as limited as we need. +//! To use, create PostgresBackend and run() it, passing the Handler +//! implementation determining how to process the queries. Currently its API +//! is rather narrow, but we can extend it once required. + +use crate::postgres_backend::AuthType; +use anyhow::{bail, Context, Result}; +use bytes::{Bytes, BytesMut}; +use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket}; +use rand::Rng; +use std::future::Future; +use std::net::SocketAddr; +use std::pin::Pin; +use std::sync::Arc; +use std::task::Poll; +use tracing::{debug, error, trace}; + +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader}; +use tokio_rustls::TlsAcceptor; + +#[async_trait::async_trait] +pub trait Handler { + /// Handle single query. + /// postgres_backend will issue ReadyForQuery after calling this (this + /// might be not what we want after CopyData streaming, but currently we don't + /// care). + async fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>; + + /// Called on startup packet receival, allows to process params. + /// + /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users + /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow + /// to override whole init logic in implementations. + fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> { + Ok(()) + } + + /// Check auth md5 + fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> { + bail!("MD5 auth failed") + } + + /// Check auth jwt + fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> { + bail!("JWT auth failed") + } +} + +/// PostgresBackend protocol state. +/// XXX: The order of the constructors matters. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] +pub enum ProtoState { + Initialization, + Encrypted, + Authentication, + Established, + Closed, +} + +#[derive(Clone, Copy)] +pub enum ProcessMsgResult { + Continue, + Break, +} + +/// Always-writeable sock_split stream. +/// May not be readable. See [`PostgresBackend::take_stream_in`] +pub enum Stream { + Unencrypted(BufReader), + Tls(Box>>), + Broken, +} + +impl AsyncWrite for Stream { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf), + Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf), + Self::Broken => unreachable!(), + } + } + fn poll_flush( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx), + Self::Tls(stream) => Pin::new(stream).poll_flush(cx), + Self::Broken => unreachable!(), + } + } + fn poll_shutdown( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx), + Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx), + Self::Broken => unreachable!(), + } + } +} +impl AsyncRead for Stream { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> Poll> { + match self.get_mut() { + Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf), + Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf), + Self::Broken => unreachable!(), + } + } +} + +pub struct PostgresBackend { + stream: Stream, + // Output buffer. c.f. BeMessage::write why we are using BytesMut here. + buf_out: BytesMut, + + pub state: ProtoState, + + md5_salt: [u8; 4], + auth_type: AuthType, + + peer_addr: SocketAddr, + pub tls_config: Option>, +} + +pub fn query_from_cstring(query_string: Bytes) -> Vec { + let mut query_string = query_string.to_vec(); + if let Some(ch) = query_string.last() { + if *ch == 0 { + query_string.pop(); + } + } + query_string +} + +// Cast a byte slice to a string slice, dropping null terminator if there's one. +fn cstr_to_str(bytes: &[u8]) -> Result<&str> { + let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes); + std::str::from_utf8(without_null).map_err(|e| e.into()) +} + +impl PostgresBackend { + pub fn new( + socket: tokio::net::TcpStream, + auth_type: AuthType, + tls_config: Option>, + ) -> std::io::Result { + let peer_addr = socket.peer_addr()?; + + Ok(Self { + stream: Stream::Unencrypted(BufReader::new(socket)), + buf_out: BytesMut::with_capacity(10 * 1024), + state: ProtoState::Initialization, + md5_salt: [0u8; 4], + auth_type, + tls_config, + peer_addr, + }) + } + + pub fn get_peer_addr(&self) -> &SocketAddr { + &self.peer_addr + } + + /// Read full message or return None if connection is closed. + pub async fn read_message(&mut self) -> Result> { + use ProtoState::*; + match self.state { + Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await, + Authentication | Established => FeMessage::read_fut(&mut self.stream).await, + Closed => Ok(None), + } + } + + /// Flush output buffer into the socket. + pub async fn flush(&mut self) -> std::io::Result<&mut Self> { + self.stream.write_all(&self.buf_out).await?; + self.buf_out.clear(); + Ok(self) + } + + /// Write message into internal output buffer. + pub fn write_message(&mut self, message: &BeMessage<'_>) -> Result<&mut Self, std::io::Error> { + BeMessage::write(&mut self.buf_out, message)?; + Ok(self) + } + + // Wrapper for run_message_loop() that shuts down socket when we are done + pub async fn run(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()> + where + F: Fn() -> S, + S: Future, + { + let ret = self.run_message_loop(handler, shutdown_watcher).await; + let _ = self.stream.shutdown(); + ret + } + + async fn run_message_loop( + &mut self, + handler: &mut impl Handler, + shutdown_watcher: F, + ) -> Result<()> + where + F: Fn() -> S, + S: Future, + { + trace!("postgres backend to {:?} started", self.peer_addr); + + tokio::select!( + biased; + + _ = shutdown_watcher() => { + // We were requested to shut down. + tracing::info!("shutdown request received during handshake"); + return Ok(()) + }, + + result = async { + while self.state < ProtoState::Established { + if let Some(msg) = self.read_message().await? { + trace!("got message {msg:?} during handshake"); + + match self.process_handshake_message(handler, msg).await? { + ProcessMsgResult::Continue => { + self.flush().await?; + continue; + } + ProcessMsgResult::Break => { + trace!("postgres backend to {:?} exited during handshake", self.peer_addr); + return Ok(()); + } + } + } else { + trace!("postgres backend to {:?} exited during handshake", self.peer_addr); + return Ok(()); + } + } + Ok::<(), anyhow::Error>(()) + } => { + // Handshake complete. + result?; + } + ); + + // Authentication completed + let mut query_string = Bytes::new(); + while let Some(msg) = tokio::select!( + biased; + _ = shutdown_watcher() => { + // We were requested to shut down. + tracing::info!("shutdown request received in run_message_loop"); + Ok(None) + }, + msg = self.read_message() => { msg }, + )? { + trace!("got message {:?}", msg); + + let result = self.process_message(handler, msg, &mut query_string).await; + self.flush().await?; + match result? { + ProcessMsgResult::Continue => { + self.flush().await?; + continue; + } + ProcessMsgResult::Break => break, + } + } + + trace!("postgres backend to {:?} exited", self.peer_addr); + Ok(()) + } + + async fn start_tls(&mut self) -> anyhow::Result<()> { + if let Stream::Unencrypted(plain_stream) = + std::mem::replace(&mut self.stream, Stream::Broken) + { + let acceptor = TlsAcceptor::from(self.tls_config.clone().unwrap()); + let tls_stream = acceptor.accept(plain_stream).await?; + + self.stream = Stream::Tls(Box::new(tls_stream)); + return Ok(()); + }; + bail!("TLS already started"); + } + + async fn process_handshake_message( + &mut self, + handler: &mut impl Handler, + msg: FeMessage, + ) -> Result { + assert!(self.state < ProtoState::Established); + let have_tls = self.tls_config.is_some(); + match msg { + FeMessage::StartupPacket(m) => { + trace!("got startup message {m:?}"); + + match m { + FeStartupPacket::SslRequest => { + debug!("SSL requested"); + + self.write_message(&BeMessage::EncryptionResponse(have_tls))?; + if have_tls { + self.start_tls().await?; + self.state = ProtoState::Encrypted; + } + } + FeStartupPacket::GssEncRequest => { + debug!("GSS requested"); + self.write_message(&BeMessage::EncryptionResponse(false))?; + } + FeStartupPacket::StartupMessage { .. } => { + if have_tls && !matches!(self.state, ProtoState::Encrypted) { + self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?; + bail!("client did not connect with TLS"); + } + + // NB: startup() may change self.auth_type -- we are using that in proxy code + // to bypass auth for new users. + handler.startup(self, &m)?; + + match self.auth_type { + AuthType::Trust => { + self.write_message(&BeMessage::AuthenticationOk)? + .write_message(&BeParameterStatusMessage::encoding())? + // The async python driver requires a valid server_version + .write_message(&BeMessage::ParameterStatus( + BeParameterStatusMessage::ServerVersion("14.1"), + ))? + .write_message(&BeMessage::ReadyForQuery)?; + self.state = ProtoState::Established; + } + AuthType::MD5 => { + rand::thread_rng().fill(&mut self.md5_salt); + self.write_message(&BeMessage::AuthenticationMD5Password( + self.md5_salt, + ))?; + self.state = ProtoState::Authentication; + } + AuthType::NeonJWT => { + self.write_message(&BeMessage::AuthenticationCleartextPassword)?; + self.state = ProtoState::Authentication; + } + } + } + FeStartupPacket::CancelRequest { .. } => { + self.state = ProtoState::Closed; + return Ok(ProcessMsgResult::Break); + } + } + } + + FeMessage::PasswordMessage(m) => { + trace!("got password message '{:?}'", m); + + assert!(self.state == ProtoState::Authentication); + + match self.auth_type { + AuthType::Trust => unreachable!(), + AuthType::MD5 => { + let (_, md5_response) = m.split_last().context("protocol violation")?; + + if let Err(e) = handler.check_auth_md5(self, md5_response) { + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + bail!("auth failed: {}", e); + } + } + AuthType::NeonJWT => { + let (_, jwt_response) = m.split_last().context("protocol violation")?; + + if let Err(e) = handler.check_auth_jwt(self, jwt_response) { + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + bail!("auth failed: {}", e); + } + } + } + self.write_message(&BeMessage::AuthenticationOk)? + .write_message(&BeParameterStatusMessage::encoding())? + .write_message(&BeMessage::ReadyForQuery)?; + self.state = ProtoState::Established; + } + + _ => { + self.state = ProtoState::Closed; + return Ok(ProcessMsgResult::Break); + } + } + Ok(ProcessMsgResult::Continue) + } + + async fn process_message( + &mut self, + handler: &mut impl Handler, + msg: FeMessage, + unnamed_query_string: &mut Bytes, + ) -> Result { + // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth + // TODO: change that to proper top-level match of protocol state with separate message handling for each state + assert!(self.state == ProtoState::Established); + + match msg { + FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => { + bail!("protocol violation"); + } + + FeMessage::Query(body) => { + // remove null terminator + let query_string = cstr_to_str(&body)?; + + trace!("got query {:?}", query_string); + // xxx distinguish fatal and recoverable errors? + if let Err(e) = handler.process_query(self, query_string).await { + // ":?" uses the alternate formatting style, which makes anyhow display the + // full cause of the error, not just the top-level context + its trace. + // We don't want to send that in the ErrorResponse though, + // because it's not relevant to the compute node logs. + error!("query handler for '{}' failed: {:?}", query_string, e); + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + // TODO: untangle convoluted control flow + if e.to_string().contains("failed to run") { + return Ok(ProcessMsgResult::Break); + } + } + self.write_message(&BeMessage::ReadyForQuery)?; + } + + FeMessage::Parse(m) => { + *unnamed_query_string = m.query_string; + self.write_message(&BeMessage::ParseComplete)?; + } + + FeMessage::Describe(_) => { + self.write_message(&BeMessage::ParameterDescription)? + .write_message(&BeMessage::NoData)?; + } + + FeMessage::Bind(_) => { + self.write_message(&BeMessage::BindComplete)?; + } + + FeMessage::Close(_) => { + self.write_message(&BeMessage::CloseComplete)?; + } + + FeMessage::Execute(_) => { + let query_string = cstr_to_str(unnamed_query_string)?; + trace!("got execute {:?}", query_string); + // xxx distinguish fatal and recoverable errors? + if let Err(e) = handler.process_query(self, query_string).await { + error!("query handler for '{}' failed: {:?}", query_string, e); + self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; + } + // NOTE there is no ReadyForQuery message. This handler is used + // for basebackup and it uses CopyOut which doesn't require + // ReadyForQuery message and backend just switches back to + // processing mode after sending CopyDone or ErrorResponse. + } + + FeMessage::Sync => { + self.write_message(&BeMessage::ReadyForQuery)?; + } + + FeMessage::Terminate => { + return Ok(ProcessMsgResult::Break); + } + + // We prefer explicit pattern matching to wildcards, because + // this helps us spot the places where new variants are missing + FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => { + bail!("unexpected message type: {:?}", msg); + } + } + + Ok(ProcessMsgResult::Continue) + } +} diff --git a/zenith_utils/src/seqwait.rs b/libs/utils/src/seqwait.rs similarity index 82% rename from zenith_utils/src/seqwait.rs rename to libs/utils/src/seqwait.rs index bc32f51b13..bf330a482c 100644 --- a/zenith_utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -4,12 +4,13 @@ use std::cmp::{Eq, Ordering, PartialOrd}; use std::collections::BinaryHeap; use std::fmt::Debug; use std::mem; -use std::sync::mpsc::{channel, Receiver, Sender}; use std::sync::Mutex; use std::time::Duration; +use tokio::sync::watch::{channel, Receiver, Sender}; +use tokio::time::timeout; /// An error happened while waiting for a number -#[derive(Debug, PartialEq, thiserror::Error)] +#[derive(Debug, PartialEq, Eq, thiserror::Error)] #[error("SeqWaitError")] pub enum SeqWaitError { /// The wait timeout was reached @@ -141,10 +142,10 @@ where /// /// This call won't complete until someone has called `advance` /// with a number greater than or equal to the one we're waiting for. - pub fn wait_for(&self, num: V) -> Result<(), SeqWaitError> { + pub async fn wait_for(&self, num: V) -> Result<(), SeqWaitError> { match self.queue_for_wait(num) { Ok(None) => Ok(()), - Ok(Some(rx)) => rx.recv().map_err(|_| SeqWaitError::Shutdown), + Ok(Some(mut rx)) => rx.changed().await.map_err(|_| SeqWaitError::Shutdown), Err(e) => Err(e), } } @@ -156,13 +157,18 @@ where /// /// If that hasn't happened after the specified timeout duration, /// [`SeqWaitError::Timeout`] will be returned. - pub fn wait_for_timeout(&self, num: V, timeout_duration: Duration) -> Result<(), SeqWaitError> { + pub async fn wait_for_timeout( + &self, + num: V, + timeout_duration: Duration, + ) -> Result<(), SeqWaitError> { match self.queue_for_wait(num) { Ok(None) => Ok(()), - Ok(Some(rx)) => rx.recv_timeout(timeout_duration).map_err(|e| match e { - std::sync::mpsc::RecvTimeoutError::Timeout => SeqWaitError::Timeout, - std::sync::mpsc::RecvTimeoutError::Disconnected => SeqWaitError::Shutdown, - }), + Ok(Some(mut rx)) => match timeout(timeout_duration, rx.changed()).await { + Ok(Ok(())) => Ok(()), + Ok(Err(_)) => Err(SeqWaitError::Shutdown), + Err(_) => Err(SeqWaitError::Timeout), + }, Err(e) => Err(e), } } @@ -179,7 +185,7 @@ where } // Create a new channel. - let (tx, rx) = channel(); + let (tx, rx) = channel(()); internal.waiters.push(Waiter { wake_num: num, wake_channel: tx, @@ -234,8 +240,6 @@ where mod tests { use super::*; use std::sync::Arc; - use std::thread::sleep; - use std::thread::spawn; use std::time::Duration; impl MonotonicCounter for i32 { @@ -248,46 +252,54 @@ mod tests { } } - #[test] - fn seqwait() { + #[tokio::test] + async fn seqwait() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); let seq3 = Arc::clone(&seq); - spawn(move || { - seq2.wait_for(42).expect("wait_for 42"); + let jh1 = tokio::task::spawn(async move { + seq2.wait_for(42).await.expect("wait_for 42"); let old = seq2.advance(100); assert_eq!(old, 99); - seq2.wait_for(999).expect_err("no 999"); + seq2.wait_for_timeout(999, Duration::from_millis(100)) + .await + .expect_err("no 999"); }); - spawn(move || { - seq3.wait_for(42).expect("wait_for 42"); - seq3.wait_for(0).expect("wait_for 0"); + let jh2 = tokio::task::spawn(async move { + seq3.wait_for(42).await.expect("wait_for 42"); + seq3.wait_for(0).await.expect("wait_for 0"); }); - sleep(Duration::from_secs(1)); + tokio::time::sleep(Duration::from_millis(200)).await; let old = seq.advance(99); assert_eq!(old, 0); - seq.wait_for(100).expect("wait_for 100"); + seq.wait_for(100).await.expect("wait_for 100"); // Calling advance with a smaller value is a no-op assert_eq!(seq.advance(98), 100); assert_eq!(seq.load(), 100); + jh1.await.unwrap(); + jh2.await.unwrap(); + seq.shutdown(); } - #[test] - fn seqwait_timeout() { + #[tokio::test] + async fn seqwait_timeout() { let seq = Arc::new(SeqWait::new(0)); let seq2 = Arc::clone(&seq); - spawn(move || { + let jh = tokio::task::spawn(async move { let timeout = Duration::from_millis(1); - let res = seq2.wait_for_timeout(42, timeout); + let res = seq2.wait_for_timeout(42, timeout).await; assert_eq!(res, Err(SeqWaitError::Timeout)); }); - sleep(Duration::from_secs(1)); + tokio::time::sleep(Duration::from_millis(200)).await; // This will attempt to wake, but nothing will happen // because the waiter already dropped its Receiver. let old = seq.advance(99); - assert_eq!(old, 0) + assert_eq!(old, 0); + jh.await.unwrap(); + + seq.shutdown(); } } diff --git a/zenith_utils/src/shutdown.rs b/libs/utils/src/shutdown.rs similarity index 100% rename from zenith_utils/src/shutdown.rs rename to libs/utils/src/shutdown.rs diff --git a/zenith_utils/src/signals.rs b/libs/utils/src/signals.rs similarity index 100% rename from zenith_utils/src/signals.rs rename to libs/utils/src/signals.rs diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs new file mode 100644 index 0000000000..177a839d75 --- /dev/null +++ b/libs/utils/src/simple_rcu.rs @@ -0,0 +1,289 @@ +//! +//! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat +//! similar to a lock, but it allows readers to "hold on" to an old value of RCU +//! without blocking writers, and allows writing a new values without blocking +//! readers. When you update the new value, the new value is immediately visible +//! to new readers, but the update waits until all existing readers have +//! finishe, so that no one sees the old value anymore. +//! +//! This implementation isn't wait-free; it uses an RwLock that is held for a +//! short duration when the value is read or updated. +//! +//! # Examples +//! +//! Read a value and do things with it while holding the guard: +//! +//! ``` +//! # let rcu = utils::simple_rcu::Rcu::new(1); +//! { +//! let read = rcu.read(); +//! println!("the current value is {}", *read); +//! // exiting the scope drops the read-guard, and allows concurrent writers +//! // to finish. +//! } +//! ``` +//! +//! Increment the value by one, and wait for old readers to finish: +//! +//! ``` +//! # let rcu = utils::simple_rcu::Rcu::new(1); +//! let write_guard = rcu.lock_for_write(); +//! +//! // NB: holding `write_guard` blocks new readers and writers. Keep this section short! +//! let new_value = *write_guard + 1; +//! +//! let waitlist = write_guard.store_and_unlock(new_value); // consumes `write_guard` +//! +//! // Concurrent reads and writes are now possible again. Wait for all the readers +//! // that still observe the old value to finish. +//! waitlist.wait(); +//! ``` +//! +#![warn(missing_docs)] + +use std::ops::Deref; +use std::sync::mpsc::{sync_channel, Receiver, SyncSender}; +use std::sync::{Arc, Weak}; +use std::sync::{Mutex, RwLock, RwLockWriteGuard}; + +/// +/// Rcu allows multiple readers to read and hold onto a value without blocking +/// (for very long). Storing to the Rcu updates the value, making new readers +/// immediately see the new value, but it also waits for all current readers to +/// finish. +/// +pub struct Rcu { + inner: RwLock>, +} + +struct RcuInner { + current_cell: Arc>, + old_cells: Vec>>, +} + +/// +/// RcuCell holds one value. It can be the latest one, or an old one. +/// +struct RcuCell { + value: V, + + /// A dummy channel. We never send anything to this channel. The point is + /// that when the RcuCell is dropped, any cloned Senders will be notified + /// that the channel is closed. Updaters can use this to wait out until the + /// RcuCell has been dropped, i.e. until the old value is no longer in use. + /// + /// We never do anything with the receiver, we just need to hold onto it so + /// that the Senders will be notified when it's dropped. But because it's + /// not Sync, we need a Mutex on it. + watch: (SyncSender<()>, Mutex>), +} + +impl RcuCell { + fn new(value: V) -> Self { + let (watch_sender, watch_receiver) = sync_channel(0); + RcuCell { + value, + watch: (watch_sender, Mutex::new(watch_receiver)), + } + } +} + +impl Rcu { + /// Create a new `Rcu`, initialized to `starting_val` + pub fn new(starting_val: V) -> Self { + let inner = RcuInner { + current_cell: Arc::new(RcuCell::new(starting_val)), + old_cells: Vec::new(), + }; + Self { + inner: RwLock::new(inner), + } + } + + /// + /// Read current value. Any store() calls will block until the returned + /// guard object is dropped. + /// + pub fn read(&self) -> RcuReadGuard { + let current_cell = Arc::clone(&self.inner.read().unwrap().current_cell); + RcuReadGuard { cell: current_cell } + } + + /// + /// Lock the current value for updating. Returns a guard object that can be + /// used to read the current value, and to store a new value. + /// + /// Note: holding the write-guard blocks concurrent readers, so you should + /// finish the update and drop the guard quickly! Multiple writers can be + /// waiting on the RcuWriteGuard::store step at the same time, however. + /// + pub fn lock_for_write(&self) -> RcuWriteGuard<'_, V> { + let inner = self.inner.write().unwrap(); + RcuWriteGuard { inner } + } +} + +/// +/// Read guard returned by `read` +/// +pub struct RcuReadGuard { + cell: Arc>, +} + +impl Deref for RcuReadGuard { + type Target = V; + + fn deref(&self) -> &V { + &self.cell.value + } +} + +/// +/// Write guard returned by `write` +/// +/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so +/// it should only be held for a short duration! +/// +/// Calling `store` consumes the guard, making new reads and new writes possible +/// again. +/// +pub struct RcuWriteGuard<'a, V> { + inner: RwLockWriteGuard<'a, RcuInner>, +} + +impl<'a, V> Deref for RcuWriteGuard<'a, V> { + type Target = V; + + fn deref(&self) -> &V { + &self.inner.current_cell.value + } +} + +impl<'a, V> RcuWriteGuard<'a, V> { + /// + /// Store a new value. The new value will be written to the Rcu immediately, + /// and will be immediately seen by any `read` calls that start afterwards. + /// + /// Returns a list of readers that can see old values. You can call `wait()` + /// on it to wait for them to finish. + /// + pub fn store_and_unlock(mut self, new_val: V) -> RcuWaitList { + let new_cell = Arc::new(RcuCell::new(new_val)); + + let mut watches = Vec::new(); + { + let old = std::mem::replace(&mut self.inner.current_cell, new_cell); + self.inner.old_cells.push(Arc::downgrade(&old)); + + // cleanup old cells that no longer have any readers, and collect + // the watches for any that do. + self.inner.old_cells.retain(|weak| { + if let Some(cell) = weak.upgrade() { + watches.push(cell.watch.0.clone()); + true + } else { + false + } + }); + } + RcuWaitList(watches) + } +} + +/// +/// List of readers who can still see old values. +/// +pub struct RcuWaitList(Vec>); + +impl RcuWaitList { + /// + /// Wait for old readers to finish. + /// + pub fn wait(mut self) { + // after all the old_cells are no longer in use, we're done + for w in self.0.iter_mut() { + // This will block until the Receiver is closed. That happens when + // the RcuCell is dropped. + #[allow(clippy::single_match)] + match w.send(()) { + Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"), + Err(_) => { + // closed, which means that the cell has been dropped, and + // its value is no longer in use + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::{Arc, Mutex}; + use std::thread::{sleep, spawn}; + use std::time::Duration; + + #[test] + fn two_writers() { + let rcu = Rcu::new(1); + + let read1 = rcu.read(); + assert_eq!(*read1, 1); + + let write2 = rcu.lock_for_write(); + assert_eq!(*write2, 1); + let wait2 = write2.store_and_unlock(2); + + let read2 = rcu.read(); + assert_eq!(*read2, 2); + + let write3 = rcu.lock_for_write(); + assert_eq!(*write3, 2); + let wait3 = write3.store_and_unlock(3); + + // new reader can see the new value, and old readers continue to see the old values. + let read3 = rcu.read(); + assert_eq!(*read3, 3); + assert_eq!(*read2, 2); + assert_eq!(*read1, 1); + + let log = Arc::new(Mutex::new(Vec::new())); + // Wait for the old readers to finish in separate threads. + let log_clone = Arc::clone(&log); + let thread2 = spawn(move || { + wait2.wait(); + log_clone.lock().unwrap().push("wait2 done"); + }); + let log_clone = Arc::clone(&log); + let thread3 = spawn(move || { + wait3.wait(); + log_clone.lock().unwrap().push("wait3 done"); + }); + + // without this sleep the test can pass on accident if the writer is slow + sleep(Duration::from_millis(500)); + + // Release first reader. This allows first write to finish, but calling + // wait() on the second one would still block. + log.lock().unwrap().push("dropping read1"); + drop(read1); + thread2.join().unwrap(); + + sleep(Duration::from_millis(500)); + + // Release second reader, and finish second writer. + log.lock().unwrap().push("dropping read2"); + drop(read2); + thread3.join().unwrap(); + + assert_eq!( + log.lock().unwrap().as_slice(), + &[ + "dropping read1", + "wait2 done", + "dropping read2", + "wait3 done" + ] + ); + } +} diff --git a/zenith_utils/src/sock_split.rs b/libs/utils/src/sock_split.rs similarity index 86% rename from zenith_utils/src/sock_split.rs rename to libs/utils/src/sock_split.rs index c62963e113..5e4598daf1 100644 --- a/zenith_utils/src/sock_split.rs +++ b/libs/utils/src/sock_split.rs @@ -4,7 +4,7 @@ use std::{ sync::Arc, }; -use rustls::Session; +use rustls::Connection; /// Wrapper supporting reads of a shared TcpStream. pub struct ArcTcpRead(Arc); @@ -56,7 +56,7 @@ impl BufStream { pub enum ReadStream { Tcp(BufReader), - Tls(rustls_split::ReadHalf), + Tls(rustls_split::ReadHalf), } impl io::Read for ReadStream { @@ -79,7 +79,7 @@ impl ReadStream { pub enum WriteStream { Tcp(Arc), - Tls(rustls_split::WriteHalf), + Tls(rustls_split::WriteHalf), } impl WriteStream { @@ -107,11 +107,11 @@ impl io::Write for WriteStream { } } -type TlsStream = rustls::StreamOwned; +type TlsStream = rustls::StreamOwned; pub enum BidiStream { Tcp(BufStream), - /// This variant is boxed, because [`rustls::ServerSession`] is quite larger than [`BufStream`]. + /// This variant is boxed, because [`rustls::ServerConnection`] is quite larger than [`BufStream`]. Tls(Box>), } @@ -127,7 +127,7 @@ impl BidiStream { if how == Shutdown::Read { tls_boxed.sock.get_ref().shutdown(how) } else { - tls_boxed.sess.send_close_notify(); + tls_boxed.conn.send_close_notify(); let res = tls_boxed.flush(); tls_boxed.sock.get_ref().shutdown(how)?; res @@ -154,19 +154,23 @@ impl BidiStream { // TODO would be nice to avoid the Arc here let socket = Arc::try_unwrap(reader.into_inner().0).unwrap(); - let (read_half, write_half) = - rustls_split::split(socket, tls_boxed.sess, read_buf_cfg, write_buf_cfg); + let (read_half, write_half) = rustls_split::split( + socket, + Connection::Server(tls_boxed.conn), + read_buf_cfg, + write_buf_cfg, + ); (ReadStream::Tls(read_half), WriteStream::Tls(write_half)) } } } - pub fn start_tls(self, mut session: rustls::ServerSession) -> io::Result { + pub fn start_tls(self, mut conn: rustls::ServerConnection) -> io::Result { match self { Self::Tcp(mut stream) => { - session.complete_io(&mut stream)?; - assert!(!session.is_handshaking()); - Ok(Self::Tls(Box::new(TlsStream::new(session, stream)))) + conn.complete_io(&mut stream)?; + assert!(!conn.is_handshaking()); + Ok(Self::Tls(Box::new(TlsStream::new(conn, stream)))) } Self::Tls { .. } => Err(io::Error::new( io::ErrorKind::InvalidInput, diff --git a/zenith_utils/src/tcp_listener.rs b/libs/utils/src/tcp_listener.rs similarity index 100% rename from zenith_utils/src/tcp_listener.rs rename to libs/utils/src/tcp_listener.rs diff --git a/zenith_utils/src/vec_map.rs b/libs/utils/src/vec_map.rs similarity index 99% rename from zenith_utils/src/vec_map.rs rename to libs/utils/src/vec_map.rs index 558721c724..9953b447c8 100644 --- a/zenith_utils/src/vec_map.rs +++ b/libs/utils/src/vec_map.rs @@ -1,11 +1,9 @@ use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds}; -use serde::{Deserialize, Serialize}; - /// Ordered map datastructure implemented in a Vec. /// Append only - can only add keys that are larger than the /// current max key. -#[derive(Clone, Debug, Serialize, Deserialize)] +#[derive(Clone, Debug)] pub struct VecMap(Vec<(K, V)>); impl Default for VecMap { diff --git a/zenith_utils/tests/bin_ser_test.rs b/libs/utils/tests/bin_ser_test.rs similarity index 92% rename from zenith_utils/tests/bin_ser_test.rs rename to libs/utils/tests/bin_ser_test.rs index ada43a1189..b995b61b78 100644 --- a/zenith_utils/tests/bin_ser_test.rs +++ b/libs/utils/tests/bin_ser_test.rs @@ -2,9 +2,9 @@ use bytes::{Buf, BytesMut}; use hex_literal::hex; use serde::Deserialize; use std::io::Read; -use zenith_utils::bin_ser::LeSer; +use utils::bin_ser::LeSer; -#[derive(Debug, PartialEq, Deserialize)] +#[derive(Debug, PartialEq, Eq, Deserialize)] pub struct HeaderData { magic: u16, info: u16, diff --git a/zenith_utils/tests/cert.pem b/libs/utils/tests/cert.pem similarity index 100% rename from zenith_utils/tests/cert.pem rename to libs/utils/tests/cert.pem diff --git a/zenith_utils/tests/key.pem b/libs/utils/tests/key.pem similarity index 100% rename from zenith_utils/tests/key.pem rename to libs/utils/tests/key.pem diff --git a/zenith_utils/tests/ssl_test.rs b/libs/utils/tests/ssl_test.rs similarity index 77% rename from zenith_utils/tests/ssl_test.rs rename to libs/utils/tests/ssl_test.rs index ef2bf1ed4a..248400c2c1 100644 --- a/zenith_utils/tests/ssl_test.rs +++ b/libs/utils/tests/ssl_test.rs @@ -7,10 +7,9 @@ use std::{ use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use lazy_static::lazy_static; -use rustls::Session; +use once_cell::sync::Lazy; -use zenith_utils::postgres_backend::{AuthType, Handler, PostgresBackend}; +use utils::postgres_backend::{AuthType, Handler, PostgresBackend}; fn make_tcp_pair() -> (TcpStream, TcpStream) { let listener = TcpListener::bind("127.0.0.1:0").unwrap(); @@ -20,18 +19,20 @@ fn make_tcp_pair() -> (TcpStream, TcpStream) { (server_stream, client_stream) } -lazy_static! { - static ref KEY: rustls::PrivateKey = { - let mut cursor = Cursor::new(include_bytes!("key.pem")); - rustls::internal::pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone() - }; - static ref CERT: rustls::Certificate = { - let mut cursor = Cursor::new(include_bytes!("cert.pem")); - rustls::internal::pemfile::certs(&mut cursor).unwrap()[0].clone() - }; -} +static KEY: Lazy = Lazy::new(|| { + let mut cursor = Cursor::new(include_bytes!("key.pem")); + rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone()) +}); + +static CERT: Lazy = Lazy::new(|| { + let mut cursor = Cursor::new(include_bytes!("cert.pem")); + rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone()) +}); #[test] +// [false-positive](https://github.com/rust-lang/rust-clippy/issues/9274), +// we resize the vector so doing some modifications after all +#[allow(clippy::read_zero_byte_vec)] fn ssl() { let (mut client_sock, server_sock) = make_tcp_pair(); @@ -45,17 +46,23 @@ fn ssl() { let ssl_response = client_sock.read_u8().unwrap(); assert_eq!(b'S', ssl_response); - let mut cfg = rustls::ClientConfig::new(); - cfg.root_store.add(&CERT).unwrap(); + let cfg = rustls::ClientConfig::builder() + .with_safe_defaults() + .with_root_certificates({ + let mut store = rustls::RootCertStore::empty(); + store.add(&CERT).unwrap(); + store + }) + .with_no_client_auth(); let client_config = Arc::new(cfg); - let dns_name = webpki::DNSNameRef::try_from_ascii_str("localhost").unwrap(); - let mut session = rustls::ClientSession::new(&client_config, dns_name); + let dns_name = "localhost".try_into().unwrap(); + let mut conn = rustls::ClientConnection::new(client_config, dns_name).unwrap(); - session.complete_io(&mut client_sock).unwrap(); - assert!(!session.is_handshaking()); + conn.complete_io(&mut client_sock).unwrap(); + assert!(!conn.is_handshaking()); - let mut stream = rustls::Stream::new(&mut session, &mut client_sock); + let mut stream = rustls::Stream::new(&mut conn, &mut client_sock); // StartupMessage stream.write_u32::(9).unwrap(); @@ -105,8 +112,10 @@ fn ssl() { } let mut handler = TestHandler { got_query: false }; - let mut cfg = rustls::ServerConfig::new(rustls::NoClientAuth::new()); - cfg.set_single_cert(vec![CERT.clone()], KEY.clone()) + let cfg = rustls::ServerConfig::builder() + .with_safe_defaults() + .with_no_client_auth() + .with_single_cert(vec![CERT.clone()], KEY.clone()) .unwrap(); let tls_config = Some(Arc::new(cfg)); @@ -209,8 +218,10 @@ fn server_forces_ssl() { } let mut handler = TestHandler; - let mut cfg = rustls::ServerConfig::new(rustls::NoClientAuth::new()); - cfg.set_single_cert(vec![CERT.clone()], KEY.clone()) + let cfg = rustls::ServerConfig::builder() + .with_safe_defaults() + .with_no_client_auth() + .with_single_cert(vec![CERT.clone()], KEY.clone()) .unwrap(); let tls_config = Some(Arc::new(cfg)); diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml deleted file mode 100644 index a3fda0b246..0000000000 --- a/monitoring/docker-compose.yml +++ /dev/null @@ -1,25 +0,0 @@ -version: "3" -services: - - prometheus: - container_name: prometheus - image: prom/prometheus:latest - volumes: - - ./prometheus.yaml:/etc/prometheus/prometheus.yml - # ports: - # - "9090:9090" - # TODO: find a proper portable solution - network_mode: "host" - - grafana: - image: grafana/grafana:latest - volumes: - - ./grafana.yaml:/etc/grafana/provisioning/datasources/datasources.yaml - environment: - - GF_AUTH_ANONYMOUS_ENABLED=true - - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - - GF_AUTH_DISABLE_LOGIN_FORM=true - # ports: - # - "3000:3000" - # TODO: find a proper portable solution - network_mode: "host" diff --git a/monitoring/grafana.yaml b/monitoring/grafana.yaml deleted file mode 100644 index eac8879e6c..0000000000 --- a/monitoring/grafana.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: 1 - -datasources: -- name: Prometheus - type: prometheus - access: proxy - orgId: 1 - url: http://localhost:9090 - basicAuth: false - isDefault: false - version: 1 - editable: false diff --git a/monitoring/prometheus.yaml b/monitoring/prometheus.yaml deleted file mode 100644 index ba55d53737..0000000000 --- a/monitoring/prometheus.yaml +++ /dev/null @@ -1,5 +0,0 @@ -scrape_configs: - - job_name: 'default' - scrape_interval: 10s - static_configs: - - targets: ['localhost:9898'] diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index cfcb453732..a38978512d 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -3,54 +3,76 @@ name = "pageserver" version = "0.1.0" edition = "2021" +[features] +default = [] +# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, +# which adds some runtime cost to run tests on outage conditions +testing = ["fail/failpoints"] + +profiling = ["pprof"] + [dependencies] -bookfile = { git = "https://github.com/zenithdb/bookfile.git", branch="generic-readext" } -chrono = "0.4.19" -rand = "0.8.3" -regex = "1.4.5" -bytes = { version = "1.0.1", features = ['serde'] } -byteorder = "1.4.3" -futures = "0.3.13" -hyper = "0.14" -lazy_static = "1.4.0" -log = "0.4.14" -clap = "3.0" -daemonize = "0.4.1" -tokio = { version = "1.11", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } -postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -tokio-stream = "0.1.8" +amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" } anyhow = { version = "1.0", features = ["backtrace"] } -crc32c = "0.6.0" -thiserror = "1.0" -hex = { version = "0.4.3", features = ["serde"] } -tar = "0.4.33" -humantime = "2.1.0" -serde = { version = "1.0", features = ["derive"] } -serde_json = "1" -toml_edit = { version = "0.13", features = ["easy"] } -scopeguard = "1.1.0" +async-stream = "0.3" async-trait = "0.1" +byteorder = "1.4.3" +bytes = "1.0.1" +chrono = "0.4.19" +clap = { version = "4.0", features = ["string"] } +close_fds = "0.3.2" const_format = "0.2.21" -tracing = "0.1.27" -tracing-futures = "0.2" -signal-hook = "0.3.10" -url = "2" -nix = "0.23" -once_cell = "1.8.0" +crc32c = "0.6.0" crossbeam-utils = "0.8.5" fail = "0.5.0" +futures = "0.3.13" +git-version = "0.3.5" +hex = "0.4.3" +humantime = "2.1.0" +humantime-serde = "1.1.1" +hyper = "0.14" +itertools = "0.10.3" +nix = "0.25" +num-traits = "0.2.15" +once_cell = "1.13.0" +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true } +rand = "0.8.3" +regex = "1.4.5" +rstar = "0.9.3" +scopeguard = "1.1.0" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1" +serde_with = "2.0" +signal-hook = "0.3.10" +svg_fmt = "0.4.1" +tar = "0.4.33" +thiserror = "1.0" +tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-util = { version = "0.7.3", features = ["io", "io-util"] } +toml_edit = { version = "0.14", features = ["easy"] } +tracing = "0.1.36" +url = "2" +walkdir = "2.3.2" -rust-s3 = { version = "0.28", default-features = false, features = ["no-verify-ssl", "tokio-rustls-tls"] } -async-compression = {version = "0.3", features = ["zstd", "tokio"]} - -postgres_ffi = { path = "../postgres_ffi" } -zenith_metrics = { path = "../zenith_metrics" } -zenith_utils = { path = "../zenith_utils" } -workspace_hack = { path = "../workspace_hack" } +etcd_broker = { path = "../libs/etcd_broker" } +metrics = { path = "../libs/metrics" } +pageserver_api = { path = "../libs/pageserver_api" } +postgres_ffi = { path = "../libs/postgres_ffi" } +pq_proto = { path = "../libs/pq_proto" } +remote_storage = { path = "../libs/remote_storage" } +tenant_size_model = { path = "../libs/tenant_size_model" } +utils = { path = "../libs/utils" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] +criterion = "0.4" hex-literal = "0.3" tempfile = "3.2" + +[[bench]] +name = "bench_layer_map" +harness = false diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs new file mode 100644 index 0000000000..25d5ecd643 --- /dev/null +++ b/pageserver/benches/bench_layer_map.rs @@ -0,0 +1,5866 @@ +use anyhow::Result; +use pageserver::repository::{Key, Value}; +use pageserver::tenant::filename::{DeltaFileName, ImageFileName}; +use pageserver::tenant::layer_map::LayerMap; +use pageserver::tenant::storage_layer::Layer; +use pageserver::tenant::storage_layer::ValueReconstructResult; +use pageserver::tenant::storage_layer::ValueReconstructState; +use std::cmp::{max, min}; +use std::ops::Range; +use std::path::PathBuf; +use std::str::FromStr; +use std::sync::Arc; +use utils::id::{TenantId, TimelineId}; +use utils::lsn::Lsn; + +use criterion::{criterion_group, criterion_main, Criterion}; + +struct DummyDelta { + key_range: Range, + lsn_range: Range, +} + +impl Layer for DummyDelta { + fn get_tenant_id(&self) -> TenantId { + TenantId::from_str("00000000000000000000000000000000").unwrap() + } + + fn get_timeline_id(&self) -> TimelineId { + TimelineId::from_str("00000000000000000000000000000000").unwrap() + } + + fn get_key_range(&self) -> Range { + self.key_range.clone() + } + + fn get_lsn_range(&self) -> Range { + self.lsn_range.clone() + } + + fn filename(&self) -> PathBuf { + todo!() + } + + fn local_path(&self) -> Option { + todo!() + } + + fn get_value_reconstruct_data( + &self, + _key: Key, + _lsn_range: Range, + _reconstruct_data: &mut ValueReconstructState, + ) -> Result { + panic!() + } + + fn is_incremental(&self) -> bool { + true + } + + fn is_in_memory(&self) -> bool { + false + } + + fn iter(&self) -> Box> + '_> { + panic!() + } + + fn key_iter(&self) -> Box + '_> { + panic!("Not implemented") + } + + fn delete(&self) -> Result<()> { + panic!() + } + + fn dump(&self, _verbose: bool) -> Result<()> { + todo!() + } +} + +struct DummyImage { + key_range: Range, + lsn: Lsn, +} + +impl Layer for DummyImage { + fn get_tenant_id(&self) -> TenantId { + TenantId::from_str("00000000000000000000000000000000").unwrap() + } + + fn get_timeline_id(&self) -> TimelineId { + TimelineId::from_str("00000000000000000000000000000000").unwrap() + } + + fn get_key_range(&self) -> Range { + self.key_range.clone() + } + + fn get_lsn_range(&self) -> Range { + // End-bound is exclusive + self.lsn..(self.lsn + 1) + } + + fn filename(&self) -> PathBuf { + todo!() + } + + fn local_path(&self) -> Option { + todo!() + } + + fn get_value_reconstruct_data( + &self, + _key: Key, + _lsn_range: Range, + _reconstruct_data: &mut ValueReconstructState, + ) -> Result { + panic!() + } + + fn is_incremental(&self) -> bool { + false + } + + fn is_in_memory(&self) -> bool { + false + } + + fn iter(&self) -> Box> + '_> { + panic!() + } + + fn key_iter(&self) -> Box + '_> { + panic!("Not implemented") + } + + fn delete(&self) -> Result<()> { + panic!() + } + + fn dump(&self, _verbose: bool) -> Result<()> { + todo!() + } +} + +fn build_layer_map() -> LayerMap { + let mut layer_map = LayerMap::default(); + + let mut min_lsn = Lsn(u64::MAX); + let mut max_lsn = Lsn(0); + + for fname in TEST_LAYER_FILENAMES { + if let Some(imgfilename) = ImageFileName::parse_str(fname) { + let layer = DummyImage { + key_range: imgfilename.key_range, + lsn: imgfilename.lsn, + }; + layer_map.insert_historic(Arc::new(layer)); + min_lsn = min(min_lsn, imgfilename.lsn); + max_lsn = max(max_lsn, imgfilename.lsn); + } else if let Some(deltafilename) = DeltaFileName::parse_str(fname) { + let layer = DummyDelta { + key_range: deltafilename.key_range, + lsn_range: deltafilename.lsn_range.clone(), + }; + layer_map.insert_historic(Arc::new(layer)); + min_lsn = min(min_lsn, deltafilename.lsn_range.start); + max_lsn = max(max_lsn, deltafilename.lsn_range.end); + } else { + panic!("unexpected filename {fname}"); + } + } + + println!("min: {min_lsn}, max: {max_lsn}"); + + layer_map +} + +fn large_layer_map(c: &mut Criterion) { + let layer_map = build_layer_map(); + + c.bench_function("search", |b| { + b.iter(|| { + let result = layer_map.search( + // Just an arbitrary point + Key::from_hex("000000067F000080000009E014000001B011").unwrap(), + // This LSN is higher than any of the LSNs in the tree + Lsn::from_str("D0/80208AE1").unwrap(), + ); + result.unwrap(); + }); + }); + + // test with a key that corresponds to the RelDir entry. See pgdatadir_mapping.rs. + c.bench_function("search_rel_dir", |b| { + b.iter(|| { + let result = layer_map.search( + Key::from_hex("000000067F00008000000000000000000001").unwrap(), + // This LSN is higher than any of the LSNs in the tree + Lsn::from_str("D0/80208AE1").unwrap(), + ); + result.unwrap(); + }); + }); +} + +criterion_group!(benches, large_layer_map); +criterion_main!(benches); + +// A list of layer filenames, extracted from our performance test environment, from +// a project where we have run pgbench many timmes. The pgbench database was initialized +// between each test run. +const TEST_LAYER_FILENAMES: &[&str] = &[ +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000006CF69CD8B0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000006F949B7C08", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__00000071F15CF6B0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__00000072AEE2BFE0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000756884A510", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__00000077B1836CA0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000007D41715570", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000007F12B83FE8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__00000083D5DE3FD0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000873B520940", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000890CF51FE0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000008C71903720", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000008E43487FF0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000009445A06DC8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__00000096187D1FC8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__00000096E85806C0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000009921F3B4A8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000009B5229DFE8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__0000009EBB11FFC0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000A93DDE5FE0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000AD3698E000", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000B3AC039FE8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000B8606C92A0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000BC59629F98", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000BD25E66810", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000BEF683BFD0", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000C14270A078", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000C3687EDFE8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000C6C7BD8140", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000C896B8DFD8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000CB82C2FF68", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000CD51009FE8", +"000000000000000000000000000000000000-000000067F00008000000032090100000000__000000CF7E08BFD0", +"000000000000000000000000000000000000-000000067F00008000000540090100000000__0000006AEF261AF8", +"000000000000000000000000000000000000-000000067F00008000000560090100000000__0000006DA30DA180", +"000000000000000000000000000000000000-000000067F00008000000580090100000000__0000006FAFE25518", +"000000000000000000000000000000000000-000000067F000080000005E0090100000000__00000073AF75E930", +"000000000000000000000000000000000000-000000067F00008000000620090100000000__00000078B2CB1C68", +"000000000000000000000000000000000000-000000067F00008000000640090100000000__0000007B9877EF40", +"000000000000000000000000000000000000-000000067F00008000000680090100000000__00000080E477E868", +"000000000000000000000000000000000000-000000067F000080000006C0090100000000__00000085BE169568", +"000000000000000000000000000000000000-000000067F00008000000700090100000000__0000008AF15FEF50", +"000000000000000000000000000000000000-000000067F00008000000740090100000000__000000902186B1D0", +"000000000000000000000000000000000000-000000067F00008000000760090100000000__00000092CA5E4EA8", +"000000000000000000000000000000000000-000000067F000080000007E0090100000000__0000009D34F8D4D8", +"000000000000000000000000000000000000-000000067F00008000000820090100000000__000000A29F1D8950", +"000000000000000000000000000000000000-000000067F00008000000860090100000000__000000A434813A68", +"000000000000000000000000000000000000-000000067F000080000008C0090100000000__000000AAEBE534F8", +"000000000000000000000000000000000000-000000067F00008000000960090100000000__000000B6C2E92A88", +"000000000000000000000000000000000000-000000067F00008000000A20090100000000__000000C5745579F0", +"000000000000000000000000000000000000-000000067F00008000000A60090100000000__000000CA2C877DC8", +"000000000000000000000000000000000000-030000000000000000000000000000000002__000000AFB4666000", +"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF7DC97FD1-000000CF801FC221", +"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF801FC221-000000CF801FDB61", +"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF801FDB61-000000CF80201FA1", +"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF80201FA1-000000CF80203CC1", +"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF80203CC1-000000CF802067C1", +"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000CF802067C1-000000CF80208AE1", +"000000067F000032AC000040040000000000-000000067F000080000005400C0000007DD8__0000006A5C770149-0000006ACEF98449", +"000000067F000032AC000040040000000000-000000067F000080000005600C0000008077__0000006CF7781D19-0000006D69B48989", +"000000067F000032AC000040040000000000-000000067F000080000005800C0000007A49__0000006F95E72491-0000006FA8EDF3B9", +"000000067F000032AC000040040000000000-000000067F000080000005A00C0000007614__000000723877FF21-00000072A0D7CEA1", +"000000067F000032AC000040040000000000-000000067F000080000005C00C0000016516__00000072A0D7CEA1-0000007318DDE691", +"000000067F000032AC000040040000000000-000000067F000080000006000C0000008FB7__00000075687C3009-00000075E915EBC9", +"000000067F000032AC000040040000000000-000000067F000080000006200C0000009441__0000007805801C41-00000078859FEA11", +"000000067F000032AC000040040000000000-000000067F000080000006400C0000007987__0000007AA1DF6639-0000007B14D5C521", +"000000067F000032AC000040040000000000-000000067F000080000006600C0000009381__0000007D41EA8D51-0000007DC21DE569", +"000000067F000032AC000040040000000000-000000067F000080000006800C0000007D6A__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000032AC000040040000000000-000000067F000080000006801400000044E4__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F000032AC000040040000000000-000000067F000080000006C00C00000090F5__00000084A325AA01-00000085239DFB81", +"000000067F000032AC000040040000000000-000000067F000080000006E00C00000096C8__000000873C9A2551-00000087BC75E5B1", +"000000067F000032AC000040040000000000-000000067F000080000007000C000000955C__00000089D6B8EE99-0000008A56BBF739", +"000000067F000032AC000040040000000000-000000067F000080000007200C000000933D__0000008C72843D41-0000008CF2BFFC89", +"000000067F000032AC000040040000000000-000000067F000080000007400C00000090E9__0000008F10E3E189-0000008F915DE591", +"000000067F000032AC000040040000000000-000000067F000080000007600C0000008180__00000091A6DD7A79-0000009228F7FA79", +"000000067F000032AC000040040000000000-000000067F000080000007800C000000974C__0000009446B52FD1-00000094D67DF4F9", +"000000067F000032AC000040040000000000-000000067F000080000007A00C000000974B__00000096E85829C9-00000098A7ADFC91", +"000000067F000032AC000040040000000000-000000067F000080000007C00C0000007EA5__000000997F5D23C9-00000099F1C9FC71", +"000000067F000032AC000040040000000000-000000067F000080000007E00C00000092CD__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000032AC000040040000000000-000000067F000080000008000C00000081F6__0000009EBBC72771-000000A154401909", +"000000067F000032AC000040040000000000-000000067F000080000008200C000000974D__000000A154401909-000000A1E407F839", +"000000067F000032AC000040040000000000-000000067F0000800000082014000000393C__000000A323C9E001-000000A37A60B1A9", +"000000067F000032AC000040040000000000-000000067F000080000008600C0000009747__000000A37A60B1A9-000000A3CA47ECA9", +"000000067F000032AC000040040000000000-000000067F000080000008801C0000009703__000000A5A081B661-000000A6503DE919", +"000000067F000032AC000040040000000000-000000067F000080000008801C00000CF6B0__000000A6F001F909-000000A91D97FD49", +"000000067F000032AC000040040000000000-000000067F000080000008C00C0000002330__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000032AC000040040000000000-000000067F000080000008E00C00000077B3__000000AB6533BFD9-000000ABF63DF511", +"000000067F000032AC000040040000000000-000000067F000080000008E02A000000529F__000000AF5D587FE1-000000AFB4666001", +"000000067F000032AC000040040000000000-000000067F000080000009004000000047E0__000000B18495C001-000000B1FA75F501", +"000000067F000032AC000040040000000000-000000067F00008000000920140000005289__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F000032AC000040040000000000-000000067F000080000009400C000008DEA4__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000032AC000040040000000000-000000067F000080000009600C000000974F__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000032AC000040040000000000-000000067F000080000009600C0000055A74__000000B808718889-000000B8606C92A1", +"000000067F000032AC000040040000000000-000000067F000080000009800C0000009748__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000032AC000040040000000000-000000067F000080000009800C000010EC71__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F000032AC000040040000000000-000000067F000080000009A00C0000071F6F__000000BCEF79BE91-000000BD263A5849", +"000000067F000032AC000040040000000000-000000067F000080000009C00C0000009749__000000BD263A5849-000000BDA607F261", +"000000067F000032AC000040040000000000-000000067F000080000009E00C0000004916__000000BEF5F47FD1-000000BF48FFEB11", +"000000067F000032AC000040040000000000-000000067F00008000000A000C0000008EF9__000000C19744E959-000000C217F3F379", +"000000067F000032AC000040040000000000-000000067F00008000000A200C0000009748__000000C430961E71-000000C4C05DDB29", +"000000067F000032AC000040040000000000-000000067F00008000000A400C0000009743__000000C6C87B6329-000000C74849FAE1", +"000000067F000032AC000040040000000000-000000067F00008000000A600C0000009746__000000C90726D0D9-000000C986F5F0D9", +"000000067F000032AC000040040000000000-000000067F00008000000A600C000007A149__000000CB40C16489-000000CB82C37859", +"000000067F000032AC000040040000000000-000000067F00008000000A800C0000009748__000000CB82C37859-000000CC11F5EDC9", +"000000067F000032AC000040040000000000-000000067F00008000000A800F0100000003__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000000000000000001-000000067F000080000005400C000004B479__0000006C98B77D29-0000006CF7781D19", +"000000067F00008000000000000000000001-000000067F000080000005400C0000104BE4__0000006C1E7C73C1-0000006C98B77D29", +"000000067F00008000000000000000000001-000000067F000080000005600C0000048643__0000006F3370DD59-0000006F95E72491", +"000000067F00008000000000000000000001-000000067F000080000005600C0000100001__0000006EB935F989-0000006F3370DD59", +"000000067F00008000000000000000000001-000000067F000080000005800C000005CF06__00000071F21624D1-000000723877FF21", +"000000067F00008000000000000000000001-000000067F000080000005800C000009D78D__000000716A103FC9-00000071F21624D1", +"000000067F00008000000000000000000001-000000067F000080000005800C00000CDE2D__00000070E8761431-000000716A103FC9", +"000000067F00008000000000000000000001-000000067F000080000005E00C00000385D9__0000007318DDE691-0000007497B01FF9", +"000000067F00008000000000000000000001-000000067F000080000005E00C0000050175__000000751253A4C1-00000075687C3009", +"000000067F00008000000000000000000001-000000067F000080000005E00C00000AF576__0000007497B01FF9-000000751253A4C1", +"000000067F00008000000000000000000001-000000067F000080000006000C0000051A02__00000077B2AD0F91-0000007805801C41", +"000000067F00008000000000000000000001-000000067F000080000006000C00000C3C38__00000077391A8001-00000077B2AD0F91", +"000000067F00008000000000000000000001-000000067F000080000006000C00000C56C1__00000076A8CDE8F9-00000077391A8001", +"000000067F00008000000000000000000001-000000067F000080000006200C000004811C__0000007A3F679FA1-0000007AA1DF6639", +"000000067F00008000000000000000000001-000000067F000080000006200C0000107883__00000079C527F0D9-0000007A3F679FA1", +"000000067F00008000000000000000000001-000000067F000080000006400C000004B4C9__0000007B14D5C521-0000007C73B53FC9", +"000000067F00008000000000000000000001-000000067F000080000006400C000005258F__0000007CEE5A0B91-0000007D41EA8D51", +"000000067F00008000000000000000000001-000000067F000080000006400C00000A887C__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F00008000000000000000000001-000000067F000080000006600C0000049742__0000007F7BE4E6F1-0000007FDCDCE659", +"000000067F00008000000000000000000001-000000067F000080000006600C00000BC29F__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F00008000000000000000000001-000000067F000080000006600C0000111C82__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F00008000000000000000000001-000000067F000080000006800C00000A8D4C__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F00008000000000000000000001-000000067F000080000006A00C0000051984__000000844F1A6789-00000084A325AA01", +"000000067F00008000000000000000000001-000000067F000080000006A00C00000703EC__00000082B573F579-00000083D5901FD9", +"000000067F00008000000000000000000001-000000067F000080000006A00C00000C4CC8__00000083D5901FD9-000000844F1A6789", +"000000067F00008000000000000000000001-000000067F000080000006C00C0000055EA3__00000086ED29E361-000000873C9A2551", +"000000067F00008000000000000000000001-000000067F000080000006C00C00000BC102__00000085D35BF439-0000008673817FC9", +"000000067F00008000000000000000000001-000000067F000080000006C00C00000BFB6E__0000008673817FC9-00000086ED29E361", +"000000067F00008000000000000000000001-000000067F000080000006E00C0000054244__0000008985FD3611-00000089D6B8EE99", +"000000067F00008000000000000000000001-000000067F000080000006E00C00000B6F42__000000890C5B6001-0000008985FD3611", +"000000067F00008000000000000000000001-000000067F000080000006E00C00000C5883__000000887C2DFE59-000000890C5B6001", +"000000067F00008000000000000000000001-000000067F000080000007000C0000053C20__0000008C2045B721-0000008C72843D41", +"000000067F00008000000000000000000001-000000067F000080000007000C00000B2B06__0000008AF67FEC19-0000008BA6803FC9", +"000000067F00008000000000000000000001-000000067F000080000007000C00000BF157__0000008BA6803FC9-0000008C2045B721", +"000000067F00008000000000000000000001-000000067F000080000007200C0000051312__0000008EBC4827C1-0000008F10E3E189", +"000000067F00008000000000000000000001-000000067F000080000007200C00000BA086__0000008E42A19FD1-0000008EBC4827C1", +"000000067F00008000000000000000000001-000000067F000080000007200C00000C58B0__0000008DB277FA49-0000008E42A19FD1", +"000000067F00008000000000000000000001-000000067F000080000007400C000004DF08__000000914B2393B1-00000091A6DD7A79", +"000000067F00008000000000000000000001-000000067F000080000007400C00000FCCA8__00000090D0E5EA29-000000914B2393B1", +"000000067F00008000000000000000000001-000000067F000080000007600C00000544BA__0000009228F7FA79-00000093786F8001", +"000000067F00008000000000000000000001-000000067F000080000007600C0000061028__0000009402435A49-0000009446B52FD1", +"000000067F00008000000000000000000001-000000067F000080000007600C000008C52F__00000093786F8001-0000009402435A49", +"000000067F00008000000000000000000001-000000067F000080000007800C000006D445__00000096AEF27399-00000096E85829C9", +"000000067F00008000000000000000000001-000000067F000080000007800C000007B8BC__00000096193A8001-00000096AEF27399", +"000000067F00008000000000000000000001-000000067F000080000007800C00000CD6B6__000000959635F2A9-00000096193A8001", +"000000067F00008000000000000000000001-000000067F000080000007A00C000004B9A5__0000009921E47AA1-000000997F5D23C9", +"000000067F00008000000000000000000001-000000067F000080000007A00C00000F720F__00000098A7ADFC91-0000009921E47AA1", +"000000067F00008000000000000000000001-000000067F000080000007C00C0000052A9D__0000009BCB4E4461-0000009C1E8CC879", +"000000067F00008000000000000000000001-000000067F000080000007C00C00000A9244__0000009A918DF181-0000009B51A8BBB9", +"000000067F00008000000000000000000001-000000067F000080000007C00C00000BA258__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F00008000000000000000000001-000000067F000080000007E00C0000061ADC__0000009E781A9731-0000009EBBC72771", +"000000067F00008000000000000000000001-000000067F000080000007E00C0000093E3A__0000009DEEE6BFF9-0000009E781A9731", +"000000067F00008000000000000000000001-000000067F000080000007E00C00000B2704__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F00008000000000000000000001-000000067F000080000008200C000005D8FE__000000A1E407F839-000000A323C9E001", +"000000067F00008000000000000000000001-000000067F000080000008600C000010ECC4__000000A539BDE561-000000A5A081B661", +"000000067F00008000000000000000000001-000000067F000080000008A00C0000104A0C__000000A91D97FD49-000000A98AB7EE49", +"000000067F00008000000000000000000001-000000067F000080000008C00C000005DA8C__000000AA2597E9A1-000000AB6533BFD9", +"000000067F00008000000000000000000001-000000067F000080000008E00C00000BC018__000000AC9601EA19-000000AD36393FE9", +"000000067F00008000000000000000000001-000000067F000080000008E0140000003E33__000000AD36393FE9-000000ADB047EAB9", +"000000067F00008000000000000000000001-000000067F000080000008E022000008E3D1__000000AE6FFFE799-000000AF5D587FE1", +"000000067F00008000000000000000000001-000000067F000080000009003800000C5213__000000B0F3EDEAC9-000000B18495C001", +"000000067F00008000000000000000000001-000000067F000080000009200C000009567A__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F00008000000000000000000001-000000067F000080000009600C00000A93FD__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F00008000000000000000000001-000000067F000080000009600C020000000B__000000B79E68FFF9-000000B808718889", +"000000067F00008000000000000000000001-000000067F000080000009A00C00000794DC__000000BC596B5D59-000000BCEF79BE91", +"000000067F00008000000000000000000001-000000067F000080000009A00C00000D6C06__000000BBE607E8F1-000000BC596B5D59", +"000000067F00008000000000000000000001-000000067F000080000009C00C00000B2921__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F00008000000000000000000001-000000067F000080000009E00C0000050E55__000000C1426D92E1-000000C19744E959", +"000000067F00008000000000000000000001-000000067F000080000009E00C000009FB21__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F00008000000000000000000001-000000067F000080000009E00C00000C0C74__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F00008000000000000000000001-000000067F00008000000A000C000005635B__000000C3E17E01A1-000000C430961E71", +"000000067F00008000000000000000000001-000000067F00008000000A000C00000B8B52__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000000000000000001-000000067F00008000000A000C00000BC072__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000000000000000001-000000067F00008000000A200C00000677D8__000000C689AF4AC1-000000C6C87B6329", +"000000067F00008000000000000000000001-000000067F00008000000A200C00000933F0__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000000000000000001-000000067F00008000000A200C00000BBC1F__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000000000000000001-000000067F00008000000A400C00000C4AE6__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000000000000000001-000000067F00008000000A400C0000107F8F__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000000000000000001-000000067F00008000000A600C0000054BFB__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000000000000000001-000000067F00008000000A600C00001117CB__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000000000000000001-000000067F00008000000A800C00000BCB46__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000000000000000001-000000067F00008000000AA00C0000078E97__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000004E10100000002-000000067F000080000005400C000004BA9C__0000006ACEF98449-0000006C1E7C73C1", +"000000067F00008000000004E10100000002-000000067F000080000005800C0000071854__0000007048B1EC09-00000070E8761431", +"000000067F00008000000004E10200000000-000000067F000080000005600C000004BA9D__0000006D69B48989-0000006EB935F989", +"000000067F00008000000004EB0100000002-000000067F00008000000A400C00000551FC__000000C74849FAE1-000000C80801E859", +"000000067F000080000005200C000006C000-030000000000000000000000000000000002__000000687B67FC58", +"000000067F00008000000520140000028A69-030000000000000000000000000000000002__0000006981B5FDC9-00000069FBEEB099", +"000000067F0000800000052014000002C260-030000000000000000000000000000000002__00000069FBEEB099-0000006A5C770149", +"000000067F000080000005400C0000000000-000000067F000080000005400C0000004000__0000006CF69CD8B0", +"000000067F000080000005400C0000004000-000000067F000080000005400C0000008000__0000006CF69CD8B0", +"000000067F000080000005400C0000008000-000000067F000080000005400C000000C000__0000006CF69CD8B0", +"000000067F000080000005400C000000C000-000000067F000080000005400C0000010000__0000006CF69CD8B0", +"000000067F000080000005400C0000010000-000000067F000080000005400C0000014000__0000006CF69CD8B0", +"000000067F000080000005400C0000014000-000000067F000080000005400C0000018000__0000006CF69CD8B0", +"000000067F000080000005400C0000018000-000000067F000080000005400C000001C000__0000006CF69CD8B0", +"000000067F000080000005400C000001C000-000000067F000080000005400C0000020000__0000006CF69CD8B0", +"000000067F000080000005400C0000020000-000000067F000080000005400C0000024000__0000006CF69CD8B0", +"000000067F000080000005400C0000024000-000000067F000080000005400C0000028000__0000006CF69CD8B0", +"000000067F000080000005400C0000028000-000000067F000080000005400C000002C000__0000006CF69CD8B0", +"000000067F000080000005400C000002C000-000000067F000080000005400C0000030000__0000006CF69CD8B0", +"000000067F000080000005400C0000030000-000000067F000080000005400C0000034000__0000006CF69CD8B0", +"000000067F000080000005400C0000034000-000000067F000080000005400C0000038000__0000006CF69CD8B0", +"000000067F000080000005400C0000038000-000000067F000080000005400C000003C000__0000006CF69CD8B0", +"000000067F000080000005400C000003C000-000000067F000080000005400C0000040000__0000006CF69CD8B0", +"000000067F000080000005400C0000040000-000000067F000080000005400C0000044000__0000006CF69CD8B0", +"000000067F000080000005400C0000044000-000000067F000080000005400C0000048000__0000006CF69CD8B0", +"000000067F000080000005400C0000048000-000000067F000080000005400C000004C000__0000006CF69CD8B0", +"000000067F000080000005400C000004B483-000000067F000080000005400C00000967AD__0000006C98B77D29-0000006CF7781D19", +"000000067F000080000005400C000004C000-000000067F000080000005400C0000050000__0000006CF69CD8B0", +"000000067F000080000005400C0000050000-000000067F000080000005400C0000054000__0000006CF69CD8B0", +"000000067F000080000005400C0000054000-000000067F000080000005400C0000058000__0000006CF69CD8B0", +"000000067F000080000005400C0000054000-030000000000000000000000000000000002__0000006AEF261AF8", +"000000067F000080000005400C0000058000-000000067F000080000005400C000005C000__0000006CF69CD8B0", +"000000067F000080000005400C000005C000-000000067F000080000005400C0000060000__0000006CF69CD8B0", +"000000067F000080000005400C0000060000-000000067F000080000005400C0000064000__0000006CF69CD8B0", +"000000067F000080000005400C0000064000-000000067F000080000005400C0000068000__0000006CF69CD8B0", +"000000067F000080000005400C0000068000-000000067F000080000005400C000006C000__0000006CF69CD8B0", +"000000067F000080000005400C000006C000-000000067F000080000005400C0000070000__0000006CF69CD8B0", +"000000067F000080000005400C0000070000-000000067F000080000005400C0000074000__0000006CF69CD8B0", +"000000067F000080000005400C0000074000-000000067F000080000005400C0000078000__0000006CF69CD8B0", +"000000067F000080000005400C0000078000-000000067F000080000005400C000007C000__0000006CF69CD8B0", +"000000067F000080000005400C000007C000-000000067F000080000005400C0000080000__0000006CF69CD8B0", +"000000067F000080000005400C0000080000-000000067F000080000005400C0000084000__0000006CF69CD8B0", +"000000067F000080000005400C0000084000-000000067F000080000005400C0000088000__0000006CF69CD8B0", +"000000067F000080000005400C0000088000-000000067F000080000005400C000008C000__0000006CF69CD8B0", +"000000067F000080000005400C000008C000-000000067F000080000005400C0000090000__0000006CF69CD8B0", +"000000067F000080000005400C0000090000-000000067F000080000005400C0000094000__0000006CF69CD8B0", +"000000067F000080000005400C0000094000-000000067F000080000005400C0000098000__0000006CF69CD8B0", +"000000067F000080000005400C00000967BA-000000067F000080000005400C00000E2771__0000006C98B77D29-0000006CF7781D19", +"000000067F000080000005400C0000098000-000000067F000080000005400C000009C000__0000006CF69CD8B0", +"000000067F000080000005400C000009C000-000000067F000080000005400C00000A0000__0000006CF69CD8B0", +"000000067F000080000005400C00000A0000-000000067F000080000005400C00000A4000__0000006CF69CD8B0", +"000000067F000080000005400C00000A4000-000000067F000080000005400C00000A8000__0000006CF69CD8B0", +"000000067F000080000005400C00000A8000-000000067F000080000005400C00000AC000__0000006CF69CD8B0", +"000000067F000080000005400C00000AC000-000000067F000080000005400C00000B0000__0000006CF69CD8B0", +"000000067F000080000005400C00000B0000-000000067F000080000005400C00000B4000__0000006CF69CD8B0", +"000000067F000080000005400C00000B4000-000000067F000080000005400C00000B8000__0000006CF69CD8B0", +"000000067F000080000005400C00000B8000-000000067F000080000005400C00000BC000__0000006CF69CD8B0", +"000000067F000080000005400C00000BC000-000000067F000080000005400C00000C0000__0000006CF69CD8B0", +"000000067F000080000005400C00000C0000-000000067F000080000005400C00000C4000__0000006CF69CD8B0", +"000000067F000080000005400C00000C4000-000000067F000080000005400C00000C8000__0000006CF69CD8B0", +"000000067F000080000005400C00000C8000-000000067F000080000005400C00000CC000__0000006CF69CD8B0", +"000000067F000080000005400C00000CC000-000000067F000080000005400C00000D0000__0000006CF69CD8B0", +"000000067F000080000005400C00000D0000-000000067F000080000005400C00000D4000__0000006CF69CD8B0", +"000000067F000080000005400C00000D4000-000000067F000080000005400C00000D8000__0000006CF69CD8B0", +"000000067F000080000005400C00000D8000-000000067F000080000005400C00000DC000__0000006CF69CD8B0", +"000000067F000080000005400C00000DC000-000000067F000080000005400C00000E0000__0000006CF69CD8B0", +"000000067F000080000005400C00000E0000-000000067F000080000005400C00000E4000__0000006CF69CD8B0", +"000000067F000080000005400C00000E277B-000000067F00008000000540140000005B2E__0000006C98B77D29-0000006CF7781D19", +"000000067F000080000005400C00000E4000-000000067F000080000005400C00000E8000__0000006CF69CD8B0", +"000000067F000080000005400C00000E8000-000000067F000080000005400C00000EC000__0000006CF69CD8B0", +"000000067F000080000005400C00000EC000-000000067F000080000005400C00000F0000__0000006CF69CD8B0", +"000000067F000080000005400C00000F0000-000000067F000080000005400C00000F4000__0000006CF69CD8B0", +"000000067F000080000005400C00000F4000-000000067F000080000005400C00000F8000__0000006CF69CD8B0", +"000000067F000080000005400C00000F8000-000000067F000080000005400C00000FC000__0000006CF69CD8B0", +"000000067F000080000005400C00000FC000-000000067F000080000005400C0000100000__0000006CF69CD8B0", +"000000067F000080000005400C0000100000-000000067F000080000005400C0000104000__0000006CF69CD8B0", +"000000067F000080000005400C0000104000-000000067F000080000005400C0000108000__0000006CF69CD8B0", +"000000067F000080000005400C0000108000-000000067F000080000005400C000010C000__0000006CF69CD8B0", +"000000067F000080000005400C000010C000-000000067F000080000005400C0000110000__0000006CF69CD8B0", +"000000067F000080000005400C0000110000-000000067F00008000000540120100000000__0000006CF69CD8B0", +"000000067F000080000005400C0100000000-000000067F00008000000540140000004760__0000006C1E7C73C1-0000006C98B77D29", +"000000067F00008000000540140000004760-000000067F0000800000054014000000BB51__0000006C1E7C73C1-0000006C98B77D29", +"000000067F00008000000540140000005B2F-000000067F0000800000054014000001A04C__0000006C98B77D29-0000006CF7781D19", +"000000067F0000800000054014000000BB51-000000067F00008000000540140000012EFA__0000006C1E7C73C1-0000006C98B77D29", +"000000067F00008000000540140000012EFA-000000067F0000800000054014000001A2E5__0000006C1E7C73C1-0000006C98B77D29", +"000000067F0000800000054014000001A04E-000000067F0000800000054016000000022B__0000006C98B77D29-0000006CF7781D19", +"000000067F0000800000054014000001A2E5-000000067F000080000005401400000216D5__0000006C1E7C73C1-0000006C98B77D29", +"000000067F000080000005401400000216D5-000000067F00008000000540140000028AD9__0000006C1E7C73C1-0000006C98B77D29", +"000000067F00008000000540140000028AD9-030000000000000000000000000000000002__0000006C1E7C73C1-0000006C98B77D29", +"000000067F0000800000054016000000022B-030000000000000000000000000000000002__0000006C98B77D29-0000006CF7781D19", +"000000067F000080000005600C0000000000-000000067F000080000005600C0000004000__0000006DA30DA180", +"000000067F000080000005600C0000000000-000000067F000080000005600C0000004000__0000006F949B7C08", +"000000067F000080000005600C0000004000-000000067F000080000005600C0000008000__0000006DA30DA180", +"000000067F000080000005600C0000004000-000000067F000080000005600C0000008000__0000006F949B7C08", +"000000067F000080000005600C0000008000-000000067F000080000005600C000000C000__0000006DA30DA180", +"000000067F000080000005600C0000008000-000000067F000080000005600C000000C000__0000006F949B7C08", +"000000067F000080000005600C0000008077-000000067F000080000005600C00000117CE__0000006CF7781D19-0000006D69B48989", +"000000067F000080000005600C000000C000-000000067F000080000005600C0000010000__0000006DA30DA180", +"000000067F000080000005600C000000C000-000000067F000080000005600C0000010000__0000006F949B7C08", +"000000067F000080000005600C0000010000-000000067F000080000005600C0000014000__0000006DA30DA180", +"000000067F000080000005600C0000010000-000000067F000080000005600C0000014000__0000006F949B7C08", +"000000067F000080000005600C00000117CE-000000067F000080000005600C000001AF0A__0000006CF7781D19-0000006D69B48989", +"000000067F000080000005600C0000014000-000000067F000080000005600C0000018000__0000006DA30DA180", +"000000067F000080000005600C0000014000-000000067F000080000005600C0000018000__0000006F949B7C08", +"000000067F000080000005600C0000018000-000000067F000080000005600C000001C000__0000006DA30DA180", +"000000067F000080000005600C0000018000-000000067F000080000005600C000001C000__0000006F949B7C08", +"000000067F000080000005600C000001AF0A-000000067F000080000005600C0000024670__0000006CF7781D19-0000006D69B48989", +"000000067F000080000005600C000001C000-000000067F000080000005600C0000020000__0000006DA30DA180", +"000000067F000080000005600C000001C000-000000067F000080000005600C0000020000__0000006F949B7C08", +"000000067F000080000005600C0000020000-000000067F000080000005600C0000024000__0000006DA30DA180", +"000000067F000080000005600C0000020000-000000067F000080000005600C0000024000__0000006F949B7C08", +"000000067F000080000005600C0000024000-000000067F000080000005600C0000028000__0000006DA30DA180", +"000000067F000080000005600C0000024000-000000067F000080000005600C0000028000__0000006F949B7C08", +"000000067F000080000005600C0000024670-000000067F000080000005600C000002DDD6__0000006CF7781D19-0000006D69B48989", +"000000067F000080000005600C0000028000-000000067F000080000005600C000002C000__0000006DA30DA180", +"000000067F000080000005600C0000028000-000000067F000080000005600C000002C000__0000006F949B7C08", +"000000067F000080000005600C000002C000-000000067F000080000005600C0000030000__0000006DA30DA180", +"000000067F000080000005600C000002C000-000000067F000080000005600C0000030000__0000006F949B7C08", +"000000067F000080000005600C000002DDD6-000000067F000080000005600C000003752A__0000006CF7781D19-0000006D69B48989", +"000000067F000080000005600C0000030000-000000067F000080000005600C0000034000__0000006DA30DA180", +"000000067F000080000005600C0000030000-000000067F000080000005600C0000034000__0000006F949B7C08", +"000000067F000080000005600C0000034000-000000067F000080000005600C0000038000__0000006DA30DA180", +"000000067F000080000005600C0000034000-000000067F000080000005600C0000038000__0000006F949B7C08", +"000000067F000080000005600C000003752A-000000067F000080000005600C0000040C90__0000006CF7781D19-0000006D69B48989", +"000000067F000080000005600C0000038000-000000067F000080000005600C000003C000__0000006DA30DA180", +"000000067F000080000005600C0000038000-000000067F000080000005600C000003C000__0000006F949B7C08", +"000000067F000080000005600C000003C000-000000067F000080000005600C0000040000__0000006DA30DA180", +"000000067F000080000005600C000003C000-000000067F000080000005600C0000040000__0000006F949B7C08", +"000000067F000080000005600C0000040000-000000067F000080000005600C0000044000__0000006DA30DA180", +"000000067F000080000005600C0000040000-000000067F000080000005600C0000044000__0000006F949B7C08", +"000000067F000080000005600C0000040C90-030000000000000000000000000000000002__0000006CF7781D19-0000006D69B48989", +"000000067F000080000005600C0000044000-000000067F000080000005600C0000048000__0000006DA30DA180", +"000000067F000080000005600C0000044000-000000067F000080000005600C0000048000__0000006F949B7C08", +"000000067F000080000005600C0000048000-000000067F000080000005600C000004C000__0000006DA30DA180", +"000000067F000080000005600C0000048000-000000067F000080000005600C000004C000__0000006F949B7C08", +"000000067F000080000005600C0000048643-000000067F000080000005600C00000907F3__0000006F3370DD59-0000006F95E72491", +"000000067F000080000005600C000004BA9D-000000067F000080000005600C00000551D2__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C000004C000-000000067F000080000005600C0000050000__0000006DA30DA180", +"000000067F000080000005600C000004C000-000000067F000080000005600C0000050000__0000006F949B7C08", +"000000067F000080000005600C0000050000-000000067F000080000005600C0000054000__0000006DA30DA180", +"000000067F000080000005600C0000050000-000000067F000080000005600C0000054000__0000006F949B7C08", +"000000067F000080000005600C0000054000-000000067F000080000005600C0000058000__0000006DA30DA180", +"000000067F000080000005600C0000054000-000000067F000080000005600C0000058000__0000006F949B7C08", +"000000067F000080000005600C00000551D2-000000067F000080000005600C000005E90B__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C0000058000-000000067F000080000005600C000005C000__0000006DA30DA180", +"000000067F000080000005600C0000058000-000000067F000080000005600C000005C000__0000006F949B7C08", +"000000067F000080000005600C000005C000-000000067F000080000005600C0000060000__0000006DA30DA180", +"000000067F000080000005600C000005C000-000000067F000080000005600C0000060000__0000006F949B7C08", +"000000067F000080000005600C000005E90B-000000067F000080000005600C000006802B__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C0000060000-000000067F000080000005600C0000064000__0000006DA30DA180", +"000000067F000080000005600C0000060000-000000067F000080000005600C0000064000__0000006F949B7C08", +"000000067F000080000005600C0000064000-000000067F000080000005600C0000068000__0000006F949B7C08", +"000000067F000080000005600C0000064000-030000000000000000000000000000000002__0000006DA30DA180", +"000000067F000080000005600C0000068000-000000067F000080000005600C000006C000__0000006F949B7C08", +"000000067F000080000005600C000006802B-000000067F000080000005600C0000071782__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C000006C000-000000067F000080000005600C0000070000__0000006F949B7C08", +"000000067F000080000005600C0000070000-000000067F000080000005600C0000074000__0000006F949B7C08", +"000000067F000080000005600C0000071782-000000067F000080000005600C000007AEE8__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C0000074000-000000067F000080000005600C0000078000__0000006F949B7C08", +"000000067F000080000005600C0000078000-000000067F000080000005600C000007C000__0000006F949B7C08", +"000000067F000080000005600C000007AEE8-000000067F000080000005600C000008460B__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C000007C000-000000067F000080000005600C0000080000__0000006F949B7C08", +"000000067F000080000005600C0000080000-000000067F000080000005600C0000084000__0000006F949B7C08", +"000000067F000080000005600C0000084000-000000067F000080000005600C0000088000__0000006F949B7C08", +"000000067F000080000005600C000008460B-000000067F000080000005600C000008DD71__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C0000088000-000000067F000080000005600C000008C000__0000006F949B7C08", +"000000067F000080000005600C000008C000-000000067F000080000005600C0000090000__0000006F949B7C08", +"000000067F000080000005600C000008DD71-000000067F000080000005600C00000974D7__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C0000090000-000000067F000080000005600C0000094000__0000006F949B7C08", +"000000067F000080000005600C00000907F5-000000067F000080000005600C00000D90E0__0000006F3370DD59-0000006F95E72491", +"000000067F000080000005600C0000094000-000000067F000080000005600C0000098000__0000006F949B7C08", +"000000067F000080000005600C00000974D7-000000067F000080000005600C00000A0C0B__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C0000098000-000000067F000080000005600C000009C000__0000006F949B7C08", +"000000067F000080000005600C000009C000-000000067F000080000005600C00000A0000__0000006F949B7C08", +"000000067F000080000005600C00000A0000-000000067F000080000005600C00000A4000__0000006F949B7C08", +"000000067F000080000005600C00000A0C0B-000000067F000080000005600C00000AA371__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000A4000-000000067F000080000005600C00000A8000__0000006F949B7C08", +"000000067F000080000005600C00000A8000-000000067F000080000005600C00000AC000__0000006F949B7C08", +"000000067F000080000005600C00000AA371-000000067F000080000005600C00000B3AD7__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000AC000-000000067F000080000005600C00000B0000__0000006F949B7C08", +"000000067F000080000005600C00000B0000-000000067F000080000005600C00000B4000__0000006F949B7C08", +"000000067F000080000005600C00000B3AD7-000000067F000080000005600C00000BD20B__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000B4000-000000067F000080000005600C00000B8000__0000006F949B7C08", +"000000067F000080000005600C00000B8000-000000067F000080000005600C00000BC000__0000006F949B7C08", +"000000067F000080000005600C00000BC000-000000067F000080000005600C00000C0000__0000006F949B7C08", +"000000067F000080000005600C00000BD20B-000000067F000080000005600C00000C6932__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000C0000-000000067F000080000005600C00000C4000__0000006F949B7C08", +"000000067F000080000005600C00000C4000-000000067F000080000005600C00000C8000__0000006F949B7C08", +"000000067F000080000005600C00000C6932-000000067F000080000005600C00000D0098__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000C8000-000000067F000080000005600C00000CC000__0000006F949B7C08", +"000000067F000080000005600C00000CC000-000000067F000080000005600C00000D0000__0000006F949B7C08", +"000000067F000080000005600C00000D0000-000000067F000080000005600C00000D4000__0000006F949B7C08", +"000000067F000080000005600C00000D0098-000000067F000080000005600C00000D97FE__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000D4000-000000067F000080000005600C00000D8000__0000006F949B7C08", +"000000067F000080000005600C00000D8000-000000067F000080000005600C00000DC000__0000006F949B7C08", +"000000067F000080000005600C00000D90F8-000000067F00008000000560140000002A9A__0000006F3370DD59-0000006F95E72491", +"000000067F000080000005600C00000D97FE-000000067F000080000005600C00000E2F0B__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000DC000-000000067F000080000005600C00000E0000__0000006F949B7C08", +"000000067F000080000005600C00000E0000-000000067F000080000005600C00000E4000__0000006F949B7C08", +"000000067F000080000005600C00000E2F0B-000000067F000080000005600C00000EC671__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000E4000-000000067F000080000005600C00000E8000__0000006F949B7C08", +"000000067F000080000005600C00000E8000-000000067F000080000005600C00000EC000__0000006F949B7C08", +"000000067F000080000005600C00000EC000-000000067F000080000005600C00000F0000__0000006F949B7C08", +"000000067F000080000005600C00000EC671-000000067F000080000005600C00000F5D9F__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000F0000-000000067F000080000005600C00000F4000__0000006F949B7C08", +"000000067F000080000005600C00000F4000-000000067F000080000005600C00000F8000__0000006F949B7C08", +"000000067F000080000005600C00000F5D9F-000000067F000080000005600C00000FF505__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C00000F8000-000000067F000080000005600C00000FC000__0000006F949B7C08", +"000000067F000080000005600C00000FC000-000000067F000080000005600C0000100000__0000006F949B7C08", +"000000067F000080000005600C00000FF505-000000067F000080000005600C0000108C10__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C0000100000-000000067F000080000005600C0000104000__0000006F949B7C08", +"000000067F000080000005600C0000100001-000000067F000080000005600C0000111BF7__0000006EB935F989-0000006F3370DD59", +"000000067F000080000005600C0000104000-000000067F000080000005600C0000108000__0000006F949B7C08", +"000000067F000080000005600C0000108000-000000067F000080000005600C000010C000__0000006F949B7C08", +"000000067F000080000005600C0000108C10-000000067F000080000005600C0100000000__0000006D69B48989-0000006EB935F989", +"000000067F000080000005600C000010C000-000000067F000080000005600C0000110000__0000006F949B7C08", +"000000067F000080000005600C0000110000-000000067F00008000000560120100000000__0000006F949B7C08", +"000000067F000080000005600C0000111BF7-000000067F0000800000056014000000451D__0000006EB935F989-0000006F3370DD59", +"000000067F00008000000560140000002A9A-000000067F00008000000560140000016143__0000006F3370DD59-0000006F95E72491", +"000000067F0000800000056014000000451D-000000067F0000800000056014000000B9A7__0000006EB935F989-0000006F3370DD59", +"000000067F0000800000056014000000B9A7-000000067F00008000000560140000012DE3__0000006EB935F989-0000006F3370DD59", +"000000067F00008000000560140000012DE3-000000067F0000800000056014000001A213__0000006EB935F989-0000006F3370DD59", +"000000067F00008000000560140000016143-000000067F00008000000560140000029CE0__0000006F3370DD59-0000006F95E72491", +"000000067F0000800000056014000001A213-000000067F00008000000560140000021666__0000006EB935F989-0000006F3370DD59", +"000000067F00008000000560140000021666-000000067F00008000000560140000028A7C__0000006EB935F989-0000006F3370DD59", +"000000067F00008000000560140000028A7C-030000000000000000000000000000000002__0000006EB935F989-0000006F3370DD59", +"000000067F00008000000560140000029CE2-030000000000000000000000000000000002__0000006F3370DD59-0000006F95E72491", +"000000067F000080000005800C0000000000-000000067F000080000005800C0000004000__0000006FAFE25518", +"000000067F000080000005800C0000000000-000000067F000080000005800C0000004000__00000071F15CF6B0", +"000000067F000080000005800C0000004000-000000067F000080000005800C0000008000__0000006FAFE25518", +"000000067F000080000005800C0000004000-000000067F000080000005800C0000008000__00000071F15CF6B0", +"000000067F000080000005800C0000007A49-030000000000000000000000000000000002__0000006F95E72491-0000006FA8EDF3B9", +"000000067F000080000005800C0000008000-000000067F000080000005800C000000C000__0000006FAFE25518", +"000000067F000080000005800C0000008000-000000067F000080000005800C000000C000__0000007168C9DFF8", +"000000067F000080000005800C0000008000-000000067F000080000005800C000000C000__00000072377CDB60", +"000000067F000080000005800C00000096DE-000000067F000080000005800C0000012E0C__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C000000C000-000000067F000080000005800C0000010000__0000007168C9DFF8", +"000000067F000080000005800C000000C000-000000067F000080000005800C0000010000__00000072377CDB60", +"000000067F000080000005800C000000C000-030000000000000000000000000000000002__0000006FAFE25518", +"000000067F000080000005800C0000010000-000000067F000080000005800C0000014000__0000007168C9DFF8", +"000000067F000080000005800C0000010000-000000067F000080000005800C0000014000__00000072377CDB60", +"000000067F000080000005800C0000012E0C-000000067F000080000005800C000001C572__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000014000-000000067F000080000005800C0000018000__0000007168C9DFF8", +"000000067F000080000005800C0000014000-000000067F000080000005800C0000018000__00000072377CDB60", +"000000067F000080000005800C0000018000-000000067F000080000005800C000001C000__0000007168C9DFF8", +"000000067F000080000005800C0000018000-000000067F000080000005800C000001C000__00000072377CDB60", +"000000067F000080000005800C000001C000-000000067F000080000005800C0000020000__0000007168C9DFF8", +"000000067F000080000005800C000001C000-000000067F000080000005800C0000020000__00000072377CDB60", +"000000067F000080000005800C000001C572-000000067F000080000005800C0000025CD8__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000020000-000000067F000080000005800C0000024000__0000007168C9DFF8", +"000000067F000080000005800C0000020000-000000067F000080000005800C0000024000__00000072377CDB60", +"000000067F000080000005800C0000024000-000000067F000080000005800C0000028000__0000007168C9DFF8", +"000000067F000080000005800C0000024000-000000067F000080000005800C0000028000__00000072377CDB60", +"000000067F000080000005800C0000025CD8-000000067F000080000005800C000002F40B__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000028000-000000067F000080000005800C000002C000__0000007168C9DFF8", +"000000067F000080000005800C0000028000-000000067F000080000005800C000002C000__00000072377CDB60", +"000000067F000080000005800C000002C000-000000067F000080000005800C0000030000__0000007168C9DFF8", +"000000067F000080000005800C000002C000-000000067F000080000005800C0000030000__00000072377CDB60", +"000000067F000080000005800C000002F40B-000000067F000080000005800C0000038B1E__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000030000-000000067F000080000005800C0000034000__0000007168C9DFF8", +"000000067F000080000005800C0000030000-000000067F000080000005800C0000034000__00000072377CDB60", +"000000067F000080000005800C0000034000-000000067F000080000005800C0000038000__0000007168C9DFF8", +"000000067F000080000005800C0000034000-000000067F000080000005800C0000038000__00000072377CDB60", +"000000067F000080000005800C0000038000-000000067F000080000005800C000003C000__0000007168C9DFF8", +"000000067F000080000005800C0000038000-000000067F000080000005800C000003C000__00000072377CDB60", +"000000067F000080000005800C0000038B1E-000000067F000080000005800C0000042284__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C000003C000-000000067F000080000005800C0000040000__0000007168C9DFF8", +"000000067F000080000005800C000003C000-000000067F000080000005800C0000040000__00000072377CDB60", +"000000067F000080000005800C0000040000-000000067F000080000005800C0000044000__0000007168C9DFF8", +"000000067F000080000005800C0000040000-000000067F000080000005800C0000044000__00000072377CDB60", +"000000067F000080000005800C0000042284-000000067F000080000005800C000004B9EA__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000044000-000000067F000080000005800C0000048000__0000007168C9DFF8", +"000000067F000080000005800C0000044000-000000067F000080000005800C0000048000__00000072377CDB60", +"000000067F000080000005800C0000048000-000000067F000080000005800C000004C000__0000007168C9DFF8", +"000000067F000080000005800C0000048000-000000067F000080000005800C000004C000__00000072377CDB60", +"000000067F000080000005800C000004B9EA-000000067F000080000005800C000005510B__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C000004C000-000000067F000080000005800C0000050000__0000007168C9DFF8", +"000000067F000080000005800C000004C000-000000067F000080000005800C0000050000__00000072377CDB60", +"000000067F000080000005800C0000050000-000000067F000080000005800C0000054000__0000007168C9DFF8", +"000000067F000080000005800C0000050000-000000067F000080000005800C0000054000__00000072377CDB60", +"000000067F000080000005800C0000054000-000000067F000080000005800C0000058000__0000007168C9DFF8", +"000000067F000080000005800C0000054000-000000067F000080000005800C0000058000__00000072377CDB60", +"000000067F000080000005800C000005510B-000000067F000080000005800C000005E871__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000058000-000000067F000080000005800C000005C000__0000007168C9DFF8", +"000000067F000080000005800C0000058000-000000067F000080000005800C000005C000__00000072377CDB60", +"000000067F000080000005800C000005C000-000000067F000080000005800C0000060000__0000007168C9DFF8", +"000000067F000080000005800C000005C000-000000067F000080000005800C0000060000__00000072377CDB60", +"000000067F000080000005800C000005CF08-000000067F000080000005800C00000BAF56__00000071F21624D1-000000723877FF21", +"000000067F000080000005800C000005E871-000000067F000080000005800C0000067F8B__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000060000-000000067F000080000005800C0000064000__0000007168C9DFF8", +"000000067F000080000005800C0000060000-000000067F000080000005800C0000064000__00000072377CDB60", +"000000067F000080000005800C0000064000-000000067F000080000005800C0000068000__0000007168C9DFF8", +"000000067F000080000005800C0000064000-000000067F000080000005800C0000068000__00000072377CDB60", +"000000067F000080000005800C0000067F8B-000000067F000080000005800C0100000000__0000006FA8EDF3B9-0000007048B1EC09", +"000000067F000080000005800C0000068000-000000067F000080000005800C000006C000__0000007168C9DFF8", +"000000067F000080000005800C0000068000-000000067F000080000005800C000006C000__00000072377CDB60", +"000000067F000080000005800C000006C000-000000067F000080000005800C0000070000__0000007168C9DFF8", +"000000067F000080000005800C000006C000-000000067F000080000005800C0000070000__00000072377CDB60", +"000000067F000080000005800C0000070000-000000067F000080000005800C0000074000__0000007168C9DFF8", +"000000067F000080000005800C0000070000-000000067F000080000005800C0000074000__00000072377CDB60", +"000000067F000080000005800C0000071854-000000067F000080000005800C000007AFBA__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C0000074000-000000067F000080000005800C0000078000__0000007168C9DFF8", +"000000067F000080000005800C0000074000-000000067F000080000005800C0000078000__00000072377CDB60", +"000000067F000080000005800C0000078000-000000067F000080000005800C000007C000__0000007168C9DFF8", +"000000067F000080000005800C0000078000-000000067F000080000005800C000007C000__00000072377CDB60", +"000000067F000080000005800C000007AFBA-000000067F000080000005800C0000084720__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C000007C000-000000067F000080000005800C0000080000__0000007168C9DFF8", +"000000067F000080000005800C000007C000-000000067F000080000005800C0000080000__00000072377CDB60", +"000000067F000080000005800C0000080000-000000067F000080000005800C0000084000__0000007168C9DFF8", +"000000067F000080000005800C0000080000-000000067F000080000005800C0000084000__00000072377CDB60", +"000000067F000080000005800C0000084000-000000067F000080000005800C0000088000__0000007168C9DFF8", +"000000067F000080000005800C0000084000-000000067F000080000005800C0000088000__00000072377CDB60", +"000000067F000080000005800C0000084720-000000067F000080000005800C000008DE86__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C0000088000-000000067F000080000005800C000008C000__0000007168C9DFF8", +"000000067F000080000005800C0000088000-000000067F000080000005800C000008C000__00000072377CDB60", +"000000067F000080000005800C000008C000-000000067F000080000005800C0000090000__0000007168C9DFF8", +"000000067F000080000005800C000008C000-000000067F000080000005800C0000090000__00000072377CDB60", +"000000067F000080000005800C000008DE86-000000067F000080000005800C00000975A6__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C0000090000-000000067F000080000005800C0000094000__0000007168C9DFF8", +"000000067F000080000005800C0000090000-000000067F000080000005800C0000094000__00000072377CDB60", +"000000067F000080000005800C0000094000-000000067F000080000005800C0000098000__0000007168C9DFF8", +"000000067F000080000005800C0000094000-000000067F000080000005800C0000098000__00000072377CDB60", +"000000067F000080000005800C00000975A6-000000067F000080000005800C00000A0D0C__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C0000098000-000000067F000080000005800C000009C000__0000007168C9DFF8", +"000000067F000080000005800C0000098000-000000067F000080000005800C000009C000__00000072377CDB60", +"000000067F000080000005800C000009C000-000000067F000080000005800C00000A0000__0000007168C9DFF8", +"000000067F000080000005800C000009C000-000000067F000080000005800C00000A0000__00000072377CDB60", +"000000067F000080000005800C000009D78D-000000067F000080000005800C0200000018__000000716A103FC9-00000071F21624D1", +"000000067F000080000005800C00000A0000-000000067F000080000005800C00000A4000__0000007168C9DFF8", +"000000067F000080000005800C00000A0000-000000067F000080000005800C00000A4000__00000072377CDB60", +"000000067F000080000005800C00000A0D0C-000000067F000080000005800C00000AA472__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C00000A4000-000000067F000080000005800C00000A8000__0000007168C9DFF8", +"000000067F000080000005800C00000A4000-000000067F000080000005800C00000A8000__00000072377CDB60", +"000000067F000080000005800C00000A8000-000000067F000080000005800C00000AC000__0000007168C9DFF8", +"000000067F000080000005800C00000A8000-000000067F000080000005800C00000AC000__00000072377CDB60", +"000000067F000080000005800C00000AA472-000000067F000080000005800C00000B3BB4__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C00000AC000-000000067F000080000005800C00000B0000__0000007168C9DFF8", +"000000067F000080000005800C00000AC000-000000067F000080000005800C00000B0000__00000072377CDB60", +"000000067F000080000005800C00000B0000-000000067F000080000005800C00000B4000__0000007168C9DFF8", +"000000067F000080000005800C00000B0000-000000067F000080000005800C00000B4000__00000072377CDB60", +"000000067F000080000005800C00000B3BB4-000000067F000080000005800C00000BD30B__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C00000B4000-000000067F000080000005800C00000B8000__0000007168C9DFF8", +"000000067F000080000005800C00000B4000-000000067F000080000005800C00000B8000__00000072377CDB60", +"000000067F000080000005800C00000B8000-000000067F000080000005800C00000BC000__0000007168C9DFF8", +"000000067F000080000005800C00000B8000-000000067F000080000005800C00000BC000__00000072377CDB60", +"000000067F000080000005800C00000BAF5F-000000067F000080000005801400000007C1__00000071F21624D1-000000723877FF21", +"000000067F000080000005800C00000BC000-000000067F000080000005800C00000C0000__0000007168C9DFF8", +"000000067F000080000005800C00000BC000-000000067F000080000005800C00000C0000__00000072377CDB60", +"000000067F000080000005800C00000BD30B-000000067F000080000005800C00000C6A32__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C00000C0000-000000067F000080000005800C00000C4000__0000007168C9DFF8", +"000000067F000080000005800C00000C0000-000000067F000080000005800C00000C4000__00000072377CDB60", +"000000067F000080000005800C00000C4000-000000067F000080000005800C00000C8000__0000007168C9DFF8", +"000000067F000080000005800C00000C4000-000000067F000080000005800C00000C8000__00000072377CDB60", +"000000067F000080000005800C00000C6A32-000000067F000080000005800C0100000000__0000007048B1EC09-00000070E8761431", +"000000067F000080000005800C00000C8000-000000067F000080000005800C00000CC000__0000007168C9DFF8", +"000000067F000080000005800C00000C8000-000000067F000080000005800C00000CC000__00000072377CDB60", +"000000067F000080000005800C00000CC000-000000067F000080000005800C00000D0000__0000007168C9DFF8", +"000000067F000080000005800C00000CC000-000000067F000080000005800C00000D0000__00000072377CDB60", +"000000067F000080000005800C00000CDE2D-000000067F000080000005800C00000D754D__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C00000D0000-000000067F000080000005800C00000D4000__0000007168C9DFF8", +"000000067F000080000005800C00000D0000-000000067F000080000005800C00000D4000__00000072377CDB60", +"000000067F000080000005800C00000D4000-000000067F000080000005800C00000D8000__0000007168C9DFF8", +"000000067F000080000005800C00000D4000-000000067F000080000005800C00000D8000__00000072377CDB60", +"000000067F000080000005800C00000D754D-000000067F000080000005800C00000E0CB3__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C00000D8000-000000067F000080000005800C00000DC000__0000007168C9DFF8", +"000000067F000080000005800C00000D8000-000000067F000080000005800C00000DC000__00000072377CDB60", +"000000067F000080000005800C00000DC000-000000067F000080000005800C00000E0000__0000007168C9DFF8", +"000000067F000080000005800C00000DC000-000000067F000080000005800C00000E0000__00000072377CDB60", +"000000067F000080000005800C00000E0000-000000067F000080000005800C00000E4000__0000007168C9DFF8", +"000000067F000080000005800C00000E0000-000000067F000080000005800C00000E4000__00000072377CDB60", +"000000067F000080000005800C00000E0CB3-000000067F000080000005800C00000EA409__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C00000E4000-000000067F000080000005800C00000E8000__0000007168C9DFF8", +"000000067F000080000005800C00000E4000-000000067F000080000005800C00000E8000__00000072377CDB60", +"000000067F000080000005800C00000E8000-000000067F000080000005800C00000EC000__0000007168C9DFF8", +"000000067F000080000005800C00000E8000-000000067F000080000005800C00000EC000__00000072377CDB60", +"000000067F000080000005800C00000EA409-000000067F000080000005800C00000F3B4B__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C00000EC000-000000067F000080000005800C00000F0000__0000007168C9DFF8", +"000000067F000080000005800C00000EC000-000000067F000080000005800C00000F0000__00000072377CDB60", +"000000067F000080000005800C00000F0000-000000067F000080000005800C00000F4000__0000007168C9DFF8", +"000000067F000080000005800C00000F0000-000000067F000080000005800C00000F4000__00000072377CDB60", +"000000067F000080000005800C00000F3B4B-000000067F000080000005800C00000FD2B1__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C00000F4000-000000067F000080000005800C00000F8000__0000007168C9DFF8", +"000000067F000080000005800C00000F4000-000000067F000080000005800C00000F8000__00000072377CDB60", +"000000067F000080000005800C00000F8000-000000067F000080000005800C00000FC000__0000007168C9DFF8", +"000000067F000080000005800C00000F8000-000000067F000080000005800C00000FC000__00000072377CDB60", +"000000067F000080000005800C00000FC000-000000067F000080000005800C0000100000__0000007168C9DFF8", +"000000067F000080000005800C00000FC000-000000067F000080000005800C0000100000__00000072377CDB60", +"000000067F000080000005800C00000FD2B1-000000067F000080000005800C00001069D8__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C0000100000-000000067F000080000005800C0000104000__0000007168C9DFF8", +"000000067F000080000005800C0000100000-000000067F000080000005800C0000104000__00000072377CDB60", +"000000067F000080000005800C0000104000-000000067F000080000005800C0000108000__0000007168C9DFF8", +"000000067F000080000005800C0000104000-000000067F000080000005800C0000108000__00000072377CDB60", +"000000067F000080000005800C00001069D8-000000067F000080000005800C000011010C__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C0000108000-000000067F000080000005800C000010C000__0000007168C9DFF8", +"000000067F000080000005800C0000108000-000000067F000080000005800C000010C000__00000072377CDB60", +"000000067F000080000005800C000010C000-000000067F000080000005800C0000110000__0000007168C9DFF8", +"000000067F000080000005800C000010C000-000000067F000080000005800C0000110000__00000072377CDB60", +"000000067F000080000005800C0000110000-000000067F00008000000580120100000000__00000072377CDB60", +"000000067F000080000005800C0000110000-030000000000000000000000000000000002__0000007168C9DFF8", +"000000067F000080000005800C000011010C-01000000000000000100000002000000001E__00000070E8761431-000000716A103FC9", +"000000067F000080000005800C0200000018-000000067F000080000005801400000059BE__000000716A103FC9-00000071F21624D1", +"000000067F00008000000580140000000000-000000067F00008000000580140000004000__00000072377CDB60", +"000000067F000080000005801400000007C3-000000067F00008000000580140000020462__00000071F21624D1-000000723877FF21", +"000000067F00008000000580140000004000-000000067F00008000000580140000008000__00000072377CDB60", +"000000067F000080000005801400000059BE-000000067F0000800000058014000000BF38__000000716A103FC9-00000071F21624D1", +"000000067F00008000000580140000008000-000000067F0000800000058014000000C000__00000072377CDB60", +"000000067F0000800000058014000000BF38-000000067F00008000000580140000012530__000000716A103FC9-00000071F21624D1", +"000000067F0000800000058014000000C000-000000067F00008000000580140000010000__00000072377CDB60", +"000000067F00008000000580140000010000-000000067F00008000000580140000014000__00000072377CDB60", +"000000067F00008000000580140000012530-000000067F00008000000580140000018B50__000000716A103FC9-00000071F21624D1", +"000000067F00008000000580140000014000-000000067F00008000000580140000018000__00000072377CDB60", +"000000067F00008000000580140000018000-000000067F0000800000058014000001C000__00000072377CDB60", +"000000067F00008000000580140000018B50-000000067F0000800000058014000001F0D3__000000716A103FC9-00000071F21624D1", +"000000067F0000800000058014000001C000-000000067F00008000000580140000020000__00000072377CDB60", +"000000067F0000800000058014000001F0D3-000000067F0000800000058014000002562B__000000716A103FC9-00000071F21624D1", +"000000067F00008000000580140000020000-000000067F00008000000580140000024000__00000072377CDB60", +"000000067F00008000000580140000020464-030000000000000000000000000000000002__00000071F21624D1-000000723877FF21", +"000000067F00008000000580140000024000-000000067F00008000000580140000028000__00000072377CDB60", +"000000067F0000800000058014000002562B-000000067F0000800000058014000002BC37__000000716A103FC9-00000071F21624D1", +"000000067F00008000000580140000028000-000000067F0000800000058014000002C000__00000072377CDB60", +"000000067F0000800000058014000002BC37-030000000000000000000000000000000002__000000716A103FC9-00000071F21624D1", +"000000067F0000800000058014000002C000-030000000000000000000000000000000002__00000072377CDB60", +"000000067F000080000005A00C0000007614-000000067F000080000005A00C000000ED44__000000723877FF21-00000072A0D7CEA1", +"000000067F000080000005A00C000000ED44-000000067F000080000005A00C0000016337__000000723877FF21-00000072A0D7CEA1", +"000000067F000080000005A00C0000016337-000000067F000080000005A014000000148C__000000723877FF21-00000072A0D7CEA1", +"000000067F000080000005A014000000148C-000000067F000080000005C00C0000003207__000000723877FF21-00000072A0D7CEA1", +"000000067F000080000005C00C0000003207-000000067F000080000005C00C000000C96D__000000723877FF21-00000072A0D7CEA1", +"000000067F000080000005C00C000000C96D-030000000000000000000000000000000002__000000723877FF21-00000072A0D7CEA1", +"000000067F000080000005C00C0000016516-000000067F000080000005C0140000001694__00000072A0D7CEA1-0000007318DDE691", +"000000067F000080000005C0140000001694-000000067F000080000005E00C000000360C__00000072A0D7CEA1-0000007318DDE691", +"000000067F000080000005E00C0000000000-000000067F000080000005E00C0000004000__00000073AF75E930", +"000000067F000080000005E00C0000000000-000000067F000080000005E00C0000004000__000000756884A510", +"000000067F000080000005E00C000000360C-000000067F000080000005E00C000000CD72__00000072A0D7CEA1-0000007318DDE691", +"000000067F000080000005E00C0000004000-000000067F000080000005E00C0000008000__00000073AF75E930", +"000000067F000080000005E00C0000004000-000000067F000080000005E00C0000008000__000000756884A510", +"000000067F000080000005E00C0000008000-000000067F000080000005E00C000000C000__00000073AF75E930", +"000000067F000080000005E00C0000008000-000000067F000080000005E00C000000C000__000000756884A510", +"000000067F000080000005E00C000000C000-000000067F000080000005E00C0000010000__00000073AF75E930", +"000000067F000080000005E00C000000C000-000000067F000080000005E00C0000010000__000000756884A510", +"000000067F000080000005E00C000000CD72-000000067F000080000005E00C00000164D8__00000072A0D7CEA1-0000007318DDE691", +"000000067F000080000005E00C0000010000-000000067F000080000005E00C0000014000__00000073AF75E930", +"000000067F000080000005E00C0000010000-000000067F000080000005E00C0000014000__000000756884A510", +"000000067F000080000005E00C0000014000-000000067F000080000005E00C0000018000__00000073AF75E930", +"000000067F000080000005E00C0000014000-000000067F000080000005E00C0000018000__000000756884A510", +"000000067F000080000005E00C00000164D8-000000067F000080000005E00C000001FC0B__00000072A0D7CEA1-0000007318DDE691", +"000000067F000080000005E00C0000018000-000000067F000080000005E00C000001C000__00000073AF75E930", +"000000067F000080000005E00C0000018000-000000067F000080000005E00C000001C000__000000756884A510", +"000000067F000080000005E00C000001C000-000000067F000080000005E00C0000020000__00000073AF75E930", +"000000067F000080000005E00C000001C000-000000067F000080000005E00C0000020000__000000756884A510", +"000000067F000080000005E00C000001FC0B-000000067F000080000005E00C0000029319__00000072A0D7CEA1-0000007318DDE691", +"000000067F000080000005E00C0000020000-000000067F000080000005E00C0000024000__00000073AF75E930", +"000000067F000080000005E00C0000020000-000000067F000080000005E00C0000024000__000000756884A510", +"000000067F000080000005E00C0000024000-000000067F000080000005E00C0000028000__00000073AF75E930", +"000000067F000080000005E00C0000024000-000000067F000080000005E00C0000028000__000000756884A510", +"000000067F000080000005E00C0000028000-000000067F000080000005E00C000002C000__00000073AF75E930", +"000000067F000080000005E00C0000028000-000000067F000080000005E00C000002C000__000000756884A510", +"000000067F000080000005E00C0000029319-030000000000000000000000000000000002__00000072A0D7CEA1-0000007318DDE691", +"000000067F000080000005E00C000002C000-000000067F000080000005E00C0000030000__00000073AF75E930", +"000000067F000080000005E00C000002C000-000000067F000080000005E00C0000030000__000000756884A510", +"000000067F000080000005E00C0000030000-000000067F000080000005E00C0000034000__00000073AF75E930", +"000000067F000080000005E00C0000030000-000000067F000080000005E00C0000034000__000000756884A510", +"000000067F000080000005E00C0000034000-000000067F000080000005E00C0000038000__00000073AF75E930", +"000000067F000080000005E00C0000034000-000000067F000080000005E00C0000038000__000000756884A510", +"000000067F000080000005E00C0000038000-000000067F000080000005E00C000003C000__00000073AF75E930", +"000000067F000080000005E00C0000038000-000000067F000080000005E00C000003C000__000000756884A510", +"000000067F000080000005E00C00000385D9-000000067F000080000005E00C0000041D0A__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C000003C000-000000067F000080000005E00C0000040000__00000073AF75E930", +"000000067F000080000005E00C000003C000-000000067F000080000005E00C0000040000__000000756884A510", +"000000067F000080000005E00C0000040000-000000067F000080000005E00C0000044000__00000073AF75E930", +"000000067F000080000005E00C0000040000-000000067F000080000005E00C0000044000__000000756884A510", +"000000067F000080000005E00C0000041D0A-000000067F000080000005E00C000004B470__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000044000-000000067F000080000005E00C0000048000__00000073AF75E930", +"000000067F000080000005E00C0000044000-000000067F000080000005E00C0000048000__000000756884A510", +"000000067F000080000005E00C0000048000-000000067F000080000005E00C000004C000__00000073AF75E930", +"000000067F000080000005E00C0000048000-000000067F000080000005E00C000004C000__000000756884A510", +"000000067F000080000005E00C000004B470-000000067F000080000005E00C0000054BA9__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C000004C000-000000067F000080000005E00C0000050000__00000073AF75E930", +"000000067F000080000005E00C000004C000-000000067F000080000005E00C0000050000__000000756884A510", +"000000067F000080000005E00C0000050000-000000067F000080000005E00C0000054000__00000073AF75E930", +"000000067F000080000005E00C0000050000-000000067F000080000005E00C0000054000__000000756884A510", +"000000067F000080000005E00C000005017A-000000067F000080000005E00C000009FEAD__000000751253A4C1-00000075687C3009", +"000000067F000080000005E00C0000054000-000000067F000080000005E00C0000058000__00000073AF75E930", +"000000067F000080000005E00C0000054000-000000067F000080000005E00C0000058000__000000756884A510", +"000000067F000080000005E00C0000054BA9-000000067F000080000005E00C000005E30B__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000058000-000000067F000080000005E00C000005C000__00000073AF75E930", +"000000067F000080000005E00C0000058000-000000067F000080000005E00C000005C000__000000756884A510", +"000000067F000080000005E00C000005C000-000000067F000080000005E00C0000060000__00000073AF75E930", +"000000067F000080000005E00C000005C000-000000067F000080000005E00C0000060000__000000756884A510", +"000000067F000080000005E00C000005E30B-000000067F000080000005E00C0000067A2C__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000060000-000000067F000080000005E00C0000064000__00000073AF75E930", +"000000067F000080000005E00C0000060000-000000067F000080000005E00C0000064000__000000756884A510", +"000000067F000080000005E00C0000064000-000000067F000080000005E00C0000068000__00000073AF75E930", +"000000067F000080000005E00C0000064000-000000067F000080000005E00C0000068000__000000756884A510", +"000000067F000080000005E00C0000067A2C-000000067F000080000005E00C0000071187__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000068000-000000067F000080000005E00C000006C000__00000073AF75E930", +"000000067F000080000005E00C0000068000-000000067F000080000005E00C000006C000__000000756884A510", +"000000067F000080000005E00C000006C000-000000067F000080000005E00C0000070000__00000073AF75E930", +"000000067F000080000005E00C000006C000-000000067F000080000005E00C0000070000__000000756884A510", +"000000067F000080000005E00C0000070000-000000067F000080000005E00C0000074000__00000073AF75E930", +"000000067F000080000005E00C0000070000-000000067F000080000005E00C0000074000__000000756884A510", +"000000067F000080000005E00C0000071187-000000067F000080000005E00C000007A8ED__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000074000-000000067F000080000005E00C0000078000__00000073AF75E930", +"000000067F000080000005E00C0000074000-000000067F000080000005E00C0000078000__000000756884A510", +"000000067F000080000005E00C0000078000-000000067F000080000005E00C000007C000__00000073AF75E930", +"000000067F000080000005E00C0000078000-000000067F000080000005E00C000007C000__000000756884A510", +"000000067F000080000005E00C000007A8ED-000000067F000080000005E00C000008400B__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C000007C000-000000067F000080000005E00C0000080000__00000073AF75E930", +"000000067F000080000005E00C000007C000-000000067F000080000005E00C0000080000__000000756884A510", +"000000067F000080000005E00C0000080000-000000067F000080000005E00C0000084000__00000073AF75E930", +"000000067F000080000005E00C0000080000-000000067F000080000005E00C0000084000__000000756884A510", +"000000067F000080000005E00C0000084000-000000067F000080000005E00C0000088000__00000073AF75E930", +"000000067F000080000005E00C0000084000-000000067F000080000005E00C0000088000__000000756884A510", +"000000067F000080000005E00C000008400B-000000067F000080000005E00C000008D771__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000088000-000000067F000080000005E00C000008C000__000000756884A510", +"000000067F000080000005E00C0000088000-030000000000000000000000000000000002__00000073AF75E930", +"000000067F000080000005E00C000008C000-000000067F000080000005E00C0000090000__000000756884A510", +"000000067F000080000005E00C000008D771-000000067F000080000005E00C0000096ED7__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000090000-000000067F000080000005E00C0000094000__000000756884A510", +"000000067F000080000005E00C0000094000-000000067F000080000005E00C0000098000__000000756884A510", +"000000067F000080000005E00C0000096ED7-000000067F000080000005E00C00000A060B__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000098000-000000067F000080000005E00C000009C000__000000756884A510", +"000000067F000080000005E00C000009C000-000000067F000080000005E00C00000A0000__000000756884A510", +"000000067F000080000005E00C000009FEB2-000000067F000080000005E00C00000EF4ED__000000751253A4C1-00000075687C3009", +"000000067F000080000005E00C00000A0000-000000067F000080000005E00C00000A4000__000000756884A510", +"000000067F000080000005E00C00000A060B-000000067F000080000005E00C00000A9D71__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000A4000-000000067F000080000005E00C00000A8000__000000756884A510", +"000000067F000080000005E00C00000A8000-000000067F000080000005E00C00000AC000__000000756884A510", +"000000067F000080000005E00C00000A9D71-000000067F000080000005E00C00000B34D7__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000AC000-000000067F000080000005E00C00000B0000__000000756884A510", +"000000067F000080000005E00C00000AF576-000000067F000080000005E00C0200000023__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E00C00000B0000-000000067F000080000005E00C00000B4000__000000756884A510", +"000000067F000080000005E00C00000B34D7-000000067F000080000005E00C00000BCC0C__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000B4000-000000067F000080000005E00C00000B8000__000000756884A510", +"000000067F000080000005E00C00000B8000-000000067F000080000005E00C00000BC000__000000756884A510", +"000000067F000080000005E00C00000BC000-000000067F000080000005E00C00000C0000__000000756884A510", +"000000067F000080000005E00C00000BCC0C-000000067F000080000005E00C00000C6336__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000C0000-000000067F000080000005E00C00000C4000__000000756884A510", +"000000067F000080000005E00C00000C4000-000000067F000080000005E00C00000C8000__000000756884A510", +"000000067F000080000005E00C00000C6336-000000067F000080000005E00C00000CFA9C__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000C8000-000000067F000080000005E00C00000CC000__000000756884A510", +"000000067F000080000005E00C00000CC000-000000067F000080000005E00C00000D0000__000000756884A510", +"000000067F000080000005E00C00000CFA9C-000000067F000080000005E00C00000D91AB__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000D0000-000000067F000080000005E00C00000D4000__000000756884A510", +"000000067F000080000005E00C00000D4000-000000067F000080000005E00C00000D8000__000000756884A510", +"000000067F000080000005E00C00000D8000-000000067F000080000005E00C00000DC000__000000756884A510", +"000000067F000080000005E00C00000D91AB-000000067F000080000005E00C00000E2911__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000DC000-000000067F000080000005E00C00000E0000__000000756884A510", +"000000067F000080000005E00C00000E0000-000000067F000080000005E00C00000E4000__000000756884A510", +"000000067F000080000005E00C00000E2911-000000067F000080000005E00C00000EC077__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000E4000-000000067F000080000005E00C00000E8000__000000756884A510", +"000000067F000080000005E00C00000E8000-000000067F000080000005E00C00000EC000__000000756884A510", +"000000067F000080000005E00C00000EC000-000000067F000080000005E00C00000F0000__000000756884A510", +"000000067F000080000005E00C00000EC077-000000067F000080000005E00C00000F57A8__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000EF4F1-000000067F000080000005E014000000BDDE__000000751253A4C1-00000075687C3009", +"000000067F000080000005E00C00000F0000-000000067F000080000005E00C00000F4000__000000756884A510", +"000000067F000080000005E00C00000F4000-000000067F000080000005E00C00000F8000__000000756884A510", +"000000067F000080000005E00C00000F57A8-000000067F000080000005E00C00000FEF0A__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C00000F8000-000000067F000080000005E00C00000FC000__000000756884A510", +"000000067F000080000005E00C00000FC000-000000067F000080000005E00C0000100000__000000756884A510", +"000000067F000080000005E00C00000FEF0A-000000067F000080000005E00C000010862B__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C0000100000-000000067F000080000005E00C0000104000__000000756884A510", +"000000067F000080000005E00C0000104000-000000067F000080000005E00C0000108000__000000756884A510", +"000000067F000080000005E00C0000108000-000000067F000080000005E00C000010C000__000000756884A510", +"000000067F000080000005E00C000010862B-000000067F000080000005E00C0000111C20__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C000010C000-000000067F000080000005E00C0000110000__000000756884A510", +"000000067F000080000005E00C0000110000-000000067F000080000005E0120100000000__000000756884A510", +"000000067F000080000005E00C00FFFFFFFF-010000000000000001000000030000000002__0000007318DDE691-0000007497B01FF9", +"000000067F000080000005E00C02FFFFFFFF-000000067F000080000005E0140000006C41__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E0140000000000-000000067F000080000005E0140000004000__000000756884A510", +"000000067F000080000005E0140000004000-000000067F000080000005E0140000008000__000000756884A510", +"000000067F000080000005E0140000006C41-000000067F000080000005E014000000D890__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E0140000008000-000000067F000080000005E014000000C000__000000756884A510", +"000000067F000080000005E014000000BDDE-000000067F000080000005E0140000023A18__000000751253A4C1-00000075687C3009", +"000000067F000080000005E014000000C000-000000067F000080000005E0140000010000__000000756884A510", +"000000067F000080000005E014000000D890-000000067F000080000005E01400000144C8__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E0140000010000-000000067F000080000005E0140000014000__000000756884A510", +"000000067F000080000005E0140000014000-000000067F000080000005E0140000018000__000000756884A510", +"000000067F000080000005E01400000144C8-000000067F000080000005E014000001B1AC__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E0140000018000-000000067F000080000005E014000001C000__000000756884A510", +"000000067F000080000005E014000001B1AC-000000067F000080000005E0140000021E03__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E014000001C000-000000067F000080000005E0140000020000__000000756884A510", +"000000067F000080000005E0140000020000-000000067F000080000005E0140000024000__000000756884A510", +"000000067F000080000005E0140000021E03-000000067F000080000005E0140000028A36__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E0140000023A18-030000000000000000000000000000000002__000000751253A4C1-00000075687C3009", +"000000067F000080000005E0140000024000-000000067F000080000005E0140000028000__000000756884A510", +"000000067F000080000005E0140000028000-000000067F000080000005E014000002C000__000000756884A510", +"000000067F000080000005E0140000028A36-030000000000000000000000000000000002__0000007497B01FF9-000000751253A4C1", +"000000067F000080000005E014000002C000-030000000000000000000000000000000002__000000756884A510", +"000000067F000080000006000C0000000000-000000067F000080000006000C0000004000__00000077B1836CA0", +"000000067F000080000006000C0000004000-000000067F000080000006000C0000008000__00000077B1836CA0", +"000000067F000080000006000C0000008000-000000067F000080000006000C000000C000__00000077B1836CA0", +"000000067F000080000006000C0000008FB7-000000067F000080000006000C000001271D__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C000000C000-000000067F000080000006000C0000010000__00000077B1836CA0", +"000000067F000080000006000C0000010000-000000067F000080000006000C0000014000__00000077B1836CA0", +"000000067F000080000006000C000001271D-000000067F000080000006000C000001BE83__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C0000014000-000000067F000080000006000C0000018000__00000077B1836CA0", +"000000067F000080000006000C0000018000-000000067F000080000006000C000001C000__00000077B1836CA0", +"000000067F000080000006000C000001BE83-000000067F000080000006000C00000255B6__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C000001C000-000000067F000080000006000C0000020000__00000077B1836CA0", +"000000067F000080000006000C0000020000-000000067F000080000006000C0000024000__00000077B1836CA0", +"000000067F000080000006000C0000024000-000000067F000080000006000C0000028000__00000077B1836CA0", +"000000067F000080000006000C00000255B6-000000067F000080000006000C000002ED0B__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C0000028000-000000067F000080000006000C000002C000__00000077B1836CA0", +"000000067F000080000006000C000002C000-000000067F000080000006000C0000030000__00000077B1836CA0", +"000000067F000080000006000C000002ED0B-000000067F000080000006000C000003842B__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C0000030000-000000067F000080000006000C0000034000__00000077B1836CA0", +"000000067F000080000006000C0000034000-000000067F000080000006000C0000038000__00000077B1836CA0", +"000000067F000080000006000C0000038000-000000067F000080000006000C000003C000__00000077B1836CA0", +"000000067F000080000006000C000003842B-000000067F000080000006000C0000041B80__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C000003C000-000000067F000080000006000C0000040000__00000077B1836CA0", +"000000067F000080000006000C0000040000-000000067F000080000006000C0000044000__00000077B1836CA0", +"000000067F000080000006000C0000041B80-000000067F000080000006000C000004B2E6__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C0000044000-000000067F000080000006000C0000048000__00000077B1836CA0", +"000000067F000080000006000C0000048000-000000067F000080000006000C000004C000__0000007739203FF0", +"000000067F000080000006000C000004B2E6-030000000000000000000000000000000002__00000075687C3009-00000075E915EBC9", +"000000067F000080000006000C000004BAC2-000000067F000080000006000C00000551F7__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C000004C000-000000067F000080000006000C0000050000__0000007739203FF0", +"000000067F000080000006000C0000050000-000000067F000080000006000C0000054000__0000007739203FF0", +"000000067F000080000006000C0000051A05-000000067F000080000006000C00000A4D93__00000077B2AD0F91-0000007805801C41", +"000000067F000080000006000C0000054000-000000067F000080000006000C0000058000__0000007739203FF0", +"000000067F000080000006000C00000551F7-000000067F000080000006000C000005E90B__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C0000058000-000000067F000080000006000C000005C000__0000007739203FF0", +"000000067F000080000006000C000005C000-000000067F000080000006000C0000060000__0000007739203FF0", +"000000067F000080000006000C000005E90B-000000067F000080000006000C000006802B__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C0000060000-000000067F000080000006000C0000064000__0000007739203FF0", +"000000067F000080000006000C0000064000-000000067F000080000006000C0000068000__0000007739203FF0", +"000000067F000080000006000C0000068000-000000067F000080000006000C000006C000__0000007739203FF0", +"000000067F000080000006000C000006802B-000000067F000080000006000C0000071782__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C000006C000-000000067F000080000006000C0000070000__0000007739203FF0", +"000000067F000080000006000C0000070000-000000067F000080000006000C0000074000__0000007739203FF0", +"000000067F000080000006000C0000071782-000000067F000080000006000C000007AEE8__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C0000074000-000000067F000080000006000C0000078000__0000007739203FF0", +"000000067F000080000006000C0000078000-000000067F000080000006000C000007C000__0000007739203FF0", +"000000067F000080000006000C000007AEE8-000000067F000080000006000C000008460B__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C000007C000-000000067F000080000006000C0000080000__0000007739203FF0", +"000000067F000080000006000C0000080000-000000067F000080000006000C0000084000__0000007739203FF0", +"000000067F000080000006000C0000084000-000000067F000080000006000C0000088000__0000007739203FF0", +"000000067F000080000006000C000008460B-000000067F000080000006000C000008DD71__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C0000088000-000000067F000080000006000C000008C000__0000007739203FF0", +"000000067F000080000006000C000008C000-000000067F000080000006000C0000090000__0000007739203FF0", +"000000067F000080000006000C000008DD71-000000067F000080000006000C00000974D7__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C0000090000-000000067F000080000006000C0000094000__0000007739203FF0", +"000000067F000080000006000C0000094000-000000067F000080000006000C0000098000__0000007739203FF0", +"000000067F000080000006000C00000974D7-000000067F000080000006000C00000A0C0B__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C0000098000-000000067F000080000006000C000009C000__0000007739203FF0", +"000000067F000080000006000C000009C000-000000067F000080000006000C00000A0000__0000007739203FF0", +"000000067F000080000006000C00000A0000-000000067F000080000006000C00000A4000__0000007739203FF0", +"000000067F000080000006000C00000A0C0B-000000067F000080000006000C00000AA371__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C00000A4000-000000067F000080000006000C00000A8000__0000007739203FF0", +"000000067F000080000006000C00000A4D95-000000067F000080000006000C00000F7C7B__00000077B2AD0F91-0000007805801C41", +"000000067F000080000006000C00000A8000-000000067F000080000006000C00000AC000__0000007739203FF0", +"000000067F000080000006000C00000AA371-000000067F000080000006000C00000B3AD7__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C00000AC000-000000067F000080000006000C00000B0000__0000007739203FF0", +"000000067F000080000006000C00000B0000-000000067F000080000006000C00000B4000__0000007739203FF0", +"000000067F000080000006000C00000B3AD7-000000067F000080000006000C00000BD20B__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C00000B4000-000000067F000080000006000C00000B8000__0000007739203FF0", +"000000067F000080000006000C00000B8000-000000067F000080000006000C00000BC000__0000007739203FF0", +"000000067F000080000006000C00000BC000-000000067F000080000006000C00000C0000__0000007739203FF0", +"000000067F000080000006000C00000BD20B-000000067F000080000006000C0100000000__00000075E915EBC9-00000076A8CDE8F9", +"000000067F000080000006000C00000C0000-000000067F000080000006000C00000C4000__0000007739203FF0", +"000000067F000080000006000C00000C3C38-000000067F00008000000600140000001B38__00000077391A8001-00000077B2AD0F91", +"000000067F000080000006000C00000C4000-000000067F000080000006000C00000C8000__0000007739203FF0", +"000000067F000080000006000C00000C56C1-000000067F000080000006000C00000CEE0A__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C00000C8000-000000067F000080000006000C00000CC000__0000007739203FF0", +"000000067F000080000006000C00000CC000-000000067F000080000006000C00000D0000__0000007739203FF0", +"000000067F000080000006000C00000CEE0A-000000067F000080000006000C00000D8520__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C00000D0000-000000067F000080000006000C00000D4000__0000007739203FF0", +"000000067F000080000006000C00000D4000-000000067F000080000006000C00000D8000__0000007739203FF0", +"000000067F000080000006000C00000D8000-000000067F000080000006000C00000DC000__0000007739203FF0", +"000000067F000080000006000C00000D8520-000000067F000080000006000C00000E1C86__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C00000DC000-000000067F000080000006000C00000E0000__0000007739203FF0", +"000000067F000080000006000C00000E0000-000000067F000080000006000C00000E4000__0000007739203FF0", +"000000067F000080000006000C00000E1C86-000000067F000080000006000C00000EB3EC__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C00000E4000-000000067F000080000006000C00000E8000__0000007739203FF0", +"000000067F000080000006000C00000E8000-000000067F000080000006000C00000EC000__0000007739203FF0", +"000000067F000080000006000C00000EB3EC-000000067F000080000006000C00000F4B0C__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C00000EC000-000000067F000080000006000C00000F0000__0000007739203FF0", +"000000067F000080000006000C00000F0000-000000067F000080000006000C00000F4000__0000007739203FF0", +"000000067F000080000006000C00000F4000-000000067F000080000006000C00000F8000__0000007739203FF0", +"000000067F000080000006000C00000F4B0C-000000067F000080000006000C00000FE272__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C00000F7C96-000000067F0000800000060014000000F3A9__00000077B2AD0F91-0000007805801C41", +"000000067F000080000006000C00000F8000-000000067F000080000006000C00000FC000__0000007739203FF0", +"000000067F000080000006000C00000FC000-000000067F000080000006000C0000100000__0000007739203FF0", +"000000067F000080000006000C00000FE272-000000067F000080000006000C000010798F__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C0000100000-000000067F000080000006000C0000104000__0000007739203FF0", +"000000067F000080000006000C0000104000-000000067F000080000006000C0000108000__0000007739203FF0", +"000000067F000080000006000C000010798F-000000067F000080000006000C00001110F5__00000076A8CDE8F9-00000077391A8001", +"000000067F000080000006000C0000108000-000000067F000080000006000C000010C000__0000007739203FF0", +"000000067F000080000006000C000010C000-000000067F000080000006000C0000110000__0000007739203FF0", +"000000067F000080000006000C0000110000-030000000000000000000000000000000002__0000007739203FF0", +"000000067F000080000006000C00001110F5-010000000000000001000000030000000006__00000076A8CDE8F9-00000077391A8001", +"000000067F00008000000600140000001B38-000000067F00008000000600140000008758__00000077391A8001-00000077B2AD0F91", +"000000067F00008000000600140000008758-000000067F0000800000060014000000F32F__00000077391A8001-00000077B2AD0F91", +"000000067F0000800000060014000000F32F-000000067F00008000000600140000015EDC__00000077391A8001-00000077B2AD0F91", +"000000067F0000800000060014000000F3A9-000000067F00008000000600140000028656__00000077B2AD0F91-0000007805801C41", +"000000067F00008000000600140000015EDC-000000067F0000800000060014000001CB12__00000077391A8001-00000077B2AD0F91", +"000000067F0000800000060014000001CB12-000000067F000080000006001400000236BC__00000077391A8001-00000077B2AD0F91", +"000000067F000080000006001400000236BC-000000067F0000800000060014000002A294__00000077391A8001-00000077B2AD0F91", +"000000067F00008000000600140000028657-030000000000000000000000000000000002__00000077B2AD0F91-0000007805801C41", +"000000067F0000800000060014000002A294-030000000000000000000000000000000002__00000077391A8001-00000077B2AD0F91", +"000000067F000080000006200C0000000000-000000067F000080000006200C0000004000__00000078B2CB1C68", +"000000067F000080000006200C0000004000-000000067F000080000006200C0000008000__00000078B2CB1C68", +"000000067F000080000006200C0000008000-000000067F000080000006200C000000C000__00000078B2CB1C68", +"000000067F000080000006200C0000009441-000000067F000080000006200C0000012B8D__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C000000C000-000000067F000080000006200C0000010000__00000078B2CB1C68", +"000000067F000080000006200C0000010000-000000067F000080000006200C0000014000__00000078B2CB1C68", +"000000067F000080000006200C0000012B8D-000000067F000080000006200C000001C2F3__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C0000014000-000000067F000080000006200C0000018000__00000078B2CB1C68", +"000000067F000080000006200C0000018000-000000067F000080000006200C000001C000__00000078B2CB1C68", +"000000067F000080000006200C000001C000-000000067F000080000006200C0000020000__00000078B2CB1C68", +"000000067F000080000006200C000001C2F3-000000067F000080000006200C0000025A0C__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C0000020000-000000067F000080000006200C0000024000__00000078B2CB1C68", +"000000067F000080000006200C0000024000-000000067F000080000006200C0000028000__00000078B2CB1C68", +"000000067F000080000006200C0000025A0C-000000067F000080000006200C000002F172__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C0000028000-000000067F000080000006200C000002C000__00000078B2CB1C68", +"000000067F000080000006200C000002C000-000000067F000080000006200C0000030000__00000078B2CB1C68", +"000000067F000080000006200C000002F172-000000067F000080000006200C00000388D8__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C0000030000-000000067F000080000006200C0000034000__00000078B2CB1C68", +"000000067F000080000006200C0000034000-000000067F000080000006200C0000038000__00000078B2CB1C68", +"000000067F000080000006200C0000038000-000000067F000080000006200C000003C000__00000078B2CB1C68", +"000000067F000080000006200C00000388D8-000000067F000080000006200C0000042009__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C000003C000-000000067F000080000006200C0000040000__00000078B2CB1C68", +"000000067F000080000006200C0000040000-000000067F000080000006200C0000044000__00000078B2CB1C68", +"000000067F000080000006200C0000042009-000000067F000080000006200C000004B76F__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C0000044000-000000067F000080000006200C0000048000__00000078B2CB1C68", +"000000067F000080000006200C0000048000-000000067F000080000006200C000004C000__00000078B2CB1C68", +"000000067F000080000006200C0000048000-000000067F000080000006200C000004C000__0000007AA0A6FB48", +"000000067F000080000006200C0000048121-000000067F000080000006200C0000090C08__0000007A3F679FA1-0000007AA1DF6639", +"000000067F000080000006200C000004B76F-030000000000000000000000000000000002__0000007805801C41-00000078859FEA11", +"000000067F000080000006200C000004BAC9-000000067F000080000006200C00000551FE__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C000004C000-000000067F000080000006200C0000050000__00000078B2CB1C68", +"000000067F000080000006200C000004C000-000000067F000080000006200C0000050000__0000007AA0A6FB48", +"000000067F000080000006200C0000050000-000000067F000080000006200C0000054000__00000078B2CB1C68", +"000000067F000080000006200C0000050000-000000067F000080000006200C0000054000__0000007AA0A6FB48", +"000000067F000080000006200C0000054000-000000067F000080000006200C0000058000__00000078B2CB1C68", +"000000067F000080000006200C0000054000-000000067F000080000006200C0000058000__0000007AA0A6FB48", +"000000067F000080000006200C00000551FE-000000067F000080000006200C000005E90C__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C0000058000-000000067F000080000006200C000005C000__00000078B2CB1C68", +"000000067F000080000006200C0000058000-000000067F000080000006200C000005C000__0000007AA0A6FB48", +"000000067F000080000006200C000005C000-000000067F000080000006200C0000060000__00000078B2CB1C68", +"000000067F000080000006200C000005C000-000000067F000080000006200C0000060000__0000007AA0A6FB48", +"000000067F000080000006200C000005E90C-000000067F000080000006200C000006802C__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C0000060000-000000067F000080000006200C0000064000__00000078B2CB1C68", +"000000067F000080000006200C0000060000-000000067F000080000006200C0000064000__0000007AA0A6FB48", +"000000067F000080000006200C0000064000-000000067F000080000006200C0000068000__0000007AA0A6FB48", +"000000067F000080000006200C0000064000-030000000000000000000000000000000002__00000078B2CB1C68", +"000000067F000080000006200C0000068000-000000067F000080000006200C000006C000__0000007AA0A6FB48", +"000000067F000080000006200C000006802C-000000067F000080000006200C0000071783__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C000006C000-000000067F000080000006200C0000070000__0000007AA0A6FB48", +"000000067F000080000006200C0000070000-000000067F000080000006200C0000074000__0000007AA0A6FB48", +"000000067F000080000006200C0000071783-000000067F000080000006200C000007AEE9__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C0000074000-000000067F000080000006200C0000078000__0000007AA0A6FB48", +"000000067F000080000006200C0000078000-000000067F000080000006200C000007C000__0000007AA0A6FB48", +"000000067F000080000006200C000007AEE9-000000067F000080000006200C000008460B__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C000007C000-000000067F000080000006200C0000080000__0000007AA0A6FB48", +"000000067F000080000006200C0000080000-000000067F000080000006200C0000084000__0000007AA0A6FB48", +"000000067F000080000006200C0000084000-000000067F000080000006200C0000088000__0000007AA0A6FB48", +"000000067F000080000006200C000008460B-000000067F000080000006200C000008DD71__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C0000088000-000000067F000080000006200C000008C000__0000007AA0A6FB48", +"000000067F000080000006200C000008C000-000000067F000080000006200C0000090000__0000007AA0A6FB48", +"000000067F000080000006200C000008DD71-000000067F000080000006200C00000974D7__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C0000090000-000000067F000080000006200C0000094000__0000007AA0A6FB48", +"000000067F000080000006200C0000090C11-000000067F000080000006200C00000DA35B__0000007A3F679FA1-0000007AA1DF6639", +"000000067F000080000006200C0000094000-000000067F000080000006200C0000098000__0000007AA0A6FB48", +"000000067F000080000006200C00000974D7-000000067F000080000006200C00000A0C0B__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C0000098000-000000067F000080000006200C000009C000__0000007AA0A6FB48", +"000000067F000080000006200C000009C000-000000067F000080000006200C00000A0000__0000007AA0A6FB48", +"000000067F000080000006200C00000A0000-000000067F000080000006200C00000A4000__0000007AA0A6FB48", +"000000067F000080000006200C00000A0C0B-000000067F000080000006200C00000AA371__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000A4000-000000067F000080000006200C00000A8000__0000007AA0A6FB48", +"000000067F000080000006200C00000A8000-000000067F000080000006200C00000AC000__0000007AA0A6FB48", +"000000067F000080000006200C00000AA371-000000067F000080000006200C00000B3AD7__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000AC000-000000067F000080000006200C00000B0000__0000007AA0A6FB48", +"000000067F000080000006200C00000B0000-000000067F000080000006200C00000B4000__0000007AA0A6FB48", +"000000067F000080000006200C00000B3AD7-000000067F000080000006200C00000BD20B__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000B4000-000000067F000080000006200C00000B8000__0000007AA0A6FB48", +"000000067F000080000006200C00000B8000-000000067F000080000006200C00000BC000__0000007AA0A6FB48", +"000000067F000080000006200C00000BC000-000000067F000080000006200C00000C0000__0000007AA0A6FB48", +"000000067F000080000006200C00000BD20B-000000067F000080000006200C00000C6932__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000C0000-000000067F000080000006200C00000C4000__0000007AA0A6FB48", +"000000067F000080000006200C00000C4000-000000067F000080000006200C00000C8000__0000007AA0A6FB48", +"000000067F000080000006200C00000C6932-000000067F000080000006200C00000D0098__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000C8000-000000067F000080000006200C00000CC000__0000007AA0A6FB48", +"000000067F000080000006200C00000CC000-000000067F000080000006200C00000D0000__0000007AA0A6FB48", +"000000067F000080000006200C00000D0000-000000067F000080000006200C00000D4000__0000007AA0A6FB48", +"000000067F000080000006200C00000D0098-000000067F000080000006200C00000D97FE__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000D4000-000000067F000080000006200C00000D8000__0000007AA0A6FB48", +"000000067F000080000006200C00000D8000-000000067F000080000006200C00000DC000__0000007AA0A6FB48", +"000000067F000080000006200C00000D97FE-000000067F000080000006200C00000E2F0B__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000DA36C-000000067F00008000000620140000002D07__0000007A3F679FA1-0000007AA1DF6639", +"000000067F000080000006200C00000DC000-000000067F000080000006200C00000E0000__0000007AA0A6FB48", +"000000067F000080000006200C00000E0000-000000067F000080000006200C00000E4000__0000007AA0A6FB48", +"000000067F000080000006200C00000E2F0B-000000067F000080000006200C00000EC671__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000E4000-000000067F000080000006200C00000E8000__0000007AA0A6FB48", +"000000067F000080000006200C00000E8000-000000067F000080000006200C00000EC000__0000007AA0A6FB48", +"000000067F000080000006200C00000EC000-000000067F000080000006200C00000F0000__0000007AA0A6FB48", +"000000067F000080000006200C00000EC671-000000067F000080000006200C00000F5D9F__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000F0000-000000067F000080000006200C00000F4000__0000007AA0A6FB48", +"000000067F000080000006200C00000F4000-000000067F000080000006200C00000F8000__0000007AA0A6FB48", +"000000067F000080000006200C00000F5D9F-000000067F000080000006200C00000FF505__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C00000F8000-000000067F000080000006200C00000FC000__0000007AA0A6FB48", +"000000067F000080000006200C00000FC000-000000067F000080000006200C0000100000__0000007AA0A6FB48", +"000000067F000080000006200C00000FF505-000000067F000080000006200C0000108C10__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C0000100000-000000067F000080000006200C0000104000__0000007AA0A6FB48", +"000000067F000080000006200C0000104000-000000067F000080000006200C0000108000__0000007AA0A6FB48", +"000000067F000080000006200C0000107883-000000067F000080000006200C01000000AF__00000079C527F0D9-0000007A3F679FA1", +"000000067F000080000006200C0000108000-000000067F000080000006200C000010C000__0000007AA0A6FB48", +"000000067F000080000006200C0000108C10-000000067F000080000006200C0100000000__00000078859FEA11-00000079C527F0D9", +"000000067F000080000006200C000010C000-000000067F000080000006200C0000110000__0000007AA0A6FB48", +"000000067F000080000006200C0000110000-000000067F00008000000620120100000000__0000007AA0A6FB48", +"000000067F000080000006200C01000000AF-000000067F00008000000620140000004888__00000079C527F0D9-0000007A3F679FA1", +"000000067F00008000000620140000002D0A-000000067F00008000000620140000016355__0000007A3F679FA1-0000007AA1DF6639", +"000000067F00008000000620140000004888-000000067F0000800000062014000000BC11__00000079C527F0D9-0000007A3F679FA1", +"000000067F0000800000062014000000BC11-000000067F00008000000620140000012FA7__00000079C527F0D9-0000007A3F679FA1", +"000000067F00008000000620140000012FA7-000000067F0000800000062014000001A33D__00000079C527F0D9-0000007A3F679FA1", +"000000067F00008000000620140000016357-000000067F00008000000620140000029C35__0000007A3F679FA1-0000007AA1DF6639", +"000000067F0000800000062014000001A33D-000000067F000080000006201400000216B4__00000079C527F0D9-0000007A3F679FA1", +"000000067F000080000006201400000216B4-000000067F00008000000620140000028A65__00000079C527F0D9-0000007A3F679FA1", +"000000067F00008000000620140000028A65-030000000000000000000000000000000002__00000079C527F0D9-0000007A3F679FA1", +"000000067F00008000000620140000029C38-030000000000000000000000000000000002__0000007A3F679FA1-0000007AA1DF6639", +"000000067F000080000006400C0000000000-000000067F000080000006400C0000004000__0000007B9877EF40", +"000000067F000080000006400C0000000000-000000067F000080000006400C0000004000__0000007D41715570", +"000000067F000080000006400C0000004000-000000067F000080000006400C0000008000__0000007B9877EF40", +"000000067F000080000006400C0000004000-000000067F000080000006400C0000008000__0000007D41715570", +"000000067F000080000006400C0000007987-000000067F000080000006400C00000110ED__0000007AA1DF6639-0000007B14D5C521", +"000000067F000080000006400C0000008000-000000067F000080000006400C000000C000__0000007B9877EF40", +"000000067F000080000006400C0000008000-000000067F000080000006400C000000C000__0000007D41715570", +"000000067F000080000006400C000000C000-000000067F000080000006400C0000010000__0000007B9877EF40", +"000000067F000080000006400C000000C000-000000067F000080000006400C0000010000__0000007D41715570", +"000000067F000080000006400C0000010000-000000067F000080000006400C0000014000__0000007B9877EF40", +"000000067F000080000006400C0000010000-000000067F000080000006400C0000014000__0000007D41715570", +"000000067F000080000006400C00000110ED-000000067F000080000006400C000001A80A__0000007AA1DF6639-0000007B14D5C521", +"000000067F000080000006400C0000014000-000000067F000080000006400C0000018000__0000007B9877EF40", +"000000067F000080000006400C0000014000-000000067F000080000006400C0000018000__0000007D41715570", +"000000067F000080000006400C0000018000-000000067F000080000006400C000001C000__0000007B9877EF40", +"000000067F000080000006400C0000018000-000000067F000080000006400C000001C000__0000007D41715570", +"000000067F000080000006400C000001A80A-000000067F000080000006400C0000023F4A__0000007AA1DF6639-0000007B14D5C521", +"000000067F000080000006400C000001C000-000000067F000080000006400C0000020000__0000007B9877EF40", +"000000067F000080000006400C000001C000-000000067F000080000006400C0000020000__0000007D41715570", +"000000067F000080000006400C0000020000-000000067F000080000006400C0000024000__0000007B9877EF40", +"000000067F000080000006400C0000020000-000000067F000080000006400C0000024000__0000007D41715570", +"000000067F000080000006400C0000023F4A-000000067F000080000006400C000002D6B0__0000007AA1DF6639-0000007B14D5C521", +"000000067F000080000006400C0000024000-000000067F000080000006400C0000028000__0000007B9877EF40", +"000000067F000080000006400C0000024000-000000067F000080000006400C0000028000__0000007D41715570", +"000000067F000080000006400C0000028000-000000067F000080000006400C000002C000__0000007B9877EF40", +"000000067F000080000006400C0000028000-000000067F000080000006400C000002C000__0000007D41715570", +"000000067F000080000006400C000002C000-000000067F000080000006400C0000030000__0000007B9877EF40", +"000000067F000080000006400C000002C000-000000067F000080000006400C0000030000__0000007D41715570", +"000000067F000080000006400C000002D6B0-000000067F000080000006400C0000036DD4__0000007AA1DF6639-0000007B14D5C521", +"000000067F000080000006400C0000030000-000000067F000080000006400C0000034000__0000007B9877EF40", +"000000067F000080000006400C0000030000-000000067F000080000006400C0000034000__0000007D41715570", +"000000067F000080000006400C0000034000-000000067F000080000006400C0000038000__0000007B9877EF40", +"000000067F000080000006400C0000034000-000000067F000080000006400C0000038000__0000007D41715570", +"000000067F000080000006400C0000036DD4-000000067F000080000006400C000004050A__0000007AA1DF6639-0000007B14D5C521", +"000000067F000080000006400C0000038000-000000067F000080000006400C000003C000__0000007B9877EF40", +"000000067F000080000006400C0000038000-000000067F000080000006400C000003C000__0000007D41715570", +"000000067F000080000006400C000003C000-000000067F000080000006400C0000040000__0000007B9877EF40", +"000000067F000080000006400C000003C000-000000067F000080000006400C0000040000__0000007D41715570", +"000000067F000080000006400C0000040000-000000067F000080000006400C0000044000__0000007B9877EF40", +"000000067F000080000006400C0000040000-000000067F000080000006400C0000044000__0000007D41715570", +"000000067F000080000006400C000004050A-030000000000000000000000000000000002__0000007AA1DF6639-0000007B14D5C521", +"000000067F000080000006400C0000044000-000000067F000080000006400C0000048000__0000007B9877EF40", +"000000067F000080000006400C0000044000-000000067F000080000006400C0000048000__0000007D41715570", +"000000067F000080000006400C0000048000-000000067F000080000006400C000004C000__0000007B9877EF40", +"000000067F000080000006400C0000048000-000000067F000080000006400C000004C000__0000007D41715570", +"000000067F000080000006400C000004B4C9-000000067F000080000006400C0000054C01__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C000004C000-000000067F000080000006400C0000050000__0000007B9877EF40", +"000000067F000080000006400C000004C000-000000067F000080000006400C0000050000__0000007D41715570", +"000000067F000080000006400C0000050000-000000067F000080000006400C0000054000__0000007B9877EF40", +"000000067F000080000006400C0000050000-000000067F000080000006400C0000054000__0000007D41715570", +"000000067F000080000006400C00000525C4-000000067F000080000006400C00000A47A7__0000007CEE5A0B91-0000007D41EA8D51", +"000000067F000080000006400C0000054000-000000067F000080000006400C0000058000__0000007B9877EF40", +"000000067F000080000006400C0000054000-000000067F000080000006400C0000058000__0000007D41715570", +"000000067F000080000006400C0000054C01-000000067F000080000006400C000005E30C__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000058000-000000067F000080000006400C000005C000__0000007B9877EF40", +"000000067F000080000006400C0000058000-000000067F000080000006400C000005C000__0000007D41715570", +"000000067F000080000006400C000005C000-000000067F000080000006400C0000060000__0000007B9877EF40", +"000000067F000080000006400C000005C000-000000067F000080000006400C0000060000__0000007D41715570", +"000000067F000080000006400C000005E30C-000000067F000080000006400C0000067A2C__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000060000-000000067F000080000006400C0000064000__0000007B9877EF40", +"000000067F000080000006400C0000060000-000000067F000080000006400C0000064000__0000007D41715570", +"000000067F000080000006400C0000064000-000000067F000080000006400C0000068000__0000007B9877EF40", +"000000067F000080000006400C0000064000-000000067F000080000006400C0000068000__0000007D41715570", +"000000067F000080000006400C0000067A2C-000000067F000080000006400C0000071187__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000068000-000000067F000080000006400C000006C000__0000007B9877EF40", +"000000067F000080000006400C0000068000-000000067F000080000006400C000006C000__0000007D41715570", +"000000067F000080000006400C000006C000-000000067F000080000006400C0000070000__0000007B9877EF40", +"000000067F000080000006400C000006C000-000000067F000080000006400C0000070000__0000007D41715570", +"000000067F000080000006400C0000070000-000000067F000080000006400C0000074000__0000007B9877EF40", +"000000067F000080000006400C0000070000-000000067F000080000006400C0000074000__0000007D41715570", +"000000067F000080000006400C0000071187-000000067F000080000006400C000007A8ED__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000074000-000000067F000080000006400C0000078000__0000007B9877EF40", +"000000067F000080000006400C0000074000-000000067F000080000006400C0000078000__0000007D41715570", +"000000067F000080000006400C0000078000-000000067F000080000006400C000007C000__0000007B9877EF40", +"000000067F000080000006400C0000078000-000000067F000080000006400C000007C000__0000007D41715570", +"000000067F000080000006400C000007A8ED-000000067F000080000006400C000008400B__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C000007C000-000000067F000080000006400C0000080000__0000007B9877EF40", +"000000067F000080000006400C000007C000-000000067F000080000006400C0000080000__0000007D41715570", +"000000067F000080000006400C0000080000-000000067F000080000006400C0000084000__0000007B9877EF40", +"000000067F000080000006400C0000080000-000000067F000080000006400C0000084000__0000007D41715570", +"000000067F000080000006400C0000084000-000000067F000080000006400C0000088000__0000007B9877EF40", +"000000067F000080000006400C0000084000-000000067F000080000006400C0000088000__0000007D41715570", +"000000067F000080000006400C000008400B-000000067F000080000006400C000008D771__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000088000-000000067F000080000006400C000008C000__0000007B9877EF40", +"000000067F000080000006400C0000088000-000000067F000080000006400C000008C000__0000007D41715570", +"000000067F000080000006400C000008C000-000000067F000080000006400C0000090000__0000007B9877EF40", +"000000067F000080000006400C000008C000-000000067F000080000006400C0000090000__0000007D41715570", +"000000067F000080000006400C000008D771-000000067F000080000006400C0000096ED7__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000090000-000000067F000080000006400C0000094000__0000007D41715570", +"000000067F000080000006400C0000090000-030000000000000000000000000000000002__0000007B9877EF40", +"000000067F000080000006400C0000094000-000000067F000080000006400C0000098000__0000007D41715570", +"000000067F000080000006400C0000096ED7-000000067F000080000006400C00000A060B__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000098000-000000067F000080000006400C000009C000__0000007D41715570", +"000000067F000080000006400C000009C000-000000067F000080000006400C00000A0000__0000007D41715570", +"000000067F000080000006400C00000A0000-000000067F000080000006400C00000A4000__0000007D41715570", +"000000067F000080000006400C00000A060B-000000067F000080000006400C00000A9D71__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000A4000-000000067F000080000006400C00000A8000__0000007D41715570", +"000000067F000080000006400C00000A47B1-000000067F000080000006400C00000F593E__0000007CEE5A0B91-0000007D41EA8D51", +"000000067F000080000006400C00000A8000-000000067F000080000006400C00000AC000__0000007D41715570", +"000000067F000080000006400C00000A887C-000000067F000080000006400C020000001F__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F000080000006400C00000A9D71-000000067F000080000006400C00000B34D7__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000AC000-000000067F000080000006400C00000B0000__0000007D41715570", +"000000067F000080000006400C00000B0000-000000067F000080000006400C00000B4000__0000007D41715570", +"000000067F000080000006400C00000B34D7-000000067F000080000006400C00000BCC0C__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000B4000-000000067F000080000006400C00000B8000__0000007D41715570", +"000000067F000080000006400C00000B8000-000000067F000080000006400C00000BC000__0000007D41715570", +"000000067F000080000006400C00000BC000-000000067F000080000006400C00000C0000__0000007D41715570", +"000000067F000080000006400C00000BCC0C-000000067F000080000006400C00000C6336__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000C0000-000000067F000080000006400C00000C4000__0000007D41715570", +"000000067F000080000006400C00000C4000-000000067F000080000006400C00000C8000__0000007D41715570", +"000000067F000080000006400C00000C6336-000000067F000080000006400C00000CFA9C__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000C8000-000000067F000080000006400C00000CC000__0000007D41715570", +"000000067F000080000006400C00000CC000-000000067F000080000006400C00000D0000__0000007D41715570", +"000000067F000080000006400C00000CFA9C-000000067F000080000006400C00000D91AB__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000D0000-000000067F000080000006400C00000D4000__0000007D41715570", +"000000067F000080000006400C00000D4000-000000067F000080000006400C00000D8000__0000007D41715570", +"000000067F000080000006400C00000D8000-000000067F000080000006400C00000DC000__0000007D41715570", +"000000067F000080000006400C00000D91AB-000000067F000080000006400C00000E2911__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000DC000-000000067F000080000006400C00000E0000__0000007D41715570", +"000000067F000080000006400C00000E0000-000000067F000080000006400C00000E4000__0000007D41715570", +"000000067F000080000006400C00000E2911-000000067F000080000006400C00000EC077__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000E4000-000000067F000080000006400C00000E8000__0000007D41715570", +"000000067F000080000006400C00000E8000-000000067F000080000006400C00000EC000__0000007D41715570", +"000000067F000080000006400C00000EC000-000000067F000080000006400C00000F0000__0000007D41715570", +"000000067F000080000006400C00000EC077-000000067F000080000006400C00000F57A8__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000F0000-000000067F000080000006400C00000F4000__0000007D41715570", +"000000067F000080000006400C00000F4000-000000067F000080000006400C00000F8000__0000007D41715570", +"000000067F000080000006400C00000F57A8-000000067F000080000006400C00000FEF0A__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C00000F5940-000000067F0000800000064014000000E7FF__0000007CEE5A0B91-0000007D41EA8D51", +"000000067F000080000006400C00000F8000-000000067F000080000006400C00000FC000__0000007D41715570", +"000000067F000080000006400C00000FC000-000000067F000080000006400C0000100000__0000007D41715570", +"000000067F000080000006400C00000FEF0A-000000067F000080000006400C000010862B__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C0000100000-000000067F000080000006400C0000104000__0000007D41715570", +"000000067F000080000006400C0000104000-000000067F000080000006400C0000108000__0000007D41715570", +"000000067F000080000006400C0000108000-000000067F000080000006400C000010C000__0000007D41715570", +"000000067F000080000006400C000010862B-000000067F000080000006400C0000111C20__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C000010C000-000000067F000080000006400C0000110000__0000007D41715570", +"000000067F000080000006400C0000110000-000000067F00008000000640120100000000__0000007D41715570", +"000000067F000080000006400C00FFFFFFFF-01000000000000000100000003000000000D__0000007B14D5C521-0000007C73B53FC9", +"000000067F000080000006400C020000001F-000000067F0000800000064014000000691F__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F00008000000640140000000000-000000067F00008000000640140000004000__0000007D41715570", +"000000067F00008000000640140000004000-000000067F00008000000640140000008000__0000007D41715570", +"000000067F0000800000064014000000691F-000000067F0000800000064014000000D68F__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F00008000000640140000008000-000000067F0000800000064014000000C000__0000007D41715570", +"000000067F0000800000064014000000C000-000000067F00008000000640140000010000__0000007D41715570", +"000000067F0000800000064014000000D68F-000000067F00008000000640140000014406__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F0000800000064014000000E803-000000067F000080000006401400000274BB__0000007CEE5A0B91-0000007D41EA8D51", +"000000067F00008000000640140000010000-000000067F00008000000640140000014000__0000007D41715570", +"000000067F00008000000640140000014000-000000067F00008000000640140000018000__0000007D41715570", +"000000067F00008000000640140000014406-000000067F0000800000064014000001B192__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F00008000000640140000018000-000000067F0000800000064014000001C000__0000007D41715570", +"000000067F0000800000064014000001B192-000000067F00008000000640140000021F03__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F0000800000064014000001C000-000000067F00008000000640140000020000__0000007D41715570", +"000000067F00008000000640140000020000-000000067F00008000000640140000024000__0000007D41715570", +"000000067F00008000000640140000021F03-000000067F00008000000640140000028C6A__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F00008000000640140000024000-000000067F00008000000640140000028000__0000007D41715570", +"000000067F000080000006401400000274BF-030000000000000000000000000000000002__0000007CEE5A0B91-0000007D41EA8D51", +"000000067F00008000000640140000028000-000000067F0000800000064014000002C000__0000007D41715570", +"000000067F00008000000640140000028C6A-030000000000000000000000000000000002__0000007C73B53FC9-0000007CEE5A0B91", +"000000067F0000800000064014000002C000-030000000000000000000000000000000002__0000007D41715570", +"000000067F000080000006600C0000000000-000000067F000080000006600C0000004000__0000007F12B83FE8", +"000000067F000080000006600C0000004000-000000067F000080000006600C0000008000__0000007F12B83FE8", +"000000067F000080000006600C0000008000-000000067F000080000006600C000000C000__0000007F12B83FE8", +"000000067F000080000006600C0000009381-000000067F000080000006600C0000012AE7__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C000000C000-000000067F000080000006600C0000010000__0000007F12B83FE8", +"000000067F000080000006600C0000010000-000000067F000080000006600C0000014000__0000007F12B83FE8", +"000000067F000080000006600C0000012AE7-000000067F000080000006600C000001C20B__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C0000014000-000000067F000080000006600C0000018000__0000007F12B83FE8", +"000000067F000080000006600C0000018000-000000067F000080000006600C000001C000__0000007F12B83FE8", +"000000067F000080000006600C000001C000-000000067F000080000006600C0000020000__0000007F12B83FE8", +"000000067F000080000006600C000001C20B-000000067F000080000006600C000002593B__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C0000020000-000000067F000080000006600C0000024000__0000007F12B83FE8", +"000000067F000080000006600C0000024000-000000067F000080000006600C0000028000__0000007F12B83FE8", +"000000067F000080000006600C000002593B-000000067F000080000006600C000002F0A1__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C0000028000-000000067F000080000006600C000002C000__0000007F12B83FE8", +"000000067F000080000006600C000002C000-000000067F000080000006600C0000030000__0000007F12B83FE8", +"000000067F000080000006600C000002F0A1-000000067F000080000006600C00000387B6__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C0000030000-000000067F000080000006600C0000034000__0000007F12B83FE8", +"000000067F000080000006600C0000034000-000000067F000080000006600C0000038000__0000007F12B83FE8", +"000000067F000080000006600C0000038000-000000067F000080000006600C000003C000__0000007F12B83FE8", +"000000067F000080000006600C00000387B6-000000067F000080000006600C0000041F1C__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C000003C000-000000067F000080000006600C0000040000__0000007F12B83FE8", +"000000067F000080000006600C0000040000-000000067F000080000006600C0000044000__0000007F12B83FE8", +"000000067F000080000006600C0000041F1C-000000067F000080000006600C000004B682__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C0000044000-000000067F000080000006600C0000048000__0000007F12B83FE8", +"000000067F000080000006600C0000048000-000000067F000080000006600C000004C000__0000007F108C1FD8", +"000000067F000080000006600C0000048000-000000067F000080000006600C000004C000__0000007FDCA75700", +"000000067F000080000006600C0000049743-000000067F000080000006600C0000093532__0000007F7BE4E6F1-0000007FDCDCE659", +"000000067F000080000006600C000004B682-030000000000000000000000000000000002__0000007D41EA8D51-0000007DC21DE569", +"000000067F000080000006600C000004BAC3-000000067F000080000006600C00000551F8__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C000004C000-000000067F000080000006600C0000050000__0000007F108C1FD8", +"000000067F000080000006600C000004C000-000000067F000080000006600C0000050000__0000007FDCA75700", +"000000067F000080000006600C0000050000-000000067F000080000006600C0000054000__0000007F108C1FD8", +"000000067F000080000006600C0000050000-000000067F000080000006600C0000054000__0000007FDCA75700", +"000000067F000080000006600C0000054000-000000067F000080000006600C0000058000__0000007F108C1FD8", +"000000067F000080000006600C0000054000-000000067F000080000006600C0000058000__0000007FDCA75700", +"000000067F000080000006600C00000551F8-000000067F000080000006600C000005E90C__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C0000058000-000000067F000080000006600C000005C000__0000007F108C1FD8", +"000000067F000080000006600C0000058000-000000067F000080000006600C000005C000__0000007FDCA75700", +"000000067F000080000006600C000005C000-000000067F000080000006600C0000060000__0000007F108C1FD8", +"000000067F000080000006600C000005C000-000000067F000080000006600C0000060000__0000007FDCA75700", +"000000067F000080000006600C000005E90C-000000067F000080000006600C000006802C__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C0000060000-000000067F000080000006600C0000064000__0000007F108C1FD8", +"000000067F000080000006600C0000060000-000000067F000080000006600C0000064000__0000007FDCA75700", +"000000067F000080000006600C0000064000-000000067F000080000006600C0000068000__0000007F108C1FD8", +"000000067F000080000006600C0000064000-000000067F000080000006600C0000068000__0000007FDCA75700", +"000000067F000080000006600C0000068000-000000067F000080000006600C000006C000__0000007F108C1FD8", +"000000067F000080000006600C0000068000-000000067F000080000006600C000006C000__0000007FDCA75700", +"000000067F000080000006600C000006802C-000000067F000080000006600C0000071783__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C000006C000-000000067F000080000006600C0000070000__0000007F108C1FD8", +"000000067F000080000006600C000006C000-000000067F000080000006600C0000070000__0000007FDCA75700", +"000000067F000080000006600C0000070000-000000067F000080000006600C0000074000__0000007F108C1FD8", +"000000067F000080000006600C0000070000-000000067F000080000006600C0000074000__0000007FDCA75700", +"000000067F000080000006600C0000071783-000000067F000080000006600C000007AEE9__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C0000074000-000000067F000080000006600C0000078000__0000007F108C1FD8", +"000000067F000080000006600C0000074000-000000067F000080000006600C0000078000__0000007FDCA75700", +"000000067F000080000006600C0000078000-000000067F000080000006600C000007C000__0000007F108C1FD8", +"000000067F000080000006600C0000078000-000000067F000080000006600C000007C000__0000007FDCA75700", +"000000067F000080000006600C000007AEE9-000000067F000080000006600C000008460B__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C000007C000-000000067F000080000006600C0000080000__0000007F108C1FD8", +"000000067F000080000006600C000007C000-000000067F000080000006600C0000080000__0000007FDCA75700", +"000000067F000080000006600C0000080000-000000067F000080000006600C0000084000__0000007F108C1FD8", +"000000067F000080000006600C0000080000-000000067F000080000006600C0000084000__0000007FDCA75700", +"000000067F000080000006600C0000084000-000000067F000080000006600C0000088000__0000007F108C1FD8", +"000000067F000080000006600C0000084000-000000067F000080000006600C0000088000__0000007FDCA75700", +"000000067F000080000006600C000008460B-000000067F000080000006600C000008DD71__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C0000088000-000000067F000080000006600C000008C000__0000007F108C1FD8", +"000000067F000080000006600C0000088000-000000067F000080000006600C000008C000__0000007FDCA75700", +"000000067F000080000006600C000008C000-000000067F000080000006600C0000090000__0000007F108C1FD8", +"000000067F000080000006600C000008C000-000000067F000080000006600C0000090000__0000007FDCA75700", +"000000067F000080000006600C000008DD71-000000067F000080000006600C00000974D7__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C0000090000-000000067F000080000006600C0000094000__0000007F108C1FD8", +"000000067F000080000006600C0000090000-000000067F000080000006600C0000094000__0000007FDCA75700", +"000000067F000080000006600C0000093532-000000067F000080000006600C00000DD150__0000007F7BE4E6F1-0000007FDCDCE659", +"000000067F000080000006600C0000094000-000000067F000080000006600C0000098000__0000007F108C1FD8", +"000000067F000080000006600C0000094000-000000067F000080000006600C0000098000__0000007FDCA75700", +"000000067F000080000006600C00000974D7-000000067F000080000006600C00000A0C0B__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C0000098000-000000067F000080000006600C000009C000__0000007F108C1FD8", +"000000067F000080000006600C0000098000-000000067F000080000006600C000009C000__0000007FDCA75700", +"000000067F000080000006600C000009C000-000000067F000080000006600C00000A0000__0000007F108C1FD8", +"000000067F000080000006600C000009C000-000000067F000080000006600C00000A0000__0000007FDCA75700", +"000000067F000080000006600C00000A0000-000000067F000080000006600C00000A4000__0000007F108C1FD8", +"000000067F000080000006600C00000A0000-000000067F000080000006600C00000A4000__0000007FDCA75700", +"000000067F000080000006600C00000A0C0B-000000067F000080000006600C00000AA371__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C00000A4000-000000067F000080000006600C00000A8000__0000007F108C1FD8", +"000000067F000080000006600C00000A4000-000000067F000080000006600C00000A8000__0000007FDCA75700", +"000000067F000080000006600C00000A8000-000000067F000080000006600C00000AC000__0000007F108C1FD8", +"000000067F000080000006600C00000A8000-000000067F000080000006600C00000AC000__0000007FDCA75700", +"000000067F000080000006600C00000AA371-000000067F000080000006600C00000B3AD7__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C00000AC000-000000067F000080000006600C00000B0000__0000007F108C1FD8", +"000000067F000080000006600C00000AC000-000000067F000080000006600C00000B0000__0000007FDCA75700", +"000000067F000080000006600C00000B0000-000000067F000080000006600C00000B4000__0000007F108C1FD8", +"000000067F000080000006600C00000B0000-000000067F000080000006600C00000B4000__0000007FDCA75700", +"000000067F000080000006600C00000B3AD7-000000067F000080000006600C0100000000__0000007DC21DE569-0000007E71DBF8F9", +"000000067F000080000006600C00000B4000-000000067F000080000006600C00000B8000__0000007F108C1FD8", +"000000067F000080000006600C00000B4000-000000067F000080000006600C00000B8000__0000007FDCA75700", +"000000067F000080000006600C00000B8000-000000067F000080000006600C00000BC000__0000007F108C1FD8", +"000000067F000080000006600C00000B8000-000000067F000080000006600C00000BC000__0000007FDCA75700", +"000000067F000080000006600C00000BC000-000000067F000080000006600C00000C0000__0000007F108C1FD8", +"000000067F000080000006600C00000BC000-000000067F000080000006600C00000C0000__0000007FDCA75700", +"000000067F000080000006600C00000BC29F-000000067F000080000006600C00000C59CF__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C00000C0000-000000067F000080000006600C00000C4000__0000007F108C1FD8", +"000000067F000080000006600C00000C0000-000000067F000080000006600C00000C4000__0000007FDCA75700", +"000000067F000080000006600C00000C4000-000000067F000080000006600C00000C8000__0000007F108C1FD8", +"000000067F000080000006600C00000C4000-000000067F000080000006600C00000C8000__0000007FDCA75700", +"000000067F000080000006600C00000C59CF-000000067F000080000006600C00000CF10B__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C00000C8000-000000067F000080000006600C00000CC000__0000007F108C1FD8", +"000000067F000080000006600C00000C8000-000000067F000080000006600C00000CC000__0000007FDCA75700", +"000000067F000080000006600C00000CC000-000000067F000080000006600C00000D0000__0000007F108C1FD8", +"000000067F000080000006600C00000CC000-000000067F000080000006600C00000D0000__0000007FDCA75700", +"000000067F000080000006600C00000CF10B-000000067F000080000006600C00000D882C__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C00000D0000-000000067F000080000006600C00000D4000__0000007F108C1FD8", +"000000067F000080000006600C00000D0000-000000067F000080000006600C00000D4000__0000007FDCA75700", +"000000067F000080000006600C00000D4000-000000067F000080000006600C00000D8000__0000007F108C1FD8", +"000000067F000080000006600C00000D4000-000000067F000080000006600C00000D8000__0000007FDCA75700", +"000000067F000080000006600C00000D8000-000000067F000080000006600C00000DC000__0000007F108C1FD8", +"000000067F000080000006600C00000D8000-000000067F000080000006600C00000DC000__0000007FDCA75700", +"000000067F000080000006600C00000D882C-000000067F000080000006600C00000E1F7F__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C00000DC000-000000067F000080000006600C00000E0000__0000007F108C1FD8", +"000000067F000080000006600C00000DC000-000000067F000080000006600C00000E0000__0000007FDCA75700", +"000000067F000080000006600C00000DD152-000000067F00008000000660140000003DA8__0000007F7BE4E6F1-0000007FDCDCE659", +"000000067F000080000006600C00000E0000-000000067F000080000006600C00000E4000__0000007F108C1FD8", +"000000067F000080000006600C00000E0000-000000067F000080000006600C00000E4000__0000007FDCA75700", +"000000067F000080000006600C00000E1F7F-000000067F000080000006600C00000EB6E5__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C00000E4000-000000067F000080000006600C00000E8000__0000007F108C1FD8", +"000000067F000080000006600C00000E4000-000000067F000080000006600C00000E8000__0000007FDCA75700", +"000000067F000080000006600C00000E8000-000000067F000080000006600C00000EC000__0000007F108C1FD8", +"000000067F000080000006600C00000E8000-000000067F000080000006600C00000EC000__0000007FDCA75700", +"000000067F000080000006600C00000EB6E5-000000067F000080000006600C00000F4E0C__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C00000EC000-000000067F000080000006600C00000F0000__0000007F108C1FD8", +"000000067F000080000006600C00000EC000-000000067F000080000006600C00000F0000__0000007FDCA75700", +"000000067F000080000006600C00000F0000-000000067F000080000006600C00000F4000__0000007F108C1FD8", +"000000067F000080000006600C00000F0000-000000067F000080000006600C00000F4000__0000007FDCA75700", +"000000067F000080000006600C00000F4000-000000067F000080000006600C00000F8000__0000007F108C1FD8", +"000000067F000080000006600C00000F4000-000000067F000080000006600C00000F8000__0000007FDCA75700", +"000000067F000080000006600C00000F4E0C-000000067F000080000006600C00000FE572__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C00000F8000-000000067F000080000006600C00000FC000__0000007F108C1FD8", +"000000067F000080000006600C00000F8000-000000067F000080000006600C00000FC000__0000007FDCA75700", +"000000067F000080000006600C00000FC000-000000067F000080000006600C0000100000__0000007F108C1FD8", +"000000067F000080000006600C00000FC000-000000067F000080000006600C0000100000__0000007FDCA75700", +"000000067F000080000006600C00000FE572-000000067F000080000006600C0000107CD8__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C0000100000-000000067F000080000006600C0000104000__0000007F108C1FD8", +"000000067F000080000006600C0000100000-000000067F000080000006600C0000104000__0000007FDCA75700", +"000000067F000080000006600C0000104000-000000067F000080000006600C0000108000__0000007F108C1FD8", +"000000067F000080000006600C0000104000-000000067F000080000006600C0000108000__0000007FDCA75700", +"000000067F000080000006600C0000107CD8-000000067F000080000006600C000011140B__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C0000108000-000000067F000080000006600C000010C000__0000007F108C1FD8", +"000000067F000080000006600C0000108000-000000067F000080000006600C000010C000__0000007FDCA75700", +"000000067F000080000006600C000010C000-000000067F000080000006600C0000110000__0000007F108C1FD8", +"000000067F000080000006600C000010C000-000000067F000080000006600C0000110000__0000007FDCA75700", +"000000067F000080000006600C0000110000-000000067F00008000000660120100000000__0000007FDCA75700", +"000000067F000080000006600C0000110000-030000000000000000000000000000000002__0000007F108C1FD8", +"000000067F000080000006600C000011140B-010000000000000001000000030000000010__0000007E71DBF8F9-0000007F11E4BFE9", +"000000067F000080000006600C0000111C82-000000067F0000800000066014000000535B__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F00008000000660140000000000-000000067F00008000000660140000004000__0000007FDCA75700", +"000000067F00008000000660140000003DAA-000000067F00008000000660140000017C4D__0000007F7BE4E6F1-0000007FDCDCE659", +"000000067F00008000000660140000004000-000000067F00008000000660140000008000__0000007FDCA75700", +"000000067F0000800000066014000000535B-000000067F0000800000066014000000C839__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F00008000000660140000008000-000000067F0000800000066014000000C000__0000007FDCA75700", +"000000067F0000800000066014000000C000-000000067F00008000000660140000010000__0000007FDCA75700", +"000000067F0000800000066014000000C839-000000067F00008000000660140000013D42__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F00008000000660140000010000-000000067F00008000000660140000014000__0000007FDCA75700", +"000000067F00008000000660140000013D42-000000067F0000800000066014000001B222__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F00008000000660140000014000-000000067F00008000000660140000018000__0000007FDCA75700", +"000000067F00008000000660140000017C51-000000067F0000800000066014000002B9D0__0000007F7BE4E6F1-0000007FDCDCE659", +"000000067F00008000000660140000018000-000000067F0000800000066014000001C000__0000007FDCA75700", +"000000067F0000800000066014000001B222-000000067F00008000000660140000022704__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F0000800000066014000001C000-000000067F00008000000660140000020000__0000007FDCA75700", +"000000067F00008000000660140000020000-000000067F00008000000660140000024000__0000007FDCA75700", +"000000067F00008000000660140000022704-000000067F00008000000660140000029C2D__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F00008000000660140000024000-000000067F00008000000660140000028000__0000007FDCA75700", +"000000067F00008000000660140000028000-000000067F0000800000066014000002C000__0000007FDCA75700", +"000000067F00008000000660140000029C2D-030000000000000000000000000000000002__0000007F11E4BFE9-0000007F7BE4E6F1", +"000000067F0000800000066014000002B9D1-030000000000000000000000000000000002__0000007F7BE4E6F1-0000007FDCDCE659", +"000000067F0000800000066014000002C000-030000000000000000000000000000000002__0000007FDCA75700", +"000000067F000080000006800C0000000000-000000067F000080000006800C0000004000__00000081AFEDBFE0", +"000000067F000080000006800C0000004000-000000067F000080000006800C0000008000__00000081AFEDBFE0", +"000000067F000080000006800C0000007D6A-000000067F000080000006800C00000114D0__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000080000006800C0000008000-000000067F000080000006800C000000C000__00000081AFEDBFE0", +"000000067F000080000006800C000000C000-000000067F000080000006800C0000010000__00000081AFEDBFE0", +"000000067F000080000006800C0000010000-000000067F000080000006800C0000014000__00000081AFEDBFE0", +"000000067F000080000006800C00000114D0-000000067F000080000006800C000001AC0B__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000080000006800C0000014000-000000067F000080000006800C0000018000__00000081AFEDBFE0", +"000000067F000080000006800C0000018000-000000067F000080000006800C000001C000__00000081AFEDBFE0", +"000000067F000080000006800C000001AC0B-000000067F000080000006800C0000024348__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000080000006800C000001C000-000000067F000080000006800C0000020000__00000081AFEDBFE0", +"000000067F000080000006800C0000020000-000000067F000080000006800C0000024000__00000081AFEDBFE0", +"000000067F000080000006800C0000024000-000000067F000080000006800C0000028000__00000081AFEDBFE0", +"000000067F000080000006800C0000024348-000000067F000080000006800C000002DAAE__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000080000006800C0000028000-000000067F000080000006800C000002C000__00000081AFEDBFE0", +"000000067F000080000006800C000002C000-000000067F000080000006800C0000030000__00000081AFEDBFE0", +"000000067F000080000006800C000002DAAE-000000067F000080000006800C00000371D0__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000080000006800C0000030000-000000067F000080000006800C0000034000__00000081AFEDBFE0", +"000000067F000080000006800C0000034000-000000067F000080000006800C0000038000__00000081AFEDBFE0", +"000000067F000080000006800C00000371D0-000000067F000080000006800C000004090B__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000080000006800C0000038000-000000067F000080000006800C000003C000__00000081AFEDBFE0", +"000000067F000080000006800C000003C000-000000067F000080000006800C0000040000__00000081AFEDBFE0", +"000000067F000080000006800C0000040000-000000067F000080000006800C0000044000__00000081A164D628", +"000000067F000080000006800C000004090B-030000000000000000000000000000000002__0000007FDCDCE659-000000804F6BFFC1", +"000000067F000080000006800C0000042368-000000067F000080000006800C000004BACE__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C0000044000-000000067F000080000006800C0000048000__00000081A164D628", +"000000067F000080000006800C0000048000-000000067F000080000006800C000004C000__00000081A164D628", +"000000067F000080000006800C000004BACE-000000067F000080000006800C0000055202__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C000004C000-000000067F000080000006800C0000050000__00000081A164D628", +"000000067F000080000006800C0000050000-000000067F000080000006800C0000054000__00000081A164D628", +"000000067F000080000006800C0000054000-000000067F000080000006800C0000058000__00000081A164D628", +"000000067F000080000006800C0000055202-000000067F000080000006800C000005E90D__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C0000058000-000000067F000080000006800C000005C000__00000081A164D628", +"000000067F000080000006800C000005C000-000000067F000080000006800C0000060000__00000081A164D628", +"000000067F000080000006800C000005E90D-000000067F000080000006800C000006802B__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C0000060000-000000067F000080000006800C0000064000__00000081A164D628", +"000000067F000080000006800C0000064000-000000067F000080000006800C0000068000__00000081A164D628", +"000000067F000080000006800C0000068000-000000067F000080000006800C000006C000__00000081A164D628", +"000000067F000080000006800C000006802B-000000067F000080000006800C0000071782__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C000006C000-000000067F000080000006800C0000070000__00000081A164D628", +"000000067F000080000006800C0000070000-000000067F000080000006800C0000074000__00000081A164D628", +"000000067F000080000006800C0000071782-000000067F000080000006800C000007AEE8__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C0000074000-000000067F000080000006800C0000078000__00000081A164D628", +"000000067F000080000006800C0000078000-000000067F000080000006800C000007C000__00000081A164D628", +"000000067F000080000006800C000007AEE8-000000067F000080000006800C000008460B__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C000007C000-000000067F000080000006800C0000080000__00000081A164D628", +"000000067F000080000006800C0000080000-000000067F000080000006800C0000084000__00000081A164D628", +"000000067F000080000006800C0000084000-000000067F000080000006800C0000088000__00000081A164D628", +"000000067F000080000006800C000008460B-000000067F000080000006800C000008DD71__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C0000088000-000000067F000080000006800C000008C000__00000081A164D628", +"000000067F000080000006800C000008C000-000000067F000080000006800C0000090000__00000081A164D628", +"000000067F000080000006800C000008DD71-000000067F000080000006800C00000974D7__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C0000090000-000000067F000080000006800C0000094000__00000081A164D628", +"000000067F000080000006800C0000094000-000000067F000080000006800C0000098000__00000081A164D628", +"000000067F000080000006800C00000974D7-000000067F000080000006800C00000A0C0B__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C0000098000-000000067F000080000006800C000009C000__00000081A164D628", +"000000067F000080000006800C000009C000-000000067F000080000006800C00000A0000__00000081A164D628", +"000000067F000080000006800C00000A0000-000000067F000080000006800C00000A4000__00000081A164D628", +"000000067F000080000006800C00000A0C0B-000000067F000080000006800C0100000000__000000804F6BFFC1-00000080EF2FF5B9", +"000000067F000080000006800C00000A4000-000000067F000080000006800C00000A8000__00000081A164D628", +"000000067F000080000006800C00000A8000-000000067F000080000006800C00000AC000__00000081A164D628", +"000000067F000080000006800C00000A8D4C-000000067F000080000006800C00000B24B2__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000AC000-000000067F000080000006800C00000B0000__00000081A164D628", +"000000067F000080000006800C00000B0000-000000067F000080000006800C00000B4000__00000081A164D628", +"000000067F000080000006800C00000B24B2-000000067F000080000006800C00000BBC0B__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000B4000-000000067F000080000006800C00000B8000__00000081A164D628", +"000000067F000080000006800C00000B8000-000000067F000080000006800C00000BC000__00000081A164D628", +"000000067F000080000006800C00000BBC0B-000000067F000080000006800C00000C533F__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000BC000-000000067F000080000006800C00000C0000__00000081A164D628", +"000000067F000080000006800C00000C0000-000000067F000080000006800C00000C4000__00000081A164D628", +"000000067F000080000006800C00000C4000-000000067F000080000006800C00000C8000__00000081A164D628", +"000000067F000080000006800C00000C533F-000000067F000080000006800C00000CEAA5__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000C8000-000000067F000080000006800C00000CC000__00000081A164D628", +"000000067F000080000006800C00000CC000-000000067F000080000006800C00000D0000__00000081A164D628", +"000000067F000080000006800C00000CEAA5-000000067F000080000006800C00000D81BE__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000D0000-000000067F000080000006800C00000D4000__00000081A164D628", +"000000067F000080000006800C00000D4000-000000067F000080000006800C00000D8000__00000081A164D628", +"000000067F000080000006800C00000D8000-000000067F000080000006800C00000DC000__00000081A164D628", +"000000067F000080000006800C00000D81BE-000000067F000080000006800C00000E190B__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000DC000-000000067F000080000006800C00000E0000__00000081A164D628", +"000000067F000080000006800C00000E0000-000000067F000080000006800C00000E4000__00000081A164D628", +"000000067F000080000006800C00000E190B-000000067F000080000006800C00000EB071__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000E4000-000000067F000080000006800C00000E8000__00000081A164D628", +"000000067F000080000006800C00000E8000-000000067F000080000006800C00000EC000__00000081A164D628", +"000000067F000080000006800C00000EB071-000000067F000080000006800C00000F47AC__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000EC000-000000067F000080000006800C00000F0000__00000081A164D628", +"000000067F000080000006800C00000F0000-000000067F000080000006800C00000F4000__00000081A164D628", +"000000067F000080000006800C00000F4000-000000067F000080000006800C00000F8000__00000081A164D628", +"000000067F000080000006800C00000F47AC-000000067F000080000006800C00000FDF0A__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C00000F8000-000000067F000080000006800C00000FC000__00000081A164D628", +"000000067F000080000006800C00000FC000-000000067F000080000006800C0000100000__00000081A164D628", +"000000067F000080000006800C00000FDF0A-000000067F000080000006800C000010762B__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C0000100000-000000067F000080000006800C0000104000__00000081A164D628", +"000000067F000080000006800C0000104000-000000067F000080000006800C0000108000__00000081A164D628", +"000000067F000080000006800C000010762B-000000067F000080000006800C0000110D88__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006800C0000108000-030000000000000000000000000000000002__00000081A164D628", +"000000067F000080000006800C0000110D88-010000000000000001000000030000000014__00000080EF2FF5B9-00000081AFAF5FD1", +"000000067F000080000006801400000044E4-000000067F0000800000068014000000C3F5__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F0000800000068014000000C3F5-000000067F00008000000680140000014303__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F00008000000680140000014303-000000067F0000800000068014000001C214__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F0000800000068014000001C214-000000067F00008000000680140000024125__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F00008000000680140000024125-000000067F0000800000068014000002C035__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F0000800000068014000002C035-000000067F000080000006A00C00000072CA__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F000080000006A00C0000000000-000000067F000080000006A00C0000004000__00000083D5DE3FD0", +"000000067F000080000006A00C0000004000-000000067F000080000006A00C0000008000__00000083D5DE3FD0", +"000000067F000080000006A00C00000072CA-030000000000000000000000000000000002__00000081AFAF5FD1-0000008215AFE5A9", +"000000067F000080000006A00C0000008000-000000067F000080000006A00C000000C000__00000083865C64B8", +"000000067F000080000006A00C0000008000-000000067F000080000006A00C000000C000__00000084A1F03030", +"000000067F000080000006A00C00000096E3-000000067F000080000006A00C0000012E0B__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C000000C000-000000067F000080000006A00C0000010000__00000083865C64B8", +"000000067F000080000006A00C000000C000-000000067F000080000006A00C0000010000__00000084A1F03030", +"000000067F000080000006A00C0000010000-000000067F000080000006A00C0000014000__00000083865C64B8", +"000000067F000080000006A00C0000010000-000000067F000080000006A00C0000014000__00000084A1F03030", +"000000067F000080000006A00C0000012E0B-000000067F000080000006A00C000001C571__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000014000-000000067F000080000006A00C0000018000__00000083865C64B8", +"000000067F000080000006A00C0000014000-000000067F000080000006A00C0000018000__00000084A1F03030", +"000000067F000080000006A00C0000018000-000000067F000080000006A00C000001C000__00000083865C64B8", +"000000067F000080000006A00C0000018000-000000067F000080000006A00C000001C000__00000084A1F03030", +"000000067F000080000006A00C000001C000-000000067F000080000006A00C0000020000__00000083865C64B8", +"000000067F000080000006A00C000001C000-000000067F000080000006A00C0000020000__00000084A1F03030", +"000000067F000080000006A00C000001C571-000000067F000080000006A00C0000025CD7__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000020000-000000067F000080000006A00C0000024000__00000083865C64B8", +"000000067F000080000006A00C0000020000-000000067F000080000006A00C0000024000__00000084A1F03030", +"000000067F000080000006A00C0000024000-000000067F000080000006A00C0000028000__00000083865C64B8", +"000000067F000080000006A00C0000024000-000000067F000080000006A00C0000028000__00000084A1F03030", +"000000067F000080000006A00C0000025CD7-000000067F000080000006A00C000002F40B__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000028000-000000067F000080000006A00C000002C000__00000083865C64B8", +"000000067F000080000006A00C0000028000-000000067F000080000006A00C000002C000__00000084A1F03030", +"000000067F000080000006A00C000002C000-000000067F000080000006A00C0000030000__00000083865C64B8", +"000000067F000080000006A00C000002C000-000000067F000080000006A00C0000030000__00000084A1F03030", +"000000067F000080000006A00C000002F40B-000000067F000080000006A00C0000038B1E__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000030000-000000067F000080000006A00C0000034000__00000083865C64B8", +"000000067F000080000006A00C0000030000-000000067F000080000006A00C0000034000__00000084A1F03030", +"000000067F000080000006A00C0000034000-000000067F000080000006A00C0000038000__00000083865C64B8", +"000000067F000080000006A00C0000034000-000000067F000080000006A00C0000038000__00000084A1F03030", +"000000067F000080000006A00C0000038000-000000067F000080000006A00C000003C000__00000083865C64B8", +"000000067F000080000006A00C0000038000-000000067F000080000006A00C000003C000__00000084A1F03030", +"000000067F000080000006A00C0000038B1E-000000067F000080000006A00C0000042284__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C000003C000-000000067F000080000006A00C0000040000__00000083865C64B8", +"000000067F000080000006A00C000003C000-000000067F000080000006A00C0000040000__00000084A1F03030", +"000000067F000080000006A00C0000040000-000000067F000080000006A00C0000044000__00000083865C64B8", +"000000067F000080000006A00C0000040000-000000067F000080000006A00C0000044000__00000084A1F03030", +"000000067F000080000006A00C0000042284-000000067F000080000006A00C000004B9EA__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000044000-000000067F000080000006A00C0000048000__00000083865C64B8", +"000000067F000080000006A00C0000044000-000000067F000080000006A00C0000048000__00000084A1F03030", +"000000067F000080000006A00C0000048000-000000067F000080000006A00C000004C000__00000083865C64B8", +"000000067F000080000006A00C0000048000-000000067F000080000006A00C000004C000__00000084A1F03030", +"000000067F000080000006A00C000004B9EA-000000067F000080000006A00C000005510B__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C000004C000-000000067F000080000006A00C0000050000__00000083865C64B8", +"000000067F000080000006A00C000004C000-000000067F000080000006A00C0000050000__00000084A1F03030", +"000000067F000080000006A00C0000050000-000000067F000080000006A00C0000054000__00000083865C64B8", +"000000067F000080000006A00C0000050000-000000067F000080000006A00C0000054000__00000084A1F03030", +"000000067F000080000006A00C000005198B-000000067F000080000006A00C00000A31A6__000000844F1A6789-00000084A325AA01", +"000000067F000080000006A00C0000054000-000000067F000080000006A00C0000058000__00000083865C64B8", +"000000067F000080000006A00C0000054000-000000067F000080000006A00C0000058000__00000084A1F03030", +"000000067F000080000006A00C000005510B-000000067F000080000006A00C000005E871__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000058000-000000067F000080000006A00C000005C000__00000083865C64B8", +"000000067F000080000006A00C0000058000-000000067F000080000006A00C000005C000__00000084A1F03030", +"000000067F000080000006A00C000005C000-000000067F000080000006A00C0000060000__00000083865C64B8", +"000000067F000080000006A00C000005C000-000000067F000080000006A00C0000060000__00000084A1F03030", +"000000067F000080000006A00C000005E871-000000067F000080000006A00C0000067F8B__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000060000-000000067F000080000006A00C0000064000__00000083865C64B8", +"000000067F000080000006A00C0000060000-000000067F000080000006A00C0000064000__00000084A1F03030", +"000000067F000080000006A00C0000064000-000000067F000080000006A00C0000068000__00000083865C64B8", +"000000067F000080000006A00C0000064000-000000067F000080000006A00C0000068000__00000084A1F03030", +"000000067F000080000006A00C0000067F8B-000000067F000080000006A00C0100000000__0000008215AFE5A9-00000082B573F579", +"000000067F000080000006A00C0000068000-000000067F000080000006A00C000006C000__00000083865C64B8", +"000000067F000080000006A00C0000068000-000000067F000080000006A00C000006C000__00000084A1F03030", +"000000067F000080000006A00C000006C000-000000067F000080000006A00C0000070000__00000083865C64B8", +"000000067F000080000006A00C000006C000-000000067F000080000006A00C0000070000__00000084A1F03030", +"000000067F000080000006A00C0000070000-000000067F000080000006A00C0000074000__00000083865C64B8", +"000000067F000080000006A00C0000070000-000000067F000080000006A00C0000074000__00000084A1F03030", +"000000067F000080000006A00C00000703EC-000000067F000080000006A00C0000079B0C__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C0000074000-000000067F000080000006A00C0000078000__00000083865C64B8", +"000000067F000080000006A00C0000074000-000000067F000080000006A00C0000078000__00000084A1F03030", +"000000067F000080000006A00C0000078000-000000067F000080000006A00C000007C000__00000083865C64B8", +"000000067F000080000006A00C0000078000-000000067F000080000006A00C000007C000__00000084A1F03030", +"000000067F000080000006A00C0000079B0C-000000067F000080000006A00C0000083272__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C000007C000-000000067F000080000006A00C0000080000__00000083865C64B8", +"000000067F000080000006A00C000007C000-000000067F000080000006A00C0000080000__00000084A1F03030", +"000000067F000080000006A00C0000080000-000000067F000080000006A00C0000084000__00000083865C64B8", +"000000067F000080000006A00C0000080000-000000067F000080000006A00C0000084000__00000084A1F03030", +"000000067F000080000006A00C0000083272-000000067F000080000006A00C000008C9D8__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C0000084000-000000067F000080000006A00C0000088000__00000083865C64B8", +"000000067F000080000006A00C0000084000-000000067F000080000006A00C0000088000__00000084A1F03030", +"000000067F000080000006A00C0000088000-000000067F000080000006A00C000008C000__00000083865C64B8", +"000000067F000080000006A00C0000088000-000000067F000080000006A00C000008C000__00000084A1F03030", +"000000067F000080000006A00C000008C000-000000067F000080000006A00C0000090000__00000083865C64B8", +"000000067F000080000006A00C000008C000-000000067F000080000006A00C0000090000__00000084A1F03030", +"000000067F000080000006A00C000008C9D8-000000067F000080000006A00C0000096129__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C0000090000-000000067F000080000006A00C0000094000__00000083865C64B8", +"000000067F000080000006A00C0000090000-000000067F000080000006A00C0000094000__00000084A1F03030", +"000000067F000080000006A00C0000094000-000000067F000080000006A00C0000098000__00000083865C64B8", +"000000067F000080000006A00C0000094000-000000067F000080000006A00C0000098000__00000084A1F03030", +"000000067F000080000006A00C0000096129-000000067F000080000006A00C000009F88F__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C0000098000-000000067F000080000006A00C000009C000__00000083865C64B8", +"000000067F000080000006A00C0000098000-000000067F000080000006A00C000009C000__00000084A1F03030", +"000000067F000080000006A00C000009C000-000000067F000080000006A00C00000A0000__00000083865C64B8", +"000000067F000080000006A00C000009C000-000000067F000080000006A00C00000A0000__00000084A1F03030", +"000000067F000080000006A00C000009F88F-000000067F000080000006A00C00000A8F9F__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000A0000-000000067F000080000006A00C00000A4000__00000083865C64B8", +"000000067F000080000006A00C00000A0000-000000067F000080000006A00C00000A4000__00000084A1F03030", +"000000067F000080000006A00C00000A31B0-000000067F000080000006A00C00000F4C19__000000844F1A6789-00000084A325AA01", +"000000067F000080000006A00C00000A4000-000000067F000080000006A00C00000A8000__00000083865C64B8", +"000000067F000080000006A00C00000A4000-000000067F000080000006A00C00000A8000__00000084A1F03030", +"000000067F000080000006A00C00000A8000-000000067F000080000006A00C00000AC000__00000083865C64B8", +"000000067F000080000006A00C00000A8000-000000067F000080000006A00C00000AC000__00000084A1F03030", +"000000067F000080000006A00C00000A8F9F-000000067F000080000006A00C00000B2705__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000AC000-000000067F000080000006A00C00000B0000__00000083865C64B8", +"000000067F000080000006A00C00000AC000-000000067F000080000006A00C00000B0000__00000084A1F03030", +"000000067F000080000006A00C00000B0000-000000067F000080000006A00C00000B4000__00000083865C64B8", +"000000067F000080000006A00C00000B0000-000000067F000080000006A00C00000B4000__00000084A1F03030", +"000000067F000080000006A00C00000B2705-000000067F000080000006A00C00000BBE10__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000B4000-000000067F000080000006A00C00000B8000__00000083865C64B8", +"000000067F000080000006A00C00000B4000-000000067F000080000006A00C00000B8000__00000084A1F03030", +"000000067F000080000006A00C00000B8000-000000067F000080000006A00C00000BC000__00000083865C64B8", +"000000067F000080000006A00C00000B8000-000000067F000080000006A00C00000BC000__00000084A1F03030", +"000000067F000080000006A00C00000BBE10-000000067F000080000006A00C00000C5543__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000BC000-000000067F000080000006A00C00000C0000__00000083865C64B8", +"000000067F000080000006A00C00000BC000-000000067F000080000006A00C00000C0000__00000084A1F03030", +"000000067F000080000006A00C00000C0000-000000067F000080000006A00C00000C4000__00000083865C64B8", +"000000067F000080000006A00C00000C0000-000000067F000080000006A00C00000C4000__00000084A1F03030", +"000000067F000080000006A00C00000C4000-000000067F000080000006A00C00000C8000__00000083865C64B8", +"000000067F000080000006A00C00000C4000-000000067F000080000006A00C00000C8000__00000084A1F03030", +"000000067F000080000006A00C00000C4CC8-000000067F000080000006A0140000001CBC__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A00C00000C5543-000000067F000080000006A00C00000CECA9__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000C8000-000000067F000080000006A00C00000CC000__00000083865C64B8", +"000000067F000080000006A00C00000C8000-000000067F000080000006A00C00000CC000__00000084A1F03030", +"000000067F000080000006A00C00000CC000-000000067F000080000006A00C00000D0000__00000083865C64B8", +"000000067F000080000006A00C00000CC000-000000067F000080000006A00C00000D0000__00000084A1F03030", +"000000067F000080000006A00C00000CECA9-000000067F000080000006A00C00000D83C0__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000D0000-000000067F000080000006A00C00000D4000__00000083865C64B8", +"000000067F000080000006A00C00000D0000-000000067F000080000006A00C00000D4000__00000084A1F03030", +"000000067F000080000006A00C00000D4000-000000067F000080000006A00C00000D8000__00000083865C64B8", +"000000067F000080000006A00C00000D4000-000000067F000080000006A00C00000D8000__00000084A1F03030", +"000000067F000080000006A00C00000D8000-000000067F000080000006A00C00000DC000__00000083865C64B8", +"000000067F000080000006A00C00000D8000-000000067F000080000006A00C00000DC000__00000084A1F03030", +"000000067F000080000006A00C00000D83C0-000000067F000080000006A00C00000E1B0A__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000DC000-000000067F000080000006A00C00000E0000__00000083865C64B8", +"000000067F000080000006A00C00000DC000-000000067F000080000006A00C00000E0000__00000084A1F03030", +"000000067F000080000006A00C00000E0000-000000067F000080000006A00C00000E4000__00000084A1F03030", +"000000067F000080000006A00C00000E0000-030000000000000000000000000000000002__00000083865C64B8", +"000000067F000080000006A00C00000E1B0A-000000067F000080000006A00C00000EB270__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000E4000-000000067F000080000006A00C00000E8000__00000084A1F03030", +"000000067F000080000006A00C00000E8000-000000067F000080000006A00C00000EC000__00000084A1F03030", +"000000067F000080000006A00C00000EB270-000000067F000080000006A00C00000F49AA__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000EC000-000000067F000080000006A00C00000F0000__00000084A1F03030", +"000000067F000080000006A00C00000F0000-000000067F000080000006A00C00000F4000__00000084A1F03030", +"000000067F000080000006A00C00000F4000-000000067F000080000006A00C00000F8000__00000084A1F03030", +"000000067F000080000006A00C00000F49AA-000000067F000080000006A00C00000FE10A__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C00000F4C23-000000067F000080000006A014000000E1C2__000000844F1A6789-00000084A325AA01", +"000000067F000080000006A00C00000F8000-000000067F000080000006A00C00000FC000__00000084A1F03030", +"000000067F000080000006A00C00000FC000-000000067F000080000006A00C0000100000__00000084A1F03030", +"000000067F000080000006A00C00000FE10A-000000067F000080000006A00C000010782C__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C0000100000-000000067F000080000006A00C0000104000__00000084A1F03030", +"000000067F000080000006A00C0000104000-000000067F000080000006A00C0000108000__00000084A1F03030", +"000000067F000080000006A00C000010782C-000000067F000080000006A00C0000110F88__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A00C0000108000-000000067F000080000006A00C000010C000__00000084A1F03030", +"000000067F000080000006A00C000010C000-000000067F000080000006A00C0000110000__00000084A1F03030", +"000000067F000080000006A00C0000110000-000000067F000080000006A0120100000000__00000084A1F03030", +"000000067F000080000006A00C0000110F88-010000000000000001000000030000000014__00000082B573F579-00000083D5901FD9", +"000000067F000080000006A0140000000000-000000067F000080000006A0140000004000__00000084A1F03030", +"000000067F000080000006A0140000001CBC-000000067F000080000006A01400000088E1__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A0140000004000-000000067F000080000006A0140000008000__00000084A1F03030", +"000000067F000080000006A0140000008000-000000067F000080000006A014000000C000__00000084A1F03030", +"000000067F000080000006A01400000088E1-000000067F000080000006A014000000F459__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A014000000C000-000000067F000080000006A0140000010000__00000084A1F03030", +"000000067F000080000006A014000000E1C2-000000067F000080000006A014000002682C__000000844F1A6789-00000084A325AA01", +"000000067F000080000006A014000000F459-000000067F000080000006A0140000016068__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A0140000010000-000000067F000080000006A0140000014000__00000084A1F03030", +"000000067F000080000006A0140000014000-000000067F000080000006A0140000018000__00000084A1F03030", +"000000067F000080000006A0140000016068-000000067F000080000006A014000001CC14__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A0140000018000-000000067F000080000006A014000001C000__00000084A1F03030", +"000000067F000080000006A014000001C000-000000067F000080000006A0140000020000__00000084A1F03030", +"000000067F000080000006A014000001CC14-000000067F000080000006A014000002384E__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A0140000020000-000000067F000080000006A0140000024000__00000084A1F03030", +"000000067F000080000006A014000002384E-000000067F000080000006A014000002A467__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A0140000024000-000000067F000080000006A0140000028000__00000084A1F03030", +"000000067F000080000006A0140000026831-030000000000000000000000000000000002__000000844F1A6789-00000084A325AA01", +"000000067F000080000006A0140000028000-000000067F000080000006A014000002C000__00000084A1F03030", +"000000067F000080000006A014000002A467-030000000000000000000000000000000002__00000083D5901FD9-000000844F1A6789", +"000000067F000080000006A014000002C000-030000000000000000000000000000000002__00000084A1F03030", +"000000067F000080000006C00C0000000000-000000067F000080000006C00C0000004000__00000086746BDFE0", +"000000067F000080000006C00C0000004000-000000067F000080000006C00C0000008000__00000086746BDFE0", +"000000067F000080000006C00C0000008000-000000067F000080000006C00C000000C000__00000086746BDFE0", +"000000067F000080000006C00C00000090F5-000000067F000080000006C00C000001280C__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C000000C000-000000067F000080000006C00C0000010000__00000086746BDFE0", +"000000067F000080000006C00C0000010000-000000067F000080000006C00C0000014000__00000086746BDFE0", +"000000067F000080000006C00C000001280C-000000067F000080000006C00C000001BF72__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C0000014000-000000067F000080000006C00C0000018000__00000086746BDFE0", +"000000067F000080000006C00C0000018000-000000067F000080000006C00C000001C000__00000086746BDFE0", +"000000067F000080000006C00C000001BF72-000000067F000080000006C00C00000256D8__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C000001C000-000000067F000080000006C00C0000020000__00000086746BDFE0", +"000000067F000080000006C00C0000020000-000000067F000080000006C00C0000024000__00000086746BDFE0", +"000000067F000080000006C00C0000024000-000000067F000080000006C00C0000028000__00000086746BDFE0", +"000000067F000080000006C00C00000256D8-000000067F000080000006C00C000002EE0B__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C0000028000-000000067F000080000006C00C000002C000__00000086746BDFE0", +"000000067F000080000006C00C000002C000-000000067F000080000006C00C0000030000__00000086746BDFE0", +"000000067F000080000006C00C000002EE0B-000000067F000080000006C00C0000038521__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C0000030000-000000067F000080000006C00C0000034000__00000086746BDFE0", +"000000067F000080000006C00C0000034000-000000067F000080000006C00C0000038000__00000086746BDFE0", +"000000067F000080000006C00C0000038000-000000067F000080000006C00C000003C000__00000086746BDFE0", +"000000067F000080000006C00C0000038521-000000067F000080000006C00C0000041C87__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C000003C000-000000067F000080000006C00C0000040000__00000086746BDFE0", +"000000067F000080000006C00C0000040000-000000067F000080000006C00C0000044000__00000086746BDFE0", +"000000067F000080000006C00C0000041C87-000000067F000080000006C00C000004B3ED__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C0000044000-000000067F000080000006C00C0000048000__00000086746BDFE0", +"000000067F000080000006C00C0000048000-000000067F000080000006C00C000004C000__00000086720CFFF0", +"000000067F000080000006C00C0000048000-000000067F000080000006C00C000004C000__000000873B520940", +"000000067F000080000006C00C000004B3ED-030000000000000000000000000000000002__00000084A325AA01-00000085239DFB81", +"000000067F000080000006C00C000004BAC4-000000067F000080000006C00C00000551F9__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C000004C000-000000067F000080000006C00C0000050000__00000086720CFFF0", +"000000067F000080000006C00C000004C000-000000067F000080000006C00C0000050000__000000873B520940", +"000000067F000080000006C00C0000050000-000000067F000080000006C00C0000054000__00000086720CFFF0", +"000000067F000080000006C00C0000050000-000000067F000080000006C00C0000054000__000000873B520940", +"000000067F000080000006C00C0000054000-000000067F000080000006C00C0000058000__00000086720CFFF0", +"000000067F000080000006C00C0000054000-000000067F000080000006C00C0000058000__000000873B520940", +"000000067F000080000006C00C00000551F9-000000067F000080000006C00C000005E90C__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C0000055EB3-000000067F000080000006C00C00000AB316__00000086ED29E361-000000873C9A2551", +"000000067F000080000006C00C0000058000-000000067F000080000006C00C000005C000__00000086720CFFF0", +"000000067F000080000006C00C0000058000-000000067F000080000006C00C000005C000__000000873B520940", +"000000067F000080000006C00C000005C000-000000067F000080000006C00C0000060000__00000086720CFFF0", +"000000067F000080000006C00C000005C000-000000067F000080000006C00C0000060000__000000873B520940", +"000000067F000080000006C00C000005E90C-000000067F000080000006C00C000006802C__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C0000060000-000000067F000080000006C00C0000064000__00000086720CFFF0", +"000000067F000080000006C00C0000060000-000000067F000080000006C00C0000064000__000000873B520940", +"000000067F000080000006C00C0000064000-000000067F000080000006C00C0000068000__00000086720CFFF0", +"000000067F000080000006C00C0000064000-000000067F000080000006C00C0000068000__000000873B520940", +"000000067F000080000006C00C0000068000-000000067F000080000006C00C000006C000__00000086720CFFF0", +"000000067F000080000006C00C0000068000-000000067F000080000006C00C000006C000__000000873B520940", +"000000067F000080000006C00C000006802C-000000067F000080000006C00C0000071783__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C000006C000-000000067F000080000006C00C0000070000__00000086720CFFF0", +"000000067F000080000006C00C000006C000-000000067F000080000006C00C0000070000__000000873B520940", +"000000067F000080000006C00C0000070000-000000067F000080000006C00C0000074000__00000086720CFFF0", +"000000067F000080000006C00C0000070000-000000067F000080000006C00C0000074000__000000873B520940", +"000000067F000080000006C00C0000071783-000000067F000080000006C00C000007AEE9__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C0000074000-000000067F000080000006C00C0000078000__00000086720CFFF0", +"000000067F000080000006C00C0000074000-000000067F000080000006C00C0000078000__000000873B520940", +"000000067F000080000006C00C0000078000-000000067F000080000006C00C000007C000__00000086720CFFF0", +"000000067F000080000006C00C0000078000-000000067F000080000006C00C000007C000__000000873B520940", +"000000067F000080000006C00C000007AEE9-000000067F000080000006C00C000008460B__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C000007C000-000000067F000080000006C00C0000080000__00000086720CFFF0", +"000000067F000080000006C00C000007C000-000000067F000080000006C00C0000080000__000000873B520940", +"000000067F000080000006C00C0000080000-000000067F000080000006C00C0000084000__00000086720CFFF0", +"000000067F000080000006C00C0000080000-000000067F000080000006C00C0000084000__000000873B520940", +"000000067F000080000006C00C0000084000-000000067F000080000006C00C0000088000__00000086720CFFF0", +"000000067F000080000006C00C0000084000-000000067F000080000006C00C0000088000__000000873B520940", +"000000067F000080000006C00C000008460B-000000067F000080000006C00C000008DD71__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C0000088000-000000067F000080000006C00C000008C000__00000086720CFFF0", +"000000067F000080000006C00C0000088000-000000067F000080000006C00C000008C000__000000873B520940", +"000000067F000080000006C00C000008C000-000000067F000080000006C00C0000090000__00000086720CFFF0", +"000000067F000080000006C00C000008C000-000000067F000080000006C00C0000090000__000000873B520940", +"000000067F000080000006C00C000008DD71-000000067F000080000006C00C00000974D7__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C0000090000-000000067F000080000006C00C0000094000__00000086720CFFF0", +"000000067F000080000006C00C0000090000-000000067F000080000006C00C0000094000__000000873B520940", +"000000067F000080000006C00C0000094000-000000067F000080000006C00C0000098000__00000086720CFFF0", +"000000067F000080000006C00C0000094000-000000067F000080000006C00C0000098000__000000873B520940", +"000000067F000080000006C00C00000974D7-000000067F000080000006C00C00000A0C0B__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C0000098000-000000067F000080000006C00C000009C000__00000086720CFFF0", +"000000067F000080000006C00C0000098000-000000067F000080000006C00C000009C000__000000873B520940", +"000000067F000080000006C00C000009C000-000000067F000080000006C00C00000A0000__00000086720CFFF0", +"000000067F000080000006C00C000009C000-000000067F000080000006C00C00000A0000__000000873B520940", +"000000067F000080000006C00C00000A0000-000000067F000080000006C00C00000A4000__00000086720CFFF0", +"000000067F000080000006C00C00000A0000-000000067F000080000006C00C00000A4000__000000873B520940", +"000000067F000080000006C00C00000A0C0B-000000067F000080000006C00C00000AA371__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C00000A4000-000000067F000080000006C00C00000A8000__00000086720CFFF0", +"000000067F000080000006C00C00000A4000-000000067F000080000006C00C00000A8000__000000873B520940", +"000000067F000080000006C00C00000A8000-000000067F000080000006C00C00000AC000__00000086720CFFF0", +"000000067F000080000006C00C00000A8000-000000067F000080000006C00C00000AC000__000000873B520940", +"000000067F000080000006C00C00000AA371-000000067F000080000006C00C00000B3AD7__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C00000AB316-000000067F000080000006C00C00001015F1__00000086ED29E361-000000873C9A2551", +"000000067F000080000006C00C00000AC000-000000067F000080000006C00C00000B0000__00000086720CFFF0", +"000000067F000080000006C00C00000AC000-000000067F000080000006C00C00000B0000__000000873B520940", +"000000067F000080000006C00C00000B0000-000000067F000080000006C00C00000B4000__00000086720CFFF0", +"000000067F000080000006C00C00000B0000-000000067F000080000006C00C00000B4000__000000873B520940", +"000000067F000080000006C00C00000B3AD7-000000067F000080000006C00C0100000000__00000085239DFB81-00000085D35BF439", +"000000067F000080000006C00C00000B4000-000000067F000080000006C00C00000B8000__00000086720CFFF0", +"000000067F000080000006C00C00000B4000-000000067F000080000006C00C00000B8000__000000873B520940", +"000000067F000080000006C00C00000B8000-000000067F000080000006C00C00000BC000__00000086720CFFF0", +"000000067F000080000006C00C00000B8000-000000067F000080000006C00C00000BC000__000000873B520940", +"000000067F000080000006C00C00000BC000-000000067F000080000006C00C00000C0000__00000086720CFFF0", +"000000067F000080000006C00C00000BC000-000000067F000080000006C00C00000C0000__000000873B520940", +"000000067F000080000006C00C00000BC102-000000067F000080000006C00C00000C580D__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C00000BFB6E-000000067F000080000006C01400000016BC__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C00C00000C0000-000000067F000080000006C00C00000C4000__00000086720CFFF0", +"000000067F000080000006C00C00000C0000-000000067F000080000006C00C00000C4000__000000873B520940", +"000000067F000080000006C00C00000C4000-000000067F000080000006C00C00000C8000__00000086720CFFF0", +"000000067F000080000006C00C00000C4000-000000067F000080000006C00C00000C8000__000000873B520940", +"000000067F000080000006C00C00000C580D-000000067F000080000006C00C00000CEF73__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C00000C8000-000000067F000080000006C00C00000CC000__00000086720CFFF0", +"000000067F000080000006C00C00000C8000-000000067F000080000006C00C00000CC000__000000873B520940", +"000000067F000080000006C00C00000CC000-000000067F000080000006C00C00000D0000__00000086720CFFF0", +"000000067F000080000006C00C00000CC000-000000067F000080000006C00C00000D0000__000000873B520940", +"000000067F000080000006C00C00000CEF73-000000067F000080000006C00C00000D86D9__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C00000D0000-000000067F000080000006C00C00000D4000__00000086720CFFF0", +"000000067F000080000006C00C00000D0000-000000067F000080000006C00C00000D4000__000000873B520940", +"000000067F000080000006C00C00000D4000-000000067F000080000006C00C00000D8000__00000086720CFFF0", +"000000067F000080000006C00C00000D4000-000000067F000080000006C00C00000D8000__000000873B520940", +"000000067F000080000006C00C00000D8000-000000067F000080000006C00C00000DC000__00000086720CFFF0", +"000000067F000080000006C00C00000D8000-000000067F000080000006C00C00000DC000__000000873B520940", +"000000067F000080000006C00C00000D86D9-000000067F000080000006C00C00000E1E0C__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C00000DC000-000000067F000080000006C00C00000E0000__00000086720CFFF0", +"000000067F000080000006C00C00000DC000-000000067F000080000006C00C00000E0000__000000873B520940", +"000000067F000080000006C00C00000E0000-000000067F000080000006C00C00000E4000__00000086720CFFF0", +"000000067F000080000006C00C00000E0000-000000067F000080000006C00C00000E4000__000000873B520940", +"000000067F000080000006C00C00000E1E0C-000000067F000080000006C00C00000EB572__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C00000E4000-000000067F000080000006C00C00000E8000__00000086720CFFF0", +"000000067F000080000006C00C00000E4000-000000067F000080000006C00C00000E8000__000000873B520940", +"000000067F000080000006C00C00000E8000-000000067F000080000006C00C00000EC000__00000086720CFFF0", +"000000067F000080000006C00C00000E8000-000000067F000080000006C00C00000EC000__000000873B520940", +"000000067F000080000006C00C00000EB572-000000067F000080000006C00C00000F4CD8__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C00000EC000-000000067F000080000006C00C00000F0000__00000086720CFFF0", +"000000067F000080000006C00C00000EC000-000000067F000080000006C00C00000F0000__000000873B520940", +"000000067F000080000006C00C00000F0000-000000067F000080000006C00C00000F4000__00000086720CFFF0", +"000000067F000080000006C00C00000F0000-000000067F000080000006C00C00000F4000__000000873B520940", +"000000067F000080000006C00C00000F4000-000000067F000080000006C00C00000F8000__00000086720CFFF0", +"000000067F000080000006C00C00000F4000-000000067F000080000006C00C00000F8000__000000873B520940", +"000000067F000080000006C00C00000F4CD8-000000067F000080000006C00C00000FE40B__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C00000F8000-000000067F000080000006C00C00000FC000__00000086720CFFF0", +"000000067F000080000006C00C00000F8000-000000067F000080000006C00C00000FC000__000000873B520940", +"000000067F000080000006C00C00000FC000-000000067F000080000006C00C0000100000__00000086720CFFF0", +"000000067F000080000006C00C00000FC000-000000067F000080000006C00C0000100000__000000873B520940", +"000000067F000080000006C00C00000FE40B-000000067F000080000006C00C0000107B27__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C0000100000-000000067F000080000006C00C0000104000__00000086720CFFF0", +"000000067F000080000006C00C0000100000-000000067F000080000006C00C0000104000__000000873B520940", +"000000067F000080000006C00C00001015F3-000000067F000080000006C0140000013635__00000086ED29E361-000000873C9A2551", +"000000067F000080000006C00C0000104000-000000067F000080000006C00C0000108000__00000086720CFFF0", +"000000067F000080000006C00C0000104000-000000067F000080000006C00C0000108000__000000873B520940", +"000000067F000080000006C00C0000107B27-000000067F000080000006C00C000011128D__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C00C0000108000-000000067F000080000006C00C000010C000__00000086720CFFF0", +"000000067F000080000006C00C0000108000-000000067F000080000006C00C000010C000__000000873B520940", +"000000067F000080000006C00C000010C000-000000067F000080000006C00C0000110000__00000086720CFFF0", +"000000067F000080000006C00C000010C000-000000067F000080000006C00C0000110000__000000873B520940", +"000000067F000080000006C00C0000110000-000000067F000080000006C0120100000000__000000873B520940", +"000000067F000080000006C00C0000110000-030000000000000000000000000000000002__00000086720CFFF0", +"000000067F000080000006C00C000011128D-010000000000000001000000030000000017__00000085D35BF439-0000008673817FC9", +"000000067F000080000006C0140000000000-000000067F000080000006C0140000004000__000000873B520940", +"000000067F000080000006C01400000016BC-000000067F000080000006C014000000830F__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C0140000004000-000000067F000080000006C0140000008000__000000873B520940", +"000000067F000080000006C0140000008000-000000067F000080000006C014000000C000__000000873B520940", +"000000067F000080000006C014000000830F-000000067F000080000006C014000000EF5B__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C014000000C000-000000067F000080000006C0140000010000__000000873B520940", +"000000067F000080000006C014000000EF5B-000000067F000080000006C0140000015BA7__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C0140000010000-000000067F000080000006C0140000014000__000000873B520940", +"000000067F000080000006C0140000013636-000000067F000080000006C014000002DB5F__00000086ED29E361-000000873C9A2551", +"000000067F000080000006C0140000014000-000000067F000080000006C0140000018000__000000873B520940", +"000000067F000080000006C0140000015BA7-000000067F000080000006C014000001C7F0__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C0140000018000-000000067F000080000006C014000001C000__000000873B520940", +"000000067F000080000006C014000001C000-000000067F000080000006C0140000020000__000000873B520940", +"000000067F000080000006C014000001C7F0-000000067F000080000006C0140000023430__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C0140000020000-000000067F000080000006C0140000024000__000000873B520940", +"000000067F000080000006C0140000023430-000000067F000080000006C014000002A049__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C0140000024000-000000067F000080000006C0140000028000__000000873B520940", +"000000067F000080000006C0140000028000-000000067F000080000006C014000002C000__000000873B520940", +"000000067F000080000006C014000002A049-030000000000000000000000000000000002__0000008673817FC9-00000086ED29E361", +"000000067F000080000006C014000002C000-030000000000000000000000000000000002__000000873B520940", +"000000067F000080000006C014000002DB60-030000000000000000000000000000000002__00000086ED29E361-000000873C9A2551", +"000000067F000080000006E00C0000000000-000000067F000080000006E00C0000004000__000000890CF51FE0", +"000000067F000080000006E00C0000004000-000000067F000080000006E00C0000008000__000000890CF51FE0", +"000000067F000080000006E00C0000008000-000000067F000080000006E00C000000C000__000000890CF51FE0", +"000000067F000080000006E00C00000096C8-000000067F000080000006E00C0000012E0A__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C000000C000-000000067F000080000006E00C0000010000__000000890CF51FE0", +"000000067F000080000006E00C0000010000-000000067F000080000006E00C0000014000__000000890CF51FE0", +"000000067F000080000006E00C0000012E0A-000000067F000080000006E00C000001C570__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C0000014000-000000067F000080000006E00C0000018000__000000890CF51FE0", +"000000067F000080000006E00C0000018000-000000067F000080000006E00C000001C000__000000890CF51FE0", +"000000067F000080000006E00C000001C000-000000067F000080000006E00C0000020000__000000890CF51FE0", +"000000067F000080000006E00C000001C570-000000067F000080000006E00C0000025CD6__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C0000020000-000000067F000080000006E00C0000024000__000000890CF51FE0", +"000000067F000080000006E00C0000024000-000000067F000080000006E00C0000028000__000000890CF51FE0", +"000000067F000080000006E00C0000025CD6-000000067F000080000006E00C000002F40A__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C0000028000-000000067F000080000006E00C000002C000__000000890CF51FE0", +"000000067F000080000006E00C000002C000-000000067F000080000006E00C0000030000__000000890CF51FE0", +"000000067F000080000006E00C000002F40A-000000067F000080000006E00C0000038B1D__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C0000030000-000000067F000080000006E00C0000034000__000000890CF51FE0", +"000000067F000080000006E00C0000034000-000000067F000080000006E00C0000038000__000000890CF51FE0", +"000000067F000080000006E00C0000038000-000000067F000080000006E00C000003C000__000000890CF51FE0", +"000000067F000080000006E00C0000038B1D-000000067F000080000006E00C0000042283__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C000003C000-000000067F000080000006E00C0000040000__000000890CF51FE0", +"000000067F000080000006E00C0000040000-000000067F000080000006E00C0000044000__000000890CF51FE0", +"000000067F000080000006E00C0000042283-000000067F000080000006E00C000004B9E9__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C0000044000-000000067F000080000006E00C0000048000__000000890CF51FE0", +"000000067F000080000006E00C0000048000-000000067F000080000006E00C000004C000__000000890AE2DFC8", +"000000067F000080000006E00C0000048000-000000067F000080000006E00C000004C000__00000089D5AEF6E8", +"000000067F000080000006E00C000004B9E9-030000000000000000000000000000000002__000000873C9A2551-00000087BC75E5B1", +"000000067F000080000006E00C000004BACB-000000067F000080000006E00C0000055200__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C000004C000-000000067F000080000006E00C0000050000__000000890AE2DFC8", +"000000067F000080000006E00C000004C000-000000067F000080000006E00C0000050000__00000089D5AEF6E8", +"000000067F000080000006E00C0000050000-000000067F000080000006E00C0000054000__000000890AE2DFC8", +"000000067F000080000006E00C0000050000-000000067F000080000006E00C0000054000__00000089D5AEF6E8", +"000000067F000080000006E00C0000054000-000000067F000080000006E00C0000058000__000000890AE2DFC8", +"000000067F000080000006E00C0000054000-000000067F000080000006E00C0000058000__00000089D5AEF6E8", +"000000067F000080000006E00C0000054246-000000067F000080000006E00C00000A83ED__0000008985FD3611-00000089D6B8EE99", +"000000067F000080000006E00C0000055200-000000067F000080000006E00C000005E90B__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C0000058000-000000067F000080000006E00C000005C000__000000890AE2DFC8", +"000000067F000080000006E00C0000058000-000000067F000080000006E00C000005C000__00000089D5AEF6E8", +"000000067F000080000006E00C000005C000-000000067F000080000006E00C0000060000__000000890AE2DFC8", +"000000067F000080000006E00C000005C000-000000067F000080000006E00C0000060000__00000089D5AEF6E8", +"000000067F000080000006E00C000005E90B-000000067F000080000006E00C000006802B__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C0000060000-000000067F000080000006E00C0000064000__000000890AE2DFC8", +"000000067F000080000006E00C0000060000-000000067F000080000006E00C0000064000__00000089D5AEF6E8", +"000000067F000080000006E00C0000064000-000000067F000080000006E00C0000068000__000000890AE2DFC8", +"000000067F000080000006E00C0000064000-000000067F000080000006E00C0000068000__00000089D5AEF6E8", +"000000067F000080000006E00C0000068000-000000067F000080000006E00C000006C000__000000890AE2DFC8", +"000000067F000080000006E00C0000068000-000000067F000080000006E00C000006C000__00000089D5AEF6E8", +"000000067F000080000006E00C000006802B-000000067F000080000006E00C0000071782__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C000006C000-000000067F000080000006E00C0000070000__000000890AE2DFC8", +"000000067F000080000006E00C000006C000-000000067F000080000006E00C0000070000__00000089D5AEF6E8", +"000000067F000080000006E00C0000070000-000000067F000080000006E00C0000074000__000000890AE2DFC8", +"000000067F000080000006E00C0000070000-000000067F000080000006E00C0000074000__00000089D5AEF6E8", +"000000067F000080000006E00C0000071782-000000067F000080000006E00C000007AEE8__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C0000074000-000000067F000080000006E00C0000078000__000000890AE2DFC8", +"000000067F000080000006E00C0000074000-000000067F000080000006E00C0000078000__00000089D5AEF6E8", +"000000067F000080000006E00C0000078000-000000067F000080000006E00C000007C000__000000890AE2DFC8", +"000000067F000080000006E00C0000078000-000000067F000080000006E00C000007C000__00000089D5AEF6E8", +"000000067F000080000006E00C000007AEE8-000000067F000080000006E00C000008460B__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C000007C000-000000067F000080000006E00C0000080000__000000890AE2DFC8", +"000000067F000080000006E00C000007C000-000000067F000080000006E00C0000080000__00000089D5AEF6E8", +"000000067F000080000006E00C0000080000-000000067F000080000006E00C0000084000__000000890AE2DFC8", +"000000067F000080000006E00C0000080000-000000067F000080000006E00C0000084000__00000089D5AEF6E8", +"000000067F000080000006E00C0000084000-000000067F000080000006E00C0000088000__000000890AE2DFC8", +"000000067F000080000006E00C0000084000-000000067F000080000006E00C0000088000__00000089D5AEF6E8", +"000000067F000080000006E00C000008460B-000000067F000080000006E00C000008DD71__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C0000088000-000000067F000080000006E00C000008C000__000000890AE2DFC8", +"000000067F000080000006E00C0000088000-000000067F000080000006E00C000008C000__00000089D5AEF6E8", +"000000067F000080000006E00C000008C000-000000067F000080000006E00C0000090000__000000890AE2DFC8", +"000000067F000080000006E00C000008C000-000000067F000080000006E00C0000090000__00000089D5AEF6E8", +"000000067F000080000006E00C000008DD71-000000067F000080000006E00C00000974D7__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C0000090000-000000067F000080000006E00C0000094000__000000890AE2DFC8", +"000000067F000080000006E00C0000090000-000000067F000080000006E00C0000094000__00000089D5AEF6E8", +"000000067F000080000006E00C0000094000-000000067F000080000006E00C0000098000__000000890AE2DFC8", +"000000067F000080000006E00C0000094000-000000067F000080000006E00C0000098000__00000089D5AEF6E8", +"000000067F000080000006E00C00000974D7-000000067F000080000006E00C00000A0C0B__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C0000098000-000000067F000080000006E00C000009C000__000000890AE2DFC8", +"000000067F000080000006E00C0000098000-000000067F000080000006E00C000009C000__00000089D5AEF6E8", +"000000067F000080000006E00C000009C000-000000067F000080000006E00C00000A0000__000000890AE2DFC8", +"000000067F000080000006E00C000009C000-000000067F000080000006E00C00000A0000__00000089D5AEF6E8", +"000000067F000080000006E00C00000A0000-000000067F000080000006E00C00000A4000__000000890AE2DFC8", +"000000067F000080000006E00C00000A0000-000000067F000080000006E00C00000A4000__00000089D5AEF6E8", +"000000067F000080000006E00C00000A0C0B-000000067F000080000006E00C00000AA371__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C00000A4000-000000067F000080000006E00C00000A8000__000000890AE2DFC8", +"000000067F000080000006E00C00000A4000-000000067F000080000006E00C00000A8000__00000089D5AEF6E8", +"000000067F000080000006E00C00000A8000-000000067F000080000006E00C00000AC000__000000890AE2DFC8", +"000000067F000080000006E00C00000A8000-000000067F000080000006E00C00000AC000__00000089D5AEF6E8", +"000000067F000080000006E00C00000A8407-000000067F000080000006E00C00000FD787__0000008985FD3611-00000089D6B8EE99", +"000000067F000080000006E00C00000AA371-000000067F000080000006E00C00000B3AD7__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C00000AC000-000000067F000080000006E00C00000B0000__000000890AE2DFC8", +"000000067F000080000006E00C00000AC000-000000067F000080000006E00C00000B0000__00000089D5AEF6E8", +"000000067F000080000006E00C00000B0000-000000067F000080000006E00C00000B4000__000000890AE2DFC8", +"000000067F000080000006E00C00000B0000-000000067F000080000006E00C00000B4000__00000089D5AEF6E8", +"000000067F000080000006E00C00000B3AD7-000000067F000080000006E00C00000BD20B__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C00000B4000-000000067F000080000006E00C00000B8000__000000890AE2DFC8", +"000000067F000080000006E00C00000B4000-000000067F000080000006E00C00000B8000__00000089D5AEF6E8", +"000000067F000080000006E00C00000B6F42-000000067F000080000006E0140000000EEF__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E00C00000B8000-000000067F000080000006E00C00000BC000__000000890AE2DFC8", +"000000067F000080000006E00C00000B8000-000000067F000080000006E00C00000BC000__00000089D5AEF6E8", +"000000067F000080000006E00C00000BC000-000000067F000080000006E00C00000C0000__000000890AE2DFC8", +"000000067F000080000006E00C00000BC000-000000067F000080000006E00C00000C0000__00000089D5AEF6E8", +"000000067F000080000006E00C00000BD20B-000000067F000080000006E00C0100000000__00000087BC75E5B1-000000887C2DFE59", +"000000067F000080000006E00C00000C0000-000000067F000080000006E00C00000C4000__000000890AE2DFC8", +"000000067F000080000006E00C00000C0000-000000067F000080000006E00C00000C4000__00000089D5AEF6E8", +"000000067F000080000006E00C00000C4000-000000067F000080000006E00C00000C8000__000000890AE2DFC8", +"000000067F000080000006E00C00000C4000-000000067F000080000006E00C00000C8000__00000089D5AEF6E8", +"000000067F000080000006E00C00000C5883-000000067F000080000006E00C00000CEFE9__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C00000C8000-000000067F000080000006E00C00000CC000__000000890AE2DFC8", +"000000067F000080000006E00C00000C8000-000000067F000080000006E00C00000CC000__00000089D5AEF6E8", +"000000067F000080000006E00C00000CC000-000000067F000080000006E00C00000D0000__000000890AE2DFC8", +"000000067F000080000006E00C00000CC000-000000067F000080000006E00C00000D0000__00000089D5AEF6E8", +"000000067F000080000006E00C00000CEFE9-000000067F000080000006E00C00000D872B__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C00000D0000-000000067F000080000006E00C00000D4000__000000890AE2DFC8", +"000000067F000080000006E00C00000D0000-000000067F000080000006E00C00000D4000__00000089D5AEF6E8", +"000000067F000080000006E00C00000D4000-000000067F000080000006E00C00000D8000__000000890AE2DFC8", +"000000067F000080000006E00C00000D4000-000000067F000080000006E00C00000D8000__00000089D5AEF6E8", +"000000067F000080000006E00C00000D8000-000000067F000080000006E00C00000DC000__000000890AE2DFC8", +"000000067F000080000006E00C00000D8000-000000067F000080000006E00C00000DC000__00000089D5AEF6E8", +"000000067F000080000006E00C00000D872B-000000067F000080000006E00C00000E1E91__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C00000DC000-000000067F000080000006E00C00000E0000__000000890AE2DFC8", +"000000067F000080000006E00C00000DC000-000000067F000080000006E00C00000E0000__00000089D5AEF6E8", +"000000067F000080000006E00C00000E0000-000000067F000080000006E00C00000E4000__000000890AE2DFC8", +"000000067F000080000006E00C00000E0000-000000067F000080000006E00C00000E4000__00000089D5AEF6E8", +"000000067F000080000006E00C00000E1E91-000000067F000080000006E00C00000EB5F7__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C00000E4000-000000067F000080000006E00C00000E8000__000000890AE2DFC8", +"000000067F000080000006E00C00000E4000-000000067F000080000006E00C00000E8000__00000089D5AEF6E8", +"000000067F000080000006E00C00000E8000-000000067F000080000006E00C00000EC000__000000890AE2DFC8", +"000000067F000080000006E00C00000E8000-000000067F000080000006E00C00000EC000__00000089D5AEF6E8", +"000000067F000080000006E00C00000EB5F7-000000067F000080000006E00C00000F4D0C__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C00000EC000-000000067F000080000006E00C00000F0000__000000890AE2DFC8", +"000000067F000080000006E00C00000EC000-000000067F000080000006E00C00000F0000__00000089D5AEF6E8", +"000000067F000080000006E00C00000F0000-000000067F000080000006E00C00000F4000__000000890AE2DFC8", +"000000067F000080000006E00C00000F0000-000000067F000080000006E00C00000F4000__00000089D5AEF6E8", +"000000067F000080000006E00C00000F4000-000000067F000080000006E00C00000F8000__000000890AE2DFC8", +"000000067F000080000006E00C00000F4000-000000067F000080000006E00C00000F8000__00000089D5AEF6E8", +"000000067F000080000006E00C00000F4D0C-000000067F000080000006E00C00000FE472__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C00000F8000-000000067F000080000006E00C00000FC000__000000890AE2DFC8", +"000000067F000080000006E00C00000F8000-000000067F000080000006E00C00000FC000__00000089D5AEF6E8", +"000000067F000080000006E00C00000FC000-000000067F000080000006E00C0000100000__000000890AE2DFC8", +"000000067F000080000006E00C00000FC000-000000067F000080000006E00C0000100000__00000089D5AEF6E8", +"000000067F000080000006E00C00000FD78D-000000067F000080000006E0140000011DB5__0000008985FD3611-00000089D6B8EE99", +"000000067F000080000006E00C00000FE472-000000067F000080000006E00C0000107B8E__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C0000100000-000000067F000080000006E00C0000104000__000000890AE2DFC8", +"000000067F000080000006E00C0000100000-000000067F000080000006E00C0000104000__00000089D5AEF6E8", +"000000067F000080000006E00C0000104000-000000067F000080000006E00C0000108000__000000890AE2DFC8", +"000000067F000080000006E00C0000104000-000000067F000080000006E00C0000108000__00000089D5AEF6E8", +"000000067F000080000006E00C0000107B8E-000000067F000080000006E00C00001112F4__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E00C0000108000-000000067F000080000006E00C000010C000__000000890AE2DFC8", +"000000067F000080000006E00C0000108000-000000067F000080000006E00C000010C000__00000089D5AEF6E8", +"000000067F000080000006E00C000010C000-000000067F000080000006E00C0000110000__000000890AE2DFC8", +"000000067F000080000006E00C000010C000-000000067F000080000006E00C0000110000__00000089D5AEF6E8", +"000000067F000080000006E00C0000110000-000000067F000080000006E0120100000000__00000089D5AEF6E8", +"000000067F000080000006E00C0000110000-030000000000000000000000000000000002__000000890AE2DFC8", +"000000067F000080000006E00C00001112F4-01000000000000000100000003000000001A__000000887C2DFE59-000000890C5B6001", +"000000067F000080000006E0140000000000-000000067F000080000006E0140000004000__00000089D5AEF6E8", +"000000067F000080000006E0140000000EEF-000000067F000080000006E0140000007C4F__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E0140000004000-000000067F000080000006E0140000008000__00000089D5AEF6E8", +"000000067F000080000006E0140000007C4F-000000067F000080000006E014000000E97E__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E0140000008000-000000067F000080000006E014000000C000__00000089D5AEF6E8", +"000000067F000080000006E014000000C000-000000067F000080000006E0140000010000__00000089D5AEF6E8", +"000000067F000080000006E014000000E97E-000000067F000080000006E01400000156DC__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E0140000010000-000000067F000080000006E0140000014000__00000089D5AEF6E8", +"000000067F000080000006E0140000011DB5-000000067F000080000006E014000002B9CE__0000008985FD3611-00000089D6B8EE99", +"000000067F000080000006E0140000014000-000000067F000080000006E0140000018000__00000089D5AEF6E8", +"000000067F000080000006E01400000156DC-000000067F000080000006E014000001C468__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E0140000018000-000000067F000080000006E014000001C000__00000089D5AEF6E8", +"000000067F000080000006E014000001C000-000000067F000080000006E0140000020000__00000089D5AEF6E8", +"000000067F000080000006E014000001C468-000000067F000080000006E01400000231D5__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E0140000020000-000000067F000080000006E0140000024000__00000089D5AEF6E8", +"000000067F000080000006E01400000231D5-000000067F000080000006E0140000029F96__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E0140000024000-000000067F000080000006E0140000028000__00000089D5AEF6E8", +"000000067F000080000006E0140000028000-000000067F000080000006E014000002C000__00000089D5AEF6E8", +"000000067F000080000006E0140000029F96-030000000000000000000000000000000002__000000890C5B6001-0000008985FD3611", +"000000067F000080000006E014000002B9D0-030000000000000000000000000000000002__0000008985FD3611-00000089D6B8EE99", +"000000067F000080000006E014000002C000-030000000000000000000000000000000002__00000089D5AEF6E8", +"000000067F000080000007000C0000000000-000000067F000080000007000C0000004000__0000008BA730BFE8", +"000000067F000080000007000C0000004000-000000067F000080000007000C0000008000__0000008BA730BFE8", +"000000067F000080000007000C0000008000-000000067F000080000007000C000000C000__0000008BA730BFE8", +"000000067F000080000007000C000000955C-000000067F000080000007000C0000012CC2__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C000000C000-000000067F000080000007000C0000010000__0000008BA730BFE8", +"000000067F000080000007000C0000010000-000000067F000080000007000C0000014000__0000008BA730BFE8", +"000000067F000080000007000C0000012CC2-000000067F000080000007000C000001C40A__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C0000014000-000000067F000080000007000C0000018000__0000008BA730BFE8", +"000000067F000080000007000C0000018000-000000067F000080000007000C000001C000__0000008BA730BFE8", +"000000067F000080000007000C000001C000-000000067F000080000007000C0000020000__0000008BA730BFE8", +"000000067F000080000007000C000001C40A-000000067F000080000007000C0000025B39__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C0000020000-000000067F000080000007000C0000024000__0000008BA730BFE8", +"000000067F000080000007000C0000024000-000000067F000080000007000C0000028000__0000008BA730BFE8", +"000000067F000080000007000C0000025B39-000000067F000080000007000C000002F29F__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C0000028000-000000067F000080000007000C000002C000__0000008BA730BFE8", +"000000067F000080000007000C000002C000-000000067F000080000007000C0000030000__0000008BA730BFE8", +"000000067F000080000007000C000002F29F-000000067F000080000007000C00000389B3__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C0000030000-000000067F000080000007000C0000034000__0000008BA730BFE8", +"000000067F000080000007000C0000034000-000000067F000080000007000C0000038000__0000008BA730BFE8", +"000000067F000080000007000C0000038000-000000067F000080000007000C000003C000__0000008BA730BFE8", +"000000067F000080000007000C00000389B3-000000067F000080000007000C0000042119__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C000003C000-000000067F000080000007000C0000040000__0000008BA730BFE8", +"000000067F000080000007000C0000040000-000000067F000080000007000C0000044000__0000008BA730BFE8", +"000000067F000080000007000C0000042119-000000067F000080000007000C000004B87F__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C0000044000-000000067F000080000007000C0000048000__0000008BA730BFE8", +"000000067F000080000007000C0000048000-000000067F000080000007000C000004C000__0000008B9669EDB0", +"000000067F000080000007000C0000048000-000000067F000080000007000C000004C000__0000008C71903720", +"000000067F000080000007000C000004B87F-030000000000000000000000000000000002__00000089D6B8EE99-0000008A56BBF739", +"000000067F000080000007000C000004BAD3-000000067F000080000007000C0000055207__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C000004C000-000000067F000080000007000C0000050000__0000008B9669EDB0", +"000000067F000080000007000C000004C000-000000067F000080000007000C0000050000__0000008C71903720", +"000000067F000080000007000C0000050000-000000067F000080000007000C0000054000__0000008B9669EDB0", +"000000067F000080000007000C0000050000-000000067F000080000007000C0000054000__0000008C71903720", +"000000067F000080000007000C0000053C23-000000067F000080000007000C00000A6F76__0000008C2045B721-0000008C72843D41", +"000000067F000080000007000C0000054000-000000067F000080000007000C0000058000__0000008B9669EDB0", +"000000067F000080000007000C0000054000-000000067F000080000007000C0000058000__0000008C71903720", +"000000067F000080000007000C0000055207-000000067F000080000007000C000005E912__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C0000058000-000000067F000080000007000C000005C000__0000008B9669EDB0", +"000000067F000080000007000C0000058000-000000067F000080000007000C000005C000__0000008C71903720", +"000000067F000080000007000C000005C000-000000067F000080000007000C0000060000__0000008B9669EDB0", +"000000067F000080000007000C000005C000-000000067F000080000007000C0000060000__0000008C71903720", +"000000067F000080000007000C000005E912-000000067F000080000007000C000006802C__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C0000060000-000000067F000080000007000C0000064000__0000008B9669EDB0", +"000000067F000080000007000C0000060000-000000067F000080000007000C0000064000__0000008C71903720", +"000000067F000080000007000C0000064000-000000067F000080000007000C0000068000__0000008B9669EDB0", +"000000067F000080000007000C0000064000-000000067F000080000007000C0000068000__0000008C71903720", +"000000067F000080000007000C0000068000-000000067F000080000007000C000006C000__0000008B9669EDB0", +"000000067F000080000007000C0000068000-000000067F000080000007000C000006C000__0000008C71903720", +"000000067F000080000007000C000006802C-000000067F000080000007000C0000071783__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C000006C000-000000067F000080000007000C0000070000__0000008B9669EDB0", +"000000067F000080000007000C000006C000-000000067F000080000007000C0000070000__0000008C71903720", +"000000067F000080000007000C0000070000-000000067F000080000007000C0000074000__0000008B9669EDB0", +"000000067F000080000007000C0000070000-000000067F000080000007000C0000074000__0000008C71903720", +"000000067F000080000007000C0000071783-000000067F000080000007000C000007AEE9__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C0000074000-000000067F000080000007000C0000078000__0000008B9669EDB0", +"000000067F000080000007000C0000074000-000000067F000080000007000C0000078000__0000008C71903720", +"000000067F000080000007000C0000078000-000000067F000080000007000C000007C000__0000008B9669EDB0", +"000000067F000080000007000C0000078000-000000067F000080000007000C000007C000__0000008C71903720", +"000000067F000080000007000C000007AEE9-000000067F000080000007000C000008460B__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C000007C000-000000067F000080000007000C0000080000__0000008B9669EDB0", +"000000067F000080000007000C000007C000-000000067F000080000007000C0000080000__0000008C71903720", +"000000067F000080000007000C0000080000-000000067F000080000007000C0000084000__0000008B9669EDB0", +"000000067F000080000007000C0000080000-000000067F000080000007000C0000084000__0000008C71903720", +"000000067F000080000007000C0000084000-000000067F000080000007000C0000088000__0000008B9669EDB0", +"000000067F000080000007000C0000084000-000000067F000080000007000C0000088000__0000008C71903720", +"000000067F000080000007000C000008460B-000000067F000080000007000C000008DD71__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C0000088000-000000067F000080000007000C000008C000__0000008B9669EDB0", +"000000067F000080000007000C0000088000-000000067F000080000007000C000008C000__0000008C71903720", +"000000067F000080000007000C000008C000-000000067F000080000007000C0000090000__0000008B9669EDB0", +"000000067F000080000007000C000008C000-000000067F000080000007000C0000090000__0000008C71903720", +"000000067F000080000007000C000008DD71-000000067F000080000007000C00000974D7__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C0000090000-000000067F000080000007000C0000094000__0000008B9669EDB0", +"000000067F000080000007000C0000090000-000000067F000080000007000C0000094000__0000008C71903720", +"000000067F000080000007000C0000094000-000000067F000080000007000C0000098000__0000008B9669EDB0", +"000000067F000080000007000C0000094000-000000067F000080000007000C0000098000__0000008C71903720", +"000000067F000080000007000C00000974D7-000000067F000080000007000C00000A0C0B__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C0000098000-000000067F000080000007000C000009C000__0000008B9669EDB0", +"000000067F000080000007000C0000098000-000000067F000080000007000C000009C000__0000008C71903720", +"000000067F000080000007000C000009C000-000000067F000080000007000C00000A0000__0000008B9669EDB0", +"000000067F000080000007000C000009C000-000000067F000080000007000C00000A0000__0000008C71903720", +"000000067F000080000007000C00000A0000-000000067F000080000007000C00000A4000__0000008B9669EDB0", +"000000067F000080000007000C00000A0000-000000067F000080000007000C00000A4000__0000008C71903720", +"000000067F000080000007000C00000A0C0B-000000067F000080000007000C00000AA371__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C00000A4000-000000067F000080000007000C00000A8000__0000008B9669EDB0", +"000000067F000080000007000C00000A4000-000000067F000080000007000C00000A8000__0000008C71903720", +"000000067F000080000007000C00000A6F77-000000067F000080000007000C00000FA170__0000008C2045B721-0000008C72843D41", +"000000067F000080000007000C00000A8000-000000067F000080000007000C00000AC000__0000008B9669EDB0", +"000000067F000080000007000C00000A8000-000000067F000080000007000C00000AC000__0000008C71903720", +"000000067F000080000007000C00000AA371-000000067F000080000007000C0100000000__0000008A56BBF739-0000008AF67FEC19", +"000000067F000080000007000C00000AC000-000000067F000080000007000C00000B0000__0000008B9669EDB0", +"000000067F000080000007000C00000AC000-000000067F000080000007000C00000B0000__0000008C71903720", +"000000067F000080000007000C00000B0000-000000067F000080000007000C00000B4000__0000008B9669EDB0", +"000000067F000080000007000C00000B0000-000000067F000080000007000C00000B4000__0000008C71903720", +"000000067F000080000007000C00000B2B06-000000067F000080000007000C00000BC211__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000B4000-000000067F000080000007000C00000B8000__0000008B9669EDB0", +"000000067F000080000007000C00000B4000-000000067F000080000007000C00000B8000__0000008C71903720", +"000000067F000080000007000C00000B8000-000000067F000080000007000C00000BC000__0000008B9669EDB0", +"000000067F000080000007000C00000B8000-000000067F000080000007000C00000BC000__0000008C71903720", +"000000067F000080000007000C00000BC000-000000067F000080000007000C00000C0000__0000008B9669EDB0", +"000000067F000080000007000C00000BC000-000000067F000080000007000C00000C0000__0000008C71903720", +"000000067F000080000007000C00000BC211-000000067F000080000007000C00000C5941__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000BF157-000000067F000080000007001400000016B2__0000008BA6803FC9-0000008C2045B721", +"000000067F000080000007000C00000C0000-000000067F000080000007000C00000C4000__0000008B9669EDB0", +"000000067F000080000007000C00000C0000-000000067F000080000007000C00000C4000__0000008C71903720", +"000000067F000080000007000C00000C4000-000000067F000080000007000C00000C8000__0000008B9669EDB0", +"000000067F000080000007000C00000C4000-000000067F000080000007000C00000C8000__0000008C71903720", +"000000067F000080000007000C00000C5941-000000067F000080000007000C00000CF0A7__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000C8000-000000067F000080000007000C00000CC000__0000008B9669EDB0", +"000000067F000080000007000C00000C8000-000000067F000080000007000C00000CC000__0000008C71903720", +"000000067F000080000007000C00000CC000-000000067F000080000007000C00000D0000__0000008B9669EDB0", +"000000067F000080000007000C00000CC000-000000067F000080000007000C00000D0000__0000008C71903720", +"000000067F000080000007000C00000CF0A7-000000067F000080000007000C00000D87BC__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000D0000-000000067F000080000007000C00000D4000__0000008B9669EDB0", +"000000067F000080000007000C00000D0000-000000067F000080000007000C00000D4000__0000008C71903720", +"000000067F000080000007000C00000D4000-000000067F000080000007000C00000D8000__0000008B9669EDB0", +"000000067F000080000007000C00000D4000-000000067F000080000007000C00000D8000__0000008C71903720", +"000000067F000080000007000C00000D8000-000000067F000080000007000C00000DC000__0000008B9669EDB0", +"000000067F000080000007000C00000D8000-000000067F000080000007000C00000DC000__0000008C71903720", +"000000067F000080000007000C00000D87BC-000000067F000080000007000C00000E1F0A__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000DC000-000000067F000080000007000C00000E0000__0000008B9669EDB0", +"000000067F000080000007000C00000DC000-000000067F000080000007000C00000E0000__0000008C71903720", +"000000067F000080000007000C00000E0000-000000067F000080000007000C00000E4000__0000008B9669EDB0", +"000000067F000080000007000C00000E0000-000000067F000080000007000C00000E4000__0000008C71903720", +"000000067F000080000007000C00000E1F0A-000000067F000080000007000C00000EB670__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000E4000-000000067F000080000007000C00000E8000__0000008B9669EDB0", +"000000067F000080000007000C00000E4000-000000067F000080000007000C00000E8000__0000008C71903720", +"000000067F000080000007000C00000E8000-000000067F000080000007000C00000EC000__0000008B9669EDB0", +"000000067F000080000007000C00000E8000-000000067F000080000007000C00000EC000__0000008C71903720", +"000000067F000080000007000C00000EB670-000000067F000080000007000C00000F4DA7__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000EC000-000000067F000080000007000C00000F0000__0000008B9669EDB0", +"000000067F000080000007000C00000EC000-000000067F000080000007000C00000F0000__0000008C71903720", +"000000067F000080000007000C00000F0000-000000067F000080000007000C00000F4000__0000008B9669EDB0", +"000000067F000080000007000C00000F0000-000000067F000080000007000C00000F4000__0000008C71903720", +"000000067F000080000007000C00000F4000-000000067F000080000007000C00000F8000__0000008B9669EDB0", +"000000067F000080000007000C00000F4000-000000067F000080000007000C00000F8000__0000008C71903720", +"000000067F000080000007000C00000F4DA7-000000067F000080000007000C00000FE509__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C00000F8000-000000067F000080000007000C00000FC000__0000008B9669EDB0", +"000000067F000080000007000C00000F8000-000000067F000080000007000C00000FC000__0000008C71903720", +"000000067F000080000007000C00000FA175-000000067F00008000000700140000010412__0000008C2045B721-0000008C72843D41", +"000000067F000080000007000C00000FC000-000000067F000080000007000C0000100000__0000008B9669EDB0", +"000000067F000080000007000C00000FC000-000000067F000080000007000C0000100000__0000008C71903720", +"000000067F000080000007000C00000FE509-000000067F000080000007000C0000107C2B__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C0000100000-000000067F000080000007000C0000104000__0000008B9669EDB0", +"000000067F000080000007000C0000100000-000000067F000080000007000C0000104000__0000008C71903720", +"000000067F000080000007000C0000104000-000000067F000080000007000C0000108000__0000008B9669EDB0", +"000000067F000080000007000C0000104000-000000067F000080000007000C0000108000__0000008C71903720", +"000000067F000080000007000C0000107C2B-000000067F000080000007000C0000111385__0000008AF67FEC19-0000008BA6803FC9", +"000000067F000080000007000C0000108000-000000067F000080000007000C000010C000__0000008C71903720", +"000000067F000080000007000C0000108000-030000000000000000000000000000000002__0000008B9669EDB0", +"000000067F000080000007000C000010C000-000000067F000080000007000C0000110000__0000008C71903720", +"000000067F000080000007000C0000110000-000000067F00008000000700120100000000__0000008C71903720", +"000000067F000080000007000C0000111385-01000000000000000100000003000000001E__0000008AF67FEC19-0000008BA6803FC9", +"000000067F00008000000700140000000000-000000067F00008000000700140000004000__0000008C71903720", +"000000067F000080000007001400000016B2-000000067F000080000007001400000082A6__0000008BA6803FC9-0000008C2045B721", +"000000067F00008000000700140000004000-000000067F00008000000700140000008000__0000008C71903720", +"000000067F00008000000700140000008000-000000067F0000800000070014000000C000__0000008C71903720", +"000000067F000080000007001400000082A6-000000067F0000800000070014000000EED0__0000008BA6803FC9-0000008C2045B721", +"000000067F0000800000070014000000C000-000000067F00008000000700140000010000__0000008C71903720", +"000000067F0000800000070014000000EED0-000000067F00008000000700140000015ADC__0000008BA6803FC9-0000008C2045B721", +"000000067F00008000000700140000010000-000000067F00008000000700140000014000__0000008C71903720", +"000000067F0000800000070014000001041E-000000067F000080000007001400000294B8__0000008C2045B721-0000008C72843D41", +"000000067F00008000000700140000014000-000000067F00008000000700140000018000__0000008C71903720", +"000000067F00008000000700140000015ADC-000000067F0000800000070014000001C6D6__0000008BA6803FC9-0000008C2045B721", +"000000067F00008000000700140000018000-000000067F0000800000070014000001C000__0000008C71903720", +"000000067F0000800000070014000001C000-000000067F00008000000700140000020000__0000008C71903720", +"000000067F0000800000070014000001C6D6-000000067F000080000007001400000232FD__0000008BA6803FC9-0000008C2045B721", +"000000067F00008000000700140000020000-000000067F00008000000700140000024000__0000008C71903720", +"000000067F000080000007001400000232FD-000000067F00008000000700140000029F07__0000008BA6803FC9-0000008C2045B721", +"000000067F00008000000700140000024000-000000067F00008000000700140000028000__0000008C71903720", +"000000067F00008000000700140000028000-000000067F0000800000070014000002C000__0000008C71903720", +"000000067F000080000007001400000294BA-030000000000000000000000000000000002__0000008C2045B721-0000008C72843D41", +"000000067F00008000000700140000029F07-030000000000000000000000000000000002__0000008BA6803FC9-0000008C2045B721", +"000000067F0000800000070014000002C000-030000000000000000000000000000000002__0000008C71903720", +"000000067F000080000007200C0000000000-000000067F000080000007200C0000004000__0000008E43487FF0", +"000000067F000080000007200C0000004000-000000067F000080000007200C0000008000__0000008E43487FF0", +"000000067F000080000007200C0000008000-000000067F000080000007200C000000C000__0000008E43487FF0", +"000000067F000080000007200C000000933D-000000067F000080000007200C0000012AA3__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C000000C000-000000067F000080000007200C0000010000__0000008E43487FF0", +"000000067F000080000007200C0000010000-000000067F000080000007200C0000014000__0000008E43487FF0", +"000000067F000080000007200C0000012AA3-000000067F000080000007200C000001C209__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C0000014000-000000067F000080000007200C0000018000__0000008E43487FF0", +"000000067F000080000007200C0000018000-000000067F000080000007200C000001C000__0000008E43487FF0", +"000000067F000080000007200C000001C000-000000067F000080000007200C0000020000__0000008E43487FF0", +"000000067F000080000007200C000001C209-000000067F000080000007200C0000025939__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C0000020000-000000067F000080000007200C0000024000__0000008E43487FF0", +"000000067F000080000007200C0000024000-000000067F000080000007200C0000028000__0000008E43487FF0", +"000000067F000080000007200C0000025939-000000067F000080000007200C000002F09F__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C0000028000-000000067F000080000007200C000002C000__0000008E43487FF0", +"000000067F000080000007200C000002C000-000000067F000080000007200C0000030000__0000008E43487FF0", +"000000067F000080000007200C000002F09F-000000067F000080000007200C00000387B4__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C0000030000-000000067F000080000007200C0000034000__0000008E43487FF0", +"000000067F000080000007200C0000034000-000000067F000080000007200C0000038000__0000008E43487FF0", +"000000067F000080000007200C0000038000-000000067F000080000007200C000003C000__0000008E43487FF0", +"000000067F000080000007200C00000387B4-000000067F000080000007200C0000041F1A__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C000003C000-000000067F000080000007200C0000040000__0000008E43487FF0", +"000000067F000080000007200C0000040000-000000067F000080000007200C0000044000__0000008E43487FF0", +"000000067F000080000007200C0000041F1A-000000067F000080000007200C000004B680__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C0000044000-000000067F000080000007200C0000048000__0000008E43487FF0", +"000000067F000080000007200C0000048000-000000067F000080000007200C000004C000__0000008E3CDF59C0", +"000000067F000080000007200C0000048000-000000067F000080000007200C000004C000__0000008F10EA21C8", +"000000067F000080000007200C000004B680-030000000000000000000000000000000002__0000008C72843D41-0000008CF2BFFC89", +"000000067F000080000007200C000004BACE-000000067F000080000007200C0000055202__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C000004C000-000000067F000080000007200C0000050000__0000008E3CDF59C0", +"000000067F000080000007200C000004C000-000000067F000080000007200C0000050000__0000008F10EA21C8", +"000000067F000080000007200C0000050000-000000067F000080000007200C0000054000__0000008E3CDF59C0", +"000000067F000080000007200C0000050000-000000067F000080000007200C0000054000__0000008F10EA21C8", +"000000067F000080000007200C000005131D-000000067F000080000007200C00000A2138__0000008EBC4827C1-0000008F10E3E189", +"000000067F000080000007200C0000054000-000000067F000080000007200C0000058000__0000008E3CDF59C0", +"000000067F000080000007200C0000054000-000000067F000080000007200C0000058000__0000008F10EA21C8", +"000000067F000080000007200C0000055202-000000067F000080000007200C000005E90D__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C0000058000-000000067F000080000007200C000005C000__0000008E3CDF59C0", +"000000067F000080000007200C0000058000-000000067F000080000007200C000005C000__0000008F10EA21C8", +"000000067F000080000007200C000005C000-000000067F000080000007200C0000060000__0000008E3CDF59C0", +"000000067F000080000007200C000005C000-000000067F000080000007200C0000060000__0000008F10EA21C8", +"000000067F000080000007200C000005E90D-000000067F000080000007200C000006802B__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C0000060000-000000067F000080000007200C0000064000__0000008E3CDF59C0", +"000000067F000080000007200C0000060000-000000067F000080000007200C0000064000__0000008F10EA21C8", +"000000067F000080000007200C0000064000-000000067F000080000007200C0000068000__0000008E3CDF59C0", +"000000067F000080000007200C0000064000-000000067F000080000007200C0000068000__0000008F10EA21C8", +"000000067F000080000007200C0000068000-000000067F000080000007200C000006C000__0000008E3CDF59C0", +"000000067F000080000007200C0000068000-000000067F000080000007200C000006C000__0000008F10EA21C8", +"000000067F000080000007200C000006802B-000000067F000080000007200C0000071782__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C000006C000-000000067F000080000007200C0000070000__0000008E3CDF59C0", +"000000067F000080000007200C000006C000-000000067F000080000007200C0000070000__0000008F10EA21C8", +"000000067F000080000007200C0000070000-000000067F000080000007200C0000074000__0000008E3CDF59C0", +"000000067F000080000007200C0000070000-000000067F000080000007200C0000074000__0000008F10EA21C8", +"000000067F000080000007200C0000071782-000000067F000080000007200C000007AEE8__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C0000074000-000000067F000080000007200C0000078000__0000008E3CDF59C0", +"000000067F000080000007200C0000074000-000000067F000080000007200C0000078000__0000008F10EA21C8", +"000000067F000080000007200C0000078000-000000067F000080000007200C000007C000__0000008E3CDF59C0", +"000000067F000080000007200C0000078000-000000067F000080000007200C000007C000__0000008F10EA21C8", +"000000067F000080000007200C000007AEE8-000000067F000080000007200C000008460B__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C000007C000-000000067F000080000007200C0000080000__0000008E3CDF59C0", +"000000067F000080000007200C000007C000-000000067F000080000007200C0000080000__0000008F10EA21C8", +"000000067F000080000007200C0000080000-000000067F000080000007200C0000084000__0000008E3CDF59C0", +"000000067F000080000007200C0000080000-000000067F000080000007200C0000084000__0000008F10EA21C8", +"000000067F000080000007200C0000084000-000000067F000080000007200C0000088000__0000008E3CDF59C0", +"000000067F000080000007200C0000084000-000000067F000080000007200C0000088000__0000008F10EA21C8", +"000000067F000080000007200C000008460B-000000067F000080000007200C000008DD71__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C0000088000-000000067F000080000007200C000008C000__0000008E3CDF59C0", +"000000067F000080000007200C0000088000-000000067F000080000007200C000008C000__0000008F10EA21C8", +"000000067F000080000007200C000008C000-000000067F000080000007200C0000090000__0000008E3CDF59C0", +"000000067F000080000007200C000008C000-000000067F000080000007200C0000090000__0000008F10EA21C8", +"000000067F000080000007200C000008DD71-000000067F000080000007200C00000974D7__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C0000090000-000000067F000080000007200C0000094000__0000008E3CDF59C0", +"000000067F000080000007200C0000090000-000000067F000080000007200C0000094000__0000008F10EA21C8", +"000000067F000080000007200C0000094000-000000067F000080000007200C0000098000__0000008E3CDF59C0", +"000000067F000080000007200C0000094000-000000067F000080000007200C0000098000__0000008F10EA21C8", +"000000067F000080000007200C00000974D7-000000067F000080000007200C00000A0C0B__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C0000098000-000000067F000080000007200C000009C000__0000008E3CDF59C0", +"000000067F000080000007200C0000098000-000000067F000080000007200C000009C000__0000008F10EA21C8", +"000000067F000080000007200C000009C000-000000067F000080000007200C00000A0000__0000008E3CDF59C0", +"000000067F000080000007200C000009C000-000000067F000080000007200C00000A0000__0000008F10EA21C8", +"000000067F000080000007200C00000A0000-000000067F000080000007200C00000A4000__0000008E3CDF59C0", +"000000067F000080000007200C00000A0000-000000067F000080000007200C00000A4000__0000008F10EA21C8", +"000000067F000080000007200C00000A0C0B-000000067F000080000007200C00000AA371__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C00000A2138-000000067F000080000007200C00000F342E__0000008EBC4827C1-0000008F10E3E189", +"000000067F000080000007200C00000A4000-000000067F000080000007200C00000A8000__0000008E3CDF59C0", +"000000067F000080000007200C00000A4000-000000067F000080000007200C00000A8000__0000008F10EA21C8", +"000000067F000080000007200C00000A8000-000000067F000080000007200C00000AC000__0000008E3CDF59C0", +"000000067F000080000007200C00000A8000-000000067F000080000007200C00000AC000__0000008F10EA21C8", +"000000067F000080000007200C00000AA371-000000067F000080000007200C00000B3AD7__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C00000AC000-000000067F000080000007200C00000B0000__0000008E3CDF59C0", +"000000067F000080000007200C00000AC000-000000067F000080000007200C00000B0000__0000008F10EA21C8", +"000000067F000080000007200C00000B0000-000000067F000080000007200C00000B4000__0000008E3CDF59C0", +"000000067F000080000007200C00000B0000-000000067F000080000007200C00000B4000__0000008F10EA21C8", +"000000067F000080000007200C00000B3AD7-000000067F000080000007200C00000BD20B__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C00000B4000-000000067F000080000007200C00000B8000__0000008E3CDF59C0", +"000000067F000080000007200C00000B4000-000000067F000080000007200C00000B8000__0000008F10EA21C8", +"000000067F000080000007200C00000B8000-000000067F000080000007200C00000BC000__0000008E3CDF59C0", +"000000067F000080000007200C00000B8000-000000067F000080000007200C00000BC000__0000008F10EA21C8", +"000000067F000080000007200C00000BA086-000000067F00008000000720140000001101__0000008E42A19FD1-0000008EBC4827C1", +"000000067F000080000007200C00000BC000-000000067F000080000007200C00000C0000__0000008E3CDF59C0", +"000000067F000080000007200C00000BC000-000000067F000080000007200C00000C0000__0000008F10EA21C8", +"000000067F000080000007200C00000BD20B-000000067F000080000007200C0100000000__0000008CF2BFFC89-0000008DB277FA49", +"000000067F000080000007200C00000C0000-000000067F000080000007200C00000C4000__0000008E3CDF59C0", +"000000067F000080000007200C00000C0000-000000067F000080000007200C00000C4000__0000008F10EA21C8", +"000000067F000080000007200C00000C4000-000000067F000080000007200C00000C8000__0000008E3CDF59C0", +"000000067F000080000007200C00000C4000-000000067F000080000007200C00000C8000__0000008F10EA21C8", +"000000067F000080000007200C00000C58B0-000000067F000080000007200C00000CF00A__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C00000C8000-000000067F000080000007200C00000CC000__0000008E3CDF59C0", +"000000067F000080000007200C00000C8000-000000067F000080000007200C00000CC000__0000008F10EA21C8", +"000000067F000080000007200C00000CC000-000000067F000080000007200C00000D0000__0000008E3CDF59C0", +"000000067F000080000007200C00000CC000-000000067F000080000007200C00000D0000__0000008F10EA21C8", +"000000067F000080000007200C00000CF00A-000000067F000080000007200C00000D871F__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C00000D0000-000000067F000080000007200C00000D4000__0000008E3CDF59C0", +"000000067F000080000007200C00000D0000-000000067F000080000007200C00000D4000__0000008F10EA21C8", +"000000067F000080000007200C00000D4000-000000067F000080000007200C00000D8000__0000008E3CDF59C0", +"000000067F000080000007200C00000D4000-000000067F000080000007200C00000D8000__0000008F10EA21C8", +"000000067F000080000007200C00000D8000-000000067F000080000007200C00000DC000__0000008E3CDF59C0", +"000000067F000080000007200C00000D8000-000000067F000080000007200C00000DC000__0000008F10EA21C8", +"000000067F000080000007200C00000D871F-000000067F000080000007200C00000E1E85__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C00000DC000-000000067F000080000007200C00000E0000__0000008E3CDF59C0", +"000000067F000080000007200C00000DC000-000000067F000080000007200C00000E0000__0000008F10EA21C8", +"000000067F000080000007200C00000E0000-000000067F000080000007200C00000E4000__0000008E3CDF59C0", +"000000067F000080000007200C00000E0000-000000067F000080000007200C00000E4000__0000008F10EA21C8", +"000000067F000080000007200C00000E1E85-000000067F000080000007200C00000EB5EB__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C00000E4000-000000067F000080000007200C00000E8000__0000008E3CDF59C0", +"000000067F000080000007200C00000E4000-000000067F000080000007200C00000E8000__0000008F10EA21C8", +"000000067F000080000007200C00000E8000-000000067F000080000007200C00000EC000__0000008E3CDF59C0", +"000000067F000080000007200C00000E8000-000000067F000080000007200C00000EC000__0000008F10EA21C8", +"000000067F000080000007200C00000EB5EB-000000067F000080000007200C00000F4D0C__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C00000EC000-000000067F000080000007200C00000F0000__0000008E3CDF59C0", +"000000067F000080000007200C00000EC000-000000067F000080000007200C00000F0000__0000008F10EA21C8", +"000000067F000080000007200C00000F0000-000000067F000080000007200C00000F4000__0000008E3CDF59C0", +"000000067F000080000007200C00000F0000-000000067F000080000007200C00000F4000__0000008F10EA21C8", +"000000067F000080000007200C00000F342F-000000067F0000800000072014000000D54C__0000008EBC4827C1-0000008F10E3E189", +"000000067F000080000007200C00000F4000-000000067F000080000007200C00000F8000__0000008E3CDF59C0", +"000000067F000080000007200C00000F4000-000000067F000080000007200C00000F8000__0000008F10EA21C8", +"000000067F000080000007200C00000F4D0C-000000067F000080000007200C00000FE472__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C00000F8000-000000067F000080000007200C00000FC000__0000008E3CDF59C0", +"000000067F000080000007200C00000F8000-000000067F000080000007200C00000FC000__0000008F10EA21C8", +"000000067F000080000007200C00000FC000-000000067F000080000007200C0000100000__0000008E3CDF59C0", +"000000067F000080000007200C00000FC000-000000067F000080000007200C0000100000__0000008F10EA21C8", +"000000067F000080000007200C00000FE472-000000067F000080000007200C0000107B8E__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C0000100000-000000067F000080000007200C0000104000__0000008E3CDF59C0", +"000000067F000080000007200C0000100000-000000067F000080000007200C0000104000__0000008F10EA21C8", +"000000067F000080000007200C0000104000-000000067F000080000007200C0000108000__0000008E3CDF59C0", +"000000067F000080000007200C0000104000-000000067F000080000007200C0000108000__0000008F10EA21C8", +"000000067F000080000007200C0000107B8E-000000067F000080000007200C00001112F4__0000008DB277FA49-0000008E42A19FD1", +"000000067F000080000007200C0000108000-000000067F000080000007200C000010C000__0000008E3CDF59C0", +"000000067F000080000007200C0000108000-000000067F000080000007200C000010C000__0000008F10EA21C8", +"000000067F000080000007200C000010C000-000000067F000080000007200C0000110000__0000008F10EA21C8", +"000000067F000080000007200C000010C000-030000000000000000000000000000000002__0000008E3CDF59C0", +"000000067F000080000007200C0000110000-000000067F00008000000720120100000000__0000008F10EA21C8", +"000000067F000080000007200C00001112F4-010000000000000001000000040000000001__0000008DB277FA49-0000008E42A19FD1", +"000000067F00008000000720140000000000-000000067F00008000000720140000004000__0000008F10EA21C8", +"000000067F00008000000720140000001101-000000067F00008000000720140000007E82__0000008E42A19FD1-0000008EBC4827C1", +"000000067F00008000000720140000004000-000000067F00008000000720140000008000__0000008F10EA21C8", +"000000067F00008000000720140000007E82-000000067F0000800000072014000000EB9D__0000008E42A19FD1-0000008EBC4827C1", +"000000067F00008000000720140000008000-000000067F0000800000072014000000C000__0000008F10EA21C8", +"000000067F0000800000072014000000C000-000000067F00008000000720140000010000__0000008F10EA21C8", +"000000067F0000800000072014000000D54D-000000067F00008000000720140000025E6D__0000008EBC4827C1-0000008F10E3E189", +"000000067F0000800000072014000000EB9D-000000067F00008000000720140000015866__0000008E42A19FD1-0000008EBC4827C1", +"000000067F00008000000720140000010000-000000067F00008000000720140000014000__0000008F10EA21C8", +"000000067F00008000000720140000014000-000000067F00008000000720140000018000__0000008F10EA21C8", +"000000067F00008000000720140000015866-000000067F0000800000072014000001C591__0000008E42A19FD1-0000008EBC4827C1", +"000000067F00008000000720140000018000-000000067F0000800000072014000001C000__0000008F10EA21C8", +"000000067F0000800000072014000001C000-000000067F00008000000720140000020000__0000008F10EA21C8", +"000000067F0000800000072014000001C591-000000067F0000800000072014000002326E__0000008E42A19FD1-0000008EBC4827C1", +"000000067F00008000000720140000020000-000000067F00008000000720140000024000__0000008F10EA21C8", +"000000067F0000800000072014000002326E-000000067F00008000000720140000029F59__0000008E42A19FD1-0000008EBC4827C1", +"000000067F00008000000720140000024000-000000067F00008000000720140000028000__0000008F10EA21C8", +"000000067F00008000000720140000025E75-030000000000000000000000000000000002__0000008EBC4827C1-0000008F10E3E189", +"000000067F00008000000720140000028000-000000067F0000800000072014000002C000__0000008F10EA21C8", +"000000067F00008000000720140000029F59-030000000000000000000000000000000002__0000008E42A19FD1-0000008EBC4827C1", +"000000067F0000800000072014000002C000-030000000000000000000000000000000002__0000008F10EA21C8", +"000000067F000080000007400C0000000000-000000067F000080000007400C0000004000__00000091A67E3E18", +"000000067F000080000007400C0000004000-000000067F000080000007400C0000008000__00000091A67E3E18", +"000000067F000080000007400C0000008000-000000067F000080000007400C000000C000__00000091A67E3E18", +"000000067F000080000007400C00000090E9-000000067F000080000007400C000001280C__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C000000C000-000000067F000080000007400C0000010000__00000091A67E3E18", +"000000067F000080000007400C0000010000-000000067F000080000007400C0000014000__00000091A67E3E18", +"000000067F000080000007400C000001280C-000000067F000080000007400C000001BF72__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C0000014000-000000067F000080000007400C0000018000__00000091A67E3E18", +"000000067F000080000007400C0000018000-000000067F000080000007400C000001C000__00000091A67E3E18", +"000000067F000080000007400C000001BF72-000000067F000080000007400C00000256D8__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C000001C000-000000067F000080000007400C0000020000__00000091A67E3E18", +"000000067F000080000007400C0000020000-000000067F000080000007400C0000024000__00000091A67E3E18", +"000000067F000080000007400C0000024000-000000067F000080000007400C0000028000__00000091A67E3E18", +"000000067F000080000007400C00000256D8-000000067F000080000007400C000002EE0B__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C0000028000-000000067F000080000007400C000002C000__00000091A67E3E18", +"000000067F000080000007400C000002C000-000000067F000080000007400C0000030000__00000091A67E3E18", +"000000067F000080000007400C000002EE0B-000000067F000080000007400C0000038521__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C0000030000-000000067F000080000007400C0000034000__00000091A67E3E18", +"000000067F000080000007400C0000034000-000000067F000080000007400C0000038000__00000091A67E3E18", +"000000067F000080000007400C0000038000-000000067F000080000007400C000003C000__00000091A67E3E18", +"000000067F000080000007400C0000038521-000000067F000080000007400C0000041C87__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C000003C000-000000067F000080000007400C0000040000__00000091A67E3E18", +"000000067F000080000007400C0000040000-000000067F000080000007400C0000044000__00000091A67E3E18", +"000000067F000080000007400C0000041C87-000000067F000080000007400C000004B3ED__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C0000044000-000000067F000080000007400C0000048000__00000091A67E3E18", +"000000067F000080000007400C0000048000-000000067F000080000007400C000004C000__000000914B20A810", +"000000067F000080000007400C000004B3ED-030000000000000000000000000000000002__0000008F10E3E189-0000008F915DE591", +"000000067F000080000007400C000004BAC9-000000067F000080000007400C00000551FE__0000008F915DE591-000000903121F569", +"000000067F000080000007400C000004C000-000000067F000080000007400C0000050000__000000914B20A810", +"000000067F000080000007400C000004DF0B-000000067F000080000007400C000009B41F__000000914B2393B1-00000091A6DD7A79", +"000000067F000080000007400C0000050000-000000067F000080000007400C0000054000__000000914B20A810", +"000000067F000080000007400C0000054000-000000067F000080000007400C0000058000__000000914B20A810", +"000000067F000080000007400C00000551FE-000000067F000080000007400C000005E90C__0000008F915DE591-000000903121F569", +"000000067F000080000007400C0000058000-000000067F000080000007400C000005C000__000000914B20A810", +"000000067F000080000007400C000005C000-000000067F000080000007400C0000060000__000000914B20A810", +"000000067F000080000007400C000005E90C-000000067F000080000007400C000006802C__0000008F915DE591-000000903121F569", +"000000067F000080000007400C0000060000-000000067F000080000007400C0000064000__000000914B20A810", +"000000067F000080000007400C0000064000-000000067F000080000007400C0000068000__000000914B20A810", +"000000067F000080000007400C0000068000-000000067F000080000007400C000006C000__000000914B20A810", +"000000067F000080000007400C000006802C-000000067F000080000007400C0000071783__0000008F915DE591-000000903121F569", +"000000067F000080000007400C000006C000-000000067F000080000007400C0000070000__000000914B20A810", +"000000067F000080000007400C0000070000-000000067F000080000007400C0000074000__000000914B20A810", +"000000067F000080000007400C0000071783-000000067F000080000007400C000007AEE9__0000008F915DE591-000000903121F569", +"000000067F000080000007400C0000074000-000000067F000080000007400C0000078000__000000914B20A810", +"000000067F000080000007400C0000078000-000000067F000080000007400C000007C000__000000914B20A810", +"000000067F000080000007400C000007AEE9-000000067F000080000007400C000008460B__0000008F915DE591-000000903121F569", +"000000067F000080000007400C000007C000-000000067F000080000007400C0000080000__000000914B20A810", +"000000067F000080000007400C0000080000-000000067F000080000007400C0000084000__000000914B20A810", +"000000067F000080000007400C0000084000-000000067F000080000007400C0000088000__000000914B20A810", +"000000067F000080000007400C000008460B-000000067F000080000007400C000008DD71__0000008F915DE591-000000903121F569", +"000000067F000080000007400C0000088000-000000067F000080000007400C000008C000__000000914B20A810", +"000000067F000080000007400C000008C000-000000067F000080000007400C0000090000__000000914B20A810", +"000000067F000080000007400C000008DD71-000000067F000080000007400C00000974D7__0000008F915DE591-000000903121F569", +"000000067F000080000007400C0000090000-000000067F000080000007400C0000094000__000000914B20A810", +"000000067F000080000007400C0000094000-000000067F000080000007400C0000098000__000000914B20A810", +"000000067F000080000007400C00000974D7-000000067F000080000007400C00000A0C0B__0000008F915DE591-000000903121F569", +"000000067F000080000007400C0000098000-000000067F000080000007400C000009C000__000000914B20A810", +"000000067F000080000007400C000009B420-000000067F000080000007400C00000E830A__000000914B2393B1-00000091A6DD7A79", +"000000067F000080000007400C000009C000-000000067F000080000007400C00000A0000__000000914B20A810", +"000000067F000080000007400C00000A0000-000000067F000080000007400C00000A4000__000000914B20A810", +"000000067F000080000007400C00000A0C0B-000000067F000080000007400C00000AA371__0000008F915DE591-000000903121F569", +"000000067F000080000007400C00000A4000-000000067F000080000007400C00000A8000__000000914B20A810", +"000000067F000080000007400C00000A8000-000000067F000080000007400C00000AC000__00000090DFD64240", +"000000067F000080000007400C00000AA371-000000067F000080000007400C0100000000__0000008F915DE591-000000903121F569", +"000000067F000080000007400C00000AA4EC-000000067F000080000007400C00000B3C0C__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000AC000-000000067F000080000007400C00000B0000__00000090DFD64240", +"000000067F000080000007400C00000B0000-000000067F000080000007400C00000B4000__00000090DFD64240", +"000000067F000080000007400C00000B3C0C-000000067F000080000007400C00000BD372__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000B4000-000000067F000080000007400C00000B8000__00000090DFD64240", +"000000067F000080000007400C00000B8000-000000067F000080000007400C00000BC000__00000090DFD64240", +"000000067F000080000007400C00000BC000-000000067F000080000007400C00000C0000__00000090DFD64240", +"000000067F000080000007400C00000BD372-000000067F000080000007400C00000C6AD8__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000C0000-000000067F000080000007400C00000C4000__00000090DFD64240", +"000000067F000080000007400C00000C4000-000000067F000080000007400C00000C8000__00000090DFD64240", +"000000067F000080000007400C00000C6AD8-000000067F000080000007400C00000D020B__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000C8000-000000067F000080000007400C00000CC000__00000090DFD64240", +"000000067F000080000007400C00000CC000-000000067F000080000007400C00000D0000__00000090DFD64240", +"000000067F000080000007400C00000D0000-000000067F000080000007400C00000D4000__00000090DFD64240", +"000000067F000080000007400C00000D020B-000000067F000080000007400C00000D9971__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000D4000-000000067F000080000007400C00000D8000__00000090DFD64240", +"000000067F000080000007400C00000D8000-000000067F000080000007400C00000DC000__00000090DFD64240", +"000000067F000080000007400C00000D9971-000000067F000080000007400C00000E30D7__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000DC000-000000067F000080000007400C00000E0000__00000090DFD64240", +"000000067F000080000007400C00000E0000-000000067F000080000007400C00000E4000__00000090DFD64240", +"000000067F000080000007400C00000E30D7-000000067F000080000007400C00000EC80B__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000E4000-000000067F000080000007400C00000E8000__00000090DFD64240", +"000000067F000080000007400C00000E8000-000000067F000080000007400C00000EC000__00000090DFD64240", +"000000067F000080000007400C00000E8314-000000067F00008000000740140000008178__000000914B2393B1-00000091A6DD7A79", +"000000067F000080000007400C00000EC000-000000067F000080000007400C00000F0000__00000090DFD64240", +"000000067F000080000007400C00000EC80B-000000067F000080000007400C00000F5F38__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000F0000-000000067F000080000007400C00000F4000__00000090DFD64240", +"000000067F000080000007400C00000F4000-000000067F000080000007400C00000F8000__00000090DFD64240", +"000000067F000080000007400C00000F5F38-000000067F000080000007400C00000FF69E__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C00000F8000-000000067F000080000007400C00000FC000__00000090DFD64240", +"000000067F000080000007400C00000FC000-000000067F000080000007400C0000100000__00000090DFD64240", +"000000067F000080000007400C00000FCCA8-000000067F000080000007400C00001119BA__00000090D0E5EA29-000000914B2393B1", +"000000067F000080000007400C00000FF69E-000000067F000080000007400C0000108DAF__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C0000100000-000000067F000080000007400C0000104000__00000090DFD64240", +"000000067F000080000007400C0000104000-000000067F000080000007400C0000108000__00000090DFD64240", +"000000067F000080000007400C0000108000-000000067F000080000007400C000010C000__00000090DFD64240", +"000000067F000080000007400C0000108DAF-000000067F000080000007400C0100000000__000000903121F569-00000090D0E5EA29", +"000000067F000080000007400C000010C000-000000067F000080000007400C0000110000__00000090DFD64240", +"000000067F000080000007400C0000110000-030000000000000000000000000000000002__00000090DFD64240", +"000000067F000080000007400C00001119BA-000000067F00008000000740140000004326__00000090D0E5EA29-000000914B2393B1", +"000000067F00008000000740140000004326-000000067F0000800000074014000000B7EE__00000090D0E5EA29-000000914B2393B1", +"000000067F00008000000740140000008179-000000067F0000800000074014000001D4B7__000000914B2393B1-00000091A6DD7A79", +"000000067F0000800000074014000000B7EE-000000067F00008000000740140000012CCD__00000090D0E5EA29-000000914B2393B1", +"000000067F00008000000740140000012CCD-000000067F0000800000074014000001A16B__00000090D0E5EA29-000000914B2393B1", +"000000067F0000800000074014000001A16B-000000067F000080000007401400000215C9__00000090D0E5EA29-000000914B2393B1", +"000000067F0000800000074014000001D4BA-030000000000000000000000000000000002__000000914B2393B1-00000091A6DD7A79", +"000000067F000080000007401400000215C9-000000067F00008000000740140000028A4A__00000090D0E5EA29-000000914B2393B1", +"000000067F00008000000740140000028A4A-030000000000000000000000000000000002__00000090D0E5EA29-000000914B2393B1", +"000000067F000080000007600C0000000000-000000067F000080000007600C0000004000__00000092CA5E4EA8", +"000000067F000080000007600C0000000000-000000067F000080000007600C0000004000__0000009445A06DC8", +"000000067F000080000007600C0000004000-000000067F000080000007600C0000008000__00000092CA5E4EA8", +"000000067F000080000007600C0000004000-000000067F000080000007600C0000008000__0000009445A06DC8", +"000000067F000080000007600C0000008000-000000067F000080000007600C000000C000__00000092CA5E4EA8", +"000000067F000080000007600C0000008000-000000067F000080000007600C000000C000__0000009445A06DC8", +"000000067F000080000007600C0000008180-000000067F000080000007600C00000118E6__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C000000C000-000000067F000080000007600C0000010000__00000092CA5E4EA8", +"000000067F000080000007600C000000C000-000000067F000080000007600C0000010000__0000009445A06DC8", +"000000067F000080000007600C0000010000-000000067F000080000007600C0000014000__00000092CA5E4EA8", +"000000067F000080000007600C0000010000-000000067F000080000007600C0000014000__0000009445A06DC8", +"000000067F000080000007600C00000118E6-000000067F000080000007600C000001B00A__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C0000014000-000000067F000080000007600C0000018000__00000092CA5E4EA8", +"000000067F000080000007600C0000014000-000000067F000080000007600C0000018000__0000009445A06DC8", +"000000067F000080000007600C0000018000-000000067F000080000007600C000001C000__00000092CA5E4EA8", +"000000067F000080000007600C0000018000-000000067F000080000007600C000001C000__0000009445A06DC8", +"000000067F000080000007600C000001B00A-000000067F000080000007600C0000024745__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C000001C000-000000067F000080000007600C0000020000__00000092CA5E4EA8", +"000000067F000080000007600C000001C000-000000067F000080000007600C0000020000__0000009445A06DC8", +"000000067F000080000007600C0000020000-000000067F000080000007600C0000024000__00000092CA5E4EA8", +"000000067F000080000007600C0000020000-000000067F000080000007600C0000024000__0000009445A06DC8", +"000000067F000080000007600C0000024000-000000067F000080000007600C0000028000__00000092CA5E4EA8", +"000000067F000080000007600C0000024000-000000067F000080000007600C0000028000__0000009445A06DC8", +"000000067F000080000007600C0000024745-000000067F000080000007600C000002DEAB__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C0000028000-000000067F000080000007600C000002C000__00000092CA5E4EA8", +"000000067F000080000007600C0000028000-000000067F000080000007600C000002C000__0000009445A06DC8", +"000000067F000080000007600C000002C000-000000067F000080000007600C0000030000__00000092CA5E4EA8", +"000000067F000080000007600C000002C000-000000067F000080000007600C0000030000__0000009445A06DC8", +"000000067F000080000007600C000002DEAB-000000067F000080000007600C00000375CB__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C0000030000-000000067F000080000007600C0000034000__00000092CA5E4EA8", +"000000067F000080000007600C0000030000-000000067F000080000007600C0000034000__0000009445A06DC8", +"000000067F000080000007600C0000034000-000000067F000080000007600C0000038000__00000092CA5E4EA8", +"000000067F000080000007600C0000034000-000000067F000080000007600C0000038000__0000009445A06DC8", +"000000067F000080000007600C00000375CB-000000067F000080000007600C0000040D0B__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C0000038000-000000067F000080000007600C000003C000__00000092CA5E4EA8", +"000000067F000080000007600C0000038000-000000067F000080000007600C000003C000__0000009445A06DC8", +"000000067F000080000007600C000003C000-000000067F000080000007600C0000040000__00000092CA5E4EA8", +"000000067F000080000007600C000003C000-000000067F000080000007600C0000040000__0000009445A06DC8", +"000000067F000080000007600C0000040000-000000067F000080000007600C0000044000__00000092CA5E4EA8", +"000000067F000080000007600C0000040000-000000067F000080000007600C0000044000__0000009445A06DC8", +"000000067F000080000007600C0000040D0B-000000067F000080000007600C000004A471__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C0000044000-000000067F000080000007600C0000048000__00000092CA5E4EA8", +"000000067F000080000007600C0000044000-000000067F000080000007600C0000048000__0000009445A06DC8", +"000000067F000080000007600C0000048000-000000067F000080000007600C000004C000__00000092CA5E4EA8", +"000000067F000080000007600C0000048000-000000067F000080000007600C000004C000__0000009445A06DC8", +"000000067F000080000007600C000004A471-030000000000000000000000000000000002__00000091A6DD7A79-0000009228F7FA79", +"000000067F000080000007600C000004C000-000000067F000080000007600C0000050000__00000092CA5E4EA8", +"000000067F000080000007600C000004C000-000000067F000080000007600C0000050000__0000009445A06DC8", +"000000067F000080000007600C0000050000-000000067F000080000007600C0000054000__00000092CA5E4EA8", +"000000067F000080000007600C0000050000-000000067F000080000007600C0000054000__0000009445A06DC8", +"000000067F000080000007600C0000054000-000000067F000080000007600C0000058000__00000092CA5E4EA8", +"000000067F000080000007600C0000054000-000000067F000080000007600C0000058000__0000009445A06DC8", +"000000067F000080000007600C00000544BA-000000067F000080000007600C000005DC0A__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000058000-000000067F000080000007600C000005C000__00000092CA5E4EA8", +"000000067F000080000007600C0000058000-000000067F000080000007600C000005C000__0000009445A06DC8", +"000000067F000080000007600C000005C000-000000067F000080000007600C0000060000__00000092CA5E4EA8", +"000000067F000080000007600C000005C000-000000067F000080000007600C0000060000__0000009445A06DC8", +"000000067F000080000007600C000005DC0A-000000067F000080000007600C000006732B__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000060000-000000067F000080000007600C0000064000__00000092CA5E4EA8", +"000000067F000080000007600C0000060000-000000067F000080000007600C0000064000__0000009445A06DC8", +"000000067F000080000007600C0000061031-000000067F000080000007600C00000C1159__0000009402435A49-0000009446B52FD1", +"000000067F000080000007600C0000064000-000000067F000080000007600C0000068000__00000092CA5E4EA8", +"000000067F000080000007600C0000064000-000000067F000080000007600C0000068000__0000009445A06DC8", +"000000067F000080000007600C000006732B-000000067F000080000007600C0000070A91__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000068000-000000067F000080000007600C000006C000__00000092CA5E4EA8", +"000000067F000080000007600C0000068000-000000067F000080000007600C000006C000__0000009445A06DC8", +"000000067F000080000007600C000006C000-000000067F000080000007600C0000070000__00000092CA5E4EA8", +"000000067F000080000007600C000006C000-000000067F000080000007600C0000070000__0000009445A06DC8", +"000000067F000080000007600C0000070000-000000067F000080000007600C0000074000__00000092CA5E4EA8", +"000000067F000080000007600C0000070000-000000067F000080000007600C0000074000__0000009445A06DC8", +"000000067F000080000007600C0000070A91-000000067F000080000007600C000007A1F7__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000074000-000000067F000080000007600C0000078000__00000092CA5E4EA8", +"000000067F000080000007600C0000074000-000000067F000080000007600C0000078000__0000009445A06DC8", +"000000067F000080000007600C0000078000-000000067F000080000007600C000007C000__00000092CA5E4EA8", +"000000067F000080000007600C0000078000-000000067F000080000007600C000007C000__0000009445A06DC8", +"000000067F000080000007600C000007A1F7-000000067F000080000007600C000008390C__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C000007C000-000000067F000080000007600C0000080000__00000092CA5E4EA8", +"000000067F000080000007600C000007C000-000000067F000080000007600C0000080000__0000009445A06DC8", +"000000067F000080000007600C0000080000-000000067F000080000007600C0000084000__00000092CA5E4EA8", +"000000067F000080000007600C0000080000-000000067F000080000007600C0000084000__0000009445A06DC8", +"000000067F000080000007600C000008390C-000000067F000080000007600C000008D072__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000084000-000000067F000080000007600C0000088000__00000092CA5E4EA8", +"000000067F000080000007600C0000084000-000000067F000080000007600C0000088000__0000009445A06DC8", +"000000067F000080000007600C0000088000-000000067F000080000007600C000008C000__00000092CA5E4EA8", +"000000067F000080000007600C0000088000-000000067F000080000007600C000008C000__0000009445A06DC8", +"000000067F000080000007600C000008C000-000000067F000080000007600C0000090000__00000092CA5E4EA8", +"000000067F000080000007600C000008C000-000000067F000080000007600C0000090000__0000009445A06DC8", +"000000067F000080000007600C000008C52F-000000067F000080000007600C000010B57A__00000093786F8001-0000009402435A49", +"000000067F000080000007600C000008D072-000000067F000080000007600C000009679A__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000090000-000000067F000080000007600C0000094000__00000092CA5E4EA8", +"000000067F000080000007600C0000090000-000000067F000080000007600C0000094000__0000009445A06DC8", +"000000067F000080000007600C0000094000-000000067F000080000007600C0000098000__00000092CA5E4EA8", +"000000067F000080000007600C0000094000-000000067F000080000007600C0000098000__0000009445A06DC8", +"000000067F000080000007600C000009679A-000000067F000080000007600C000009FF00__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000098000-000000067F000080000007600C000009C000__00000092CA5E4EA8", +"000000067F000080000007600C0000098000-000000067F000080000007600C000009C000__0000009445A06DC8", +"000000067F000080000007600C000009C000-000000067F000080000007600C00000A0000__00000092CA5E4EA8", +"000000067F000080000007600C000009C000-000000067F000080000007600C00000A0000__0000009445A06DC8", +"000000067F000080000007600C000009FF00-000000067F000080000007600C00000A960B__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000A0000-000000067F000080000007600C00000A4000__00000092CA5E4EA8", +"000000067F000080000007600C00000A0000-000000067F000080000007600C00000A4000__0000009445A06DC8", +"000000067F000080000007600C00000A4000-000000067F000080000007600C00000A8000__00000092CA5E4EA8", +"000000067F000080000007600C00000A4000-000000067F000080000007600C00000A8000__0000009445A06DC8", +"000000067F000080000007600C00000A8000-000000067F000080000007600C00000AC000__0000009445A06DC8", +"000000067F000080000007600C00000A8000-030000000000000000000000000000000002__00000092CA5E4EA8", +"000000067F000080000007600C00000A960B-000000067F000080000007600C00000B2D55__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000AC000-000000067F000080000007600C00000B0000__0000009445A06DC8", +"000000067F000080000007600C00000B0000-000000067F000080000007600C00000B4000__0000009445A06DC8", +"000000067F000080000007600C00000B2D55-000000067F000080000007600C00000BC4BB__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000B4000-000000067F000080000007600C00000B8000__0000009445A06DC8", +"000000067F000080000007600C00000B8000-000000067F000080000007600C00000BC000__0000009445A06DC8", +"000000067F000080000007600C00000BC000-000000067F000080000007600C00000C0000__0000009445A06DC8", +"000000067F000080000007600C00000BC4BB-000000067F000080000007600C00000C5BEA__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000C0000-000000067F000080000007600C00000C4000__0000009445A06DC8", +"000000067F000080000007600C00000C115D-000000067F0000800000076014000000333A__0000009402435A49-0000009446B52FD1", +"000000067F000080000007600C00000C4000-000000067F000080000007600C00000C8000__0000009445A06DC8", +"000000067F000080000007600C00000C5BEA-000000067F000080000007600C00000CF30B__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000C8000-000000067F000080000007600C00000CC000__0000009445A06DC8", +"000000067F000080000007600C00000CC000-000000067F000080000007600C00000D0000__0000009445A06DC8", +"000000067F000080000007600C00000CF30B-000000067F000080000007600C00000D8A2B__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000D0000-000000067F000080000007600C00000D4000__0000009445A06DC8", +"000000067F000080000007600C00000D4000-000000067F000080000007600C00000D8000__0000009445A06DC8", +"000000067F000080000007600C00000D8000-000000067F000080000007600C00000DC000__0000009445A06DC8", +"000000067F000080000007600C00000D8A2B-000000067F000080000007600C00000E217C__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000DC000-000000067F000080000007600C00000E0000__0000009445A06DC8", +"000000067F000080000007600C00000E0000-000000067F000080000007600C00000E4000__0000009445A06DC8", +"000000067F000080000007600C00000E217C-000000067F000080000007600C00000EB8E2__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000E4000-000000067F000080000007600C00000E8000__0000009445A06DC8", +"000000067F000080000007600C00000E8000-000000067F000080000007600C00000EC000__0000009445A06DC8", +"000000067F000080000007600C00000EB8E2-000000067F000080000007600C00000F500B__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000EC000-000000067F000080000007600C00000F0000__0000009445A06DC8", +"000000067F000080000007600C00000F0000-000000067F000080000007600C00000F4000__0000009445A06DC8", +"000000067F000080000007600C00000F4000-000000067F000080000007600C00000F8000__0000009445A06DC8", +"000000067F000080000007600C00000F500B-000000067F000080000007600C00000FE771__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C00000F8000-000000067F000080000007600C00000FC000__0000009445A06DC8", +"000000067F000080000007600C00000FC000-000000067F000080000007600C0000100000__0000009445A06DC8", +"000000067F000080000007600C00000FE771-000000067F000080000007600C0000107ED7__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000100000-000000067F000080000007600C0000104000__0000009445A06DC8", +"000000067F000080000007600C0000104000-000000067F000080000007600C0000108000__0000009445A06DC8", +"000000067F000080000007600C0000107ED7-000000067F000080000007600C000011160C__0000009228F7FA79-00000093786F8001", +"000000067F000080000007600C0000108000-000000067F000080000007600C000010C000__0000009445A06DC8", +"000000067F000080000007600C000010B57A-000000067F00008000000760140000003D14__00000093786F8001-0000009402435A49", +"000000067F000080000007600C000010C000-000000067F000080000007600C0000110000__0000009445A06DC8", +"000000067F000080000007600C0000110000-000000067F00008000000760120100000000__0000009445A06DC8", +"000000067F000080000007600C000011160C-010000000000000001000000040000000008__0000009228F7FA79-00000093786F8001", +"000000067F00008000000760140000000000-000000067F00008000000760140000004000__0000009445A06DC8", +"000000067F00008000000760140000003354-000000067F00008000000760140000023CAB__0000009402435A49-0000009446B52FD1", +"000000067F00008000000760140000003D14-000000067F0000800000076014000000A251__00000093786F8001-0000009402435A49", +"000000067F00008000000760140000004000-000000067F00008000000760140000008000__0000009445A06DC8", +"000000067F00008000000760140000008000-000000067F0000800000076014000000C000__0000009445A06DC8", +"000000067F0000800000076014000000A251-000000067F000080000007601400000107AC__00000093786F8001-0000009402435A49", +"000000067F0000800000076014000000C000-000000067F00008000000760140000010000__0000009445A06DC8", +"000000067F00008000000760140000010000-000000067F00008000000760140000014000__0000009445A06DC8", +"000000067F000080000007601400000107AC-000000067F00008000000760140000016CC4__00000093786F8001-0000009402435A49", +"000000067F00008000000760140000014000-000000067F00008000000760140000018000__0000009445A06DC8", +"000000067F00008000000760140000016CC4-000000067F0000800000076014000001D272__00000093786F8001-0000009402435A49", +"000000067F00008000000760140000018000-000000067F0000800000076014000001C000__0000009445A06DC8", +"000000067F0000800000076014000001C000-000000067F00008000000760140000020000__0000009445A06DC8", +"000000067F0000800000076014000001D272-000000067F000080000007601400000237C3__00000093786F8001-0000009402435A49", +"000000067F00008000000760140000020000-000000067F00008000000760140000024000__0000009445A06DC8", +"000000067F000080000007601400000237C3-000000067F00008000000760140000029CC5__00000093786F8001-0000009402435A49", +"000000067F00008000000760140000023CB3-030000000000000000000000000000000002__0000009402435A49-0000009446B52FD1", +"000000067F00008000000760140000024000-000000067F00008000000760140000028000__0000009445A06DC8", +"000000067F00008000000760140000028000-000000067F0000800000076014000002C000__0000009445A06DC8", +"000000067F00008000000760140000029CC5-030000000000000000000000000000000002__00000093786F8001-0000009402435A49", +"000000067F0000800000076014000002C000-030000000000000000000000000000000002__0000009445A06DC8", +"000000067F000080000007800C0000000000-000000067F000080000007800C0000004000__00000096187D1FC8", +"000000067F000080000007800C0000000000-000000067F000080000007800C0000004000__00000096E85806C0", +"000000067F000080000007800C0000004000-000000067F000080000007800C0000008000__00000096187D1FC8", +"000000067F000080000007800C0000004000-000000067F000080000007800C0000008000__00000096E85806C0", +"000000067F000080000007800C0000008000-000000067F000080000007800C000000C000__00000096187D1FC8", +"000000067F000080000007800C0000008000-000000067F000080000007800C000000C000__00000096E85806C0", +"000000067F000080000007800C000000974C-000000067F000080000007800C0000012EB2__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C000000C000-000000067F000080000007800C0000010000__00000096187D1FC8", +"000000067F000080000007800C000000C000-000000067F000080000007800C0000010000__00000096E85806C0", +"000000067F000080000007800C0000010000-000000067F000080000007800C0000014000__00000096187D1FC8", +"000000067F000080000007800C0000010000-000000067F000080000007800C0000014000__00000096E85806C0", +"000000067F000080000007800C0000012EB2-000000067F000080000007800C000001C60B__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C0000014000-000000067F000080000007800C0000018000__00000096187D1FC8", +"000000067F000080000007800C0000014000-000000067F000080000007800C0000018000__00000096E85806C0", +"000000067F000080000007800C0000018000-000000067F000080000007800C000001C000__00000096187D1FC8", +"000000067F000080000007800C0000018000-000000067F000080000007800C000001C000__00000096E85806C0", +"000000067F000080000007800C000001C000-000000067F000080000007800C0000020000__00000096187D1FC8", +"000000067F000080000007800C000001C000-000000067F000080000007800C0000020000__00000096E85806C0", +"000000067F000080000007800C000001C60B-000000067F000080000007800C0000025D39__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C0000020000-000000067F000080000007800C0000024000__00000096187D1FC8", +"000000067F000080000007800C0000020000-000000067F000080000007800C0000024000__00000096E85806C0", +"000000067F000080000007800C0000024000-000000067F000080000007800C0000028000__00000096187D1FC8", +"000000067F000080000007800C0000024000-000000067F000080000007800C0000028000__00000096E85806C0", +"000000067F000080000007800C0000025D39-000000067F000080000007800C000002F49F__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C0000028000-000000067F000080000007800C000002C000__00000096187D1FC8", +"000000067F000080000007800C0000028000-000000067F000080000007800C000002C000__00000096E85806C0", +"000000067F000080000007800C000002C000-000000067F000080000007800C0000030000__00000096187D1FC8", +"000000067F000080000007800C000002C000-000000067F000080000007800C0000030000__00000096E85806C0", +"000000067F000080000007800C000002F49F-000000067F000080000007800C0000038BB2__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C0000030000-000000067F000080000007800C0000034000__00000096187D1FC8", +"000000067F000080000007800C0000030000-000000067F000080000007800C0000034000__00000096E85806C0", +"000000067F000080000007800C0000034000-000000067F000080000007800C0000038000__00000096187D1FC8", +"000000067F000080000007800C0000034000-000000067F000080000007800C0000038000__00000096E85806C0", +"000000067F000080000007800C0000038000-000000067F000080000007800C000003C000__00000096187D1FC8", +"000000067F000080000007800C0000038000-000000067F000080000007800C000003C000__00000096E85806C0", +"000000067F000080000007800C0000038BB2-000000067F000080000007800C0000042318__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C000003C000-000000067F000080000007800C0000040000__00000096187D1FC8", +"000000067F000080000007800C000003C000-000000067F000080000007800C0000040000__00000096E85806C0", +"000000067F000080000007800C0000040000-000000067F000080000007800C0000044000__00000096187D1FC8", +"000000067F000080000007800C0000040000-000000067F000080000007800C0000044000__00000096E85806C0", +"000000067F000080000007800C0000042318-000000067F000080000007800C000004BA7E__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C0000044000-000000067F000080000007800C0000048000__00000096187D1FC8", +"000000067F000080000007800C0000044000-000000067F000080000007800C0000048000__00000096E85806C0", +"000000067F000080000007800C0000048000-000000067F000080000007800C000004C000__00000096187D1FC8", +"000000067F000080000007800C0000048000-000000067F000080000007800C000004C000__00000096E85806C0", +"000000067F000080000007800C000004BA7E-000000067F000080000007800C00000551B3__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C000004C000-000000067F000080000007800C0000050000__00000096187D1FC8", +"000000067F000080000007800C000004C000-000000067F000080000007800C0000050000__00000096E85806C0", +"000000067F000080000007800C0000050000-000000067F000080000007800C0000054000__00000096187D1FC8", +"000000067F000080000007800C0000050000-000000067F000080000007800C0000054000__00000096E85806C0", +"000000067F000080000007800C0000054000-000000067F000080000007800C0000058000__0000009614F1FFE8", +"000000067F000080000007800C0000054000-000000067F000080000007800C0000058000__00000096E85806C0", +"000000067F000080000007800C00000551B3-030000000000000000000000000000000002__0000009446B52FD1-00000094D67DF4F9", +"000000067F000080000007800C000005523E-000000067F000080000007800C000005E9A4__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C0000058000-000000067F000080000007800C000005C000__0000009614F1FFE8", +"000000067F000080000007800C0000058000-000000067F000080000007800C000005C000__00000096E85806C0", +"000000067F000080000007800C000005C000-000000067F000080000007800C0000060000__0000009614F1FFE8", +"000000067F000080000007800C000005C000-000000067F000080000007800C0000060000__00000096E85806C0", +"000000067F000080000007800C000005E9A4-000000067F000080000007800C000006810A__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C0000060000-000000067F000080000007800C0000064000__0000009614F1FFE8", +"000000067F000080000007800C0000060000-000000067F000080000007800C0000064000__00000096E85806C0", +"000000067F000080000007800C0000064000-000000067F000080000007800C0000068000__0000009614F1FFE8", +"000000067F000080000007800C0000064000-000000067F000080000007800C0000068000__00000096E85806C0", +"000000067F000080000007800C0000068000-000000067F000080000007800C000006C000__0000009614F1FFE8", +"000000067F000080000007800C0000068000-000000067F000080000007800C000006C000__00000096E85806C0", +"000000067F000080000007800C000006810A-000000067F000080000007800C0000071870__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C000006C000-000000067F000080000007800C0000070000__0000009614F1FFE8", +"000000067F000080000007800C000006C000-000000067F000080000007800C0000070000__00000096E85806C0", +"000000067F000080000007800C000006D446-000000067F000080000007800C00000D9B82__00000096AEF27399-00000096E85829C9", +"000000067F000080000007800C0000070000-000000067F000080000007800C0000074000__0000009614F1FFE8", +"000000067F000080000007800C0000070000-000000067F000080000007800C0000074000__00000096E85806C0", +"000000067F000080000007800C0000071870-000000067F000080000007800C000007AFD6__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C0000074000-000000067F000080000007800C0000078000__0000009614F1FFE8", +"000000067F000080000007800C0000074000-000000067F000080000007800C0000078000__00000096E85806C0", +"000000067F000080000007800C0000078000-000000067F000080000007800C000007C000__0000009614F1FFE8", +"000000067F000080000007800C0000078000-000000067F000080000007800C000007C000__00000096E85806C0", +"000000067F000080000007800C000007AFD6-000000067F000080000007800C000008470B__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C000007B8DE-000000067F000080000007800C00000F73DA__00000096193A8001-00000096AEF27399", +"000000067F000080000007800C000007C000-000000067F000080000007800C0000080000__0000009614F1FFE8", +"000000067F000080000007800C000007C000-000000067F000080000007800C0000080000__00000096E85806C0", +"000000067F000080000007800C0000080000-000000067F000080000007800C0000084000__0000009614F1FFE8", +"000000067F000080000007800C0000080000-000000067F000080000007800C0000084000__00000096E85806C0", +"000000067F000080000007800C0000084000-000000067F000080000007800C0000088000__0000009614F1FFE8", +"000000067F000080000007800C0000084000-000000067F000080000007800C0000088000__00000096E85806C0", +"000000067F000080000007800C000008470B-000000067F000080000007800C000008DE71__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C0000088000-000000067F000080000007800C000008C000__0000009614F1FFE8", +"000000067F000080000007800C0000088000-000000067F000080000007800C000008C000__00000096E85806C0", +"000000067F000080000007800C000008C000-000000067F000080000007800C0000090000__0000009614F1FFE8", +"000000067F000080000007800C000008C000-000000067F000080000007800C0000090000__00000096E85806C0", +"000000067F000080000007800C000008DE71-000000067F000080000007800C0000097591__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C0000090000-000000067F000080000007800C0000094000__0000009614F1FFE8", +"000000067F000080000007800C0000090000-000000067F000080000007800C0000094000__00000096E85806C0", +"000000067F000080000007800C0000094000-000000067F000080000007800C0000098000__0000009614F1FFE8", +"000000067F000080000007800C0000094000-000000067F000080000007800C0000098000__00000096E85806C0", +"000000067F000080000007800C0000097591-000000067F000080000007800C00000A0CF7__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C0000098000-000000067F000080000007800C000009C000__0000009614F1FFE8", +"000000067F000080000007800C0000098000-000000067F000080000007800C000009C000__00000096E85806C0", +"000000067F000080000007800C000009C000-000000067F000080000007800C00000A0000__0000009614F1FFE8", +"000000067F000080000007800C000009C000-000000067F000080000007800C00000A0000__00000096E85806C0", +"000000067F000080000007800C00000A0000-000000067F000080000007800C00000A4000__0000009614F1FFE8", +"000000067F000080000007800C00000A0000-000000067F000080000007800C00000A4000__00000096E85806C0", +"000000067F000080000007800C00000A0CF7-000000067F000080000007800C00000AA40B__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C00000A4000-000000067F000080000007800C00000A8000__0000009614F1FFE8", +"000000067F000080000007800C00000A4000-000000067F000080000007800C00000A8000__00000096E85806C0", +"000000067F000080000007800C00000A8000-000000067F000080000007800C00000AC000__0000009614F1FFE8", +"000000067F000080000007800C00000A8000-000000067F000080000007800C00000AC000__00000096E85806C0", +"000000067F000080000007800C00000AA40B-000000067F000080000007800C00000B3B4D__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C00000AC000-000000067F000080000007800C00000B0000__0000009614F1FFE8", +"000000067F000080000007800C00000AC000-000000067F000080000007800C00000B0000__00000096E85806C0", +"000000067F000080000007800C00000B0000-000000067F000080000007800C00000B4000__0000009614F1FFE8", +"000000067F000080000007800C00000B0000-000000067F000080000007800C00000B4000__00000096E85806C0", +"000000067F000080000007800C00000B3B4D-000000067F000080000007800C00000BD2B3__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C00000B4000-000000067F000080000007800C00000B8000__0000009614F1FFE8", +"000000067F000080000007800C00000B4000-000000067F000080000007800C00000B8000__00000096E85806C0", +"000000067F000080000007800C00000B8000-000000067F000080000007800C00000BC000__0000009614F1FFE8", +"000000067F000080000007800C00000B8000-000000067F000080000007800C00000BC000__00000096E85806C0", +"000000067F000080000007800C00000BC000-000000067F000080000007800C00000C0000__0000009614F1FFE8", +"000000067F000080000007800C00000BC000-000000067F000080000007800C00000C0000__00000096E85806C0", +"000000067F000080000007800C00000BD2B3-000000067F000080000007800C00000C69DA__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C00000C0000-000000067F000080000007800C00000C4000__0000009614F1FFE8", +"000000067F000080000007800C00000C0000-000000067F000080000007800C00000C4000__00000096E85806C0", +"000000067F000080000007800C00000C4000-000000067F000080000007800C00000C8000__0000009614F1FFE8", +"000000067F000080000007800C00000C4000-000000067F000080000007800C00000C8000__00000096E85806C0", +"000000067F000080000007800C00000C69DA-000000067F000080000007800C0100000000__00000094D67DF4F9-000000959635F2A9", +"000000067F000080000007800C00000C8000-000000067F000080000007800C00000CC000__0000009614F1FFE8", +"000000067F000080000007800C00000C8000-000000067F000080000007800C00000CC000__00000096E85806C0", +"000000067F000080000007800C00000CC000-000000067F000080000007800C00000D0000__0000009614F1FFE8", +"000000067F000080000007800C00000CC000-000000067F000080000007800C00000D0000__00000096E85806C0", +"000000067F000080000007800C00000CD6B6-000000067F000080000007800C00000D6C18__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C00000D0000-000000067F000080000007800C00000D4000__0000009614F1FFE8", +"000000067F000080000007800C00000D0000-000000067F000080000007800C00000D4000__00000096E85806C0", +"000000067F000080000007800C00000D4000-000000067F000080000007800C00000D8000__0000009614F1FFE8", +"000000067F000080000007800C00000D4000-000000067F000080000007800C00000D8000__00000096E85806C0", +"000000067F000080000007800C00000D6C18-000000067F000080000007800C00000E0179__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C00000D8000-000000067F000080000007800C00000DC000__0000009614F1FFE8", +"000000067F000080000007800C00000D8000-000000067F000080000007800C00000DC000__00000096E85806C0", +"000000067F000080000007800C00000D9BA3-000000067F00008000000780140000013481__00000096AEF27399-00000096E85829C9", +"000000067F000080000007800C00000DC000-000000067F000080000007800C00000E0000__0000009614F1FFE8", +"000000067F000080000007800C00000DC000-000000067F000080000007800C00000E0000__00000096E85806C0", +"000000067F000080000007800C00000E0000-000000067F000080000007800C00000E4000__0000009614F1FFE8", +"000000067F000080000007800C00000E0000-000000067F000080000007800C00000E4000__00000096E85806C0", +"000000067F000080000007800C00000E0179-000000067F000080000007800C00000E96DC__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C00000E4000-000000067F000080000007800C00000E8000__0000009614F1FFE8", +"000000067F000080000007800C00000E4000-000000067F000080000007800C00000E8000__00000096E85806C0", +"000000067F000080000007800C00000E8000-000000067F000080000007800C00000EC000__0000009614F1FFE8", +"000000067F000080000007800C00000E8000-000000067F000080000007800C00000EC000__00000096E85806C0", +"000000067F000080000007800C00000E96DC-000000067F000080000007800C00000F2C3E__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C00000EC000-000000067F000080000007800C00000F0000__0000009614F1FFE8", +"000000067F000080000007800C00000EC000-000000067F000080000007800C00000F0000__00000096E85806C0", +"000000067F000080000007800C00000F0000-000000067F000080000007800C00000F4000__0000009614F1FFE8", +"000000067F000080000007800C00000F0000-000000067F000080000007800C00000F4000__00000096E85806C0", +"000000067F000080000007800C00000F2C3E-000000067F000080000007800C00000FC1A0__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C00000F4000-000000067F000080000007800C00000F8000__0000009614F1FFE8", +"000000067F000080000007800C00000F4000-000000067F000080000007800C00000F8000__00000096E85806C0", +"000000067F000080000007800C00000F73E3-000000067F00008000000780140000003F18__00000096193A8001-00000096AEF27399", +"000000067F000080000007800C00000F8000-000000067F000080000007800C00000FC000__0000009614F1FFE8", +"000000067F000080000007800C00000F8000-000000067F000080000007800C00000FC000__00000096E85806C0", +"000000067F000080000007800C00000FC000-000000067F000080000007800C0000100000__0000009614F1FFE8", +"000000067F000080000007800C00000FC000-000000067F000080000007800C0000100000__00000096E85806C0", +"000000067F000080000007800C00000FC1A0-000000067F000080000007800C00001057C1__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C0000100000-000000067F000080000007800C0000104000__0000009614F1FFE8", +"000000067F000080000007800C0000100000-000000067F000080000007800C0000104000__00000096E85806C0", +"000000067F000080000007800C0000104000-000000067F000080000007800C0000108000__0000009614F1FFE8", +"000000067F000080000007800C0000104000-000000067F000080000007800C0000108000__00000096E85806C0", +"000000067F000080000007800C00001057C1-000000067F000080000007800C000010EF0B__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C0000108000-000000067F000080000007800C000010C000__0000009614F1FFE8", +"000000067F000080000007800C0000108000-000000067F000080000007800C000010C000__00000096E85806C0", +"000000067F000080000007800C000010C000-000000067F000080000007800C0000110000__0000009614F1FFE8", +"000000067F000080000007800C000010C000-000000067F000080000007800C0000110000__00000096E85806C0", +"000000067F000080000007800C000010EF0B-01000000000000000100000004000000000B__000000959635F2A9-00000096193A8001", +"000000067F000080000007800C0000110000-000000067F00008000000780120100000000__00000096E85806C0", +"000000067F000080000007800C0000110000-030000000000000000000000000000000002__0000009614F1FFE8", +"000000067F00008000000780140000000000-000000067F00008000000780140000004000__00000096E85806C0", +"000000067F00008000000780140000003F18-000000067F00008000000780140000009ED4__00000096193A8001-00000096AEF27399", +"000000067F00008000000780140000004000-000000067F00008000000780140000008000__00000096E85806C0", +"000000067F00008000000780140000008000-000000067F0000800000078014000000C000__00000096E85806C0", +"000000067F00008000000780140000009ED4-000000067F0000800000078014000000FE9A__00000096193A8001-00000096AEF27399", +"000000067F0000800000078014000000C000-000000067F00008000000780140000010000__00000096E85806C0", +"000000067F0000800000078014000000FE9A-000000067F00008000000780140000015DD1__00000096193A8001-00000096AEF27399", +"000000067F00008000000780140000010000-000000067F00008000000780140000014000__00000096E85806C0", +"000000067F00008000000780140000013481-030000000000000000000000000000000002__00000096AEF27399-00000096E85829C9", +"000000067F00008000000780140000014000-000000067F00008000000780140000018000__00000096E85806C0", +"000000067F00008000000780140000015DD1-000000067F0000800000078014000001BD7E__00000096193A8001-00000096AEF27399", +"000000067F00008000000780140000018000-000000067F0000800000078014000001C000__00000096E85806C0", +"000000067F0000800000078014000001BD7E-000000067F00008000000780140000021CF0__00000096193A8001-00000096AEF27399", +"000000067F0000800000078014000001C000-000000067F00008000000780140000020000__00000096E85806C0", +"000000067F00008000000780140000020000-000000067F00008000000780140000024000__00000096E85806C0", +"000000067F00008000000780140000021CF0-000000067F00008000000780140000027CF8__00000096193A8001-00000096AEF27399", +"000000067F00008000000780140000024000-000000067F00008000000780140000028000__00000096E85806C0", +"000000067F00008000000780140000027CF8-000000067F0000800000078014000002DC88__00000096193A8001-00000096AEF27399", +"000000067F00008000000780140000028000-000000067F0000800000078014000002C000__00000096E85806C0", +"000000067F0000800000078014000002C000-030000000000000000000000000000000002__00000096E85806C0", +"000000067F0000800000078014000002DC88-030000000000000000000000000000000002__00000096193A8001-00000096AEF27399", +"000000067F000080000007A00C0000000000-000000067F000080000007A00C0000004000__0000009921F3B4A8", +"000000067F000080000007A00C0000004000-000000067F000080000007A00C0000008000__0000009921F3B4A8", +"000000067F000080000007A00C0000008000-000000067F000080000007A00C000000C000__0000009921F3B4A8", +"000000067F000080000007A00C000000974B-000000067F000080000007A00C0000012EB1__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C000000C000-000000067F000080000007A00C0000010000__0000009921F3B4A8", +"000000067F000080000007A00C0000010000-000000067F000080000007A00C0000014000__0000009921F3B4A8", +"000000067F000080000007A00C0000012EB1-000000067F000080000007A00C000001C60B__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000014000-000000067F000080000007A00C0000018000__0000009921F3B4A8", +"000000067F000080000007A00C0000018000-000000067F000080000007A00C000001C000__0000009921F3B4A8", +"000000067F000080000007A00C000001C000-000000067F000080000007A00C0000020000__0000009921F3B4A8", +"000000067F000080000007A00C000001C60B-000000067F000080000007A00C0000025D39__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000020000-000000067F000080000007A00C0000024000__0000009921F3B4A8", +"000000067F000080000007A00C0000024000-000000067F000080000007A00C0000028000__0000009921F3B4A8", +"000000067F000080000007A00C0000025D39-000000067F000080000007A00C000002F49F__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000028000-000000067F000080000007A00C000002C000__0000009921F3B4A8", +"000000067F000080000007A00C000002C000-000000067F000080000007A00C0000030000__0000009921F3B4A8", +"000000067F000080000007A00C000002F49F-000000067F000080000007A00C0000038BB2__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000030000-000000067F000080000007A00C0000034000__0000009921F3B4A8", +"000000067F000080000007A00C0000034000-000000067F000080000007A00C0000038000__0000009921F3B4A8", +"000000067F000080000007A00C0000038000-000000067F000080000007A00C000003C000__0000009921F3B4A8", +"000000067F000080000007A00C0000038BB2-000000067F000080000007A00C0000042318__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C000003C000-000000067F000080000007A00C0000040000__0000009921F3B4A8", +"000000067F000080000007A00C0000040000-000000067F000080000007A00C0000044000__0000009921F3B4A8", +"000000067F000080000007A00C0000042318-000000067F000080000007A00C000004BA7E__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000044000-000000067F000080000007A00C0000048000__0000009921F3B4A8", +"000000067F000080000007A00C0000048000-000000067F000080000007A00C000004C000__0000009921F3B4A8", +"000000067F000080000007A00C000004B9B2-000000067F000080000007A00C0000097B6D__0000009921E47AA1-000000997F5D23C9", +"000000067F000080000007A00C000004BA7E-000000067F000080000007A00C00000551B3__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C000004C000-000000067F000080000007A00C0000050000__0000009921F3B4A8", +"000000067F000080000007A00C0000050000-000000067F000080000007A00C0000054000__0000009921F3B4A8", +"000000067F000080000007A00C0000054000-000000067F000080000007A00C0000058000__0000009921F3B4A8", +"000000067F000080000007A00C00000551B3-000000067F000080000007A00C000005E90A__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000058000-000000067F000080000007A00C000005C000__0000009921F3B4A8", +"000000067F000080000007A00C000005C000-000000067F000080000007A00C0000060000__0000009921F3B4A8", +"000000067F000080000007A00C000005E90A-000000067F000080000007A00C000006802C__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000060000-000000067F000080000007A00C0000064000__0000009921F3B4A8", +"000000067F000080000007A00C0000064000-000000067F000080000007A00C0000068000__0000009921F3B4A8", +"000000067F000080000007A00C0000068000-000000067F000080000007A00C000006C000__0000009921F3B4A8", +"000000067F000080000007A00C000006802C-000000067F000080000007A00C0000071783__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C000006C000-000000067F000080000007A00C0000070000__0000009921F3B4A8", +"000000067F000080000007A00C0000070000-000000067F000080000007A00C0000074000__0000009921F3B4A8", +"000000067F000080000007A00C0000071783-000000067F000080000007A00C000007AEE8__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000074000-000000067F000080000007A00C0000078000__0000009921F3B4A8", +"000000067F000080000007A00C0000078000-000000067F000080000007A00C000007C000__0000009921F3B4A8", +"000000067F000080000007A00C000007AEE8-000000067F000080000007A00C000008460B__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C000007C000-000000067F000080000007A00C0000080000__0000009921F3B4A8", +"000000067F000080000007A00C0000080000-000000067F000080000007A00C0000084000__0000009921F3B4A8", +"000000067F000080000007A00C0000084000-000000067F000080000007A00C0000088000__0000009921F3B4A8", +"000000067F000080000007A00C000008460B-000000067F000080000007A00C000008DD71__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000088000-000000067F000080000007A00C000008C000__0000009921F3B4A8", +"000000067F000080000007A00C000008C000-000000067F000080000007A00C0000090000__0000009921F3B4A8", +"000000067F000080000007A00C000008DD71-000000067F000080000007A00C00000974D7__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000090000-000000067F000080000007A00C0000094000__0000009921F3B4A8", +"000000067F000080000007A00C0000094000-000000067F000080000007A00C0000098000__0000009921F3B4A8", +"000000067F000080000007A00C00000974D7-000000067F000080000007A00C00000A0C0B__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000097B7A-000000067F000080000007A00C00000E3627__0000009921E47AA1-000000997F5D23C9", +"000000067F000080000007A00C0000098000-000000067F000080000007A00C000009C000__0000009921F3B4A8", +"000000067F000080000007A00C000009C000-000000067F000080000007A00C00000A0000__0000009921F3B4A8", +"000000067F000080000007A00C00000A0000-000000067F000080000007A00C00000A4000__0000009921F3B4A8", +"000000067F000080000007A00C00000A0C0B-000000067F000080000007A00C00000AA371__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000A4000-000000067F000080000007A00C00000A8000__0000009921F3B4A8", +"000000067F000080000007A00C00000A8000-000000067F000080000007A00C00000AC000__0000009921F3B4A8", +"000000067F000080000007A00C00000AA371-000000067F000080000007A00C00000B3AD7__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000AC000-000000067F000080000007A00C00000B0000__0000009921F3B4A8", +"000000067F000080000007A00C00000B0000-000000067F000080000007A00C00000B4000__0000009921F3B4A8", +"000000067F000080000007A00C00000B3AD7-000000067F000080000007A00C00000BD20B__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000B4000-000000067F000080000007A00C00000B8000__0000009921F3B4A8", +"000000067F000080000007A00C00000B8000-000000067F000080000007A00C00000BC000__0000009921F3B4A8", +"000000067F000080000007A00C00000BC000-000000067F000080000007A00C00000C0000__0000009921F3B4A8", +"000000067F000080000007A00C00000BD20B-000000067F000080000007A00C00000C6932__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000C0000-000000067F000080000007A00C00000C4000__0000009921F3B4A8", +"000000067F000080000007A00C00000C4000-000000067F000080000007A00C00000C8000__0000009921F3B4A8", +"000000067F000080000007A00C00000C6932-000000067F000080000007A00C00000D0098__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000C8000-000000067F000080000007A00C00000CC000__0000009921F3B4A8", +"000000067F000080000007A00C00000CC000-000000067F000080000007A00C00000D0000__0000009921F3B4A8", +"000000067F000080000007A00C00000D0000-000000067F000080000007A00C00000D4000__0000009921F3B4A8", +"000000067F000080000007A00C00000D0098-000000067F000080000007A00C00000D97FE__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000D4000-000000067F000080000007A00C00000D8000__0000009921F3B4A8", +"000000067F000080000007A00C00000D8000-000000067F000080000007A00C00000DC000__0000009921F3B4A8", +"000000067F000080000007A00C00000D97FE-000000067F000080000007A00C00000E2F0B__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000DC000-000000067F000080000007A00C00000E0000__0000009921F3B4A8", +"000000067F000080000007A00C00000E0000-000000067F000080000007A00C00000E4000__0000009921F3B4A8", +"000000067F000080000007A00C00000E2F0B-000000067F000080000007A00C00000EC671__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000E364A-000000067F000080000007A01400000065FE__0000009921E47AA1-000000997F5D23C9", +"000000067F000080000007A00C00000E4000-000000067F000080000007A00C00000E8000__0000009921F3B4A8", +"000000067F000080000007A00C00000E8000-000000067F000080000007A00C00000EC000__0000009921F3B4A8", +"000000067F000080000007A00C00000EC000-000000067F000080000007A00C00000F0000__0000009921F3B4A8", +"000000067F000080000007A00C00000EC671-000000067F000080000007A00C00000F5D9F__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000F0000-000000067F000080000007A00C00000F4000__0000009921F3B4A8", +"000000067F000080000007A00C00000F4000-000000067F000080000007A00C00000F8000__0000009921F3B4A8", +"000000067F000080000007A00C00000F5D9F-000000067F000080000007A00C00000FF505__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C00000F720F-000000067F000080000007A00C0000111692__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A00C00000F8000-000000067F000080000007A00C00000FC000__0000009921F3B4A8", +"000000067F000080000007A00C00000FC000-000000067F000080000007A00C0000100000__0000009921F3B4A8", +"000000067F000080000007A00C00000FF505-000000067F000080000007A00C0000108C10__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C0000100000-000000067F000080000007A00C0000104000__0000009921F3B4A8", +"000000067F000080000007A00C0000104000-000000067F000080000007A00C0000108000__0000009921F3B4A8", +"000000067F000080000007A00C0000108000-000000067F000080000007A00C000010C000__0000009921F3B4A8", +"000000067F000080000007A00C0000108C10-030000000000000000000000000000000002__00000096E85829C9-00000098A7ADFC91", +"000000067F000080000007A00C000010C000-000000067F000080000007A00C0000110000__0000009921F3B4A8", +"000000067F000080000007A00C0000110000-000000067F000080000007A0120100000000__0000009921F3B4A8", +"000000067F000080000007A00C0000111692-000000067F000080000007A01400000040E7__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A0140000000000-000000067F000080000007A0140000004000__0000009921F3B4A8", +"000000067F000080000007A0140000004000-000000067F000080000007A0140000008000__0000009921F3B4A8", +"000000067F000080000007A01400000040E7-000000067F000080000007A014000000B5F6__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A0140000006601-000000067F000080000007A014000001B4CB__0000009921E47AA1-000000997F5D23C9", +"000000067F000080000007A0140000008000-000000067F000080000007A014000000C000__0000009921F3B4A8", +"000000067F000080000007A014000000B5F6-000000067F000080000007A0140000012AFC__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A014000000C000-000000067F000080000007A0140000010000__0000009921F3B4A8", +"000000067F000080000007A0140000010000-000000067F000080000007A0140000014000__0000009921F3B4A8", +"000000067F000080000007A0140000012AFC-000000067F000080000007A0140000019F9B__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A0140000014000-000000067F000080000007A0140000018000__0000009921F3B4A8", +"000000067F000080000007A0140000018000-000000067F000080000007A014000001C000__0000009921F3B4A8", +"000000067F000080000007A0140000019F9B-000000067F000080000007A01400000214BE__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A014000001B4CB-030000000000000000000000000000000002__0000009921E47AA1-000000997F5D23C9", +"000000067F000080000007A014000001C000-000000067F000080000007A0140000020000__0000009921F3B4A8", +"000000067F000080000007A0140000020000-000000067F000080000007A0140000024000__0000009921F3B4A8", +"000000067F000080000007A01400000214BE-000000067F000080000007A01400000289C9__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A0140000024000-000000067F000080000007A0140000028000__0000009921F3B4A8", +"000000067F000080000007A0140000028000-000000067F000080000007A014000002C000__0000009921F3B4A8", +"000000067F000080000007A01400000289C9-030000000000000000000000000000000002__00000098A7ADFC91-0000009921E47AA1", +"000000067F000080000007A014000002C000-030000000000000000000000000000000002__0000009921F3B4A8", +"000000067F000080000007C00C0000000000-000000067F000080000007C00C0000004000__0000009B5229DFE8", +"000000067F000080000007C00C0000004000-000000067F000080000007C00C0000008000__0000009B5229DFE8", +"000000067F000080000007C00C0000007EA5-000000067F000080000007C00C00000115FE__000000997F5D23C9-00000099F1C9FC71", +"000000067F000080000007C00C0000008000-000000067F000080000007C00C000000C000__0000009B5229DFE8", +"000000067F000080000007C00C000000C000-000000067F000080000007C00C0000010000__0000009B5229DFE8", +"000000067F000080000007C00C0000010000-000000067F000080000007C00C0000014000__0000009B5229DFE8", +"000000067F000080000007C00C00000115FE-000000067F000080000007C00C000001AD0C__000000997F5D23C9-00000099F1C9FC71", +"000000067F000080000007C00C0000014000-000000067F000080000007C00C0000018000__0000009B5229DFE8", +"000000067F000080000007C00C0000018000-000000067F000080000007C00C000001C000__0000009B5229DFE8", +"000000067F000080000007C00C000001AD0C-000000067F000080000007C00C0000024472__000000997F5D23C9-00000099F1C9FC71", +"000000067F000080000007C00C000001C000-000000067F000080000007C00C0000020000__0000009B5229DFE8", +"000000067F000080000007C00C0000020000-000000067F000080000007C00C0000024000__0000009B5229DFE8", +"000000067F000080000007C00C0000024000-000000067F000080000007C00C0000028000__0000009B5229DFE8", +"000000067F000080000007C00C0000024472-000000067F000080000007C00C000002DBD8__000000997F5D23C9-00000099F1C9FC71", +"000000067F000080000007C00C0000028000-000000067F000080000007C00C000002C000__0000009B5229DFE8", +"000000067F000080000007C00C000002C000-000000067F000080000007C00C0000030000__0000009B5229DFE8", +"000000067F000080000007C00C000002DBD8-000000067F000080000007C00C000003732B__000000997F5D23C9-00000099F1C9FC71", +"000000067F000080000007C00C0000030000-000000067F000080000007C00C0000034000__0000009B5229DFE8", +"000000067F000080000007C00C0000034000-000000067F000080000007C00C0000038000__0000009B5229DFE8", +"000000067F000080000007C00C000003732B-000000067F000080000007C00C0000040A91__000000997F5D23C9-00000099F1C9FC71", +"000000067F000080000007C00C0000038000-000000067F000080000007C00C000003C000__0000009B5229DFE8", +"000000067F000080000007C00C000003C000-000000067F000080000007C00C0000040000__0000009B5229DFE8", +"000000067F000080000007C00C0000040000-000000067F000080000007C00C0000044000__0000009B40525F80", +"000000067F000080000007C00C0000040000-000000067F000080000007C00C0000044000__0000009C1E3799F0", +"000000067F000080000007C00C0000040A91-030000000000000000000000000000000002__000000997F5D23C9-00000099F1C9FC71", +"000000067F000080000007C00C0000042360-000000067F000080000007C00C000004BAC6__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C0000044000-000000067F000080000007C00C0000048000__0000009B40525F80", +"000000067F000080000007C00C0000044000-000000067F000080000007C00C0000048000__0000009C1E3799F0", +"000000067F000080000007C00C0000048000-000000067F000080000007C00C000004C000__0000009B40525F80", +"000000067F000080000007C00C0000048000-000000067F000080000007C00C000004C000__0000009C1E3799F0", +"000000067F000080000007C00C000004BAC6-000000067F000080000007C00C00000551FB__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C000004C000-000000067F000080000007C00C0000050000__0000009B40525F80", +"000000067F000080000007C00C000004C000-000000067F000080000007C00C0000050000__0000009C1E3799F0", +"000000067F000080000007C00C0000050000-000000067F000080000007C00C0000054000__0000009B40525F80", +"000000067F000080000007C00C0000050000-000000067F000080000007C00C0000054000__0000009C1E3799F0", +"000000067F000080000007C00C0000052AA4-000000067F000080000007C00C00000A4244__0000009BCB4E4461-0000009C1E8CC879", +"000000067F000080000007C00C0000054000-000000067F000080000007C00C0000058000__0000009B40525F80", +"000000067F000080000007C00C0000054000-000000067F000080000007C00C0000058000__0000009C1E3799F0", +"000000067F000080000007C00C00000551FB-000000067F000080000007C00C000005E90B__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C0000058000-000000067F000080000007C00C000005C000__0000009B40525F80", +"000000067F000080000007C00C0000058000-000000067F000080000007C00C000005C000__0000009C1E3799F0", +"000000067F000080000007C00C000005C000-000000067F000080000007C00C0000060000__0000009B40525F80", +"000000067F000080000007C00C000005C000-000000067F000080000007C00C0000060000__0000009C1E3799F0", +"000000067F000080000007C00C000005E90B-000000067F000080000007C00C000006802B__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C0000060000-000000067F000080000007C00C0000064000__0000009B40525F80", +"000000067F000080000007C00C0000060000-000000067F000080000007C00C0000064000__0000009C1E3799F0", +"000000067F000080000007C00C0000064000-000000067F000080000007C00C0000068000__0000009B40525F80", +"000000067F000080000007C00C0000064000-000000067F000080000007C00C0000068000__0000009C1E3799F0", +"000000067F000080000007C00C0000068000-000000067F000080000007C00C000006C000__0000009B40525F80", +"000000067F000080000007C00C0000068000-000000067F000080000007C00C000006C000__0000009C1E3799F0", +"000000067F000080000007C00C000006802B-000000067F000080000007C00C0000071782__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C000006C000-000000067F000080000007C00C0000070000__0000009B40525F80", +"000000067F000080000007C00C000006C000-000000067F000080000007C00C0000070000__0000009C1E3799F0", +"000000067F000080000007C00C0000070000-000000067F000080000007C00C0000074000__0000009B40525F80", +"000000067F000080000007C00C0000070000-000000067F000080000007C00C0000074000__0000009C1E3799F0", +"000000067F000080000007C00C0000071782-000000067F000080000007C00C000007AEE8__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C0000074000-000000067F000080000007C00C0000078000__0000009B40525F80", +"000000067F000080000007C00C0000074000-000000067F000080000007C00C0000078000__0000009C1E3799F0", +"000000067F000080000007C00C0000078000-000000067F000080000007C00C000007C000__0000009B40525F80", +"000000067F000080000007C00C0000078000-000000067F000080000007C00C000007C000__0000009C1E3799F0", +"000000067F000080000007C00C000007AEE8-000000067F000080000007C00C000008460B__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C000007C000-000000067F000080000007C00C0000080000__0000009B40525F80", +"000000067F000080000007C00C000007C000-000000067F000080000007C00C0000080000__0000009C1E3799F0", +"000000067F000080000007C00C0000080000-000000067F000080000007C00C0000084000__0000009B40525F80", +"000000067F000080000007C00C0000080000-000000067F000080000007C00C0000084000__0000009C1E3799F0", +"000000067F000080000007C00C0000084000-000000067F000080000007C00C0000088000__0000009B40525F80", +"000000067F000080000007C00C0000084000-000000067F000080000007C00C0000088000__0000009C1E3799F0", +"000000067F000080000007C00C000008460B-000000067F000080000007C00C000008DD71__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C0000088000-000000067F000080000007C00C000008C000__0000009B40525F80", +"000000067F000080000007C00C0000088000-000000067F000080000007C00C000008C000__0000009C1E3799F0", +"000000067F000080000007C00C000008C000-000000067F000080000007C00C0000090000__0000009B40525F80", +"000000067F000080000007C00C000008C000-000000067F000080000007C00C0000090000__0000009C1E3799F0", +"000000067F000080000007C00C000008DD71-000000067F000080000007C00C00000974D7__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C0000090000-000000067F000080000007C00C0000094000__0000009B40525F80", +"000000067F000080000007C00C0000090000-000000067F000080000007C00C0000094000__0000009C1E3799F0", +"000000067F000080000007C00C0000094000-000000067F000080000007C00C0000098000__0000009B40525F80", +"000000067F000080000007C00C0000094000-000000067F000080000007C00C0000098000__0000009C1E3799F0", +"000000067F000080000007C00C00000974D7-000000067F000080000007C00C00000A0C0B__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C0000098000-000000067F000080000007C00C000009C000__0000009B40525F80", +"000000067F000080000007C00C0000098000-000000067F000080000007C00C000009C000__0000009C1E3799F0", +"000000067F000080000007C00C000009C000-000000067F000080000007C00C00000A0000__0000009B40525F80", +"000000067F000080000007C00C000009C000-000000067F000080000007C00C00000A0000__0000009C1E3799F0", +"000000067F000080000007C00C00000A0000-000000067F000080000007C00C00000A4000__0000009B40525F80", +"000000067F000080000007C00C00000A0000-000000067F000080000007C00C00000A4000__0000009C1E3799F0", +"000000067F000080000007C00C00000A0C0B-000000067F000080000007C00C0100000000__00000099F1C9FC71-0000009A918DF181", +"000000067F000080000007C00C00000A4000-000000067F000080000007C00C00000A8000__0000009B40525F80", +"000000067F000080000007C00C00000A4000-000000067F000080000007C00C00000A8000__0000009C1E3799F0", +"000000067F000080000007C00C00000A424C-000000067F000080000007C00C00000F5B43__0000009BCB4E4461-0000009C1E8CC879", +"000000067F000080000007C00C00000A8000-000000067F000080000007C00C00000AC000__0000009B40525F80", +"000000067F000080000007C00C00000A8000-000000067F000080000007C00C00000AC000__0000009C1E3799F0", +"000000067F000080000007C00C00000A9244-000000067F000080000007C00C00000B2991__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000AC000-000000067F000080000007C00C00000B0000__0000009B40525F80", +"000000067F000080000007C00C00000AC000-000000067F000080000007C00C00000B0000__0000009C1E3799F0", +"000000067F000080000007C00C00000B0000-000000067F000080000007C00C00000B4000__0000009B40525F80", +"000000067F000080000007C00C00000B0000-000000067F000080000007C00C00000B4000__0000009C1E3799F0", +"000000067F000080000007C00C00000B2991-000000067F000080000007C00C00000BC0F7__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000B4000-000000067F000080000007C00C00000B8000__0000009B40525F80", +"000000067F000080000007C00C00000B4000-000000067F000080000007C00C00000B8000__0000009C1E3799F0", +"000000067F000080000007C00C00000B8000-000000067F000080000007C00C00000BC000__0000009B40525F80", +"000000067F000080000007C00C00000B8000-000000067F000080000007C00C00000BC000__0000009C1E3799F0", +"000000067F000080000007C00C00000BA258-000000067F000080000007C01400000011E2__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C00C00000BC000-000000067F000080000007C00C00000C0000__0000009B40525F80", +"000000067F000080000007C00C00000BC000-000000067F000080000007C00C00000C0000__0000009C1E3799F0", +"000000067F000080000007C00C00000BC0F7-000000067F000080000007C00C00000C580C__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000C0000-000000067F000080000007C00C00000C4000__0000009B40525F80", +"000000067F000080000007C00C00000C0000-000000067F000080000007C00C00000C4000__0000009C1E3799F0", +"000000067F000080000007C00C00000C4000-000000067F000080000007C00C00000C8000__0000009B40525F80", +"000000067F000080000007C00C00000C4000-000000067F000080000007C00C00000C8000__0000009C1E3799F0", +"000000067F000080000007C00C00000C580C-000000067F000080000007C00C00000CEF72__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000C8000-000000067F000080000007C00C00000CC000__0000009B40525F80", +"000000067F000080000007C00C00000C8000-000000067F000080000007C00C00000CC000__0000009C1E3799F0", +"000000067F000080000007C00C00000CC000-000000067F000080000007C00C00000D0000__0000009B40525F80", +"000000067F000080000007C00C00000CC000-000000067F000080000007C00C00000D0000__0000009C1E3799F0", +"000000067F000080000007C00C00000CEF72-000000067F000080000007C00C00000D86D8__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000D0000-000000067F000080000007C00C00000D4000__0000009B40525F80", +"000000067F000080000007C00C00000D0000-000000067F000080000007C00C00000D4000__0000009C1E3799F0", +"000000067F000080000007C00C00000D4000-000000067F000080000007C00C00000D8000__0000009B40525F80", +"000000067F000080000007C00C00000D4000-000000067F000080000007C00C00000D8000__0000009C1E3799F0", +"000000067F000080000007C00C00000D8000-000000067F000080000007C00C00000DC000__0000009B40525F80", +"000000067F000080000007C00C00000D8000-000000067F000080000007C00C00000DC000__0000009C1E3799F0", +"000000067F000080000007C00C00000D86D8-000000067F000080000007C00C00000E1E0B__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000DC000-000000067F000080000007C00C00000E0000__0000009B40525F80", +"000000067F000080000007C00C00000DC000-000000067F000080000007C00C00000E0000__0000009C1E3799F0", +"000000067F000080000007C00C00000E0000-000000067F000080000007C00C00000E4000__0000009B40525F80", +"000000067F000080000007C00C00000E0000-000000067F000080000007C00C00000E4000__0000009C1E3799F0", +"000000067F000080000007C00C00000E1E0B-000000067F000080000007C00C00000EB571__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000E4000-000000067F000080000007C00C00000E8000__0000009B40525F80", +"000000067F000080000007C00C00000E4000-000000067F000080000007C00C00000E8000__0000009C1E3799F0", +"000000067F000080000007C00C00000E8000-000000067F000080000007C00C00000EC000__0000009B40525F80", +"000000067F000080000007C00C00000E8000-000000067F000080000007C00C00000EC000__0000009C1E3799F0", +"000000067F000080000007C00C00000EB571-000000067F000080000007C00C00000F4CD7__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000EC000-000000067F000080000007C00C00000F0000__0000009B40525F80", +"000000067F000080000007C00C00000EC000-000000067F000080000007C00C00000F0000__0000009C1E3799F0", +"000000067F000080000007C00C00000F0000-000000067F000080000007C00C00000F4000__0000009B40525F80", +"000000067F000080000007C00C00000F0000-000000067F000080000007C00C00000F4000__0000009C1E3799F0", +"000000067F000080000007C00C00000F4000-000000067F000080000007C00C00000F8000__0000009B40525F80", +"000000067F000080000007C00C00000F4000-000000067F000080000007C00C00000F8000__0000009C1E3799F0", +"000000067F000080000007C00C00000F4CD7-000000067F000080000007C00C00000FE40B__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C00000F5B56-000000067F000080000007C014000000EB5A__0000009BCB4E4461-0000009C1E8CC879", +"000000067F000080000007C00C00000F8000-000000067F000080000007C00C00000FC000__0000009B40525F80", +"000000067F000080000007C00C00000F8000-000000067F000080000007C00C00000FC000__0000009C1E3799F0", +"000000067F000080000007C00C00000FC000-000000067F000080000007C00C0000100000__0000009B40525F80", +"000000067F000080000007C00C00000FC000-000000067F000080000007C00C0000100000__0000009C1E3799F0", +"000000067F000080000007C00C00000FE40B-000000067F000080000007C00C0000107B27__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C0000100000-000000067F000080000007C00C0000104000__0000009B40525F80", +"000000067F000080000007C00C0000100000-000000067F000080000007C00C0000104000__0000009C1E3799F0", +"000000067F000080000007C00C0000104000-000000067F000080000007C00C0000108000__0000009B40525F80", +"000000067F000080000007C00C0000104000-000000067F000080000007C00C0000108000__0000009C1E3799F0", +"000000067F000080000007C00C0000107B27-000000067F000080000007C00C000011128D__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C00C0000108000-000000067F000080000007C00C000010C000__0000009C1E3799F0", +"000000067F000080000007C00C0000108000-030000000000000000000000000000000002__0000009B40525F80", +"000000067F000080000007C00C000010C000-000000067F000080000007C00C0000110000__0000009C1E3799F0", +"000000067F000080000007C00C0000110000-000000067F000080000007C0120100000000__0000009C1E3799F0", +"000000067F000080000007C00C000011128D-010000000000000001000000040000000012__0000009A918DF181-0000009B51A8BBB9", +"000000067F000080000007C0140000000000-000000067F000080000007C0140000004000__0000009C1E3799F0", +"000000067F000080000007C01400000011E2-000000067F000080000007C0140000007F04__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C0140000004000-000000067F000080000007C0140000008000__0000009C1E3799F0", +"000000067F000080000007C0140000007F04-000000067F000080000007C014000000EC12__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C0140000008000-000000067F000080000007C014000000C000__0000009C1E3799F0", +"000000067F000080000007C014000000C000-000000067F000080000007C0140000010000__0000009C1E3799F0", +"000000067F000080000007C014000000EB5A-000000067F000080000007C0140000027B5C__0000009BCB4E4461-0000009C1E8CC879", +"000000067F000080000007C014000000EC12-000000067F000080000007C0140000015910__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C0140000010000-000000067F000080000007C0140000014000__0000009C1E3799F0", +"000000067F000080000007C0140000014000-000000067F000080000007C0140000018000__0000009C1E3799F0", +"000000067F000080000007C0140000015910-000000067F000080000007C014000001C5BB__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C0140000018000-000000067F000080000007C014000001C000__0000009C1E3799F0", +"000000067F000080000007C014000001C000-000000067F000080000007C0140000020000__0000009C1E3799F0", +"000000067F000080000007C014000001C5BB-000000067F000080000007C0140000023298__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C0140000020000-000000067F000080000007C0140000024000__0000009C1E3799F0", +"000000067F000080000007C0140000023298-000000067F000080000007C0140000029F9A__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C0140000024000-000000067F000080000007C0140000028000__0000009C1E3799F0", +"000000067F000080000007C0140000027B5E-030000000000000000000000000000000002__0000009BCB4E4461-0000009C1E8CC879", +"000000067F000080000007C0140000028000-000000067F000080000007C014000002C000__0000009C1E3799F0", +"000000067F000080000007C0140000029F9A-030000000000000000000000000000000002__0000009B51A8BBB9-0000009BCB4E4461", +"000000067F000080000007C014000002C000-030000000000000000000000000000000002__0000009C1E3799F0", +"000000067F000080000007E00C0000000000-000000067F000080000007E00C0000004000__0000009DEF760000", +"000000067F000080000007E00C0000004000-000000067F000080000007E00C0000008000__0000009DEF760000", +"000000067F000080000007E00C0000008000-000000067F000080000007E00C000000C000__0000009DEF760000", +"000000067F000080000007E00C00000092CD-000000067F000080000007E00C0000012A0A__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C000000C000-000000067F000080000007E00C0000010000__0000009DEF760000", +"000000067F000080000007E00C0000010000-000000067F000080000007E00C0000014000__0000009DEF760000", +"000000067F000080000007E00C0000012A0A-000000067F000080000007E00C000001C170__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C0000014000-000000067F000080000007E00C0000018000__0000009DEF760000", +"000000067F000080000007E00C0000018000-000000067F000080000007E00C000001C000__0000009DEF760000", +"000000067F000080000007E00C000001C000-000000067F000080000007E00C0000020000__0000009DEF760000", +"000000067F000080000007E00C000001C170-000000067F000080000007E00C00000258D6__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C0000020000-000000067F000080000007E00C0000024000__0000009DEF760000", +"000000067F000080000007E00C0000024000-000000067F000080000007E00C0000028000__0000009DEF760000", +"000000067F000080000007E00C00000258D6-000000067F000080000007E00C000002F00B__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C0000028000-000000067F000080000007E00C000002C000__0000009DEF760000", +"000000067F000080000007E00C000002C000-000000067F000080000007E00C0000030000__0000009DEF760000", +"000000067F000080000007E00C000002F00B-000000067F000080000007E00C0000038720__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C0000030000-000000067F000080000007E00C0000034000__0000009DEF760000", +"000000067F000080000007E00C0000034000-000000067F000080000007E00C0000038000__0000009DEF760000", +"000000067F000080000007E00C0000038000-000000067F000080000007E00C000003C000__0000009DEF760000", +"000000067F000080000007E00C0000038720-000000067F000080000007E00C0000041E86__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C000003C000-000000067F000080000007E00C0000040000__0000009DEF760000", +"000000067F000080000007E00C0000040000-000000067F000080000007E00C0000044000__0000009DEF760000", +"000000067F000080000007E00C0000041E86-000000067F000080000007E00C000004B5EC__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C0000044000-000000067F000080000007E00C0000048000__0000009DEF760000", +"000000067F000080000007E00C0000048000-000000067F000080000007E00C000004C000__0000009DDBE10620", +"000000067F000080000007E00C0000048000-000000067F000080000007E00C000004C000__0000009EBB11FFC0", +"000000067F000080000007E00C000004B5EC-030000000000000000000000000000000002__0000009C1E8CC879-0000009C9ED3F059", +"000000067F000080000007E00C000004BACA-000000067F000080000007E00C00000551FF__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C000004C000-000000067F000080000007E00C0000050000__0000009DDBE10620", +"000000067F000080000007E00C000004C000-000000067F000080000007E00C0000050000__0000009EBB11FFC0", +"000000067F000080000007E00C0000050000-000000067F000080000007E00C0000054000__0000009DDBE10620", +"000000067F000080000007E00C0000050000-000000067F000080000007E00C0000054000__0000009EBB11FFC0", +"000000067F000080000007E00C0000054000-000000067F000080000007E00C0000058000__0000009DDBE10620", +"000000067F000080000007E00C0000054000-000000067F000080000007E00C0000058000__0000009EBB11FFC0", +"000000067F000080000007E00C00000551FF-000000067F000080000007E00C000005E90C__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C0000058000-000000067F000080000007E00C000005C000__0000009DDBE10620", +"000000067F000080000007E00C0000058000-000000067F000080000007E00C000005C000__0000009EBB11FFC0", +"000000067F000080000007E00C000005C000-000000067F000080000007E00C0000060000__0000009DDBE10620", +"000000067F000080000007E00C000005C000-000000067F000080000007E00C0000060000__0000009EBB11FFC0", +"000000067F000080000007E00C000005E90C-000000067F000080000007E00C000006802C__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C0000060000-000000067F000080000007E00C0000064000__0000009DDBE10620", +"000000067F000080000007E00C0000060000-000000067F000080000007E00C0000064000__0000009EBB11FFC0", +"000000067F000080000007E00C0000061AE1-000000067F000080000007E00C00000C2A6C__0000009E781A9731-0000009EBBC72771", +"000000067F000080000007E00C0000064000-000000067F000080000007E00C0000068000__0000009DDBE10620", +"000000067F000080000007E00C0000064000-000000067F000080000007E00C0000068000__0000009EBB11FFC0", +"000000067F000080000007E00C0000068000-000000067F000080000007E00C000006C000__0000009DDBE10620", +"000000067F000080000007E00C0000068000-000000067F000080000007E00C000006C000__0000009EBB11FFC0", +"000000067F000080000007E00C000006802C-000000067F000080000007E00C0000071783__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C000006C000-000000067F000080000007E00C0000070000__0000009DDBE10620", +"000000067F000080000007E00C000006C000-000000067F000080000007E00C0000070000__0000009EBB11FFC0", +"000000067F000080000007E00C0000070000-000000067F000080000007E00C0000074000__0000009DDBE10620", +"000000067F000080000007E00C0000070000-000000067F000080000007E00C0000074000__0000009EBB11FFC0", +"000000067F000080000007E00C0000071783-000000067F000080000007E00C000007AEE9__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C0000074000-000000067F000080000007E00C0000078000__0000009DDBE10620", +"000000067F000080000007E00C0000074000-000000067F000080000007E00C0000078000__0000009EBB11FFC0", +"000000067F000080000007E00C0000078000-000000067F000080000007E00C000007C000__0000009DDBE10620", +"000000067F000080000007E00C0000078000-000000067F000080000007E00C000007C000__0000009EBB11FFC0", +"000000067F000080000007E00C000007AEE9-000000067F000080000007E00C000008460B__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C000007C000-000000067F000080000007E00C0000080000__0000009DDBE10620", +"000000067F000080000007E00C000007C000-000000067F000080000007E00C0000080000__0000009EBB11FFC0", +"000000067F000080000007E00C0000080000-000000067F000080000007E00C0000084000__0000009DDBE10620", +"000000067F000080000007E00C0000080000-000000067F000080000007E00C0000084000__0000009EBB11FFC0", +"000000067F000080000007E00C0000084000-000000067F000080000007E00C0000088000__0000009DDBE10620", +"000000067F000080000007E00C0000084000-000000067F000080000007E00C0000088000__0000009EBB11FFC0", +"000000067F000080000007E00C000008460B-000000067F000080000007E00C000008DD71__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C0000088000-000000067F000080000007E00C000008C000__0000009DDBE10620", +"000000067F000080000007E00C0000088000-000000067F000080000007E00C000008C000__0000009EBB11FFC0", +"000000067F000080000007E00C000008C000-000000067F000080000007E00C0000090000__0000009DDBE10620", +"000000067F000080000007E00C000008C000-000000067F000080000007E00C0000090000__0000009EBB11FFC0", +"000000067F000080000007E00C000008DD71-000000067F000080000007E00C00000974D7__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C0000090000-000000067F000080000007E00C0000094000__0000009DDBE10620", +"000000067F000080000007E00C0000090000-000000067F000080000007E00C0000094000__0000009EBB11FFC0", +"000000067F000080000007E00C0000093E3A-000000067F000080000007E00C0000111CED__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E00C0000094000-000000067F000080000007E00C0000098000__0000009DDBE10620", +"000000067F000080000007E00C0000094000-000000067F000080000007E00C0000098000__0000009EBB11FFC0", +"000000067F000080000007E00C00000974D7-000000067F000080000007E00C00000A0C0B__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C0000098000-000000067F000080000007E00C000009C000__0000009DDBE10620", +"000000067F000080000007E00C0000098000-000000067F000080000007E00C000009C000__0000009EBB11FFC0", +"000000067F000080000007E00C000009C000-000000067F000080000007E00C00000A0000__0000009DDBE10620", +"000000067F000080000007E00C000009C000-000000067F000080000007E00C00000A0000__0000009EBB11FFC0", +"000000067F000080000007E00C00000A0000-000000067F000080000007E00C00000A4000__0000009DDBE10620", +"000000067F000080000007E00C00000A0000-000000067F000080000007E00C00000A4000__0000009EBB11FFC0", +"000000067F000080000007E00C00000A0C0B-000000067F000080000007E00C00000AA371__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C00000A4000-000000067F000080000007E00C00000A8000__0000009DDBE10620", +"000000067F000080000007E00C00000A4000-000000067F000080000007E00C00000A8000__0000009EBB11FFC0", +"000000067F000080000007E00C00000A8000-000000067F000080000007E00C00000AC000__0000009DDBE10620", +"000000067F000080000007E00C00000A8000-000000067F000080000007E00C00000AC000__0000009EBB11FFC0", +"000000067F000080000007E00C00000AA371-000000067F000080000007E00C0100000000__0000009C9ED3F059-0000009D3E97E549", +"000000067F000080000007E00C00000AC000-000000067F000080000007E00C00000B0000__0000009DDBE10620", +"000000067F000080000007E00C00000AC000-000000067F000080000007E00C00000B0000__0000009EBB11FFC0", +"000000067F000080000007E00C00000B0000-000000067F000080000007E00C00000B4000__0000009DDBE10620", +"000000067F000080000007E00C00000B0000-000000067F000080000007E00C00000B4000__0000009EBB11FFC0", +"000000067F000080000007E00C00000B2704-000000067F000080000007E00C00000BBE0F__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000B4000-000000067F000080000007E00C00000B8000__0000009DDBE10620", +"000000067F000080000007E00C00000B4000-000000067F000080000007E00C00000B8000__0000009EBB11FFC0", +"000000067F000080000007E00C00000B8000-000000067F000080000007E00C00000BC000__0000009DDBE10620", +"000000067F000080000007E00C00000B8000-000000067F000080000007E00C00000BC000__0000009EBB11FFC0", +"000000067F000080000007E00C00000BBE0F-000000067F000080000007E00C00000C5542__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000BC000-000000067F000080000007E00C00000C0000__0000009DDBE10620", +"000000067F000080000007E00C00000BC000-000000067F000080000007E00C00000C0000__0000009EBB11FFC0", +"000000067F000080000007E00C00000C0000-000000067F000080000007E00C00000C4000__0000009DDBE10620", +"000000067F000080000007E00C00000C0000-000000067F000080000007E00C00000C4000__0000009EBB11FFC0", +"000000067F000080000007E00C00000C2A75-000000067F000080000007E0140000004415__0000009E781A9731-0000009EBBC72771", +"000000067F000080000007E00C00000C4000-000000067F000080000007E00C00000C8000__0000009DDBE10620", +"000000067F000080000007E00C00000C4000-000000067F000080000007E00C00000C8000__0000009EBB11FFC0", +"000000067F000080000007E00C00000C5542-000000067F000080000007E00C00000CECA8__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000C8000-000000067F000080000007E00C00000CC000__0000009DDBE10620", +"000000067F000080000007E00C00000C8000-000000067F000080000007E00C00000CC000__0000009EBB11FFC0", +"000000067F000080000007E00C00000CC000-000000067F000080000007E00C00000D0000__0000009DDBE10620", +"000000067F000080000007E00C00000CC000-000000067F000080000007E00C00000D0000__0000009EBB11FFC0", +"000000067F000080000007E00C00000CECA8-000000067F000080000007E00C00000D83BF__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000D0000-000000067F000080000007E00C00000D4000__0000009DDBE10620", +"000000067F000080000007E00C00000D0000-000000067F000080000007E00C00000D4000__0000009EBB11FFC0", +"000000067F000080000007E00C00000D4000-000000067F000080000007E00C00000D8000__0000009DDBE10620", +"000000067F000080000007E00C00000D4000-000000067F000080000007E00C00000D8000__0000009EBB11FFC0", +"000000067F000080000007E00C00000D8000-000000067F000080000007E00C00000DC000__0000009DDBE10620", +"000000067F000080000007E00C00000D8000-000000067F000080000007E00C00000DC000__0000009EBB11FFC0", +"000000067F000080000007E00C00000D83BF-000000067F000080000007E00C00000E1B0A__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000DC000-000000067F000080000007E00C00000E0000__0000009DDBE10620", +"000000067F000080000007E00C00000DC000-000000067F000080000007E00C00000E0000__0000009EBB11FFC0", +"000000067F000080000007E00C00000E0000-000000067F000080000007E00C00000E4000__0000009DDBE10620", +"000000067F000080000007E00C00000E0000-000000067F000080000007E00C00000E4000__0000009EBB11FFC0", +"000000067F000080000007E00C00000E1B0A-000000067F000080000007E00C00000EB270__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000E4000-000000067F000080000007E00C00000E8000__0000009DDBE10620", +"000000067F000080000007E00C00000E4000-000000067F000080000007E00C00000E8000__0000009EBB11FFC0", +"000000067F000080000007E00C00000E8000-000000067F000080000007E00C00000EC000__0000009DDBE10620", +"000000067F000080000007E00C00000E8000-000000067F000080000007E00C00000EC000__0000009EBB11FFC0", +"000000067F000080000007E00C00000EB270-000000067F000080000007E00C00000F49AA__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000EC000-000000067F000080000007E00C00000F0000__0000009DDBE10620", +"000000067F000080000007E00C00000EC000-000000067F000080000007E00C00000F0000__0000009EBB11FFC0", +"000000067F000080000007E00C00000F0000-000000067F000080000007E00C00000F4000__0000009DDBE10620", +"000000067F000080000007E00C00000F0000-000000067F000080000007E00C00000F4000__0000009EBB11FFC0", +"000000067F000080000007E00C00000F4000-000000067F000080000007E00C00000F8000__0000009DDBE10620", +"000000067F000080000007E00C00000F4000-000000067F000080000007E00C00000F8000__0000009EBB11FFC0", +"000000067F000080000007E00C00000F49AA-000000067F000080000007E00C00000FE10A__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C00000F8000-000000067F000080000007E00C00000FC000__0000009DDBE10620", +"000000067F000080000007E00C00000F8000-000000067F000080000007E00C00000FC000__0000009EBB11FFC0", +"000000067F000080000007E00C00000FC000-000000067F000080000007E00C0000100000__0000009DDBE10620", +"000000067F000080000007E00C00000FC000-000000067F000080000007E00C0000100000__0000009EBB11FFC0", +"000000067F000080000007E00C00000FE10A-000000067F000080000007E00C000010782C__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C0000100000-000000067F000080000007E00C0000104000__0000009DDBE10620", +"000000067F000080000007E00C0000100000-000000067F000080000007E00C0000104000__0000009EBB11FFC0", +"000000067F000080000007E00C0000104000-000000067F000080000007E00C0000108000__0000009EBB11FFC0", +"000000067F000080000007E00C0000104000-030000000000000000000000000000000002__0000009DDBE10620", +"000000067F000080000007E00C000010782C-000000067F000080000007E00C0000110F88__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C0000108000-000000067F000080000007E00C000010C000__0000009EBB11FFC0", +"000000067F000080000007E00C000010C000-000000067F000080000007E00C0000110000__0000009EBB11FFC0", +"000000067F000080000007E00C0000110000-000000067F000080000007E0120100000000__0000009EBB11FFC0", +"000000067F000080000007E00C0000110F88-010000000000000001000000040000000015__0000009D3E97E549-0000009DEEE6BFF9", +"000000067F000080000007E00C0000111CED-000000067F000080000007E0140000004818__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E0140000000000-000000067F000080000007E0140000004000__0000009EBB11FFC0", +"000000067F000080000007E0140000004000-000000067F000080000007E0140000008000__0000009EBB11FFC0", +"000000067F000080000007E0140000004418-000000067F000080000007E0140000025351__0000009E781A9731-0000009EBBC72771", +"000000067F000080000007E0140000004818-000000067F000080000007E014000000AD57__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E0140000008000-000000067F000080000007E014000000C000__0000009EBB11FFC0", +"000000067F000080000007E014000000AD57-000000067F000080000007E0140000011291__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E014000000C000-000000067F000080000007E0140000010000__0000009EBB11FFC0", +"000000067F000080000007E0140000010000-000000067F000080000007E0140000014000__0000009EBB11FFC0", +"000000067F000080000007E0140000011291-000000067F000080000007E0140000017809__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E0140000014000-000000067F000080000007E0140000018000__0000009EBB11FFC0", +"000000067F000080000007E0140000017809-000000067F000080000007E014000001DD22__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E0140000018000-000000067F000080000007E014000001C000__0000009EBB11FFC0", +"000000067F000080000007E014000001C000-000000067F000080000007E0140000020000__0000009EBB11FFC0", +"000000067F000080000007E014000001DD22-000000067F000080000007E0140000024244__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E0140000020000-000000067F000080000007E0140000024000__0000009EBB11FFC0", +"000000067F000080000007E0140000024000-000000067F000080000007E0140000028000__0000009EBB11FFC0", +"000000067F000080000007E0140000024244-000000067F000080000007E014000002A798__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E0140000025355-030000000000000000000000000000000002__0000009E781A9731-0000009EBBC72771", +"000000067F000080000007E0140000028000-000000067F000080000007E014000002C000__0000009EBB11FFC0", +"000000067F000080000007E014000002A798-030000000000000000000000000000000002__0000009DEEE6BFF9-0000009E781A9731", +"000000067F000080000007E014000002C000-030000000000000000000000000000000002__0000009EBB11FFC0", +"000000067F000080000008000C00000081F6-000000067F000080000008000C0000010448__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000010448-000000067F000080000008000C000001870A__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C000001870A-000000067F000080000008000C0000020905__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000020905-000000067F000080000008000C0000028AF3__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000028AF3-000000067F000080000008000C0000030CEA__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000030CEA-000000067F000080000008000C0000038EB6__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000038EB6-000000067F000080000008000C00000410B5__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000410B5-000000067F000080000008000C00000492CB__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000492CB-000000067F000080000008000C00000514F8__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000514F8-000000067F000080000008000C000005977B__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C000005977B-000000067F000080000008000C00000619C6__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000619C6-000000067F000080000008000C0000069B6B__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000069B6B-000000067F000080000008000C0000071DBE__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000071DBE-000000067F000080000008000C0000079F8E__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000079F8E-000000067F000080000008000C00000821D7__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000821D7-000000067F000080000008000C000008A3AB__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C000008A3AB-000000067F000080000008000C0000092556__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000092556-000000067F000080000008000C000009A744__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C000009A744-000000067F000080000008000C00000A29B0__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000A29B0-000000067F000080000008000C00000AAC4B__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000AAC4B-000000067F000080000008000C00000B2E21__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000B2E21-000000067F000080000008000C00000BB0DB__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000BB0DB-000000067F000080000008000C00000C331B__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000C331B-000000067F000080000008000C00000CB4D2__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000CB4D2-000000067F000080000008000C00000D3754__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000D3754-000000067F000080000008000C00000DB9C6__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000DB9C6-000000067F000080000008000C00000E3BC1__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000E3BC1-000000067F000080000008000C00000EBE00__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000EBE00-000000067F000080000008000C00000F3F63__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000F3F63-000000067F000080000008000C00000FC160__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C00000FC160-000000067F000080000008000C0000104448__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C0000104448-000000067F000080000008000C000010C675__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C000010C675-000000067F000080000008000C020000000B__0000009EBBC72771-000000A154401909", +"000000067F000080000008000C020000000B-000000067F00008000000800140000003ED1__0000009EBBC72771-000000A154401909", +"000000067F00008000000800140000003ED1-000000067F00008000000800140000009486__0000009EBBC72771-000000A154401909", +"000000067F00008000000800140000009486-000000067F0000800000080014000000EA73__0000009EBBC72771-000000A154401909", +"000000067F0000800000080014000000EA73-000000067F0000800000080014000001404D__0000009EBBC72771-000000A154401909", +"000000067F0000800000080014000001404D-000000067F000080000008001400000195A4__0000009EBBC72771-000000A154401909", +"000000067F000080000008001400000195A4-000000067F0000800000080014000001EBB4__0000009EBBC72771-000000A154401909", +"000000067F0000800000080014000001EBB4-000000067F000080000008001400000241E2__0000009EBBC72771-000000A154401909", +"000000067F000080000008001400000241E2-000000067F00008000000800140000029762__0000009EBBC72771-000000A154401909", +"000000067F00008000000800140000029762-030000000000000000000000000000000002__0000009EBBC72771-000000A154401909", +"000000067F000080000008200C0000000000-000000067F000080000008200C0000004000__000000A29F1D8950", +"000000067F000080000008200C0000004000-000000067F000080000008200C0000008000__000000A29F1D8950", +"000000067F000080000008200C0000008000-000000067F000080000008200C000000C000__000000A29F1D8950", +"000000067F000080000008200C000000974D-000000067F000080000008200C0000012EB3__000000A154401909-000000A1E407F839", +"000000067F000080000008200C000000C000-000000067F000080000008200C0000010000__000000A29F1D8950", +"000000067F000080000008200C0000010000-000000067F000080000008200C0000014000__000000A29F1D8950", +"000000067F000080000008200C0000012EB3-000000067F000080000008200C000001C60A__000000A154401909-000000A1E407F839", +"000000067F000080000008200C0000014000-000000067F000080000008200C0000018000__000000A29F1D8950", +"000000067F000080000008200C0000018000-000000067F000080000008200C000001C000__000000A29F1D8950", +"000000067F000080000008200C000001C000-000000067F000080000008200C0000020000__000000A29F1D8950", +"000000067F000080000008200C000001C60A-000000067F000080000008200C0000025D38__000000A154401909-000000A1E407F839", +"000000067F000080000008200C0000020000-000000067F000080000008200C0000024000__000000A29F1D8950", +"000000067F000080000008200C0000024000-000000067F000080000008200C0000028000__000000A29F1D8950", +"000000067F000080000008200C0000025D38-000000067F000080000008200C000002F49E__000000A154401909-000000A1E407F839", +"000000067F000080000008200C0000028000-000000067F000080000008200C000002C000__000000A29F1D8950", +"000000067F000080000008200C000002C000-000000067F000080000008200C0000030000__000000A29F1D8950", +"000000067F000080000008200C000002F49E-000000067F000080000008200C0000038BB1__000000A154401909-000000A1E407F839", +"000000067F000080000008200C0000030000-000000067F000080000008200C0000034000__000000A29F1D8950", +"000000067F000080000008200C0000034000-000000067F000080000008200C0000038000__000000A29F1D8950", +"000000067F000080000008200C0000038000-000000067F000080000008200C000003C000__000000A29F1D8950", +"000000067F000080000008200C0000038BB1-000000067F000080000008200C0000042317__000000A154401909-000000A1E407F839", +"000000067F000080000008200C000003C000-000000067F000080000008200C0000040000__000000A29F1D8950", +"000000067F000080000008200C0000040000-000000067F000080000008200C0000044000__000000A29F1D8950", +"000000067F000080000008200C0000042317-000000067F000080000008200C000004BA7D__000000A154401909-000000A1E407F839", +"000000067F000080000008200C0000044000-000000067F000080000008200C0000048000__000000A29F1D8950", +"000000067F000080000008200C0000048000-000000067F000080000008200C000004C000__000000A29F1D8950", +"000000067F000080000008200C000004BA7D-000000067F000080000008200C00000551B2__000000A154401909-000000A1E407F839", +"000000067F000080000008200C000004C000-000000067F000080000008200C0000050000__000000A29F1D8950", +"000000067F000080000008200C0000050000-000000067F000080000008200C0000054000__000000A29F1D8950", +"000000067F000080000008200C0000054000-000000067F000080000008200C0000058000__000000A29F1D8950", +"000000067F000080000008200C00000551B2-030000000000000000000000000000000002__000000A154401909-000000A1E407F839", +"000000067F000080000008200C0000058000-000000067F000080000008200C000005C000__000000A29F1D8950", +"000000067F000080000008200C000005C000-000000067F000080000008200C0000060000__000000A29F1D8950", +"000000067F000080000008200C000005D8FE-000000067F000080000008200C000006700C__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C0000060000-000000067F000080000008200C0000064000__000000A29F1D8950", +"000000067F000080000008200C0000064000-000000067F000080000008200C0000068000__000000A29F1D8950", +"000000067F000080000008200C000006700C-000000067F000080000008200C000007076D__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C0000068000-000000067F000080000008200C000006C000__000000A29F1D8950", +"000000067F000080000008200C000006C000-000000067F000080000008200C0000070000__000000A29F1D8950", +"000000067F000080000008200C0000070000-000000067F000080000008200C0000074000__000000A29F1D8950", +"000000067F000080000008200C000007076D-000000067F000080000008200C0000079ED3__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C0000074000-000000067F000080000008200C0000078000__000000A29F1D8950", +"000000067F000080000008200C0000078000-000000067F000080000008200C000007C000__000000A29F1D8950", +"000000067F000080000008200C0000079ED3-000000067F000080000008200C000008360A__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C000007C000-000000067F000080000008200C0000080000__000000A29F1D8950", +"000000067F000080000008200C0000080000-000000067F000080000008200C0000084000__000000A29F1D8950", +"000000067F000080000008200C000008360A-000000067F000080000008200C000008CD70__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C0000084000-000000067F000080000008200C0000088000__000000A29F1D8950", +"000000067F000080000008200C0000088000-000000067F000080000008200C000008C000__000000A29F1D8950", +"000000067F000080000008200C000008C000-000000067F000080000008200C0000090000__000000A29F1D8950", +"000000067F000080000008200C000008CD70-000000067F000080000008200C00000964D6__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C0000090000-000000067F000080000008200C0000094000__000000A29F1D8950", +"000000067F000080000008200C0000094000-000000067F000080000008200C0000098000__000000A29F1D8950", +"000000067F000080000008200C00000964D6-000000067F000080000008200C000009FC0B__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C0000098000-000000067F000080000008200C000009C000__000000A29F1D8950", +"000000067F000080000008200C000009C000-000000067F000080000008200C00000A0000__000000A29F1D8950", +"000000067F000080000008200C000009FC0B-000000067F000080000008200C00000A9319__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000A0000-000000067F000080000008200C00000A4000__000000A29F1D8950", +"000000067F000080000008200C00000A4000-000000067F000080000008200C00000A8000__000000A29F1D8950", +"000000067F000080000008200C00000A8000-000000067F000080000008200C00000AC000__000000A29F1D8950", +"000000067F000080000008200C00000A9319-000000067F000080000008200C00000B2A7F__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000AC000-000000067F000080000008200C00000B0000__000000A29F1D8950", +"000000067F000080000008200C00000B0000-000000067F000080000008200C00000B4000__000000A29F1D8950", +"000000067F000080000008200C00000B2A7F-000000067F000080000008200C00000BC1E5__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000B4000-000000067F000080000008200C00000B8000__000000A29F1D8950", +"000000067F000080000008200C00000B8000-000000067F000080000008200C00000BC000__000000A29F1D8950", +"000000067F000080000008200C00000BC000-000000067F000080000008200C00000C0000__000000A29F1D8950", +"000000067F000080000008200C00000BC1E5-000000067F000080000008200C00000C590C__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000C0000-010000000000000000000000000000000001__000000A29F1D8950", +"000000067F000080000008200C00000C590C-000000067F000080000008200C00000CF071__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000CF071-000000067F000080000008200C00000D8786__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000D8786-000000067F000080000008200C00000E1EEC__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000E1EEC-000000067F000080000008200C00000EB60C__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000EB60C-000000067F000080000008200C00000F4D43__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000F4D43-000000067F000080000008200C00000FE4A9__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C00000FE4A9-000000067F000080000008200C0000107BC5__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C0000107BC5-000000067F000080000008200C000011130B__000000A1E407F839-000000A323C9E001", +"000000067F000080000008200C000011130B-01000000000000000100000004000000001C__000000A1E407F839-000000A323C9E001", +"000000067F0000800000082014000000393C-000000067F0000800000082014000000B84D__000000A323C9E001-000000A37A60B1A9", +"000000067F0000800000082014000000B84D-000000067F0000800000082014000001375E__000000A323C9E001-000000A37A60B1A9", +"000000067F0000800000082014000001375E-000000067F0000800000082014000001B66D__000000A323C9E001-000000A37A60B1A9", +"000000067F0000800000082014000001B66D-000000067F0000800000082014000002357E__000000A323C9E001-000000A37A60B1A9", +"000000067F0000800000082014000002357E-000000067F0000800000082014000002B48D__000000A323C9E001-000000A37A60B1A9", +"000000067F0000800000082014000002B48D-030000000000000000000000000000000002__000000A323C9E001-000000A37A60B1A9", +"000000067F000080000008600C0000000000-000000067F000080000008600C0000004000__000000A434813A68", +"000000067F000080000008600C0000004000-000000067F000080000008600C0000008000__000000A434813A68", +"000000067F000080000008600C0000008000-000000067F000080000008600C000000C000__000000A434813A68", +"000000067F000080000008600C0000009747-000000067F000080000008600C0000012EAD__000000A37A60B1A9-000000A3CA47ECA9", +"000000067F000080000008600C000000C000-000000067F000080000008600C0000010000__000000A434813A68", +"000000067F000080000008600C0000010000-000000067F000080000008600C0000014000__000000A434813A68", +"000000067F000080000008600C0000012EAD-000000067F000080000008600C000001C60A__000000A37A60B1A9-000000A3CA47ECA9", +"000000067F000080000008600C0000014000-000000067F000080000008600C0000018000__000000A434813A68", +"000000067F000080000008600C0000018000-000000067F000080000008600C000001C000__000000A434813A68", +"000000067F000080000008600C000001C000-000000067F000080000008600C0000020000__000000A434813A68", +"000000067F000080000008600C000001C60A-000000067F000080000008600C0000025D38__000000A37A60B1A9-000000A3CA47ECA9", +"000000067F000080000008600C0000020000-000000067F000080000008600C0000024000__000000A434813A68", +"000000067F000080000008600C0000024000-000000067F000080000008600C0000028000__000000A434813A68", +"000000067F000080000008600C0000025D38-000000067F000080000008600C000002F49E__000000A37A60B1A9-000000A3CA47ECA9", +"000000067F000080000008600C0000028000-000000067F000080000008600C000002C000__000000A434813A68", +"000000067F000080000008600C000002C000-000000067F000080000008600C0000030000__000000A434813A68", +"000000067F000080000008600C000002F49E-030000000000000000000000000000000002__000000A37A60B1A9-000000A3CA47ECA9", +"000000067F000080000008600C000002F4CA-000000067F000080000008600C0000038BDD__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C0000030000-000000067F000080000008600C0000034000__000000A434813A68", +"000000067F000080000008600C0000034000-000000067F000080000008600C0000038000__000000A434813A68", +"000000067F000080000008600C0000038000-000000067F000080000008600C000003C000__000000A434813A68", +"000000067F000080000008600C0000038BDD-000000067F000080000008600C000004230B__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C000003C000-000000067F000080000008600C0000040000__000000A434813A68", +"000000067F000080000008600C0000040000-000000067F000080000008600C0000044000__000000A434813A68", +"000000067F000080000008600C000004230B-000000067F000080000008600C000004BA71__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C0000044000-000000067F000080000008600C0000048000__000000A434813A68", +"000000067F000080000008600C0000048000-000000067F000080000008600C000004C000__000000A434813A68", +"000000067F000080000008600C000004BA71-000000067F000080000008600C00000551A6__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C000004C000-000000067F000080000008600C0000050000__000000A434813A68", +"000000067F000080000008600C0000050000-000000067F000080000008600C0000054000__000000A434813A68", +"000000067F000080000008600C0000054000-000000067F000080000008600C0000058000__000000A434813A68", +"000000067F000080000008600C00000551A6-000000067F000080000008600C000005E90A__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C0000058000-000000067F000080000008600C000005C000__000000A434813A68", +"000000067F000080000008600C000005C000-000000067F000080000008600C0000060000__000000A434813A68", +"000000067F000080000008600C000005E90A-000000067F000080000008600C000006802C__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C0000060000-000000067F000080000008600C0000064000__000000A434813A68", +"000000067F000080000008600C0000064000-000000067F000080000008600C0000068000__000000A434813A68", +"000000067F000080000008600C0000068000-000000067F000080000008600C000006C000__000000A434813A68", +"000000067F000080000008600C000006802C-000000067F000080000008600C0000071783__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C000006C000-030000000000000000000000000000000002__000000A434813A68", +"000000067F000080000008600C0000071783-000000067F000080000008600C000007AEE9__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C000007AEE9-000000067F000080000008600C000008460B__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C000008460B-000000067F000080000008600C000008DD71__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C000008DD71-000000067F000080000008600C00000974D7__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000974D7-000000067F000080000008600C00000A0C0B__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000A0C0B-000000067F000080000008600C00000AA371__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000AA371-000000067F000080000008600C00000B3AD7__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000B3AD7-000000067F000080000008600C00000BD20B__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000BD20B-000000067F000080000008600C00000C6932__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000C6932-000000067F000080000008600C00000D0098__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000D0098-000000067F000080000008600C00000D97FE__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000D97FE-000000067F000080000008600C00000E2F0B__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000E2F0B-000000067F000080000008600C00000EC671__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000EC671-000000067F000080000008600C00000F5D9F__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000F5D9F-000000067F000080000008600C00000FF505__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C00000FF505-000000067F000080000008600C0000108C10__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C0000108C10-000000067F000080000008600C0100000000__000000A3CA47ECA9-000000A539BDE561", +"000000067F000080000008600C000010ECC4-000000067F00008000000860140000002607__000000A539BDE561-000000A5A081B661", +"000000067F00008000000860140000002607-000000067F0000800000086014000000A518__000000A539BDE561-000000A5A081B661", +"000000067F0000800000086014000000A518-000000067F00008000000860140000012429__000000A539BDE561-000000A5A081B661", +"000000067F00008000000860140000012429-000000067F0000800000086014000001A338__000000A539BDE561-000000A5A081B661", +"000000067F0000800000086014000001A338-000000067F00008000000860140000022249__000000A539BDE561-000000A5A081B661", +"000000067F00008000000860140000022249-000000067F0000800000086014000002A159__000000A539BDE561-000000A5A081B661", +"000000067F0000800000086014000002A159-030000000000000000000000000000000002__000000A539BDE561-000000A5A081B661", +"000000067F000080000008801C0000009703-000000067F000080000008801C0000012E0E__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C0000012E0E-000000067F000080000008801C000001C574__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C000001C574-000000067F000080000008801C0000025CDA__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C0000025CDA-000000067F000080000008801C000002F40A__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C000002F40A-000000067F000080000008801C0000038B1D__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C0000038B1D-000000067F000080000008801C0000042283__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C0000042283-000000067F000080000008801C000004B9E9__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C000004B9E9-000000067F000080000008801C000005510B__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C000005510B-000000067F000080000008801C000005E871__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C000005E871-000000067F000080000008801C0000067F8B__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C0000067F8B-030000000000000000000000000000000002__000000A5A081B661-000000A6503DE919", +"000000067F000080000008801C0000068000-000000067F000080000008801C000006C000__000000A76EC5DFE8", +"000000067F000080000008801C00000680F7-000000067F000080000008801C000007180C__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C000006C000-000000067F000080000008801C0000070000__000000A76EC5DFE8", +"000000067F000080000008801C0000070000-000000067F000080000008801C0000074000__000000A76EC5DFE8", +"000000067F000080000008801C000007180C-000000067F000080000008801C000007AF72__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C0000074000-000000067F000080000008801C0000078000__000000A76EC5DFE8", +"000000067F000080000008801C0000078000-000000067F000080000008801C000007C000__000000A76F097A80", +"000000067F000080000008801C000007AF72-000000067F000080000008801C00000846D8__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C000007C000-000000067F000080000008801C0000080000__000000A76F097A80", +"000000067F000080000008801C0000080000-000000067F000080000008801C0000084000__000000A76F097A80", +"000000067F000080000008801C0000084000-000000067F000080000008801C0000088000__000000A76F097A80", +"000000067F000080000008801C00000846D8-000000067F000080000008801C000008DE0B__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C0000088000-000000067F000080000008801C000008C000__000000A76F097A80", +"000000067F000080000008801C000008C000-000000067F000080000008801C0000090000__000000A76F097A80", +"000000067F000080000008801C000008DE0B-000000067F000080000008801C000009752B__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C0000090000-000000067F000080000008801C0000094000__000000A76F097A80", +"000000067F000080000008801C0000094000-000000067F000080000008801C0000098000__000000A76F097A80", +"000000067F000080000008801C000009752B-000000067F000080000008801C00000A0C91__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C0000098000-000000067F000080000008801C000009C000__000000A76F097A80", +"000000067F000080000008801C000009C000-000000067F000080000008801C00000A0000__000000A76F097A80", +"000000067F000080000008801C00000A0000-000000067F000080000008801C00000A4000__000000A76F097A80", +"000000067F000080000008801C00000A0C91-000000067F000080000008801C00000AA3F7__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C00000A4000-000000067F000080000008801C00000A8000__000000A76F097A80", +"000000067F000080000008801C00000A8000-000000067F000080000008801C00000AC000__000000A76F097A80", +"000000067F000080000008801C00000AA3F7-000000067F000080000008801C00000B3B0C__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C00000AC000-000000067F000080000008801C00000B0000__000000A76F097A80", +"000000067F000080000008801C00000B0000-000000067F000080000008801C00000B4000__000000A76F097A80", +"000000067F000080000008801C00000B3B0C-000000067F000080000008801C00000BD272__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C00000B4000-000000067F000080000008801C00000B8000__000000A76F097A80", +"000000067F000080000008801C00000B8000-000000067F000080000008801C00000BC000__000000A76F097A80", +"000000067F000080000008801C00000BC000-000000067F000080000008801C00000C0000__000000A76F097A80", +"000000067F000080000008801C00000BD272-000000067F000080000008801C00000C6999__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C00000C0000-000000067F000080000008801C00000C4000__000000A76F097A80", +"000000067F000080000008801C00000C4000-000000067F000080000008801C00000C8000__000000A76F097A80", +"000000067F000080000008801C00000C6999-000000067F000080000008801C0100000000__000000A6503DE919-000000A6F001F909", +"000000067F000080000008801C00000C8000-000000067F000080000008801C00000CC000__000000A76F097A80", +"000000067F000080000008801C00000CC000-000000067F000080000008801C00000D0000__000000A76F097A80", +"000000067F000080000008801C00000CF6B0-000000067F000080000008801C00000D8DC1__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008801C00000D0000-000000067F000080000008801C00000D4000__000000A76F097A80", +"000000067F000080000008801C00000D4000-000000067F000080000008801C00000D8000__000000A76F097A80", +"000000067F000080000008801C00000D8000-000000067F000080000008801C00000DC000__000000A76F097A80", +"000000067F000080000008801C00000D8DC1-000000067F000080000008801C00000E250B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008801C00000DC000-000000067F000080000008801C00000E0000__000000A76F097A80", +"000000067F000080000008801C00000E0000-000000067F000080000008801C00000E4000__000000A76F097A80", +"000000067F000080000008801C00000E250B-000000067F000080000008801C00000EBC71__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008801C00000E4000-000000067F000080000008801C00000E8000__000000A76F097A80", +"000000067F000080000008801C00000E8000-000000067F000080000008801C00000EC000__000000A76F097A80", +"000000067F000080000008801C00000EBC71-000000067F000080000008801C00000F53A5__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008801C00000EC000-000000067F000080000008801C00000F0000__000000A76F097A80", +"000000067F000080000008801C00000F0000-000000067F000080000008801C00000F4000__000000A76F097A80", +"000000067F000080000008801C00000F4000-000000067F000080000008801C00000F8000__000000A76F097A80", +"000000067F000080000008801C00000F53A5-000000067F000080000008801C00000FEB0B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008801C00000F8000-000000067F000080000008801C00000FC000__000000A76F097A80", +"000000067F000080000008801C00000FC000-000000067F000080000008801C0000100000__000000A76F097A80", +"000000067F000080000008801C00000FEB0B-000000067F000080000008801C000010822C__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008801C0000100000-000000067F000080000008801C0000104000__000000A76F097A80", +"000000067F000080000008801C0000104000-000000067F000080000008801C0000108000__000000A76F097A80", +"000000067F000080000008801C0000108000-000000067F000080000008801C000010C000__000000A76F097A80", +"000000067F000080000008801C000010822C-000000067F000080000008801C0000111982__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008801C000010C000-000000067F000080000008801C0000110000__000000A76F097A80", +"000000067F000080000008801C0000110000-030000000000000000000000000000000002__000000A76F097A80", +"000000067F000080000008801C0000111982-000000067F000080000008A00C00000084EA__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000084EA-000000067F000080000008A00C0000011C0C__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000011C0C-000000067F000080000008A00C000001B372__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C000001B372-000000067F000080000008A00C0000024AD8__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000024AD8-000000067F000080000008A00C000002E20B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C000002E20B-000000067F000080000008A00C0000037928__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000037928-000000067F000080000008A00C000004108E__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C000004108E-000000067F000080000008A00C000004A7F4__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C000004A7F4-000000067F000080000008A00C0000053F0B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000053F0B-000000067F000080000008A00C000005D671__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C000005D671-000000067F000080000008A00C0000066D95__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000066D95-000000067F000080000008A00C00000704FB__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000704FB-000000067F000080000008A00C0000079C0B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000079C0B-000000067F000080000008A00C0000083351__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000083351-000000067F000080000008A00C000008CAB7__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C000008CAB7-000000067F000080000008A00C00000961E2__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000961E2-000000067F000080000008A00C000009F90B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C000009F90B-000000067F000080000008A00C00000A902B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000A902B-000000067F000080000008A00C00000B2779__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000B2779-000000067F000080000008A00C00000BBEDF__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000BBEDF-000000067F000080000008A00C00000C560A__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000C560A-000000067F000080000008A00C00000CED70__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000CED70-000000067F000080000008A00C00000D84D6__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000D84D6-000000067F000080000008A00C00000E1C0A__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000E1C0A-000000067F000080000008A00C00000EB370__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000EB370-000000067F000080000008A00C00000F4AD6__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000F4AD6-000000067F000080000008A00C00000FE20B__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C00000FE20B-030000000000000000000000000000000002__000000A6F001F909-000000A91D97FD49", +"000000067F000080000008A00C0000104A0C-000000067F000080000008A00C000010DF6E__000000A91D97FD49-000000A98AB7EE49", +"000000067F000080000008A00C000010DF6E-000000067F000080000008A0140000001A21__000000A91D97FD49-000000A98AB7EE49", +"000000067F000080000008A0140000001A21-000000067F000080000008A0140000009932__000000A91D97FD49-000000A98AB7EE49", +"000000067F000080000008A0140000009932-000000067F000080000008A0140000011843__000000A91D97FD49-000000A98AB7EE49", +"000000067F000080000008A0140000011843-000000067F000080000008A0140000019753__000000A91D97FD49-000000A98AB7EE49", +"000000067F000080000008A0140000019753-000000067F000080000008A0140000021664__000000A91D97FD49-000000A98AB7EE49", +"000000067F000080000008A0140000021664-01000000000000000100000004000000001C__000000A91D97FD49-000000A98AB7EE49", +"000000067F000080000008C00C0000000000-000000067F000080000008C00C0000004000__000000AAEBE534F8", +"000000067F000080000008C00C0000002330-000000067F000080000008C00C000000BA96__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C0000004000-000000067F000080000008C00C0000008000__000000AAEBE534F8", +"000000067F000080000008C00C0000008000-000000067F000080000008C00C000000C000__000000AAEBE534F8", +"000000067F000080000008C00C000000BA96-000000067F000080000008C00C00000151CB__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C000000C000-000000067F000080000008C00C0000010000__000000AAEBE534F8", +"000000067F000080000008C00C0000010000-000000067F000080000008C00C0000014000__000000AAEBE534F8", +"000000067F000080000008C00C0000014000-000000067F000080000008C00C0000018000__000000AAEBE534F8", +"000000067F000080000008C00C00000151CB-000000067F000080000008C00C000001E90B__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C0000018000-000000067F000080000008C00C000001C000__000000AAEBE534F8", +"000000067F000080000008C00C000001C000-000000067F000080000008C00C0000020000__000000AAEBE534F8", +"000000067F000080000008C00C000001E90B-000000067F000080000008C00C000002802C__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C0000020000-000000067F000080000008C00C0000024000__000000AAEBE534F8", +"000000067F000080000008C00C0000024000-000000067F000080000008C00C0000028000__000000AAEBE534F8", +"000000067F000080000008C00C0000028000-000000067F000080000008C00C000002C000__000000AAEBE534F8", +"000000067F000080000008C00C000002802C-000000067F000080000008C00C0000031783__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C000002C000-000000067F000080000008C00C0000030000__000000AAEBE534F8", +"000000067F000080000008C00C0000030000-000000067F000080000008C00C0000034000__000000AAEBE534F8", +"000000067F000080000008C00C0000031783-000000067F000080000008C00C000003AEE9__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C0000034000-000000067F000080000008C00C0000038000__000000AAEBE534F8", +"000000067F000080000008C00C0000038000-000000067F000080000008C00C000003C000__000000AAEBE534F8", +"000000067F000080000008C00C000003AEE9-000000067F000080000008C00C000004460B__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C000003C000-000000067F000080000008C00C0000040000__000000AAEBE534F8", +"000000067F000080000008C00C0000040000-000000067F000080000008C00C0000044000__000000AAEBE534F8", +"000000067F000080000008C00C0000044000-000000067F000080000008C00C0000048000__000000AAEBE534F8", +"000000067F000080000008C00C000004460B-000000067F000080000008C00C000004DD71__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C0000048000-000000067F000080000008C00C000004C000__000000AAEBE534F8", +"000000067F000080000008C00C000004C000-000000067F000080000008C00C0000050000__000000AAEBE534F8", +"000000067F000080000008C00C000004DD71-030000000000000000000000000000000002__000000A98AB7EE49-000000AA2597E9A1", +"000000067F000080000008C00C0000050000-000000067F000080000008C00C0000054000__000000AAEBE534F8", +"000000067F000080000008C00C0000054000-000000067F000080000008C00C0000058000__000000AAEBE534F8", +"000000067F000080000008C00C0000058000-000000067F000080000008C00C000005C000__000000AAEBE534F8", +"000000067F000080000008C00C000005C000-000000067F000080000008C00C0000060000__000000AAEBE534F8", +"000000067F000080000008C00C000005DA8C-000000067F000080000008C00C00000671AE__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C0000060000-000000067F000080000008C00C0000064000__000000AAEBE534F8", +"000000067F000080000008C00C0000064000-000000067F000080000008C00C0000068000__000000AAEBE534F8", +"000000067F000080000008C00C00000671AE-000000067F000080000008C00C000007090A__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C0000068000-000000067F000080000008C00C000006C000__000000AAEBE534F8", +"000000067F000080000008C00C000006C000-000000067F000080000008C00C0000070000__000000AAEBE534F8", +"000000067F000080000008C00C0000070000-000000067F000080000008C00C0000074000__000000AAEBE534F8", +"000000067F000080000008C00C000007090A-000000067F000080000008C00C000007A070__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C0000074000-000000067F000080000008C00C0000078000__000000AAEBE534F8", +"000000067F000080000008C00C0000078000-000000067F000080000008C00C000007C000__000000AAEBE534F8", +"000000067F000080000008C00C000007A070-000000067F000080000008C00C00000837B4__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C000007C000-000000067F000080000008C00C0000080000__000000AAEBE534F8", +"000000067F000080000008C00C0000080000-000000067F000080000008C00C0000084000__000000AAEBE534F8", +"000000067F000080000008C00C00000837B4-000000067F000080000008C00C000008CF0A__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C0000084000-000000067F000080000008C00C0000088000__000000AAEBE534F8", +"000000067F000080000008C00C0000088000-000000067F000080000008C00C000008C000__000000AAEBE534F8", +"000000067F000080000008C00C000008C000-000000067F000080000008C00C0000090000__000000AAEBE534F8", +"000000067F000080000008C00C000008CF0A-000000067F000080000008C00C0000096670__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C0000090000-000000067F000080000008C00C0000094000__000000AAEBE534F8", +"000000067F000080000008C00C0000094000-000000067F000080000008C00C0000098000__000000AAEBE534F8", +"000000067F000080000008C00C0000096670-000000067F000080000008C00C000009FDD6__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C0000098000-000000067F000080000008C00C000009C000__000000AAEBE534F8", +"000000067F000080000008C00C000009C000-000000067F000080000008C00C00000A0000__000000AAEBE534F8", +"000000067F000080000008C00C000009FDD6-000000067F000080000008C00C00000A952A__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000A0000-000000067F000080000008C00C00000A4000__000000AAEBE534F8", +"000000067F000080000008C00C00000A4000-000000067F000080000008C00C00000A8000__000000AAEBE534F8", +"000000067F000080000008C00C00000A8000-000000067F000080000008C00C00000AC000__000000AAEBE534F8", +"000000067F000080000008C00C00000A952A-000000067F000080000008C00C00000B2C90__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000AC000-000000067F000080000008C00C00000B0000__000000AAEBE534F8", +"000000067F000080000008C00C00000B0000-000000067F000080000008C00C00000B4000__000000AAEBE534F8", +"000000067F000080000008C00C00000B2C90-000000067F000080000008C00C00000BC3F6__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000B4000-000000067F000080000008C00C00000B8000__000000AAEBE534F8", +"000000067F000080000008C00C00000B8000-000000067F000080000008C00C00000BC000__000000AAEBE534F8", +"000000067F000080000008C00C00000BC000-000000067F000080000008C00C00000C0000__000000AAEBE534F8", +"000000067F000080000008C00C00000BC3F6-000000067F000080000008C00C00000C5B0C__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000C0000-000000067F000080000008C00C00000C4000__000000AAEBE534F8", +"000000067F000080000008C00C00000C4000-000000067F000080000008C00C00000C8000__000000AAEBE534F8", +"000000067F000080000008C00C00000C5B0C-000000067F000080000008C00C00000CF272__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000C8000-030000000000000000000000000000000002__000000AAEBE534F8", +"000000067F000080000008C00C00000CF272-000000067F000080000008C00C00000D8986__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000D8986-000000067F000080000008C00C00000E20EC__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000E20EC-000000067F000080000008C00C00000EB80A__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000EB80A-000000067F000080000008C00C00000F4F40__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000F4F40-000000067F000080000008C00C00000FE6A6__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C00000FE6A6-000000067F000080000008C00C0000107DC1__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C0000107DC1-000000067F000080000008C00C000011150A__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008C00C000011150A-01000000000000000100000004000000001C__000000AA2597E9A1-000000AB6533BFD9", +"000000067F000080000008E00C0000000000-000000067F000080000008E00C0000004000__000000AD3698E000", +"000000067F000080000008E00C0000004000-000000067F000080000008E00C0000008000__000000AD3698E000", +"000000067F000080000008E00C00000077B3-000000067F000080000008E00C0000010F0A__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C0000008000-000000067F000080000008E00C000000C000__000000AD3698E000", +"000000067F000080000008E00C000000C000-000000067F000080000008E00C0000010000__000000AD3698E000", +"000000067F000080000008E00C0000010000-000000067F000080000008E00C0000014000__000000AD3698E000", +"000000067F000080000008E00C0000010F0A-000000067F000080000008E00C000001A670__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C0000014000-000000067F000080000008E00C0000018000__000000AD3698E000", +"000000067F000080000008E00C0000018000-000000067F000080000008E00C000001C000__000000AD3698E000", +"000000067F000080000008E00C000001A670-000000067F000080000008E00C0000023DB1__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C000001C000-000000067F000080000008E00C0000020000__000000AD3698E000", +"000000067F000080000008E00C0000020000-000000067F000080000008E00C0000024000__000000AD3698E000", +"000000067F000080000008E00C0000023DB1-000000067F000080000008E00C000002D50A__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C0000024000-000000067F000080000008E00C0000028000__000000AD3698E000", +"000000067F000080000008E00C0000028000-000000067F000080000008E00C000002C000__000000AD3698E000", +"000000067F000080000008E00C000002C000-000000067F000080000008E00C0000030000__000000AD3698E000", +"000000067F000080000008E00C000002D50A-000000067F000080000008E00C0000036C30__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C0000030000-000000067F000080000008E00C0000034000__000000AD3698E000", +"000000067F000080000008E00C0000034000-000000067F000080000008E00C0000038000__000000AD3698E000", +"000000067F000080000008E00C0000036C30-000000067F000080000008E00C0000040393__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C0000038000-000000067F000080000008E00C000003C000__000000AD3698E000", +"000000067F000080000008E00C000003C000-000000067F000080000008E00C0000040000__000000AD3698E000", +"000000067F000080000008E00C0000040000-000000067F000080000008E00C0000044000__000000AD3698E000", +"000000067F000080000008E00C0000040393-000000067F000080000008E00C0000049AF9__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C0000044000-000000067F000080000008E00C0000048000__000000AD3698E000", +"000000067F000080000008E00C0000048000-000000067F000080000008E00C000004C000__000000AD3698E000", +"000000067F000080000008E00C0000049AF9-000000067F000080000008E00C000005320C__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C000004C000-000000067F000080000008E00C0000050000__000000AD3698E000", +"000000067F000080000008E00C0000050000-000000067F000080000008E00C0000054000__000000AD3698E000", +"000000067F000080000008E00C000005320C-030000000000000000000000000000000002__000000AB6533BFD9-000000ABF63DF511", +"000000067F000080000008E00C0000054000-000000067F000080000008E00C0000058000__000000AD34AF7FD8", +"000000067F000080000008E00C000005523E-000000067F000080000008E00C000005E9A4__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C0000058000-000000067F000080000008E00C000005C000__000000AD34AF7FD8", +"000000067F000080000008E00C000005C000-000000067F000080000008E00C0000060000__000000AD34AF7FD8", +"000000067F000080000008E00C000005E9A4-000000067F000080000008E00C000006810A__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C0000060000-000000067F000080000008E00C0000064000__000000AD34AF7FD8", +"000000067F000080000008E00C0000064000-000000067F000080000008E00C0000068000__000000AD34AF7FD8", +"000000067F000080000008E00C0000068000-000000067F000080000008E00C000006C000__000000AD34AF7FD8", +"000000067F000080000008E00C000006810A-000000067F000080000008E00C0000071870__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C000006C000-000000067F000080000008E00C0000070000__000000AD34AF7FD8", +"000000067F000080000008E00C0000070000-000000067F000080000008E00C0000074000__000000AD34AF7FD8", +"000000067F000080000008E00C0000071870-000000067F000080000008E00C000007AFD6__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C0000074000-000000067F000080000008E00C0000078000__000000AD34AF7FD8", +"000000067F000080000008E00C0000078000-000000067F000080000008E00C000007C000__000000AD34AF7FD8", +"000000067F000080000008E00C000007AFD6-000000067F000080000008E00C000008470B__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C000007C000-000000067F000080000008E00C0000080000__000000AD34AF7FD8", +"000000067F000080000008E00C0000080000-000000067F000080000008E00C0000084000__000000AD34AF7FD8", +"000000067F000080000008E00C0000084000-000000067F000080000008E00C0000088000__000000AD34AF7FD8", +"000000067F000080000008E00C000008470B-000000067F000080000008E00C000008DE71__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C0000088000-000000067F000080000008E00C000008C000__000000AD34AF7FD8", +"000000067F000080000008E00C000008C000-000000067F000080000008E00C0000090000__000000AD34AF7FD8", +"000000067F000080000008E00C000008DE71-000000067F000080000008E00C0000097591__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C0000090000-000000067F000080000008E00C0000094000__000000AD34AF7FD8", +"000000067F000080000008E00C0000094000-000000067F000080000008E00C0000098000__000000AD34AF7FD8", +"000000067F000080000008E00C0000097591-000000067F000080000008E00C00000A0CF7__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C0000098000-000000067F000080000008E00C000009C000__000000AD34AF7FD8", +"000000067F000080000008E00C000009C000-000000067F000080000008E00C00000A0000__000000AD34AF7FD8", +"000000067F000080000008E00C00000A0000-000000067F000080000008E00C00000A4000__000000AD34AF7FD8", +"000000067F000080000008E00C00000A0CF7-000000067F000080000008E00C00000AA40B__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C00000A4000-000000067F000080000008E00C00000A8000__000000AD34AF7FD8", +"000000067F000080000008E00C00000A8000-000000067F000080000008E00C00000AC000__000000AD34AF7FD8", +"000000067F000080000008E00C00000AA40B-000000067F000080000008E00C00000B3B4D__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C00000AC000-000000067F000080000008E00C00000B0000__000000AD34AF7FD8", +"000000067F000080000008E00C00000B0000-000000067F000080000008E00C00000B4000__000000AD34AF7FD8", +"000000067F000080000008E00C00000B3B4D-000000067F000080000008E00C0100000000__000000ABF63DF511-000000AC9601EA19", +"000000067F000080000008E00C00000B4000-000000067F000080000008E00C00000B8000__000000AD34AF7FD8", +"000000067F000080000008E00C00000B8000-000000067F000080000008E00C00000BC000__000000AD34AF7FD8", +"000000067F000080000008E00C00000BC000-000000067F000080000008E00C00000C0000__000000AD34AF7FD8", +"000000067F000080000008E00C00000BC018-000000067F000080000008E00C00000C5749__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C00000C0000-000000067F000080000008E00C00000C4000__000000AD34AF7FD8", +"000000067F000080000008E00C00000C4000-000000067F000080000008E00C00000C8000__000000AD34AF7FD8", +"000000067F000080000008E00C00000C5749-000000067F000080000008E00C00000CEEAF__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C00000C8000-000000067F000080000008E00C00000CC000__000000AD34AF7FD8", +"000000067F000080000008E00C00000CC000-000000067F000080000008E00C00000D0000__000000AD34AF7FD8", +"000000067F000080000008E00C00000CEEAF-000000067F000080000008E00C00000D85C5__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C00000D0000-000000067F000080000008E00C00000D4000__000000AD34AF7FD8", +"000000067F000080000008E00C00000D4000-000000067F000080000008E00C00000D8000__000000AD34AF7FD8", +"000000067F000080000008E00C00000D8000-000000067F000080000008E00C00000DC000__000000AD34AF7FD8", +"000000067F000080000008E00C00000D85C5-000000067F000080000008E00C00000E1D0B__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C00000DC000-000000067F000080000008E00C00000E0000__000000AD34AF7FD8", +"000000067F000080000008E00C00000E0000-000000067F000080000008E00C00000E4000__000000AD34AF7FD8", +"000000067F000080000008E00C00000E1D0B-000000067F000080000008E00C00000EB471__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C00000E4000-000000067F000080000008E00C00000E8000__000000AD34AF7FD8", +"000000067F000080000008E00C00000E8000-000000067F000080000008E00C00000EC000__000000AD34AF7FD8", +"000000067F000080000008E00C00000EB471-000000067F000080000008E00C00000F4BAA__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C00000EC000-000000067F000080000008E00C00000F0000__000000AD34AF7FD8", +"000000067F000080000008E00C00000F0000-000000067F000080000008E00C00000F4000__000000AD34AF7FD8", +"000000067F000080000008E00C00000F4000-000000067F000080000008E00C00000F8000__000000AD34AF7FD8", +"000000067F000080000008E00C00000F4BAA-000000067F000080000008E00C00000FE30A__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C00000F8000-000000067F000080000008E00C00000FC000__000000AD34AF7FD8", +"000000067F000080000008E00C00000FC000-000000067F000080000008E00C0000100000__000000AD34AF7FD8", +"000000067F000080000008E00C00000FE30A-000000067F000080000008E00C0000107A2C__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C0000100000-000000067F000080000008E00C0000104000__000000AD34AF7FD8", +"000000067F000080000008E00C0000104000-000000067F000080000008E00C0000108000__000000AD34AF7FD8", +"000000067F000080000008E00C0000107A2C-000000067F000080000008E00C0000111187__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E00C0000108000-000000067F000080000008E00C000010C000__000000AD34AF7FD8", +"000000067F000080000008E00C000010C000-000000067F000080000008E00C0000110000__000000AD34AF7FD8", +"000000067F000080000008E00C0000110000-030000000000000000000000000000000002__000000AD34AF7FD8", +"000000067F000080000008E00C0000111187-01000000000000000100000004000000001C__000000AC9601EA19-000000AD36393FE9", +"000000067F000080000008E0140000003E33-000000067F000080000008E014000000BD44__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E014000000BD44-000000067F000080000008E0140000013C54__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E0140000013C54-000000067F000080000008E014000001BB63__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E014000001BB63-000000067F000080000008E0140000023A74__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E0140000023A74-000000067F000080000008E014000002B984__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E014000002B984-000000067F000080000008E0220000006AD0__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E0220000000000-000000067F000080000008E0220000004000__000000AF5D7D4000", +"000000067F000080000008E0220000004000-000000067F000080000008E0220000008000__000000AF5D7D4000", +"000000067F000080000008E0220000006AD0-000000067F000080000008E022000001020C__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E0220000008000-000000067F000080000008E022000000C000__000000AF5D7D4000", +"000000067F000080000008E022000000C000-000000067F000080000008E0220000010000__000000AF5D7D4000", +"000000067F000080000008E0220000010000-000000067F000080000008E0220000014000__000000AF5D7D4000", +"000000067F000080000008E022000001020C-01000000000000000100000004000000001C__000000AD36393FE9-000000ADB047EAB9", +"000000067F000080000008E0220000014000-000000067F000080000008E0220000018000__000000AF56604248", +"000000067F000080000008E02200000151DD-000000067F000080000008E022000001E90B__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000018000-000000067F000080000008E022000001C000__000000AF56604248", +"000000067F000080000008E022000001C000-000000067F000080000008E0220000020000__000000AF56604248", +"000000067F000080000008E022000001E90B-000000067F000080000008E022000002802C__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000020000-000000067F000080000008E0220000024000__000000AF56604248", +"000000067F000080000008E0220000024000-000000067F000080000008E0220000028000__000000AF56604248", +"000000067F000080000008E0220000028000-000000067F000080000008E022000002C000__000000AF56604248", +"000000067F000080000008E022000002802C-000000067F000080000008E0220000031783__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E022000002C000-000000067F000080000008E0220000030000__000000AF56604248", +"000000067F000080000008E0220000030000-000000067F000080000008E0220000034000__000000AF56604248", +"000000067F000080000008E0220000031783-000000067F000080000008E022000003AEE9__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000034000-000000067F000080000008E0220000038000__000000AF56604248", +"000000067F000080000008E0220000038000-000000067F000080000008E022000003C000__000000AF56604248", +"000000067F000080000008E022000003AEE9-000000067F000080000008E022000004460B__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E022000003C000-000000067F000080000008E0220000040000__000000AF56604248", +"000000067F000080000008E0220000040000-000000067F000080000008E0220000044000__000000AF56604248", +"000000067F000080000008E0220000044000-000000067F000080000008E0220000048000__000000AF56604248", +"000000067F000080000008E022000004460B-000000067F000080000008E022000004DD71__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000048000-000000067F000080000008E022000004C000__000000AF56604248", +"000000067F000080000008E022000004C000-000000067F000080000008E0220000050000__000000AF56604248", +"000000067F000080000008E022000004DD71-000000067F000080000008E02200000574D7__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000050000-000000067F000080000008E0220000054000__000000AF56604248", +"000000067F000080000008E0220000054000-000000067F000080000008E0220000058000__000000AF56604248", +"000000067F000080000008E02200000574D7-000000067F000080000008E0220000060C0B__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000058000-000000067F000080000008E022000005C000__000000AF56604248", +"000000067F000080000008E022000005C000-000000067F000080000008E0220000060000__000000AF56604248", +"000000067F000080000008E0220000060000-000000067F000080000008E0220000064000__000000AF56604248", +"000000067F000080000008E0220000060C0B-000000067F000080000008E022000006A371__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000064000-000000067F000080000008E0220000068000__000000AF56604248", +"000000067F000080000008E0220000068000-000000067F000080000008E022000006C000__000000AF56604248", +"000000067F000080000008E022000006A371-000000067F000080000008E0220000073AD7__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E022000006C000-000000067F000080000008E0220000070000__000000AF56604248", +"000000067F000080000008E0220000070000-000000067F000080000008E0220000074000__000000AF56604248", +"000000067F000080000008E0220000073AD7-000000067F000080000008E022000007D20B__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000074000-000000067F000080000008E0220000078000__000000AF56604248", +"000000067F000080000008E0220000078000-000000067F000080000008E022000007C000__000000AF56604248", +"000000067F000080000008E022000007C000-000000067F000080000008E0220000080000__000000AF56604248", +"000000067F000080000008E022000007D20B-000000067F000080000008E0220000086932__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000080000-000000067F000080000008E0220000084000__000000AF56604248", +"000000067F000080000008E0220000084000-000000067F000080000008E0220000088000__000000AF56604248", +"000000067F000080000008E0220000086932-000000067F000080000008E0220100000000__000000ADB047EAB9-000000AE6FFFE799", +"000000067F000080000008E0220000088000-000000067F000080000008E022000008C000__000000AF56604248", +"000000067F000080000008E022000008C000-000000067F000080000008E0220000090000__000000AF56604248", +"000000067F000080000008E022000008E3D1-000000067F000080000008E022000009797E__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E0220000090000-000000067F000080000008E0220000094000__000000AF56604248", +"000000067F000080000008E0220000094000-000000067F000080000008E0220000098000__000000AF56604248", +"000000067F000080000008E022000009797E-000000067F000080000008E02200000A10E4__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E0220000098000-000000067F000080000008E022000009C000__000000AF56604248", +"000000067F000080000008E022000009C000-000000067F000080000008E02200000A0000__000000AF56604248", +"000000067F000080000008E02200000A0000-000000067F000080000008E02200000A4000__000000AF56604248", +"000000067F000080000008E02200000A10E4-000000067F000080000008E02200000AA80B__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000A4000-000000067F000080000008E02200000A8000__000000AF56604248", +"000000067F000080000008E02200000A8000-000000067F000080000008E02200000AC000__000000AF56604248", +"000000067F000080000008E02200000AA80B-000000067F000080000008E02200000B3F4B__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000AC000-000000067F000080000008E02200000B0000__000000AF56604248", +"000000067F000080000008E02200000B0000-000000067F000080000008E02200000B4000__000000AF56604248", +"000000067F000080000008E02200000B3F4B-000000067F000080000008E02200000BD6B1__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000B4000-000000067F000080000008E02200000B8000__000000AF56604248", +"000000067F000080000008E02200000B8000-000000067F000080000008E02200000BC000__000000AF56604248", +"000000067F000080000008E02200000BC000-000000067F000080000008E02200000C0000__000000AF56604248", +"000000067F000080000008E02200000BD6B1-000000067F000080000008E02200000C6DD5__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000C0000-000000067F000080000008E02200000C4000__000000AF56604248", +"000000067F000080000008E02200000C4000-000000067F000080000008E02200000C8000__000000AF56604248", +"000000067F000080000008E02200000C6DD5-000000067F000080000008E02200000D050B__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000C8000-000000067F000080000008E02200000CC000__000000AF56604248", +"000000067F000080000008E02200000CC000-000000067F000080000008E02200000D0000__000000AF56604248", +"000000067F000080000008E02200000D0000-000000067F000080000008E02200000D4000__000000AF56604248", +"000000067F000080000008E02200000D050B-000000067F000080000008E02200000D9C71__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000D4000-000000067F000080000008E02200000D8000__000000AF56604248", +"000000067F000080000008E02200000D8000-000000067F000080000008E02200000DC000__000000AF56604248", +"000000067F000080000008E02200000D9C71-000000067F000080000008E02200000E33B8__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000DC000-000000067F000080000008E02200000E0000__000000AF56604248", +"000000067F000080000008E02200000E0000-000000067F000080000008E02200000E4000__000000AF56604248", +"000000067F000080000008E02200000E33B8-000000067F000080000008E02200000ECB09__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000E4000-000000067F000080000008E02200000E8000__000000AF56604248", +"000000067F000080000008E02200000E8000-000000067F000080000008E02200000EC000__000000AF56604248", +"000000067F000080000008E02200000EC000-000000067F000080000008E02200000F0000__000000AF56604248", +"000000067F000080000008E02200000ECB09-000000067F000080000008E02200000F626F__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000F0000-000000067F000080000008E02200000F4000__000000AF56604248", +"000000067F000080000008E02200000F4000-000000067F000080000008E02200000F8000__000000AF56604248", +"000000067F000080000008E02200000F626F-000000067F000080000008E02200000FF9D5__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02200000F8000-000000067F000080000008E02200000FC000__000000AF56604248", +"000000067F000080000008E02200000FC000-000000067F000080000008E0220000100000__000000AF56604248", +"000000067F000080000008E02200000FF9D5-000000067F000080000008E022000010912A__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E0220000100000-000000067F000080000008E0220000104000__000000AF56604248", +"000000067F000080000008E0220000104000-000000067F000080000008E0220000108000__000000AF56604248", +"000000067F000080000008E0220000108000-000000067F000080000008E022000010C000__000000AF56604248", +"000000067F000080000008E022000010912A-000000067F000080000008E0220000111C20__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E022000010C000-030000000000000000000000000000000002__000000AF56604248", +"000000067F000080000008E02200FFFFFFFF-01000000000000000100000004000000001C__000000AE6FFFE799-000000AF5D587FE1", +"000000067F000080000008E02A000000529F-000000067F000080000008E02A000000D1B0__000000AF5D587FE1-000000AFB4666001", +"000000067F000080000008E02A000000D1B0-000000067F000080000008E02A00000150BF__000000AF5D587FE1-000000AFB4666001", +"000000067F000080000008E02A00000150BF-000000067F000080000008E02A000001CFD0__000000AF5D587FE1-000000AFB4666001", +"000000067F000080000008E02A000001CFD0-000000067F000080000008E02A0000024EE1__000000AF5D587FE1-000000AFB4666001", +"000000067F000080000008E02A0000024EE1-000000067F000080000008E02A000002CDF1__000000AF5D587FE1-000000AFB4666001", +"000000067F000080000008E02A000002CDF1-030000000000000000000000000000000002__000000AF5D587FE1-000000AFB4666001", +"000000067F00008000000900380000000000-000000067F0000800000090038000000970B__000000AFB4666001-000000B05429F579", +"000000067F0000800000090038000000970B-000000067F00008000000900380000012E71__000000AFB4666001-000000B05429F579", +"000000067F00008000000900380000012E71-000000067F0000800000090038000001C5D7__000000AFB4666001-000000B05429F579", +"000000067F0000800000090038000001C5D7-000000067F00008000000900380000025D2B__000000AFB4666001-000000B05429F579", +"000000067F00008000000900380000025D2B-000000067F0000800000090038000002F491__000000AFB4666001-000000B05429F579", +"000000067F0000800000090038000002F491-000000067F00008000000900380000038BA4__000000AFB4666001-000000B05429F579", +"000000067F00008000000900380000038BA4-000000067F0000800000090038000004230A__000000AFB4666001-000000B05429F579", +"000000067F0000800000090038000004230A-000000067F0000800000090038000004BA70__000000AFB4666001-000000B05429F579", +"000000067F0000800000090038000004BA70-000000067F000080000009003800000551A5__000000AFB4666001-000000B05429F579", +"000000067F000080000009003800000551A5-000000067F0000800000090038000005E909__000000AFB4666001-000000B05429F579", +"000000067F0000800000090038000005C000-000000067F00008000000900380000060000__000000B18434BFD0", +"000000067F0000800000090038000005E909-000000067F000080000009003B0100000000__000000AFB4666001-000000B05429F579", +"000000067F0000800000090038000005EA0C-000000067F00008000000900380000068125__000000B05429F579-000000B0F3EDEAC9", +"000000067F00008000000900380000060000-000000067F00008000000900380000064000__000000B18434BFD0", +"000000067F00008000000900380000064000-000000067F00008000000900380000068000__000000B18434BFD0", +"000000067F00008000000900380000068000-000000067F0000800000090038000006C000__000000B18434BFD0", +"000000067F00008000000900380000068125-000000067F0000800000090038000007188B__000000B05429F579-000000B0F3EDEAC9", +"000000067F0000800000090038000006C000-000000067F00008000000900380000070000__000000B18434BFD0", +"000000067F00008000000900380000070000-000000067F00008000000900380000074000__000000B18434BFD0", +"000000067F0000800000090038000007188B-000000067F0000800000090038000007AFF1__000000B05429F579-000000B0F3EDEAC9", +"000000067F00008000000900380000074000-000000067F00008000000900380000078000__000000B18434BFD0", +"000000067F00008000000900380000078000-000000067F0000800000090038000007C000__000000B18434BFD0", +"000000067F0000800000090038000007AFF1-000000067F0000800000090038000008470C__000000B05429F579-000000B0F3EDEAC9", +"000000067F0000800000090038000007C000-000000067F00008000000900380000080000__000000B18434BFD0", +"000000067F00008000000900380000080000-000000067F00008000000900380000084000__000000B18434BFD0", +"000000067F00008000000900380000084000-000000067F00008000000900380000088000__000000B18434BFD0", +"000000067F0000800000090038000008470C-000000067F0000800000090038000008DE72__000000B05429F579-000000B0F3EDEAC9", +"000000067F00008000000900380000088000-000000067F0000800000090038000008C000__000000B18434BFD0", +"000000067F0000800000090038000008C000-000000067F00008000000900380000090000__000000B18434BFD0", +"000000067F0000800000090038000008DE72-000000067F00008000000900380000097592__000000B05429F579-000000B0F3EDEAC9", +"000000067F00008000000900380000090000-000000067F00008000000900380000094000__000000B18434BFD0", +"000000067F00008000000900380000094000-000000067F00008000000900380000098000__000000B18434BFD0", +"000000067F00008000000900380000097592-000000067F000080000009003800000A0CF8__000000B05429F579-000000B0F3EDEAC9", +"000000067F00008000000900380000098000-000000067F0000800000090038000009C000__000000B18434BFD0", +"000000067F0000800000090038000009C000-000000067F000080000009003800000A0000__000000B18434BFD0", +"000000067F000080000009003800000A0000-000000067F000080000009003800000A4000__000000B18434BFD0", +"000000067F000080000009003800000A0CF8-000000067F000080000009003800000AA40C__000000B05429F579-000000B0F3EDEAC9", +"000000067F000080000009003800000A4000-000000067F000080000009003800000A8000__000000B18434BFD0", +"000000067F000080000009003800000A8000-000000067F000080000009003800000AC000__000000B18434BFD0", +"000000067F000080000009003800000AA40C-000000067F000080000009003800000B3B4E__000000B05429F579-000000B0F3EDEAC9", +"000000067F000080000009003800000AC000-000000067F000080000009003800000B0000__000000B18434BFD0", +"000000067F000080000009003800000B0000-000000067F000080000009003800000B4000__000000B18434BFD0", +"000000067F000080000009003800000B3B4E-000000067F000080000009003800000BD2B4__000000B05429F579-000000B0F3EDEAC9", +"000000067F000080000009003800000B4000-000000067F000080000009003800000B8000__000000B18434BFD0", +"000000067F000080000009003800000B8000-000000067F000080000009003800000BC000__000000B18434BFD0", +"000000067F000080000009003800000BC000-000000067F000080000009003800000C0000__000000B18434BFD0", +"000000067F000080000009003800000BD2B4-000000067F00008000000900380100000000__000000B05429F579-000000B0F3EDEAC9", +"000000067F000080000009003800000C0000-000000067F000080000009003800000C4000__000000B18434BFD0", +"000000067F000080000009003800000C4000-000000067F000080000009003800000C8000__000000B18434BFD0", +"000000067F000080000009003800000C5213-000000067F000080000009003800000CE979__000000B0F3EDEAC9-000000B18495C001", +"000000067F000080000009003800000C8000-000000067F000080000009003800000CC000__000000B18434BFD0", +"000000067F000080000009003800000CC000-000000067F000080000009003800000D0000__000000B18434BFD0", +"000000067F000080000009003800000CE979-000000067F000080000009003800000D80DF__000000B0F3EDEAC9-000000B18495C001", +"000000067F000080000009003800000D0000-000000067F000080000009003800000D4000__000000B18434BFD0", +"000000067F000080000009003800000D4000-000000067F000080000009003800000D8000__000000B18434BFD0", +"000000067F000080000009003800000D8000-000000067F000080000009003800000DC000__000000B18434BFD0", +"000000067F000080000009003800000D80DF-000000067F000080000009003800000E180A__000000B0F3EDEAC9-000000B18495C001", +"000000067F000080000009003800000DC000-000000067F000080000009003800000E0000__000000B18434BFD0", +"000000067F000080000009003800000E0000-000000067F000080000009003800000E4000__000000B18434BFD0", +"000000067F000080000009003800000E180A-000000067F000080000009003800000EAF70__000000B0F3EDEAC9-000000B18495C001", +"000000067F000080000009003800000E4000-000000067F000080000009003800000E8000__000000B18434BFD0", +"000000067F000080000009003800000E8000-000000067F000080000009003800000EC000__000000B18434BFD0", +"000000067F000080000009003800000EAF70-000000067F000080000009003800000F46D6__000000B0F3EDEAC9-000000B18495C001", +"000000067F000080000009003800000EC000-000000067F000080000009003800000F0000__000000B18434BFD0", +"000000067F000080000009003800000F0000-000000067F000080000009003800000F4000__000000B18434BFD0", +"000000067F000080000009003800000F4000-000000067F000080000009003800000F8000__000000B18434BFD0", +"000000067F000080000009003800000F46D6-000000067F000080000009003800000FDE0B__000000B0F3EDEAC9-000000B18495C001", +"000000067F000080000009003800000F8000-000000067F000080000009003800000FC000__000000B18434BFD0", +"000000067F000080000009003800000FC000-000000067F00008000000900380000100000__000000B18434BFD0", +"000000067F000080000009003800000FDE0B-000000067F0000800000090038000010752B__000000B0F3EDEAC9-000000B18495C001", +"000000067F00008000000900380000100000-000000067F00008000000900380000104000__000000B18434BFD0", +"000000067F00008000000900380000104000-000000067F00008000000900380000108000__000000B18434BFD0", +"000000067F0000800000090038000010752B-000000067F00008000000900380000110C91__000000B0F3EDEAC9-000000B18495C001", +"000000067F00008000000900380000108000-000000067F0000800000090038000010C000__000000B18434BFD0", +"000000067F0000800000090038000010C000-000000067F00008000000900380000110000__000000B18434BFD0", +"000000067F00008000000900380000110000-030000000000000000000000000000000002__000000B18434BFD0", +"000000067F00008000000900380000110C91-01000000000000000100000004000000001C__000000B0F3EDEAC9-000000B18495C001", +"000000067F000080000009004000000047E0-000000067F0000800000090040000000C6F1__000000B18495C001-000000B1FA75F501", +"000000067F0000800000090040000000C6F1-000000067F00008000000900400000014600__000000B18495C001-000000B1FA75F501", +"000000067F00008000000900400000014600-000000067F0000800000090040000001C511__000000B18495C001-000000B1FA75F501", +"000000067F0000800000090040000001C511-000000067F00008000000900400000024421__000000B18495C001-000000B1FA75F501", +"000000067F00008000000900400000024421-000000067F0000800000090040000002C331__000000B18495C001-000000B1FA75F501", +"000000067F0000800000090040000002C331-000000067F000080000009200C0000007658__000000B18495C001-000000B1FA75F501", +"000000067F000080000009200C0000000000-000000067F000080000009200C0000004000__000000B3AC039FE8", +"000000067F000080000009200C0000004000-000000067F000080000009200C0000008000__000000B3AC039FE8", +"000000067F000080000009200C0000007658-000000067F000080000009200C0000010DB5__000000B18495C001-000000B1FA75F501", +"000000067F000080000009200C0000008000-000000067F000080000009200C000000C000__000000B3AC039FE8", +"000000067F000080000009200C000000C000-000000067F000080000009200C0000010000__000000B3AC039FE8", +"000000067F000080000009200C0000010000-000000067F000080000009200C0000014000__000000B3A3EC82C8", +"000000067F000080000009200C0000010DB5-030000000000000000000000000000000002__000000B18495C001-000000B1FA75F501", +"000000067F000080000009200C0000012E97-000000067F000080000009200C000001C5FD__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000014000-000000067F000080000009200C0000018000__000000B3A3EC82C8", +"000000067F000080000009200C0000018000-000000067F000080000009200C000001C000__000000B3A3EC82C8", +"000000067F000080000009200C000001C000-000000067F000080000009200C0000020000__000000B3A3EC82C8", +"000000067F000080000009200C000001C5FD-000000067F000080000009200C0000025D0C__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000020000-000000067F000080000009200C0000024000__000000B3A3EC82C8", +"000000067F000080000009200C0000024000-000000067F000080000009200C0000028000__000000B3A3EC82C8", +"000000067F000080000009200C0000025D0C-000000067F000080000009200C000002F472__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000028000-000000067F000080000009200C000002C000__000000B3A3EC82C8", +"000000067F000080000009200C000002C000-000000067F000080000009200C0000030000__000000B3A3EC82C8", +"000000067F000080000009200C000002F472-000000067F000080000009200C0000038B85__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000030000-000000067F000080000009200C0000034000__000000B3A3EC82C8", +"000000067F000080000009200C0000034000-000000067F000080000009200C0000038000__000000B3A3EC82C8", +"000000067F000080000009200C0000038000-000000067F000080000009200C000003C000__000000B3A3EC82C8", +"000000067F000080000009200C0000038B85-000000067F000080000009200C00000422EB__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C000003C000-000000067F000080000009200C0000040000__000000B3A3EC82C8", +"000000067F000080000009200C0000040000-000000067F000080000009200C0000044000__000000B3A3EC82C8", +"000000067F000080000009200C00000422EB-000000067F000080000009200C000004BA0C__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000044000-000000067F000080000009200C0000048000__000000B3A3EC82C8", +"000000067F000080000009200C0000048000-000000067F000080000009200C000004C000__000000B3A3EC82C8", +"000000067F000080000009200C000004BA0C-000000067F000080000009200C0000055141__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C000004C000-000000067F000080000009200C0000050000__000000B3A3EC82C8", +"000000067F000080000009200C0000050000-000000067F000080000009200C0000054000__000000B3A3EC82C8", +"000000067F000080000009200C0000054000-000000067F000080000009200C0000058000__000000B3A3EC82C8", +"000000067F000080000009200C0000055141-000000067F000080000009200C000005E8A7__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000058000-000000067F000080000009200C000005C000__000000B3A3EC82C8", +"000000067F000080000009200C000005C000-000000067F000080000009200C0000060000__000000B3A3EC82C8", +"000000067F000080000009200C000005E8A7-000000067F000080000009200C0000067FC1__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000060000-000000067F000080000009200C0000064000__000000B3A3EC82C8", +"000000067F000080000009200C0000064000-000000067F000080000009200C0000068000__000000B3A3EC82C8", +"000000067F000080000009200C0000067FC1-000000067F000080000009200C0000071709__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000068000-000000067F000080000009200C000006C000__000000B3A3EC82C8", +"000000067F000080000009200C000006C000-000000067F000080000009200C0000070000__000000B3A3EC82C8", +"000000067F000080000009200C0000070000-000000067F000080000009200C0000074000__000000B3A3EC82C8", +"000000067F000080000009200C0000071709-000000067F000080000009200C000007AE6F__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000074000-000000067F000080000009200C0000078000__000000B3A3EC82C8", +"000000067F000080000009200C0000078000-000000067F000080000009200C000007C000__000000B3A3EC82C8", +"000000067F000080000009200C000007AE6F-000000067F000080000009200C00000845AB__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C000007C000-000000067F000080000009200C0000080000__000000B3A3EC82C8", +"000000067F000080000009200C0000080000-000000067F000080000009200C0000084000__000000B3A3EC82C8", +"000000067F000080000009200C0000084000-000000067F000080000009200C0000088000__000000B3A3EC82C8", +"000000067F000080000009200C00000845AB-000000067F000080000009200C000008DD09__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000088000-000000067F000080000009200C000008C000__000000B3A3EC82C8", +"000000067F000080000009200C000008C000-000000067F000080000009200C0000090000__000000B3A3EC82C8", +"000000067F000080000009200C000008DD09-000000067F000080000009200C0100000000__000000B1FA75F501-000000B2CA27F641", +"000000067F000080000009200C0000090000-000000067F000080000009200C0000094000__000000B3A3EC82C8", +"000000067F000080000009200C0000094000-000000067F000080000009200C0000098000__000000B3A3EC82C8", +"000000067F000080000009200C000009567A-000000067F000080000009200C000009EDE0__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C0000098000-000000067F000080000009200C000009C000__000000B3A3EC82C8", +"000000067F000080000009200C000009C000-000000067F000080000009200C00000A0000__000000B3A3EC82C8", +"000000067F000080000009200C000009EDE0-000000067F000080000009200C00000A852B__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000A0000-000000067F000080000009200C00000A4000__000000B3A3EC82C8", +"000000067F000080000009200C00000A4000-000000067F000080000009200C00000A8000__000000B3A3EC82C8", +"000000067F000080000009200C00000A8000-000000067F000080000009200C00000AC000__000000B3A3EC82C8", +"000000067F000080000009200C00000A852B-000000067F000080000009200C00000B1C91__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000AC000-000000067F000080000009200C00000B0000__000000B3A3EC82C8", +"000000067F000080000009200C00000B0000-000000067F000080000009200C00000B4000__000000B3A3EC82C8", +"000000067F000080000009200C00000B1C91-000000067F000080000009200C00000BB3F7__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000B4000-000000067F000080000009200C00000B8000__000000B3A3EC82C8", +"000000067F000080000009200C00000B8000-000000067F000080000009200C00000BC000__000000B3A3EC82C8", +"000000067F000080000009200C00000BB3F7-000000067F000080000009200C00000C4B0C__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000BC000-000000067F000080000009200C00000C0000__000000B3A3EC82C8", +"000000067F000080000009200C00000C0000-000000067F000080000009200C00000C4000__000000B3A3EC82C8", +"000000067F000080000009200C00000C4000-000000067F000080000009200C00000C8000__000000B3A3EC82C8", +"000000067F000080000009200C00000C4B0C-000000067F000080000009200C00000CE272__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000C8000-000000067F000080000009200C00000CC000__000000B3A3EC82C8", +"000000067F000080000009200C00000CC000-000000067F000080000009200C00000D0000__000000B3A3EC82C8", +"000000067F000080000009200C00000CE272-000000067F000080000009200C00000D798F__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000D0000-000000067F000080000009200C00000D4000__000000B3A3EC82C8", +"000000067F000080000009200C00000D4000-000000067F000080000009200C00000D8000__000000B3A3EC82C8", +"000000067F000080000009200C00000D798F-000000067F000080000009200C00000E10F5__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000D8000-000000067F000080000009200C00000DC000__000000B3A3EC82C8", +"000000067F000080000009200C00000DC000-000000067F000080000009200C00000E0000__000000B3A3EC82C8", +"000000067F000080000009200C00000E0000-000000067F000080000009200C00000E4000__000000B3A3EC82C8", +"000000067F000080000009200C00000E10F5-000000067F000080000009200C00000EA80B__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000E4000-000000067F000080000009200C00000E8000__000000B3A3EC82C8", +"000000067F000080000009200C00000E8000-000000067F000080000009200C00000EC000__000000B3A3EC82C8", +"000000067F000080000009200C00000EA80B-000000067F000080000009200C00000F3F4B__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000EC000-000000067F000080000009200C00000F0000__000000B3A3EC82C8", +"000000067F000080000009200C00000F0000-000000067F000080000009200C00000F4000__000000B3A3EC82C8", +"000000067F000080000009200C00000F3F4B-000000067F000080000009200C00000FD6B1__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C00000F4000-000000067F000080000009200C00000F8000__000000B3A3EC82C8", +"000000067F000080000009200C00000F8000-000000067F000080000009200C00000FC000__000000B3A3EC82C8", +"000000067F000080000009200C00000FC000-000000067F000080000009200C0000100000__000000B3A3EC82C8", +"000000067F000080000009200C00000FD6B1-000000067F000080000009200C0000106DD5__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C0000100000-000000067F000080000009200C0000104000__000000B3A3EC82C8", +"000000067F000080000009200C0000104000-000000067F000080000009200C0000108000__000000B3A3EC82C8", +"000000067F000080000009200C0000106DD5-000000067F000080000009200C000011050B__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F000080000009200C0000108000-000000067F000080000009200C000010C000__000000B3A3EC82C8", +"000000067F000080000009200C000010C000-030000000000000000000000000000000002__000000B3A3EC82C8", +"000000067F000080000009200C000011050B-01000000000000000100000004000000001C__000000B2CA27F641-000000B3AB3B7FC9", +"000000067F00008000000920140000005289-000000067F0000800000092014000000D19A__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F0000800000092014000000D19A-000000067F000080000009201400000150A9__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F000080000009201400000150A9-000000067F0000800000092014000001CFBA__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F0000800000092014000001CFBA-000000067F00008000000920140000024ECB__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F00008000000920140000024ECB-000000067F0000800000092014000002CDDB__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F0000800000092014000002CDDB-000000067F000080000009400C000000830C__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F000080000009400C0000000000-000000067F000080000009400C0000004000__000000B5CED8CF78", +"000000067F000080000009400C0000004000-000000067F000080000009400C0000008000__000000B5CED8CF78", +"000000067F000080000009400C0000008000-000000067F000080000009400C000000C000__000000B5CED8CF78", +"000000067F000080000009400C000000830C-000000067F000080000009400C0000011A72__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F000080000009400C000000C000-000000067F000080000009400C0000010000__000000B5CED8CF78", +"000000067F000080000009400C0000010000-000000067F000080000009400C0000014000__000000B568835548", +"000000067F000080000009400C0000011A72-030000000000000000000000000000000002__000000B3AB3B7FC9-000000B4208FF3D1", +"000000067F000080000009400C0000012E51-000000067F000080000009400C000001C5B7__000000B4208FF3D1-000000B43089EC11", +"000000067F000080000009400C0000012E51-000000067F000080000009400C000001C5B7__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000014000-000000067F000080000009400C0000018000__000000B568835548", +"000000067F000080000009400C0000018000-000000067F000080000009400C000001C000__000000B568835548", +"000000067F000080000009400C000001C000-000000067F000080000009400C0000020000__000000B568835548", +"000000067F000080000009400C000001C5B7-000000067F000080000009400C0000025D1D__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C000001C5B7-000000067F000080000009400C0100000000__000000B4208FF3D1-000000B43089EC11", +"000000067F000080000009400C0000020000-000000067F000080000009400C0000024000__000000B568835548", +"000000067F000080000009400C0000024000-000000067F000080000009400C0000028000__000000B568835548", +"000000067F000080000009400C0000025D1D-000000067F000080000009400C000002F483__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000028000-000000067F000080000009400C000002C000__000000B568835548", +"000000067F000080000009400C000002C000-000000067F000080000009400C0000030000__000000B568835548", +"000000067F000080000009400C000002F483-000000067F000080000009400C0000038B96__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000030000-000000067F000080000009400C0000034000__000000B568835548", +"000000067F000080000009400C0000034000-000000067F000080000009400C0000038000__000000B568835548", +"000000067F000080000009400C0000038000-000000067F000080000009400C000003C000__000000B568835548", +"000000067F000080000009400C0000038B96-000000067F000080000009400C00000422FC__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C000003C000-000000067F000080000009400C0000040000__000000B568835548", +"000000067F000080000009400C0000040000-000000067F000080000009400C0000044000__000000B568835548", +"000000067F000080000009400C00000422FC-000000067F000080000009400C000004BA0C__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000044000-000000067F000080000009400C0000048000__000000B568835548", +"000000067F000080000009400C0000048000-000000067F000080000009400C000004C000__000000B568835548", +"000000067F000080000009400C000004BA0C-000000067F000080000009400C0000055141__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C000004C000-000000067F000080000009400C0000050000__000000B568835548", +"000000067F000080000009400C0000050000-000000067F000080000009400C0000054000__000000B568835548", +"000000067F000080000009400C0000054000-000000067F000080000009400C0000058000__000000B568835548", +"000000067F000080000009400C0000055141-000000067F000080000009400C000005E8A7__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000058000-000000067F000080000009400C000005C000__000000B568835548", +"000000067F000080000009400C000005C000-000000067F000080000009400C0000060000__000000B568835548", +"000000067F000080000009400C000005E8A7-000000067F000080000009400C0000067FC1__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000060000-000000067F000080000009400C0000064000__000000B568835548", +"000000067F000080000009400C0000064000-000000067F000080000009400C0000068000__000000B568835548", +"000000067F000080000009400C0000067FC1-000000067F000080000009400C0000071709__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000068000-000000067F000080000009400C000006C000__000000B568835548", +"000000067F000080000009400C000006C000-000000067F000080000009400C0000070000__000000B568835548", +"000000067F000080000009400C0000070000-000000067F000080000009400C0000074000__000000B568835548", +"000000067F000080000009400C0000071709-000000067F000080000009400C000007AE6F__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000074000-000000067F000080000009400C0000078000__000000B568835548", +"000000067F000080000009400C0000078000-000000067F000080000009400C000007C000__000000B568835548", +"000000067F000080000009400C000007AE6F-000000067F000080000009400C00000845AB__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C000007C000-000000067F000080000009400C0000080000__000000B568835548", +"000000067F000080000009400C0000080000-000000067F000080000009400C0000084000__000000B568835548", +"000000067F000080000009400C0000084000-000000067F000080000009400C0000088000__000000B568835548", +"000000067F000080000009400C00000845AB-000000067F000080000009400C0100000000__000000B4208FF3D1-000000B4E047E5A9", +"000000067F000080000009400C0000088000-000000067F000080000009400C000008C000__000000B568835548", +"000000067F000080000009400C000008C000-000000067F000080000009400C0000090000__000000B568835548", +"000000067F000080000009400C000008DEA4-000000067F000080000009400C00000975C4__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C0000090000-000000067F000080000009400C0000094000__000000B568835548", +"000000067F000080000009400C0000094000-000000067F000080000009400C0000098000__000000B568835548", +"000000067F000080000009400C00000975C4-000000067F000080000009400C00000A0D0A__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C0000098000-000000067F000080000009400C000009C000__000000B568835548", +"000000067F000080000009400C000009C000-000000067F000080000009400C00000A0000__000000B568835548", +"000000067F000080000009400C00000A0000-000000067F000080000009400C00000A4000__000000B568835548", +"000000067F000080000009400C00000A0D0A-000000067F000080000009400C00000AA470__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000A4000-000000067F000080000009400C00000A8000__000000B568835548", +"000000067F000080000009400C00000A8000-000000067F000080000009400C00000AC000__000000B568835548", +"000000067F000080000009400C00000AA470-000000067F000080000009400C00000B3BB2__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000AC000-000000067F000080000009400C00000B0000__000000B568835548", +"000000067F000080000009400C00000B0000-000000067F000080000009400C00000B4000__000000B568835548", +"000000067F000080000009400C00000B3BB2-000000067F000080000009400C00000BD30A__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000B4000-000000067F000080000009400C00000B8000__000000B568835548", +"000000067F000080000009400C00000B8000-000000067F000080000009400C00000BC000__000000B568835548", +"000000067F000080000009400C00000BC000-000000067F000080000009400C00000C0000__000000B568835548", +"000000067F000080000009400C00000BD30A-000000067F000080000009400C00000C6A30__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000C0000-000000067F000080000009400C00000C4000__000000B568835548", +"000000067F000080000009400C00000C4000-000000067F000080000009400C00000C8000__000000B568835548", +"000000067F000080000009400C00000C6A30-000000067F000080000009400C00000D0194__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000C8000-000000067F000080000009400C00000CC000__000000B568835548", +"000000067F000080000009400C00000CC000-000000067F000080000009400C00000D0000__000000B568835548", +"000000067F000080000009400C00000D0000-000000067F000080000009400C00000D4000__000000B568835548", +"000000067F000080000009400C00000D0194-000000067F000080000009400C00000D98FA__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000D4000-030000000000000000000000000000000002__000000B568835548", +"000000067F000080000009400C00000D98FA-000000067F000080000009400C00000E300D__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000E300D-000000067F000080000009400C00000EC773__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000EC773-000000067F000080000009400C00000F5ED9__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000F5ED9-000000067F000080000009400C00000FF60C__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00000FF60C-000000067F000080000009400C0000108D1D__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C0000108D1D-000000067F000080000009400C0000111C20__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009400C00FFFFFFFF-030000000000000000000000000000000002__000000B4E047E5A9-000000B5CED8CF79", +"000000067F000080000009600C0000000000-000000067F000080000009600C0000004000__000000B79F439FE0", +"000000067F000080000009600C0000004000-000000067F000080000009600C0000008000__000000B79F439FE0", +"000000067F000080000009600C0000008000-000000067F000080000009600C000000C000__000000B79F439FE0", +"000000067F000080000009600C000000974F-000000067F000080000009600C0000012EB5__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000080000009600C000000C000-000000067F000080000009600C0000010000__000000B79F439FE0", +"000000067F000080000009600C0000010000-000000067F000080000009600C0000014000__000000B79F439FE0", +"000000067F000080000009600C0000012EB5-000000067F000080000009600C000001C60A__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000080000009600C0000014000-000000067F000080000009600C0000018000__000000B79F439FE0", +"000000067F000080000009600C0000018000-000000067F000080000009600C000001C000__000000B79F439FE0", +"000000067F000080000009600C000001C000-000000067F000080000009600C0000020000__000000B79F439FE0", +"000000067F000080000009600C000001C60A-000000067F000080000009600C0000025D38__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000080000009600C0000020000-000000067F000080000009600C0000024000__000000B79F439FE0", +"000000067F000080000009600C0000024000-000000067F000080000009600C0000028000__000000B79F439FE0", +"000000067F000080000009600C0000025D38-000000067F000080000009600C000002F49E__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000080000009600C0000028000-000000067F000080000009600C000002C000__000000B79F439FE0", +"000000067F000080000009600C000002C000-000000067F000080000009600C0000030000__000000B79F439FE0", +"000000067F000080000009600C000002F49E-000000067F000080000009600C0000038BB1__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000080000009600C0000030000-000000067F000080000009600C0000034000__000000B79F439FE0", +"000000067F000080000009600C0000034000-000000067F000080000009600C0000038000__000000B79F439FE0", +"000000067F000080000009600C0000038000-000000067F000080000009600C000003C000__000000B79F439FE0", +"000000067F000080000009600C0000038BB1-000000067F000080000009600C0000042317__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000080000009600C000003C000-000000067F000080000009600C0000040000__000000B79F439FE0", +"000000067F000080000009600C0000040000-000000067F000080000009600C0000044000__000000B79D17BFD0", +"000000067F000080000009600C0000040000-000000067F000080000009600C0000044000__000000B8606C92A0", +"000000067F000080000009600C0000042317-030000000000000000000000000000000002__000000B5CED8CF79-000000B63EADE5B9", +"000000067F000080000009600C000004236E-000000067F000080000009600C000004BAD4__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C0000044000-000000067F000080000009600C0000048000__000000B79D17BFD0", +"000000067F000080000009600C0000044000-000000067F000080000009600C0000048000__000000B8606C92A0", +"000000067F000080000009600C0000048000-000000067F000080000009600C000004C000__000000B79D17BFD0", +"000000067F000080000009600C0000048000-000000067F000080000009600C000004C000__000000B8606C92A0", +"000000067F000080000009600C000004BAD4-000000067F000080000009600C0000055208__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C000004C000-000000067F000080000009600C0000050000__000000B79D17BFD0", +"000000067F000080000009600C000004C000-000000067F000080000009600C0000050000__000000B8606C92A0", +"000000067F000080000009600C0000050000-000000067F000080000009600C0000054000__000000B79D17BFD0", +"000000067F000080000009600C0000050000-000000067F000080000009600C0000054000__000000B8606C92A0", +"000000067F000080000009600C0000054000-000000067F000080000009600C0000058000__000000B79D17BFD0", +"000000067F000080000009600C0000054000-000000067F000080000009600C0000058000__000000B8606C92A0", +"000000067F000080000009600C0000055208-000000067F000080000009600C000005E96E__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C0000055A77-000000067F000080000009600C00000AAEA5__000000B808718889-000000B8606C92A1", +"000000067F000080000009600C0000058000-000000067F000080000009600C000005C000__000000B79D17BFD0", +"000000067F000080000009600C0000058000-000000067F000080000009600C000005C000__000000B8606C92A0", +"000000067F000080000009600C000005C000-000000067F000080000009600C0000060000__000000B79D17BFD0", +"000000067F000080000009600C000005C000-000000067F000080000009600C0000060000__000000B8606C92A0", +"000000067F000080000009600C000005E96E-000000067F000080000009600C00000680D4__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C0000060000-000000067F000080000009600C0000064000__000000B79D17BFD0", +"000000067F000080000009600C0000060000-000000067F000080000009600C0000064000__000000B8606C92A0", +"000000067F000080000009600C0000064000-000000067F000080000009600C0000068000__000000B79D17BFD0", +"000000067F000080000009600C0000064000-000000067F000080000009600C0000068000__000000B8606C92A0", +"000000067F000080000009600C0000068000-000000067F000080000009600C000006C000__000000B79D17BFD0", +"000000067F000080000009600C0000068000-000000067F000080000009600C000006C000__000000B8606C92A0", +"000000067F000080000009600C00000680D4-000000067F000080000009600C000007180B__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C000006C000-000000067F000080000009600C0000070000__000000B79D17BFD0", +"000000067F000080000009600C000006C000-000000067F000080000009600C0000070000__000000B8606C92A0", +"000000067F000080000009600C0000070000-000000067F000080000009600C0000074000__000000B79D17BFD0", +"000000067F000080000009600C0000070000-000000067F000080000009600C0000074000__000000B8606C92A0", +"000000067F000080000009600C000007180B-000000067F000080000009600C000007AF71__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C0000074000-000000067F000080000009600C0000078000__000000B79D17BFD0", +"000000067F000080000009600C0000074000-000000067F000080000009600C0000078000__000000B8606C92A0", +"000000067F000080000009600C0000078000-000000067F000080000009600C000007C000__000000B79D17BFD0", +"000000067F000080000009600C0000078000-000000067F000080000009600C000007C000__000000B8606C92A0", +"000000067F000080000009600C000007AF71-000000067F000080000009600C00000846D7__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C000007C000-000000067F000080000009600C0000080000__000000B79D17BFD0", +"000000067F000080000009600C000007C000-000000067F000080000009600C0000080000__000000B8606C92A0", +"000000067F000080000009600C0000080000-000000067F000080000009600C0000084000__000000B79D17BFD0", +"000000067F000080000009600C0000080000-000000067F000080000009600C0000084000__000000B8606C92A0", +"000000067F000080000009600C0000084000-000000067F000080000009600C0000088000__000000B79D17BFD0", +"000000067F000080000009600C0000084000-000000067F000080000009600C0000088000__000000B8606C92A0", +"000000067F000080000009600C00000846D7-000000067F000080000009600C000008DE0C__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C0000088000-000000067F000080000009600C000008C000__000000B79D17BFD0", +"000000067F000080000009600C0000088000-000000067F000080000009600C000008C000__000000B8606C92A0", +"000000067F000080000009600C000008C000-000000067F000080000009600C0000090000__000000B79D17BFD0", +"000000067F000080000009600C000008C000-000000067F000080000009600C0000090000__000000B8606C92A0", +"000000067F000080000009600C000008DE0C-000000067F000080000009600C000009752C__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C0000090000-000000067F000080000009600C0000094000__000000B79D17BFD0", +"000000067F000080000009600C0000090000-000000067F000080000009600C0000094000__000000B8606C92A0", +"000000067F000080000009600C0000094000-000000067F000080000009600C0000098000__000000B79D17BFD0", +"000000067F000080000009600C0000094000-000000067F000080000009600C0000098000__000000B8606C92A0", +"000000067F000080000009600C000009752C-000000067F000080000009600C00000A0C92__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C0000098000-000000067F000080000009600C000009C000__000000B79D17BFD0", +"000000067F000080000009600C0000098000-000000067F000080000009600C000009C000__000000B8606C92A0", +"000000067F000080000009600C000009C000-000000067F000080000009600C00000A0000__000000B79D17BFD0", +"000000067F000080000009600C000009C000-000000067F000080000009600C00000A0000__000000B8606C92A0", +"000000067F000080000009600C00000A0000-000000067F000080000009600C00000A4000__000000B79D17BFD0", +"000000067F000080000009600C00000A0000-000000067F000080000009600C00000A4000__000000B8606C92A0", +"000000067F000080000009600C00000A0C92-000000067F000080000009600C0100000000__000000B63EADE5B9-000000B6DE71F5F9", +"000000067F000080000009600C00000A4000-000000067F000080000009600C00000A8000__000000B79D17BFD0", +"000000067F000080000009600C00000A4000-000000067F000080000009600C00000A8000__000000B8606C92A0", +"000000067F000080000009600C00000A8000-000000067F000080000009600C00000AC000__000000B79D17BFD0", +"000000067F000080000009600C00000A8000-000000067F000080000009600C00000AC000__000000B8606C92A0", +"000000067F000080000009600C00000A93FD-000000067F000080000009600C00000B2B0C__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000AAEA5-000000067F000080000009600C0000101445__000000B808718889-000000B8606C92A1", +"000000067F000080000009600C00000AC000-000000067F000080000009600C00000B0000__000000B79D17BFD0", +"000000067F000080000009600C00000AC000-000000067F000080000009600C00000B0000__000000B8606C92A0", +"000000067F000080000009600C00000B0000-000000067F000080000009600C00000B4000__000000B79D17BFD0", +"000000067F000080000009600C00000B0000-000000067F000080000009600C00000B4000__000000B8606C92A0", +"000000067F000080000009600C00000B2B0C-000000067F000080000009600C00000BC272__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000B4000-000000067F000080000009600C00000B8000__000000B79D17BFD0", +"000000067F000080000009600C00000B4000-000000067F000080000009600C00000B8000__000000B8606C92A0", +"000000067F000080000009600C00000B8000-000000067F000080000009600C00000BC000__000000B79D17BFD0", +"000000067F000080000009600C00000B8000-000000067F000080000009600C00000BC000__000000B8606C92A0", +"000000067F000080000009600C00000BC000-000000067F000080000009600C00000C0000__000000B79D17BFD0", +"000000067F000080000009600C00000BC000-000000067F000080000009600C00000C0000__000000B8606C92A0", +"000000067F000080000009600C00000BC272-000000067F000080000009600C00000C59A2__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000C0000-000000067F000080000009600C00000C4000__000000B79D17BFD0", +"000000067F000080000009600C00000C0000-000000067F000080000009600C00000C4000__000000B8606C92A0", +"000000067F000080000009600C00000C4000-000000067F000080000009600C00000C8000__000000B79D17BFD0", +"000000067F000080000009600C00000C4000-000000067F000080000009600C00000C8000__000000B8606C92A0", +"000000067F000080000009600C00000C59A2-000000067F000080000009600C00000CF108__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000C8000-000000067F000080000009600C00000CC000__000000B79D17BFD0", +"000000067F000080000009600C00000C8000-000000067F000080000009600C00000CC000__000000B8606C92A0", +"000000067F000080000009600C00000CC000-000000067F000080000009600C00000D0000__000000B79D17BFD0", +"000000067F000080000009600C00000CC000-000000067F000080000009600C00000D0000__000000B8606C92A0", +"000000067F000080000009600C00000CF108-000000067F000080000009600C00000D882B__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000D0000-000000067F000080000009600C00000D4000__000000B79D17BFD0", +"000000067F000080000009600C00000D0000-000000067F000080000009600C00000D4000__000000B8606C92A0", +"000000067F000080000009600C00000D4000-000000067F000080000009600C00000D8000__000000B79D17BFD0", +"000000067F000080000009600C00000D4000-000000067F000080000009600C00000D8000__000000B8606C92A0", +"000000067F000080000009600C00000D8000-000000067F000080000009600C00000DC000__000000B79D17BFD0", +"000000067F000080000009600C00000D8000-000000067F000080000009600C00000DC000__000000B8606C92A0", +"000000067F000080000009600C00000D882B-000000067F000080000009600C00000E1F7E__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000DC000-000000067F000080000009600C00000E0000__000000B79D17BFD0", +"000000067F000080000009600C00000DC000-000000067F000080000009600C00000E0000__000000B8606C92A0", +"000000067F000080000009600C00000E0000-000000067F000080000009600C00000E4000__000000B79D17BFD0", +"000000067F000080000009600C00000E0000-000000067F000080000009600C00000E4000__000000B8606C92A0", +"000000067F000080000009600C00000E1F7E-000000067F000080000009600C00000EB6E4__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000E4000-000000067F000080000009600C00000E8000__000000B79D17BFD0", +"000000067F000080000009600C00000E4000-000000067F000080000009600C00000E8000__000000B8606C92A0", +"000000067F000080000009600C00000E8000-000000067F000080000009600C00000EC000__000000B79D17BFD0", +"000000067F000080000009600C00000E8000-000000067F000080000009600C00000EC000__000000B8606C92A0", +"000000067F000080000009600C00000EB6E4-000000067F000080000009600C00000F4E0B__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000EC000-000000067F000080000009600C00000F0000__000000B79D17BFD0", +"000000067F000080000009600C00000EC000-000000067F000080000009600C00000F0000__000000B8606C92A0", +"000000067F000080000009600C00000F0000-000000067F000080000009600C00000F4000__000000B79D17BFD0", +"000000067F000080000009600C00000F0000-000000067F000080000009600C00000F4000__000000B8606C92A0", +"000000067F000080000009600C00000F4000-000000067F000080000009600C00000F8000__000000B79D17BFD0", +"000000067F000080000009600C00000F4000-000000067F000080000009600C00000F8000__000000B8606C92A0", +"000000067F000080000009600C00000F4E0B-000000067F000080000009600C00000FE571__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C00000F8000-000000067F000080000009600C00000FC000__000000B79D17BFD0", +"000000067F000080000009600C00000F8000-000000067F000080000009600C00000FC000__000000B8606C92A0", +"000000067F000080000009600C00000FC000-000000067F000080000009600C0000100000__000000B79D17BFD0", +"000000067F000080000009600C00000FC000-000000067F000080000009600C0000100000__000000B8606C92A0", +"000000067F000080000009600C00000FE571-000000067F000080000009600C0000107CD7__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C0000100000-000000067F000080000009600C0000104000__000000B79D17BFD0", +"000000067F000080000009600C0000100000-000000067F000080000009600C0000104000__000000B8606C92A0", +"000000067F000080000009600C000010144D-000000067F0000800000096014000000E7D9__000000B808718889-000000B8606C92A1", +"000000067F000080000009600C0000104000-000000067F000080000009600C0000108000__000000B79D17BFD0", +"000000067F000080000009600C0000104000-000000067F000080000009600C0000108000__000000B8606C92A0", +"000000067F000080000009600C0000107CD7-000000067F000080000009600C000011140C__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C0000108000-000000067F000080000009600C000010C000__000000B79D17BFD0", +"000000067F000080000009600C0000108000-000000067F000080000009600C000010C000__000000B8606C92A0", +"000000067F000080000009600C000010C000-000000067F000080000009600C0000110000__000000B79D17BFD0", +"000000067F000080000009600C000010C000-000000067F000080000009600C0000110000__000000B8606C92A0", +"000000067F000080000009600C0000110000-000000067F00008000000960120100000000__000000B8606C92A0", +"000000067F000080000009600C0000110000-030000000000000000000000000000000002__000000B79D17BFD0", +"000000067F000080000009600C000011140C-01000000000000000100000004000000001C__000000B6DE71F5F9-000000B79E68FFF9", +"000000067F000080000009600C020000000B-000000067F0000800000096014000000571F__000000B79E68FFF9-000000B808718889", +"000000067F00008000000960140000000000-000000067F00008000000960140000004000__000000B8606C92A0", +"000000067F00008000000960140000004000-000000067F00008000000960140000008000__000000B8606C92A0", +"000000067F0000800000096014000000571F-000000067F0000800000096014000000CB61__000000B79E68FFF9-000000B808718889", +"000000067F00008000000960140000008000-000000067F0000800000096014000000C000__000000B8606C92A0", +"000000067F0000800000096014000000C000-000000067F00008000000960140000010000__000000B8606C92A0", +"000000067F0000800000096014000000CB61-000000067F00008000000960140000013F98__000000B79E68FFF9-000000B808718889", +"000000067F0000800000096014000000E7DB-000000067F00008000000960140000022A8D__000000B808718889-000000B8606C92A1", +"000000067F00008000000960140000010000-000000067F00008000000960140000014000__000000B8606C92A0", +"000000067F00008000000960140000013F98-000000067F0000800000096014000001B3C2__000000B79E68FFF9-000000B808718889", +"000000067F00008000000960140000014000-000000067F00008000000960140000018000__000000B8606C92A0", +"000000067F00008000000960140000018000-000000067F0000800000096014000001C000__000000B8606C92A0", +"000000067F0000800000096014000001B3C2-000000067F000080000009601400000227FC__000000B79E68FFF9-000000B808718889", +"000000067F0000800000096014000001C000-000000067F00008000000960140000020000__000000B8606C92A0", +"000000067F00008000000960140000020000-000000067F00008000000960140000024000__000000B8606C92A0", +"000000067F000080000009601400000227FC-000000067F00008000000960140000029BD8__000000B79E68FFF9-000000B808718889", +"000000067F00008000000960140000022A8D-030000000000000000000000000000000002__000000B808718889-000000B8606C92A1", +"000000067F00008000000960140000024000-000000067F00008000000960140000028000__000000B8606C92A0", +"000000067F00008000000960140000028000-000000067F0000800000096014000002C000__000000B8606C92A0", +"000000067F00008000000960140000029BD8-030000000000000000000000000000000002__000000B79E68FFF9-000000B808718889", +"000000067F0000800000096014000002C000-030000000000000000000000000000000002__000000B8606C92A0", +"000000067F000080000009800C0000009748-000000067F000080000009800C0000012EAE__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C0000012EAE-000000067F000080000009800C000001C60A__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C000001C60A-000000067F000080000009800C0000025D38__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C0000025D38-000000067F000080000009800C000002F49E__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C000002F49E-000000067F000080000009800C0000038BB1__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C0000038BB1-000000067F000080000009800C0000042317__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C0000042317-000000067F000080000009800C000004BA7D__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C000004BA7D-030000000000000000000000000000000002__000000B8606C92A1-000000B8E03BF0B9", +"000000067F000080000009800C000004BAD2-000000067F000080000009800C0000055206__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C0000055206-000000067F000080000009800C000005E911__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C000005E911-000000067F000080000009800C000006802B__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C000006802B-000000067F000080000009800C0000071782__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C0000071782-000000067F000080000009800C000007AEE8__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C000007AEE8-000000067F000080000009800C000008460B__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C000008460B-000000067F000080000009800C000008DD71__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C000008DD71-000000067F000080000009800C00000974D7__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C00000974D7-000000067F000080000009800C00000A0C0B__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C00000A0C0B-000000067F000080000009800C00000AA371__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C00000A8000-000000067F000080000009800C00000AC000__000000BA2E67EA20", +"000000067F000080000009800C00000AA371-000000067F000080000009800C0100000000__000000B8E03BF0B9-000000B97FFFFFE9", +"000000067F000080000009800C00000AA4F5-000000067F000080000009800C00000B3C0B__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000AC000-000000067F000080000009800C00000B0000__000000BA2E67EA20", +"000000067F000080000009800C00000B0000-000000067F000080000009800C00000B4000__000000BA2E67EA20", +"000000067F000080000009800C00000B3C0B-000000067F000080000009800C00000BD371__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000B4000-000000067F000080000009800C00000B8000__000000BA2E67EA20", +"000000067F000080000009800C00000B8000-000000067F000080000009800C00000BC000__000000BA2E67EA20", +"000000067F000080000009800C00000BC000-000000067F000080000009800C00000C0000__000000BA2E67EA20", +"000000067F000080000009800C00000BD371-000000067F000080000009800C00000C6AD7__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000C0000-000000067F000080000009800C00000C4000__000000BA2E67EA20", +"000000067F000080000009800C00000C4000-000000067F000080000009800C00000C8000__000000BA2E67EA20", +"000000067F000080000009800C00000C6AD7-000000067F000080000009800C00000D020B__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000C8000-000000067F000080000009800C00000CC000__000000BA2E67EA20", +"000000067F000080000009800C00000CC000-000000067F000080000009800C00000D0000__000000BA2E67EA20", +"000000067F000080000009800C00000D0000-000000067F000080000009800C00000D4000__000000BA2E67EA20", +"000000067F000080000009800C00000D020B-000000067F000080000009800C00000D9971__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000D4000-000000067F000080000009800C00000D8000__000000BA2E67EA20", +"000000067F000080000009800C00000D8000-000000067F000080000009800C00000DC000__000000BA2E67EA20", +"000000067F000080000009800C00000D9971-000000067F000080000009800C00000E30D7__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000DC000-000000067F000080000009800C00000E0000__000000BA2E67EA20", +"000000067F000080000009800C00000E0000-000000067F000080000009800C00000E4000__000000BA2E67EA20", +"000000067F000080000009800C00000E30D7-000000067F000080000009800C00000EC80B__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000E4000-000000067F000080000009800C00000E8000__000000BA2E67EA20", +"000000067F000080000009800C00000E8000-000000067F000080000009800C00000EC000__000000BA2E67EA20", +"000000067F000080000009800C00000EC000-000000067F000080000009800C00000F0000__000000BA2E67EA20", +"000000067F000080000009800C00000EC80B-000000067F000080000009800C00000F5F38__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000F0000-000000067F000080000009800C00000F4000__000000BA2E67EA20", +"000000067F000080000009800C00000F4000-000000067F000080000009800C00000F8000__000000BA2E67EA20", +"000000067F000080000009800C00000F5F38-000000067F000080000009800C00000FF69E__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C00000F8000-000000067F000080000009800C00000FC000__000000BA2E67EA20", +"000000067F000080000009800C00000FC000-000000067F000080000009800C0000100000__000000BA2E67EA20", +"000000067F000080000009800C00000FF69E-000000067F000080000009800C0000108DAF__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C0000100000-000000067F000080000009800C0000104000__000000BA2E67EA20", +"000000067F000080000009800C0000104000-000000067F000080000009800C0000108000__000000BA2E67EA20", +"000000067F000080000009800C0000108000-000000067F000080000009800C000010C000__000000BA2E67EA20", +"000000067F000080000009800C0000108DAF-000000067F000080000009800F0100000003__000000B97FFFFFE9-000000BA1FC3FB39", +"000000067F000080000009800C000010C000-000000067F000080000009800C0000110000__000000BA2E67EA20", +"000000067F000080000009800C000010EC71-000000067F000080000009801400000025C3__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F000080000009800C0000110000-030000000000000000000000000000000002__000000BA2E67EA20", +"000000067F000080000009801400000025C3-000000067F0000800000098014000000A4D3__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F0000800000098014000000A4D3-000000067F000080000009801400000123E4__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F000080000009801400000123E4-000000067F0000800000098014000001A2F3__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F0000800000098014000001A2F3-000000067F00008000000980140000022204__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F00008000000980140000022204-000000067F0000800000098014000002A114__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F0000800000098014000002A114-000000067F000080000009A00C0000004DB3__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F000080000009A00C0000000000-000000067F000080000009A00C0000004000__000000BCEF79BE90", +"000000067F000080000009A00C0000004000-000000067F000080000009A00C0000008000__000000BCEF79BE90", +"000000067F000080000009A00C0000004DB3-030000000000000000000000000000000002__000000BA1FC3FB39-000000BA9685E7C1", +"000000067F000080000009A00C0000008000-000000067F000080000009A00C000000C000__000000BC59629F98", +"000000067F000080000009A00C0000008000-000000067F000080000009A00C000000C000__000000BD25E66810", +"000000067F000080000009A00C00000096E8-000000067F000080000009A00C0000012E0B__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C000000C000-000000067F000080000009A00C0000010000__000000BC59629F98", +"000000067F000080000009A00C000000C000-000000067F000080000009A00C0000010000__000000BD25E66810", +"000000067F000080000009A00C0000010000-000000067F000080000009A00C0000014000__000000BC59629F98", +"000000067F000080000009A00C0000010000-000000067F000080000009A00C0000014000__000000BD25E66810", +"000000067F000080000009A00C0000012E0B-000000067F000080000009A00C000001C571__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000014000-000000067F000080000009A00C0000018000__000000BC59629F98", +"000000067F000080000009A00C0000014000-000000067F000080000009A00C0000018000__000000BD25E66810", +"000000067F000080000009A00C0000018000-000000067F000080000009A00C000001C000__000000BC59629F98", +"000000067F000080000009A00C0000018000-000000067F000080000009A00C000001C000__000000BD25E66810", +"000000067F000080000009A00C000001C000-000000067F000080000009A00C0000020000__000000BC59629F98", +"000000067F000080000009A00C000001C000-000000067F000080000009A00C0000020000__000000BD25E66810", +"000000067F000080000009A00C000001C571-000000067F000080000009A00C0000025CD7__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000020000-000000067F000080000009A00C0000024000__000000BC59629F98", +"000000067F000080000009A00C0000020000-000000067F000080000009A00C0000024000__000000BD25E66810", +"000000067F000080000009A00C0000024000-000000067F000080000009A00C0000028000__000000BC59629F98", +"000000067F000080000009A00C0000024000-000000067F000080000009A00C0000028000__000000BD25E66810", +"000000067F000080000009A00C0000025CD7-000000067F000080000009A00C000002F40B__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000028000-000000067F000080000009A00C000002C000__000000BC59629F98", +"000000067F000080000009A00C0000028000-000000067F000080000009A00C000002C000__000000BD25E66810", +"000000067F000080000009A00C000002C000-000000067F000080000009A00C0000030000__000000BC59629F98", +"000000067F000080000009A00C000002C000-000000067F000080000009A00C0000030000__000000BD25E66810", +"000000067F000080000009A00C000002F40B-000000067F000080000009A00C0000038B1E__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000030000-000000067F000080000009A00C0000034000__000000BC59629F98", +"000000067F000080000009A00C0000030000-000000067F000080000009A00C0000034000__000000BD25E66810", +"000000067F000080000009A00C0000034000-000000067F000080000009A00C0000038000__000000BC59629F98", +"000000067F000080000009A00C0000034000-000000067F000080000009A00C0000038000__000000BD25E66810", +"000000067F000080000009A00C0000038000-000000067F000080000009A00C000003C000__000000BC59629F98", +"000000067F000080000009A00C0000038000-000000067F000080000009A00C000003C000__000000BD25E66810", +"000000067F000080000009A00C0000038B1E-000000067F000080000009A00C0000042284__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C000003C000-000000067F000080000009A00C0000040000__000000BC59629F98", +"000000067F000080000009A00C000003C000-000000067F000080000009A00C0000040000__000000BD25E66810", +"000000067F000080000009A00C0000040000-000000067F000080000009A00C0000044000__000000BC59629F98", +"000000067F000080000009A00C0000040000-000000067F000080000009A00C0000044000__000000BD25E66810", +"000000067F000080000009A00C0000042284-000000067F000080000009A00C000004B9EA__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000044000-000000067F000080000009A00C0000048000__000000BC59629F98", +"000000067F000080000009A00C0000044000-000000067F000080000009A00C0000048000__000000BD25E66810", +"000000067F000080000009A00C0000048000-000000067F000080000009A00C000004C000__000000BC59629F98", +"000000067F000080000009A00C0000048000-000000067F000080000009A00C000004C000__000000BD25E66810", +"000000067F000080000009A00C000004B9EA-000000067F000080000009A00C000005510B__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C000004C000-000000067F000080000009A00C0000050000__000000BC59629F98", +"000000067F000080000009A00C000004C000-000000067F000080000009A00C0000050000__000000BD25E66810", +"000000067F000080000009A00C0000050000-000000067F000080000009A00C0000054000__000000BC59629F98", +"000000067F000080000009A00C0000050000-000000067F000080000009A00C0000054000__000000BD25E66810", +"000000067F000080000009A00C0000054000-000000067F000080000009A00C0000058000__000000BC59629F98", +"000000067F000080000009A00C0000054000-000000067F000080000009A00C0000058000__000000BD25E66810", +"000000067F000080000009A00C000005510B-000000067F000080000009A00C000005E871__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000058000-000000067F000080000009A00C000005C000__000000BC59629F98", +"000000067F000080000009A00C0000058000-000000067F000080000009A00C000005C000__000000BD25E66810", +"000000067F000080000009A00C000005C000-000000067F000080000009A00C0000060000__000000BC59629F98", +"000000067F000080000009A00C000005C000-000000067F000080000009A00C0000060000__000000BD25E66810", +"000000067F000080000009A00C000005E871-000000067F000080000009A00C0000067F8B__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000060000-000000067F000080000009A00C0000064000__000000BC59629F98", +"000000067F000080000009A00C0000060000-000000067F000080000009A00C0000064000__000000BD25E66810", +"000000067F000080000009A00C0000064000-000000067F000080000009A00C0000068000__000000BC59629F98", +"000000067F000080000009A00C0000064000-000000067F000080000009A00C0000068000__000000BD25E66810", +"000000067F000080000009A00C0000067F8B-000000067F000080000009A00C00000716F1__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000068000-000000067F000080000009A00C000006C000__000000BC59629F98", +"000000067F000080000009A00C0000068000-000000067F000080000009A00C000006C000__000000BD25E66810", +"000000067F000080000009A00C000006C000-000000067F000080000009A00C0000070000__000000BC59629F98", +"000000067F000080000009A00C000006C000-000000067F000080000009A00C0000070000__000000BD25E66810", +"000000067F000080000009A00C0000070000-000000067F000080000009A00C0000074000__000000BC53F74828", +"000000067F000080000009A00C0000070000-000000067F000080000009A00C0000074000__000000BD25E66810", +"000000067F000080000009A00C00000716F1-000000067F000080000009A00C0100000000__000000BA9685E7C1-000000BB4643FBD1", +"000000067F000080000009A00C0000071875-000000067F000080000009A00C000007AFDB__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C0000071F8D-000000067F000080000009A00C00000E4F8F__000000BCEF79BE91-000000BD263A5849", +"000000067F000080000009A00C0000074000-000000067F000080000009A00C0000078000__000000BC53F74828", +"000000067F000080000009A00C0000074000-000000067F000080000009A00C0000078000__000000BD25E66810", +"000000067F000080000009A00C0000078000-000000067F000080000009A00C000007C000__000000BC53F74828", +"000000067F000080000009A00C0000078000-000000067F000080000009A00C000007C000__000000BD25E66810", +"000000067F000080000009A00C00000794E0-000000067F000080000009A00C00000F2480__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A00C000007AFDB-000000067F000080000009A00C000008470A__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C000007C000-000000067F000080000009A00C0000080000__000000BC53F74828", +"000000067F000080000009A00C000007C000-000000067F000080000009A00C0000080000__000000BD25E66810", +"000000067F000080000009A00C0000080000-000000067F000080000009A00C0000084000__000000BC53F74828", +"000000067F000080000009A00C0000080000-000000067F000080000009A00C0000084000__000000BD25E66810", +"000000067F000080000009A00C0000084000-000000067F000080000009A00C0000088000__000000BC53F74828", +"000000067F000080000009A00C0000084000-000000067F000080000009A00C0000088000__000000BD25E66810", +"000000067F000080000009A00C000008470A-000000067F000080000009A00C000008DE70__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C0000088000-000000067F000080000009A00C000008C000__000000BC53F74828", +"000000067F000080000009A00C0000088000-000000067F000080000009A00C000008C000__000000BD25E66810", +"000000067F000080000009A00C000008C000-000000067F000080000009A00C0000090000__000000BC53F74828", +"000000067F000080000009A00C000008C000-000000067F000080000009A00C0000090000__000000BD25E66810", +"000000067F000080000009A00C000008DE70-000000067F000080000009A00C0000097590__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C0000090000-000000067F000080000009A00C0000094000__000000BC53F74828", +"000000067F000080000009A00C0000090000-000000067F000080000009A00C0000094000__000000BD25E66810", +"000000067F000080000009A00C0000094000-000000067F000080000009A00C0000098000__000000BC53F74828", +"000000067F000080000009A00C0000094000-000000067F000080000009A00C0000098000__000000BD25E66810", +"000000067F000080000009A00C0000097590-000000067F000080000009A00C00000A0CF6__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C0000098000-000000067F000080000009A00C000009C000__000000BC53F74828", +"000000067F000080000009A00C0000098000-000000067F000080000009A00C000009C000__000000BD25E66810", +"000000067F000080000009A00C000009C000-000000067F000080000009A00C00000A0000__000000BC53F74828", +"000000067F000080000009A00C000009C000-000000067F000080000009A00C00000A0000__000000BD25E66810", +"000000067F000080000009A00C00000A0000-000000067F000080000009A00C00000A4000__000000BC53F74828", +"000000067F000080000009A00C00000A0000-000000067F000080000009A00C00000A4000__000000BD25E66810", +"000000067F000080000009A00C00000A0CF6-000000067F000080000009A00C00000AA40B__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C00000A4000-000000067F000080000009A00C00000A8000__000000BC53F74828", +"000000067F000080000009A00C00000A4000-000000067F000080000009A00C00000A8000__000000BD25E66810", +"000000067F000080000009A00C00000A8000-000000067F000080000009A00C00000AC000__000000BC53F74828", +"000000067F000080000009A00C00000A8000-000000067F000080000009A00C00000AC000__000000BD25E66810", +"000000067F000080000009A00C00000AA40B-000000067F000080000009A00C00000B3B4D__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C00000AC000-000000067F000080000009A00C00000B0000__000000BC53F74828", +"000000067F000080000009A00C00000AC000-000000067F000080000009A00C00000B0000__000000BD25E66810", +"000000067F000080000009A00C00000B0000-000000067F000080000009A00C00000B4000__000000BC53F74828", +"000000067F000080000009A00C00000B0000-000000067F000080000009A00C00000B4000__000000BD25E66810", +"000000067F000080000009A00C00000B3B4D-000000067F000080000009A00C00000BD2B3__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C00000B4000-000000067F000080000009A00C00000B8000__000000BC53F74828", +"000000067F000080000009A00C00000B4000-000000067F000080000009A00C00000B8000__000000BD25E66810", +"000000067F000080000009A00C00000B8000-000000067F000080000009A00C00000BC000__000000BC53F74828", +"000000067F000080000009A00C00000B8000-000000067F000080000009A00C00000BC000__000000BD25E66810", +"000000067F000080000009A00C00000BC000-000000067F000080000009A00C00000C0000__000000BC53F74828", +"000000067F000080000009A00C00000BC000-000000067F000080000009A00C00000C0000__000000BD25E66810", +"000000067F000080000009A00C00000BD2B3-000000067F000080000009A00C00000C69D9__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C00000C0000-000000067F000080000009A00C00000C4000__000000BC53F74828", +"000000067F000080000009A00C00000C0000-000000067F000080000009A00C00000C4000__000000BD25E66810", +"000000067F000080000009A00C00000C4000-000000067F000080000009A00C00000C8000__000000BC53F74828", +"000000067F000080000009A00C00000C4000-000000067F000080000009A00C00000C8000__000000BD25E66810", +"000000067F000080000009A00C00000C69D9-000000067F000080000009A00C00000D010C__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C00000C8000-000000067F000080000009A00C00000CC000__000000BC53F74828", +"000000067F000080000009A00C00000C8000-000000067F000080000009A00C00000CC000__000000BD25E66810", +"000000067F000080000009A00C00000CC000-000000067F000080000009A00C00000D0000__000000BC53F74828", +"000000067F000080000009A00C00000CC000-000000067F000080000009A00C00000D0000__000000BD25E66810", +"000000067F000080000009A00C00000D0000-000000067F000080000009A00C00000D4000__000000BC53F74828", +"000000067F000080000009A00C00000D0000-000000067F000080000009A00C00000D4000__000000BD25E66810", +"000000067F000080000009A00C00000D010C-000000067F000080000009A00C0100000000__000000BB4643FBD1-000000BBE607E8F1", +"000000067F000080000009A00C00000D4000-000000067F000080000009A00C00000D8000__000000BC53F74828", +"000000067F000080000009A00C00000D4000-000000067F000080000009A00C00000D8000__000000BD25E66810", +"000000067F000080000009A00C00000D6C06-000000067F000080000009A00C00000E0166__000000BBE607E8F1-000000BC596B5D59", +"000000067F000080000009A00C00000D8000-000000067F000080000009A00C00000DC000__000000BC53F74828", +"000000067F000080000009A00C00000D8000-000000067F000080000009A00C00000DC000__000000BD25E66810", +"000000067F000080000009A00C00000DC000-000000067F000080000009A00C00000E0000__000000BC53F74828", +"000000067F000080000009A00C00000DC000-000000067F000080000009A00C00000E0000__000000BD25E66810", +"000000067F000080000009A00C00000E0000-000000067F000080000009A00C00000E4000__000000BC53F74828", +"000000067F000080000009A00C00000E0000-000000067F000080000009A00C00000E4000__000000BD25E66810", +"000000067F000080000009A00C00000E0166-000000067F000080000009A00C00000E96C9__000000BBE607E8F1-000000BC596B5D59", +"000000067F000080000009A00C00000E4000-000000067F000080000009A00C00000E8000__000000BC53F74828", +"000000067F000080000009A00C00000E4000-000000067F000080000009A00C00000E8000__000000BD25E66810", +"000000067F000080000009A00C00000E4F97-000000067F000080000009A0140000019842__000000BCEF79BE91-000000BD263A5849", +"000000067F000080000009A00C00000E8000-000000067F000080000009A00C00000EC000__000000BC53F74828", +"000000067F000080000009A00C00000E8000-000000067F000080000009A00C00000EC000__000000BD25E66810", +"000000067F000080000009A00C00000E96C9-000000067F000080000009A00C00000F2C2B__000000BBE607E8F1-000000BC596B5D59", +"000000067F000080000009A00C00000EC000-000000067F000080000009A00C00000F0000__000000BC53F74828", +"000000067F000080000009A00C00000EC000-000000067F000080000009A00C00000F0000__000000BD25E66810", +"000000067F000080000009A00C00000F0000-000000067F000080000009A00C00000F4000__000000BC53F74828", +"000000067F000080000009A00C00000F0000-000000067F000080000009A00C00000F4000__000000BD25E66810", +"000000067F000080000009A00C00000F248B-000000067F000080000009A0140000004031__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A00C00000F2C2B-000000067F000080000009A00C00000FC18E__000000BBE607E8F1-000000BC596B5D59", +"000000067F000080000009A00C00000F4000-000000067F000080000009A00C00000F8000__000000BC53F74828", +"000000067F000080000009A00C00000F4000-000000067F000080000009A00C00000F8000__000000BD25E66810", +"000000067F000080000009A00C00000F8000-000000067F000080000009A00C00000FC000__000000BC53F74828", +"000000067F000080000009A00C00000F8000-000000067F000080000009A00C00000FC000__000000BD25E66810", +"000000067F000080000009A00C00000FC000-000000067F000080000009A00C0000100000__000000BC53F74828", +"000000067F000080000009A00C00000FC000-000000067F000080000009A00C0000100000__000000BD25E66810", +"000000067F000080000009A00C00000FC18E-000000067F000080000009A00C00001056F2__000000BBE607E8F1-000000BC596B5D59", +"000000067F000080000009A00C0000100000-000000067F000080000009A00C0000104000__000000BC53F74828", +"000000067F000080000009A00C0000100000-000000067F000080000009A00C0000104000__000000BD25E66810", +"000000067F000080000009A00C0000104000-000000067F000080000009A00C0000108000__000000BC53F74828", +"000000067F000080000009A00C0000104000-000000067F000080000009A00C0000108000__000000BD25E66810", +"000000067F000080000009A00C00001056F2-000000067F000080000009A00C000010EC54__000000BBE607E8F1-000000BC596B5D59", +"000000067F000080000009A00C0000108000-000000067F000080000009A00C000010C000__000000BC53F74828", +"000000067F000080000009A00C0000108000-000000067F000080000009A00C000010C000__000000BD25E66810", +"000000067F000080000009A00C000010C000-000000067F000080000009A00C0000110000__000000BC53F74828", +"000000067F000080000009A00C000010C000-000000067F000080000009A00C0000110000__000000BD25E66810", +"000000067F000080000009A00C000010EC54-010000000000000001000000040000000020__000000BBE607E8F1-000000BC596B5D59", +"000000067F000080000009A00C0000110000-000000067F000080000009A0120100000000__000000BD25E66810", +"000000067F000080000009A00C0000110000-030000000000000000000000000000000002__000000BC53F74828", +"000000067F000080000009A0140000000000-000000067F000080000009A0140000004000__000000BD25E66810", +"000000067F000080000009A0140000004000-000000067F000080000009A0140000008000__000000BD25E66810", +"000000067F000080000009A0140000004031-000000067F000080000009A0140000009FC7__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A0140000008000-000000067F000080000009A014000000C000__000000BD25E66810", +"000000067F000080000009A0140000009FC7-000000067F000080000009A014000000FF53__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A014000000C000-000000067F000080000009A0140000010000__000000BD25E66810", +"000000067F000080000009A014000000FF53-000000067F000080000009A0140000015F1C__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A0140000010000-000000067F000080000009A0140000014000__000000BD25E66810", +"000000067F000080000009A0140000014000-000000067F000080000009A0140000018000__000000BD25E66810", +"000000067F000080000009A0140000015F1C-000000067F000080000009A014000001BED0__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A0140000018000-000000067F000080000009A014000001C000__000000BD25E66810", +"000000067F000080000009A0140000019844-030000000000000000000000000000000002__000000BCEF79BE91-000000BD263A5849", +"000000067F000080000009A014000001BED0-000000067F000080000009A0140000021E6C__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A014000001C000-000000067F000080000009A0140000020000__000000BD25E66810", +"000000067F000080000009A0140000020000-000000067F000080000009A0140000024000__000000BD25E66810", +"000000067F000080000009A0140000021E6C-000000067F000080000009A0140000027DB1__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A0140000024000-000000067F000080000009A0140000028000__000000BD25E66810", +"000000067F000080000009A0140000027DB1-000000067F000080000009A014000002DC9E__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009A0140000028000-000000067F000080000009A014000002C000__000000BD25E66810", +"000000067F000080000009A014000002C000-030000000000000000000000000000000002__000000BD25E66810", +"000000067F000080000009A01400FFFFFFFF-030000000000000000000000000000000002__000000BC596B5D59-000000BCEF79BE91", +"000000067F000080000009C00C0000000000-000000067F000080000009C00C0000004000__000000BEF683BFD0", +"000000067F000080000009C00C0000004000-000000067F000080000009C00C0000008000__000000BEF683BFD0", +"000000067F000080000009C00C0000008000-000000067F000080000009C00C000000C000__000000BEF683BFD0", +"000000067F000080000009C00C0000009749-000000067F000080000009C00C0000012EAF__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C000000C000-000000067F000080000009C00C0000010000__000000BEF683BFD0", +"000000067F000080000009C00C0000010000-000000067F000080000009C00C0000014000__000000BEF683BFD0", +"000000067F000080000009C00C0000012EAF-000000067F000080000009C00C000001C60B__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C0000014000-000000067F000080000009C00C0000018000__000000BEF683BFD0", +"000000067F000080000009C00C0000018000-000000067F000080000009C00C000001C000__000000BEF683BFD0", +"000000067F000080000009C00C000001C000-000000067F000080000009C00C0000020000__000000BEF683BFD0", +"000000067F000080000009C00C000001C60B-000000067F000080000009C00C0000025D39__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C0000020000-000000067F000080000009C00C0000024000__000000BEF683BFD0", +"000000067F000080000009C00C0000024000-000000067F000080000009C00C0000028000__000000BEF683BFD0", +"000000067F000080000009C00C0000025D39-000000067F000080000009C00C000002F49F__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C0000028000-000000067F000080000009C00C000002C000__000000BEF683BFD0", +"000000067F000080000009C00C000002C000-000000067F000080000009C00C0000030000__000000BEF683BFD0", +"000000067F000080000009C00C000002F49F-000000067F000080000009C00C0000038BB2__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C0000030000-000000067F000080000009C00C0000034000__000000BEF683BFD0", +"000000067F000080000009C00C0000034000-000000067F000080000009C00C0000038000__000000BEF683BFD0", +"000000067F000080000009C00C0000038000-000000067F000080000009C00C000003C000__000000BEF683BFD0", +"000000067F000080000009C00C0000038BB2-000000067F000080000009C00C0000042318__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C000003C000-000000067F000080000009C00C0000040000__000000BEF683BFD0", +"000000067F000080000009C00C0000040000-000000067F000080000009C00C0000044000__000000BEF683BFD0", +"000000067F000080000009C00C0000042318-000000067F000080000009C00C000004BA7E__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C0000044000-000000067F000080000009C00C0000048000__000000BEF683BFD0", +"000000067F000080000009C00C0000048000-000000067F000080000009C00C000004C000__000000BEF06884C8", +"000000067F000080000009C00C000004BA7E-030000000000000000000000000000000002__000000BD263A5849-000000BDA607F261", +"000000067F000080000009C00C000004BAC3-000000067F000080000009C00C00000551F8__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C000004C000-000000067F000080000009C00C0000050000__000000BEF06884C8", +"000000067F000080000009C00C0000050000-000000067F000080000009C00C0000054000__000000BEF06884C8", +"000000067F000080000009C00C0000054000-000000067F000080000009C00C0000058000__000000BEF06884C8", +"000000067F000080000009C00C00000551F8-000000067F000080000009C00C000005E90C__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C0000058000-000000067F000080000009C00C000005C000__000000BEF06884C8", +"000000067F000080000009C00C000005C000-000000067F000080000009C00C0000060000__000000BEF06884C8", +"000000067F000080000009C00C000005E90C-000000067F000080000009C00C000006802C__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C0000060000-000000067F000080000009C00C0000064000__000000BEF06884C8", +"000000067F000080000009C00C0000064000-000000067F000080000009C00C0000068000__000000BEF06884C8", +"000000067F000080000009C00C0000068000-000000067F000080000009C00C000006C000__000000BEF06884C8", +"000000067F000080000009C00C000006802C-000000067F000080000009C00C0000071783__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C000006C000-000000067F000080000009C00C0000070000__000000BEF06884C8", +"000000067F000080000009C00C0000070000-000000067F000080000009C00C0000074000__000000BEF06884C8", +"000000067F000080000009C00C0000071783-000000067F000080000009C00C000007AEE9__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C0000074000-000000067F000080000009C00C0000078000__000000BEF06884C8", +"000000067F000080000009C00C0000078000-000000067F000080000009C00C000007C000__000000BEF06884C8", +"000000067F000080000009C00C000007AEE9-000000067F000080000009C00C000008460B__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C000007C000-000000067F000080000009C00C0000080000__000000BEF06884C8", +"000000067F000080000009C00C0000080000-000000067F000080000009C00C0000084000__000000BEF06884C8", +"000000067F000080000009C00C0000084000-000000067F000080000009C00C0000088000__000000BEF06884C8", +"000000067F000080000009C00C000008460B-000000067F000080000009C00C000008DD71__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C0000088000-000000067F000080000009C00C000008C000__000000BEF06884C8", +"000000067F000080000009C00C000008C000-000000067F000080000009C00C0000090000__000000BEF06884C8", +"000000067F000080000009C00C000008DD71-000000067F000080000009C00C00000974D7__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C0000090000-000000067F000080000009C00C0000094000__000000BEF06884C8", +"000000067F000080000009C00C0000094000-000000067F000080000009C00C0000098000__000000BEF06884C8", +"000000067F000080000009C00C00000974D7-000000067F000080000009C00C00000A0C0B__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C0000098000-000000067F000080000009C00C000009C000__000000BEF06884C8", +"000000067F000080000009C00C000009C000-000000067F000080000009C00C00000A0000__000000BEF06884C8", +"000000067F000080000009C00C00000A0000-000000067F000080000009C00C00000A4000__000000BEF06884C8", +"000000067F000080000009C00C00000A0C0B-000000067F000080000009C00C00000AA371__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C00000A4000-000000067F000080000009C00C00000A8000__000000BEF06884C8", +"000000067F000080000009C00C00000A8000-000000067F000080000009C00C00000AC000__000000BEF06884C8", +"000000067F000080000009C00C00000AA371-000000067F000080000009C00C0100000000__000000BDA607F261-000000BE45CBFBB9", +"000000067F000080000009C00C00000AC000-000000067F000080000009C00C00000B0000__000000BEF06884C8", +"000000067F000080000009C00C00000B0000-000000067F000080000009C00C00000B4000__000000BEF06884C8", +"000000067F000080000009C00C00000B2921-000000067F000080000009C00C00000BC087__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000B4000-000000067F000080000009C00C00000B8000__000000BEF06884C8", +"000000067F000080000009C00C00000B8000-000000067F000080000009C00C00000BC000__000000BEF06884C8", +"000000067F000080000009C00C00000BC000-000000067F000080000009C00C00000C0000__000000BEF06884C8", +"000000067F000080000009C00C00000BC087-000000067F000080000009C00C00000C57B8__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000C0000-000000067F000080000009C00C00000C4000__000000BEF06884C8", +"000000067F000080000009C00C00000C4000-000000067F000080000009C00C00000C8000__000000BEF06884C8", +"000000067F000080000009C00C00000C57B8-000000067F000080000009C00C00000CEF09__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000C8000-000000067F000080000009C00C00000CC000__000000BEF06884C8", +"000000067F000080000009C00C00000CC000-000000067F000080000009C00C00000D0000__000000BEF06884C8", +"000000067F000080000009C00C00000CEF09-000000067F000080000009C00C00000D862B__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000D0000-000000067F000080000009C00C00000D4000__000000BEF06884C8", +"000000067F000080000009C00C00000D4000-000000067F000080000009C00C00000D8000__000000BEF06884C8", +"000000067F000080000009C00C00000D8000-000000067F000080000009C00C00000DC000__000000BEF06884C8", +"000000067F000080000009C00C00000D862B-000000067F000080000009C00C00000E1D7F__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000DC000-000000067F000080000009C00C00000E0000__000000BEF06884C8", +"000000067F000080000009C00C00000E0000-000000067F000080000009C00C00000E4000__000000BEF06884C8", +"000000067F000080000009C00C00000E1D7F-000000067F000080000009C00C00000EB4E5__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000E4000-000000067F000080000009C00C00000E8000__000000BEF06884C8", +"000000067F000080000009C00C00000E8000-000000067F000080000009C00C00000EC000__000000BEF06884C8", +"000000067F000080000009C00C00000EB4E5-000000067F000080000009C00C00000F4C0B__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000EC000-000000067F000080000009C00C00000F0000__000000BEF06884C8", +"000000067F000080000009C00C00000F0000-000000067F000080000009C00C00000F4000__000000BEF06884C8", +"000000067F000080000009C00C00000F4000-000000067F000080000009C00C00000F8000__000000BEF06884C8", +"000000067F000080000009C00C00000F4C0B-000000067F000080000009C00C00000FE371__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C00000F8000-000000067F000080000009C00C00000FC000__000000BEF06884C8", +"000000067F000080000009C00C00000FC000-000000067F000080000009C00C0000100000__000000BEF06884C8", +"000000067F000080000009C00C00000FE371-000000067F000080000009C00C0000107AD7__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C0000100000-000000067F000080000009C00C0000104000__000000BEF06884C8", +"000000067F000080000009C00C0000104000-000000067F000080000009C00C0000108000__000000BEF06884C8", +"000000067F000080000009C00C0000107AD7-000000067F000080000009C00C000011120B__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009C00C0000108000-000000067F000080000009C00C000010C000__000000BEF06884C8", +"000000067F000080000009C00C000010C000-030000000000000000000000000000000002__000000BEF06884C8", +"000000067F000080000009C00C000011120B-010000000000000001000000050000000003__000000BE45CBFBB9-000000BEF5F47FD1", +"000000067F000080000009E00C0000000000-000000067F000080000009E00C0000004000__000000C0C9769FD8", +"000000067F000080000009E00C0000004000-000000067F000080000009E00C0000008000__000000C0C9769FD8", +"000000067F000080000009E00C0000004916-000000067F000080000009E00C000000E07C__000000BEF5F47FD1-000000BF48FFEB11", +"000000067F000080000009E00C0000008000-000000067F000080000009E00C000000C000__000000C0C9769FD8", +"000000067F000080000009E00C000000C000-000000067F000080000009E00C0000010000__000000C0C9769FD8", +"000000067F000080000009E00C000000E07C-000000067F000080000009E00C000001779A__000000BEF5F47FD1-000000BF48FFEB11", +"000000067F000080000009E00C0000010000-000000067F000080000009E00C0000014000__000000C0C9769FD8", +"000000067F000080000009E00C0000014000-000000067F000080000009E00C0000018000__000000C0C9769FD8", +"000000067F000080000009E00C000001779A-000000067F000080000009E00C0000020F00__000000BEF5F47FD1-000000BF48FFEB11", +"000000067F000080000009E00C0000018000-000000067F000080000009E00C000001C000__000000C0C9769FD8", +"000000067F000080000009E00C000001C000-000000067F000080000009E00C0000020000__000000C0C9769FD8", +"000000067F000080000009E00C0000020000-000000067F000080000009E00C0000024000__000000C0C9769FD8", +"000000067F000080000009E00C0000020F00-000000067F000080000009E00C000002A60B__000000BEF5F47FD1-000000BF48FFEB11", +"000000067F000080000009E00C0000024000-000000067F000080000009E00C0000028000__000000C0C9769FD8", +"000000067F000080000009E00C0000028000-000000067F000080000009E00C000002C000__000000C0C9769FD8", +"000000067F000080000009E00C000002A60B-030000000000000000000000000000000002__000000BEF5F47FD1-000000BF48FFEB11", +"000000067F000080000009E00C000002C000-000000067F000080000009E00C0000030000__000000C0B597E900", +"000000067F000080000009E00C000002C000-000000067F000080000009E00C0000030000__000000C1972392A8", +"000000067F000080000009E00C000002F506-000000067F000080000009E00C0000038C11__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000030000-000000067F000080000009E00C0000034000__000000C0B597E900", +"000000067F000080000009E00C0000030000-000000067F000080000009E00C0000034000__000000C1972392A8", +"000000067F000080000009E00C0000034000-000000067F000080000009E00C0000038000__000000C0B597E900", +"000000067F000080000009E00C0000034000-000000067F000080000009E00C0000038000__000000C1972392A8", +"000000067F000080000009E00C0000038000-000000067F000080000009E00C000003C000__000000C0B597E900", +"000000067F000080000009E00C0000038000-000000067F000080000009E00C000003C000__000000C1972392A8", +"000000067F000080000009E00C0000038C11-000000067F000080000009E00C0000042361__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C000003C000-000000067F000080000009E00C0000040000__000000C0B597E900", +"000000067F000080000009E00C000003C000-000000067F000080000009E00C0000040000__000000C1972392A8", +"000000067F000080000009E00C0000040000-000000067F000080000009E00C0000044000__000000C0B597E900", +"000000067F000080000009E00C0000040000-000000067F000080000009E00C0000044000__000000C1972392A8", +"000000067F000080000009E00C0000042361-000000067F000080000009E00C000004BAC7__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000044000-000000067F000080000009E00C0000048000__000000C0B597E900", +"000000067F000080000009E00C0000044000-000000067F000080000009E00C0000048000__000000C1972392A8", +"000000067F000080000009E00C0000048000-000000067F000080000009E00C000004C000__000000C0B597E900", +"000000067F000080000009E00C0000048000-000000067F000080000009E00C000004C000__000000C1972392A8", +"000000067F000080000009E00C000004BAC7-000000067F000080000009E00C00000551FC__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C000004C000-000000067F000080000009E00C0000050000__000000C0B597E900", +"000000067F000080000009E00C000004C000-000000067F000080000009E00C0000050000__000000C1972392A8", +"000000067F000080000009E00C0000050000-000000067F000080000009E00C0000054000__000000C0B597E900", +"000000067F000080000009E00C0000050000-000000067F000080000009E00C0000054000__000000C1972392A8", +"000000067F000080000009E00C0000050E89-000000067F000080000009E00C00000A18A0__000000C1426D92E1-000000C19744E959", +"000000067F000080000009E00C0000054000-000000067F000080000009E00C0000058000__000000C0B597E900", +"000000067F000080000009E00C0000054000-000000067F000080000009E00C0000058000__000000C1972392A8", +"000000067F000080000009E00C00000551FC-000000067F000080000009E00C000005E90B__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000058000-000000067F000080000009E00C000005C000__000000C0B597E900", +"000000067F000080000009E00C0000058000-000000067F000080000009E00C000005C000__000000C1972392A8", +"000000067F000080000009E00C000005C000-000000067F000080000009E00C0000060000__000000C0B597E900", +"000000067F000080000009E00C000005C000-000000067F000080000009E00C0000060000__000000C1972392A8", +"000000067F000080000009E00C000005E90B-000000067F000080000009E00C000006802B__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000060000-000000067F000080000009E00C0000064000__000000C0B597E900", +"000000067F000080000009E00C0000060000-000000067F000080000009E00C0000064000__000000C1972392A8", +"000000067F000080000009E00C0000064000-000000067F000080000009E00C0000068000__000000C0B597E900", +"000000067F000080000009E00C0000064000-000000067F000080000009E00C0000068000__000000C1972392A8", +"000000067F000080000009E00C0000068000-000000067F000080000009E00C000006C000__000000C0B597E900", +"000000067F000080000009E00C0000068000-000000067F000080000009E00C000006C000__000000C1972392A8", +"000000067F000080000009E00C000006802B-000000067F000080000009E00C0000071782__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C000006C000-000000067F000080000009E00C0000070000__000000C0B597E900", +"000000067F000080000009E00C000006C000-000000067F000080000009E00C0000070000__000000C1972392A8", +"000000067F000080000009E00C0000070000-000000067F000080000009E00C0000074000__000000C0B597E900", +"000000067F000080000009E00C0000070000-000000067F000080000009E00C0000074000__000000C1972392A8", +"000000067F000080000009E00C0000071782-000000067F000080000009E00C000007AEE8__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000074000-000000067F000080000009E00C0000078000__000000C0B597E900", +"000000067F000080000009E00C0000074000-000000067F000080000009E00C0000078000__000000C1972392A8", +"000000067F000080000009E00C0000078000-000000067F000080000009E00C000007C000__000000C0B597E900", +"000000067F000080000009E00C0000078000-000000067F000080000009E00C000007C000__000000C1972392A8", +"000000067F000080000009E00C000007AEE8-000000067F000080000009E00C000008460B__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C000007C000-000000067F000080000009E00C0000080000__000000C0B597E900", +"000000067F000080000009E00C000007C000-000000067F000080000009E00C0000080000__000000C1972392A8", +"000000067F000080000009E00C0000080000-000000067F000080000009E00C0000084000__000000C0B597E900", +"000000067F000080000009E00C0000080000-000000067F000080000009E00C0000084000__000000C1972392A8", +"000000067F000080000009E00C0000084000-000000067F000080000009E00C0000088000__000000C0B597E900", +"000000067F000080000009E00C0000084000-000000067F000080000009E00C0000088000__000000C1972392A8", +"000000067F000080000009E00C000008460B-000000067F000080000009E00C000008DD71__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000088000-000000067F000080000009E00C000008C000__000000C0B597E900", +"000000067F000080000009E00C0000088000-000000067F000080000009E00C000008C000__000000C1972392A8", +"000000067F000080000009E00C000008C000-000000067F000080000009E00C0000090000__000000C0B597E900", +"000000067F000080000009E00C000008C000-000000067F000080000009E00C0000090000__000000C1972392A8", +"000000067F000080000009E00C000008DD71-000000067F000080000009E00C00000974D7__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000090000-000000067F000080000009E00C0000094000__000000C0B597E900", +"000000067F000080000009E00C0000090000-000000067F000080000009E00C0000094000__000000C1972392A8", +"000000067F000080000009E00C0000094000-000000067F000080000009E00C0000098000__000000C0B597E900", +"000000067F000080000009E00C0000094000-000000067F000080000009E00C0000098000__000000C1972392A8", +"000000067F000080000009E00C00000974D7-000000067F000080000009E00C0100000000__000000BF48FFEB11-000000BFF8BDFEE9", +"000000067F000080000009E00C0000098000-000000067F000080000009E00C000009C000__000000C0B597E900", +"000000067F000080000009E00C0000098000-000000067F000080000009E00C000009C000__000000C1972392A8", +"000000067F000080000009E00C000009C000-000000067F000080000009E00C00000A0000__000000C0B597E900", +"000000067F000080000009E00C000009C000-000000067F000080000009E00C00000A0000__000000C1972392A8", +"000000067F000080000009E00C000009FB21-000000067F000080000009E00C00000A9230__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000A0000-000000067F000080000009E00C00000A4000__000000C0B597E900", +"000000067F000080000009E00C00000A0000-000000067F000080000009E00C00000A4000__000000C1972392A8", +"000000067F000080000009E00C00000A18A4-000000067F000080000009E00C00000F2B76__000000C1426D92E1-000000C19744E959", +"000000067F000080000009E00C00000A4000-000000067F000080000009E00C00000A8000__000000C0B597E900", +"000000067F000080000009E00C00000A4000-000000067F000080000009E00C00000A8000__000000C1972392A8", +"000000067F000080000009E00C00000A8000-000000067F000080000009E00C00000AC000__000000C0B597E900", +"000000067F000080000009E00C00000A8000-000000067F000080000009E00C00000AC000__000000C1972392A8", +"000000067F000080000009E00C00000A9230-000000067F000080000009E00C00000B297D__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000AC000-000000067F000080000009E00C00000B0000__000000C0B597E900", +"000000067F000080000009E00C00000AC000-000000067F000080000009E00C00000B0000__000000C1972392A8", +"000000067F000080000009E00C00000B0000-000000067F000080000009E00C00000B4000__000000C0B597E900", +"000000067F000080000009E00C00000B0000-000000067F000080000009E00C00000B4000__000000C1972392A8", +"000000067F000080000009E00C00000B297D-000000067F000080000009E00C00000BC0E3__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000B4000-000000067F000080000009E00C00000B8000__000000C0B597E900", +"000000067F000080000009E00C00000B4000-000000067F000080000009E00C00000B8000__000000C1972392A8", +"000000067F000080000009E00C00000B8000-000000067F000080000009E00C00000BC000__000000C0B597E900", +"000000067F000080000009E00C00000B8000-000000067F000080000009E00C00000BC000__000000C1972392A8", +"000000067F000080000009E00C00000BC000-000000067F000080000009E00C00000C0000__000000C0B597E900", +"000000067F000080000009E00C00000BC000-000000067F000080000009E00C00000C0000__000000C1972392A8", +"000000067F000080000009E00C00000BC0E3-000000067F000080000009E00C00000C580C__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000C0000-000000067F000080000009E00C00000C4000__000000C0B597E900", +"000000067F000080000009E00C00000C0000-000000067F000080000009E00C00000C4000__000000C1972392A8", +"000000067F000080000009E00C00000C0C74-000000067F000080000009E0140000001880__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E00C00000C4000-000000067F000080000009E00C00000C8000__000000C0B597E900", +"000000067F000080000009E00C00000C4000-000000067F000080000009E00C00000C8000__000000C1972392A8", +"000000067F000080000009E00C00000C580C-000000067F000080000009E00C00000CEF71__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000C8000-000000067F000080000009E00C00000CC000__000000C0B597E900", +"000000067F000080000009E00C00000C8000-000000067F000080000009E00C00000CC000__000000C1972392A8", +"000000067F000080000009E00C00000CC000-000000067F000080000009E00C00000D0000__000000C0B597E900", +"000000067F000080000009E00C00000CC000-000000067F000080000009E00C00000D0000__000000C1972392A8", +"000000067F000080000009E00C00000CEF71-000000067F000080000009E00C00000D86D7__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000D0000-000000067F000080000009E00C00000D4000__000000C0B597E900", +"000000067F000080000009E00C00000D0000-000000067F000080000009E00C00000D4000__000000C1972392A8", +"000000067F000080000009E00C00000D4000-000000067F000080000009E00C00000D8000__000000C0B597E900", +"000000067F000080000009E00C00000D4000-000000067F000080000009E00C00000D8000__000000C1972392A8", +"000000067F000080000009E00C00000D8000-000000067F000080000009E00C00000DC000__000000C0B597E900", +"000000067F000080000009E00C00000D8000-000000067F000080000009E00C00000DC000__000000C1972392A8", +"000000067F000080000009E00C00000D86D7-000000067F000080000009E00C00000E1E0C__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000DC000-000000067F000080000009E00C00000E0000__000000C0B597E900", +"000000067F000080000009E00C00000DC000-000000067F000080000009E00C00000E0000__000000C1972392A8", +"000000067F000080000009E00C00000E0000-000000067F000080000009E00C00000E4000__000000C0B597E900", +"000000067F000080000009E00C00000E0000-000000067F000080000009E00C00000E4000__000000C1972392A8", +"000000067F000080000009E00C00000E1E0C-000000067F000080000009E00C00000EB572__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000E4000-000000067F000080000009E00C00000E8000__000000C0B597E900", +"000000067F000080000009E00C00000E4000-000000067F000080000009E00C00000E8000__000000C1972392A8", +"000000067F000080000009E00C00000E8000-000000067F000080000009E00C00000EC000__000000C0B597E900", +"000000067F000080000009E00C00000E8000-000000067F000080000009E00C00000EC000__000000C1972392A8", +"000000067F000080000009E00C00000EB572-000000067F000080000009E00C00000F4CD8__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000EC000-000000067F000080000009E00C00000F0000__000000C0B597E900", +"000000067F000080000009E00C00000EC000-000000067F000080000009E00C00000F0000__000000C1972392A8", +"000000067F000080000009E00C00000F0000-000000067F000080000009E00C00000F4000__000000C0B597E900", +"000000067F000080000009E00C00000F0000-000000067F000080000009E00C00000F4000__000000C1972392A8", +"000000067F000080000009E00C00000F2B77-000000067F000080000009E014000000D3EB__000000C1426D92E1-000000C19744E959", +"000000067F000080000009E00C00000F4000-000000067F000080000009E00C00000F8000__000000C0B597E900", +"000000067F000080000009E00C00000F4000-000000067F000080000009E00C00000F8000__000000C1972392A8", +"000000067F000080000009E00C00000F4CD8-000000067F000080000009E00C00000FE40B__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C00000F8000-000000067F000080000009E00C00000FC000__000000C0B597E900", +"000000067F000080000009E00C00000F8000-000000067F000080000009E00C00000FC000__000000C1972392A8", +"000000067F000080000009E00C00000FC000-000000067F000080000009E00C0000100000__000000C0B597E900", +"000000067F000080000009E00C00000FC000-000000067F000080000009E00C0000100000__000000C1972392A8", +"000000067F000080000009E00C00000FE40B-000000067F000080000009E00C0000107B27__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C0000100000-000000067F000080000009E00C0000104000__000000C0B597E900", +"000000067F000080000009E00C0000100000-000000067F000080000009E00C0000104000__000000C1972392A8", +"000000067F000080000009E00C0000104000-000000067F000080000009E00C0000108000__000000C1972392A8", +"000000067F000080000009E00C0000104000-030000000000000000000000000000000002__000000C0B597E900", +"000000067F000080000009E00C0000107B27-000000067F000080000009E00C000011128D__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E00C0000108000-000000067F000080000009E00C000010C000__000000C1972392A8", +"000000067F000080000009E00C000010C000-000000067F000080000009E00C0000110000__000000C1972392A8", +"000000067F000080000009E00C0000110000-000000067F000080000009E0120100000000__000000C1972392A8", +"000000067F000080000009E00C000011128D-010000000000000001000000050000000003__000000BFF8BDFEE9-000000C0C8CA5FF1", +"000000067F000080000009E0140000000000-000000067F000080000009E0140000004000__000000C1972392A8", +"000000067F000080000009E0140000001880-000000067F000080000009E014000000842E__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E0140000004000-000000067F000080000009E0140000008000__000000C1972392A8", +"000000067F000080000009E0140000008000-000000067F000080000009E014000000C000__000000C1972392A8", +"000000067F000080000009E014000000842E-000000067F000080000009E014000000F011__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E014000000C000-000000067F000080000009E0140000010000__000000C1972392A8", +"000000067F000080000009E014000000D3EB-000000067F000080000009E014000002578F__000000C1426D92E1-000000C19744E959", +"000000067F000080000009E014000000F011-000000067F000080000009E0140000015BD8__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E0140000010000-000000067F000080000009E0140000014000__000000C1972392A8", +"000000067F000080000009E0140000014000-000000067F000080000009E0140000018000__000000C1972392A8", +"000000067F000080000009E0140000015BD8-000000067F000080000009E014000001C7C5__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E0140000018000-000000067F000080000009E014000001C000__000000C1972392A8", +"000000067F000080000009E014000001C000-000000067F000080000009E0140000020000__000000C1972392A8", +"000000067F000080000009E014000001C7C5-000000067F000080000009E014000002337F__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E0140000020000-000000067F000080000009E0140000024000__000000C1972392A8", +"000000067F000080000009E014000002337F-000000067F000080000009E0140000029F4A__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E0140000024000-000000067F000080000009E0140000028000__000000C1972392A8", +"000000067F000080000009E0140000025790-030000000000000000000000000000000002__000000C1426D92E1-000000C19744E959", +"000000067F000080000009E0140000028000-000000067F000080000009E014000002C000__000000C1972392A8", +"000000067F000080000009E0140000029F4A-030000000000000000000000000000000002__000000C0C8CA5FF1-000000C1426D92E1", +"000000067F000080000009E014000002C000-030000000000000000000000000000000002__000000C1972392A8", +"000000067F00008000000A000C0000000000-000000067F00008000000A000C0000004000__000000C3687EDFE8", +"000000067F00008000000A000C0000004000-000000067F00008000000A000C0000008000__000000C3687EDFE8", +"000000067F00008000000A000C0000008000-000000067F00008000000A000C000000C000__000000C3687EDFE8", +"000000067F00008000000A000C0000008EF9-000000067F00008000000A000C000001260C__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C000000C000-000000067F00008000000A000C0000010000__000000C3687EDFE8", +"000000067F00008000000A000C0000010000-000000067F00008000000A000C0000014000__000000C3687EDFE8", +"000000067F00008000000A000C000001260C-000000067F00008000000A000C000001BD72__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C0000014000-000000067F00008000000A000C0000018000__000000C3687EDFE8", +"000000067F00008000000A000C0000018000-000000067F00008000000A000C000001C000__000000C3687EDFE8", +"000000067F00008000000A000C000001BD72-000000067F00008000000A000C00000254D8__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C000001C000-000000067F00008000000A000C0000020000__000000C3687EDFE8", +"000000067F00008000000A000C0000020000-000000067F00008000000A000C0000024000__000000C3687EDFE8", +"000000067F00008000000A000C0000024000-000000067F00008000000A000C0000028000__000000C3687EDFE8", +"000000067F00008000000A000C00000254D8-000000067F00008000000A000C000002EC0B__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C0000028000-000000067F00008000000A000C000002C000__000000C3687EDFE8", +"000000067F00008000000A000C000002C000-000000067F00008000000A000C0000030000__000000C3687EDFE8", +"000000067F00008000000A000C000002EC0B-000000067F00008000000A000C0000038322__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C0000030000-000000067F00008000000A000C0000034000__000000C3687EDFE8", +"000000067F00008000000A000C0000034000-000000067F00008000000A000C0000038000__000000C3687EDFE8", +"000000067F00008000000A000C0000038000-000000067F00008000000A000C000003C000__000000C3687EDFE8", +"000000067F00008000000A000C0000038322-000000067F00008000000A000C0000041A88__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C000003C000-000000067F00008000000A000C0000040000__000000C3687EDFE8", +"000000067F00008000000A000C0000040000-000000067F00008000000A000C0000044000__000000C3687EDFE8", +"000000067F00008000000A000C0000041A88-000000067F00008000000A000C000004B1EE__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C0000044000-000000067F00008000000A000C0000048000__000000C3687EDFE8", +"000000067F00008000000A000C0000048000-000000067F00008000000A000C000004C000__000000C366619FD8", +"000000067F00008000000A000C0000048000-000000067F00008000000A000C000004C000__000000C42FE73810", +"000000067F00008000000A000C000004B1EE-030000000000000000000000000000000002__000000C19744E959-000000C217F3F379", +"000000067F00008000000A000C000004BACE-000000067F00008000000A000C0000055202__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C000004C000-000000067F00008000000A000C0000050000__000000C366619FD8", +"000000067F00008000000A000C000004C000-000000067F00008000000A000C0000050000__000000C42FE73810", +"000000067F00008000000A000C0000050000-000000067F00008000000A000C0000054000__000000C366619FD8", +"000000067F00008000000A000C0000050000-000000067F00008000000A000C0000054000__000000C42FE73810", +"000000067F00008000000A000C0000054000-000000067F00008000000A000C0000058000__000000C366619FD8", +"000000067F00008000000A000C0000054000-000000067F00008000000A000C0000058000__000000C42FE73810", +"000000067F00008000000A000C0000055202-000000067F00008000000A000C000005E90D__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C0000056365-000000067F00008000000A000C00000ACA1A__000000C3E17E01A1-000000C430961E71", +"000000067F00008000000A000C0000058000-000000067F00008000000A000C000005C000__000000C366619FD8", +"000000067F00008000000A000C0000058000-000000067F00008000000A000C000005C000__000000C42FE73810", +"000000067F00008000000A000C000005C000-000000067F00008000000A000C0000060000__000000C366619FD8", +"000000067F00008000000A000C000005C000-000000067F00008000000A000C0000060000__000000C42FE73810", +"000000067F00008000000A000C000005E90D-000000067F00008000000A000C000006802B__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C0000060000-000000067F00008000000A000C0000064000__000000C366619FD8", +"000000067F00008000000A000C0000060000-000000067F00008000000A000C0000064000__000000C42FE73810", +"000000067F00008000000A000C0000064000-000000067F00008000000A000C0000068000__000000C366619FD8", +"000000067F00008000000A000C0000064000-000000067F00008000000A000C0000068000__000000C42FE73810", +"000000067F00008000000A000C0000068000-000000067F00008000000A000C000006C000__000000C366619FD8", +"000000067F00008000000A000C0000068000-000000067F00008000000A000C000006C000__000000C42FE73810", +"000000067F00008000000A000C000006802B-000000067F00008000000A000C0000071782__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C000006C000-000000067F00008000000A000C0000070000__000000C366619FD8", +"000000067F00008000000A000C000006C000-000000067F00008000000A000C0000070000__000000C42FE73810", +"000000067F00008000000A000C0000070000-000000067F00008000000A000C0000074000__000000C366619FD8", +"000000067F00008000000A000C0000070000-000000067F00008000000A000C0000074000__000000C42FE73810", +"000000067F00008000000A000C0000071782-000000067F00008000000A000C000007AEE8__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C0000074000-000000067F00008000000A000C0000078000__000000C366619FD8", +"000000067F00008000000A000C0000074000-000000067F00008000000A000C0000078000__000000C42FE73810", +"000000067F00008000000A000C0000078000-000000067F00008000000A000C000007C000__000000C366619FD8", +"000000067F00008000000A000C0000078000-000000067F00008000000A000C000007C000__000000C42FE73810", +"000000067F00008000000A000C000007AEE8-000000067F00008000000A000C000008460B__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C000007C000-000000067F00008000000A000C0000080000__000000C366619FD8", +"000000067F00008000000A000C000007C000-000000067F00008000000A000C0000080000__000000C42FE73810", +"000000067F00008000000A000C0000080000-000000067F00008000000A000C0000084000__000000C366619FD8", +"000000067F00008000000A000C0000080000-000000067F00008000000A000C0000084000__000000C42FE73810", +"000000067F00008000000A000C0000084000-000000067F00008000000A000C0000088000__000000C366619FD8", +"000000067F00008000000A000C0000084000-000000067F00008000000A000C0000088000__000000C42FE73810", +"000000067F00008000000A000C000008460B-000000067F00008000000A000C000008DD71__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C0000088000-000000067F00008000000A000C000008C000__000000C366619FD8", +"000000067F00008000000A000C0000088000-000000067F00008000000A000C000008C000__000000C42FE73810", +"000000067F00008000000A000C000008C000-000000067F00008000000A000C0000090000__000000C366619FD8", +"000000067F00008000000A000C000008C000-000000067F00008000000A000C0000090000__000000C42FE73810", +"000000067F00008000000A000C000008DD71-000000067F00008000000A000C00000974D7__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C0000090000-000000067F00008000000A000C0000094000__000000C366619FD8", +"000000067F00008000000A000C0000090000-000000067F00008000000A000C0000094000__000000C42FE73810", +"000000067F00008000000A000C0000094000-000000067F00008000000A000C0000098000__000000C366619FD8", +"000000067F00008000000A000C0000094000-000000067F00008000000A000C0000098000__000000C42FE73810", +"000000067F00008000000A000C00000974D7-000000067F00008000000A000C00000A0C0B__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C0000098000-000000067F00008000000A000C000009C000__000000C366619FD8", +"000000067F00008000000A000C0000098000-000000067F00008000000A000C000009C000__000000C42FE73810", +"000000067F00008000000A000C000009C000-000000067F00008000000A000C00000A0000__000000C366619FD8", +"000000067F00008000000A000C000009C000-000000067F00008000000A000C00000A0000__000000C42FE73810", +"000000067F00008000000A000C00000A0000-000000067F00008000000A000C00000A4000__000000C366619FD8", +"000000067F00008000000A000C00000A0000-000000067F00008000000A000C00000A4000__000000C42FE73810", +"000000067F00008000000A000C00000A0C0B-000000067F00008000000A000C00000AA371__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C00000A4000-000000067F00008000000A000C00000A8000__000000C366619FD8", +"000000067F00008000000A000C00000A4000-000000067F00008000000A000C00000A8000__000000C42FE73810", +"000000067F00008000000A000C00000A8000-000000067F00008000000A000C00000AC000__000000C366619FD8", +"000000067F00008000000A000C00000A8000-000000067F00008000000A000C00000AC000__000000C42FE73810", +"000000067F00008000000A000C00000AA371-000000067F00008000000A000C00000B3AD7__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C00000AC000-000000067F00008000000A000C00000B0000__000000C366619FD8", +"000000067F00008000000A000C00000AC000-000000067F00008000000A000C00000B0000__000000C42FE73810", +"000000067F00008000000A000C00000ACA25-000000067F00008000000A000C0000102D7C__000000C3E17E01A1-000000C430961E71", +"000000067F00008000000A000C00000B0000-000000067F00008000000A000C00000B4000__000000C366619FD8", +"000000067F00008000000A000C00000B0000-000000067F00008000000A000C00000B4000__000000C42FE73810", +"000000067F00008000000A000C00000B3AD7-000000067F00008000000A000C0100000000__000000C217F3F379-000000C2C7B1ECC1", +"000000067F00008000000A000C00000B4000-000000067F00008000000A000C00000B8000__000000C366619FD8", +"000000067F00008000000A000C00000B4000-000000067F00008000000A000C00000B8000__000000C42FE73810", +"000000067F00008000000A000C00000B8000-000000067F00008000000A000C00000BC000__000000C366619FD8", +"000000067F00008000000A000C00000B8000-000000067F00008000000A000C00000BC000__000000C42FE73810", +"000000067F00008000000A000C00000B8B52-000000067F00008000000A00140000001132__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A000C00000BC000-000000067F00008000000A000C00000C0000__000000C366619FD8", +"000000067F00008000000A000C00000BC000-000000067F00008000000A000C00000C0000__000000C42FE73810", +"000000067F00008000000A000C00000BC072-000000067F00008000000A000C00000C57A3__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C00000C0000-000000067F00008000000A000C00000C4000__000000C366619FD8", +"000000067F00008000000A000C00000C0000-000000067F00008000000A000C00000C4000__000000C42FE73810", +"000000067F00008000000A000C00000C4000-000000067F00008000000A000C00000C8000__000000C366619FD8", +"000000067F00008000000A000C00000C4000-000000067F00008000000A000C00000C8000__000000C42FE73810", +"000000067F00008000000A000C00000C57A3-000000067F00008000000A000C00000CEF09__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C00000C8000-000000067F00008000000A000C00000CC000__000000C366619FD8", +"000000067F00008000000A000C00000C8000-000000067F00008000000A000C00000CC000__000000C42FE73810", +"000000067F00008000000A000C00000CC000-000000067F00008000000A000C00000D0000__000000C366619FD8", +"000000067F00008000000A000C00000CC000-000000067F00008000000A000C00000D0000__000000C42FE73810", +"000000067F00008000000A000C00000CEF09-000000067F00008000000A000C00000D862B__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C00000D0000-000000067F00008000000A000C00000D4000__000000C366619FD8", +"000000067F00008000000A000C00000D0000-000000067F00008000000A000C00000D4000__000000C42FE73810", +"000000067F00008000000A000C00000D4000-000000067F00008000000A000C00000D8000__000000C366619FD8", +"000000067F00008000000A000C00000D4000-000000067F00008000000A000C00000D8000__000000C42FE73810", +"000000067F00008000000A000C00000D8000-000000067F00008000000A000C00000DC000__000000C366619FD8", +"000000067F00008000000A000C00000D8000-000000067F00008000000A000C00000DC000__000000C42FE73810", +"000000067F00008000000A000C00000D862B-000000067F00008000000A000C00000E1D7F__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C00000DC000-000000067F00008000000A000C00000E0000__000000C366619FD8", +"000000067F00008000000A000C00000DC000-000000067F00008000000A000C00000E0000__000000C42FE73810", +"000000067F00008000000A000C00000E0000-000000067F00008000000A000C00000E4000__000000C366619FD8", +"000000067F00008000000A000C00000E0000-000000067F00008000000A000C00000E4000__000000C42FE73810", +"000000067F00008000000A000C00000E1D7F-000000067F00008000000A000C00000EB4E5__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C00000E4000-000000067F00008000000A000C00000E8000__000000C366619FD8", +"000000067F00008000000A000C00000E4000-000000067F00008000000A000C00000E8000__000000C42FE73810", +"000000067F00008000000A000C00000E8000-000000067F00008000000A000C00000EC000__000000C366619FD8", +"000000067F00008000000A000C00000E8000-000000067F00008000000A000C00000EC000__000000C42FE73810", +"000000067F00008000000A000C00000EB4E5-000000067F00008000000A000C00000F4C0B__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C00000EC000-000000067F00008000000A000C00000F0000__000000C366619FD8", +"000000067F00008000000A000C00000EC000-000000067F00008000000A000C00000F0000__000000C42FE73810", +"000000067F00008000000A000C00000F0000-000000067F00008000000A000C00000F4000__000000C366619FD8", +"000000067F00008000000A000C00000F0000-000000067F00008000000A000C00000F4000__000000C42FE73810", +"000000067F00008000000A000C00000F4000-000000067F00008000000A000C00000F8000__000000C366619FD8", +"000000067F00008000000A000C00000F4000-000000067F00008000000A000C00000F8000__000000C42FE73810", +"000000067F00008000000A000C00000F4C0B-000000067F00008000000A000C00000FE371__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C00000F8000-000000067F00008000000A000C00000FC000__000000C366619FD8", +"000000067F00008000000A000C00000F8000-000000067F00008000000A000C00000FC000__000000C42FE73810", +"000000067F00008000000A000C00000FC000-000000067F00008000000A000C0000100000__000000C366619FD8", +"000000067F00008000000A000C00000FC000-000000067F00008000000A000C0000100000__000000C42FE73810", +"000000067F00008000000A000C00000FE371-000000067F00008000000A000C0000107AD7__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C0000100000-000000067F00008000000A000C0000104000__000000C366619FD8", +"000000067F00008000000A000C0000100000-000000067F00008000000A000C0000104000__000000C42FE73810", +"000000067F00008000000A000C0000102D7F-000000067F00008000000A0014000001409C__000000C3E17E01A1-000000C430961E71", +"000000067F00008000000A000C0000104000-000000067F00008000000A000C0000108000__000000C366619FD8", +"000000067F00008000000A000C0000104000-000000067F00008000000A000C0000108000__000000C42FE73810", +"000000067F00008000000A000C0000107AD7-000000067F00008000000A000C000011120B__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A000C0000108000-000000067F00008000000A000C000010C000__000000C366619FD8", +"000000067F00008000000A000C0000108000-000000067F00008000000A000C000010C000__000000C42FE73810", +"000000067F00008000000A000C000010C000-000000067F00008000000A000C0000110000__000000C366619FD8", +"000000067F00008000000A000C000010C000-000000067F00008000000A000C0000110000__000000C42FE73810", +"000000067F00008000000A000C0000110000-000000067F00008000000A00120100000000__000000C42FE73810", +"000000067F00008000000A000C0000110000-030000000000000000000000000000000002__000000C366619FD8", +"000000067F00008000000A000C000011120B-010000000000000001000000050000000007__000000C2C7B1ECC1-000000C367E48001", +"000000067F00008000000A00140000000000-000000067F00008000000A00140000004000__000000C42FE73810", +"000000067F00008000000A00140000001132-000000067F00008000000A00140000007E49__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A00140000004000-000000067F00008000000A00140000008000__000000C42FE73810", +"000000067F00008000000A00140000007E49-000000067F00008000000A0014000000EBBC__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A00140000008000-000000067F00008000000A0014000000C000__000000C42FE73810", +"000000067F00008000000A0014000000C000-000000067F00008000000A00140000010000__000000C42FE73810", +"000000067F00008000000A0014000000EBBC-000000067F00008000000A00140000015925__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A00140000010000-000000067F00008000000A00140000014000__000000C42FE73810", +"000000067F00008000000A00140000014000-000000067F00008000000A00140000018000__000000C42FE73810", +"000000067F00008000000A0014000001409F-000000067F00008000000A0016000000020E__000000C3E17E01A1-000000C430961E71", +"000000067F00008000000A00140000015925-000000067F00008000000A0014000001C612__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A00140000018000-000000067F00008000000A0014000001C000__000000C42FE73810", +"000000067F00008000000A0014000001C000-000000067F00008000000A00140000020000__000000C42FE73810", +"000000067F00008000000A0014000001C612-000000067F00008000000A00140000023364__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A00140000020000-000000067F00008000000A00140000024000__000000C42FE73810", +"000000067F00008000000A00140000023364-000000067F00008000000A0014000002A070__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A00140000024000-000000067F00008000000A00140000028000__000000C42FE73810", +"000000067F00008000000A00140000028000-000000067F00008000000A0014000002C000__000000C42FE73810", +"000000067F00008000000A0014000002A070-030000000000000000000000000000000002__000000C367E48001-000000C3E17E01A1", +"000000067F00008000000A0014000002C000-030000000000000000000000000000000002__000000C42FE73810", +"000000067F00008000000A0016000000020E-030000000000000000000000000000000002__000000C3E17E01A1-000000C430961E71", +"000000067F00008000000A200C0000000000-000000067F00008000000A200C0000004000__000000C601294000", +"000000067F00008000000A200C0000004000-000000067F00008000000A200C0000008000__000000C601294000", +"000000067F00008000000A200C0000008000-000000067F00008000000A200C000000C000__000000C601294000", +"000000067F00008000000A200C0000009748-000000067F00008000000A200C0000012EAE__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C000000C000-000000067F00008000000A200C0000010000__000000C601294000", +"000000067F00008000000A200C0000010000-000000067F00008000000A200C0000014000__000000C601294000", +"000000067F00008000000A200C0000012EAE-000000067F00008000000A200C000001C60A__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C0000014000-000000067F00008000000A200C0000018000__000000C601294000", +"000000067F00008000000A200C0000018000-000000067F00008000000A200C000001C000__000000C601294000", +"000000067F00008000000A200C000001C000-000000067F00008000000A200C0000020000__000000C601294000", +"000000067F00008000000A200C000001C60A-000000067F00008000000A200C0000025D38__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C0000020000-000000067F00008000000A200C0000024000__000000C601294000", +"000000067F00008000000A200C0000024000-000000067F00008000000A200C0000028000__000000C601294000", +"000000067F00008000000A200C0000025D38-000000067F00008000000A200C000002F49E__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C0000028000-000000067F00008000000A200C000002C000__000000C601294000", +"000000067F00008000000A200C000002C000-000000067F00008000000A200C0000030000__000000C601294000", +"000000067F00008000000A200C000002F49E-000000067F00008000000A200C0000038BB1__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C0000030000-000000067F00008000000A200C0000034000__000000C601294000", +"000000067F00008000000A200C0000034000-000000067F00008000000A200C0000038000__000000C601294000", +"000000067F00008000000A200C0000038000-000000067F00008000000A200C000003C000__000000C601294000", +"000000067F00008000000A200C0000038BB1-000000067F00008000000A200C0000042317__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C000003C000-000000067F00008000000A200C0000040000__000000C601294000", +"000000067F00008000000A200C0000040000-000000067F00008000000A200C0000044000__000000C601294000", +"000000067F00008000000A200C0000042317-000000067F00008000000A200C000004BA7D__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C0000044000-000000067F00008000000A200C0000048000__000000C601294000", +"000000067F00008000000A200C0000048000-000000067F00008000000A200C000004C000__000000C601294000", +"000000067F00008000000A200C000004BA7D-000000067F00008000000A200C00000551B2__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C000004C000-000000067F00008000000A200C0000050000__000000C601294000", +"000000067F00008000000A200C0000050000-000000067F00008000000A200C0000054000__000000C601294000", +"000000067F00008000000A200C0000054000-000000067F00008000000A200C0000058000__000000C5FED35FC8", +"000000067F00008000000A200C0000054000-000000067F00008000000A200C0000058000__000000C6C7BD8140", +"000000067F00008000000A200C00000551B2-030000000000000000000000000000000002__000000C430961E71-000000C4C05DDB29", +"000000067F00008000000A200C0000055230-000000067F00008000000A200C000005E996__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C0000058000-000000067F00008000000A200C000005C000__000000C5FED35FC8", +"000000067F00008000000A200C0000058000-000000067F00008000000A200C000005C000__000000C6C7BD8140", +"000000067F00008000000A200C000005C000-000000067F00008000000A200C0000060000__000000C5FED35FC8", +"000000067F00008000000A200C000005C000-000000067F00008000000A200C0000060000__000000C6C7BD8140", +"000000067F00008000000A200C000005E996-000000067F00008000000A200C00000680FC__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C0000060000-000000067F00008000000A200C0000064000__000000C5FED35FC8", +"000000067F00008000000A200C0000060000-000000067F00008000000A200C0000064000__000000C6C7BD8140", +"000000067F00008000000A200C0000064000-000000067F00008000000A200C0000068000__000000C5FED35FC8", +"000000067F00008000000A200C0000064000-000000067F00008000000A200C0000068000__000000C6C7BD8140", +"000000067F00008000000A200C00000677DB-000000067F00008000000A200C00000CF739__000000C689AF4AC1-000000C6C87B6329", +"000000067F00008000000A200C0000068000-000000067F00008000000A200C000006C000__000000C5FED35FC8", +"000000067F00008000000A200C0000068000-000000067F00008000000A200C000006C000__000000C6C7BD8140", +"000000067F00008000000A200C00000680FC-000000067F00008000000A200C000007180C__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C000006C000-000000067F00008000000A200C0000070000__000000C5FED35FC8", +"000000067F00008000000A200C000006C000-000000067F00008000000A200C0000070000__000000C6C7BD8140", +"000000067F00008000000A200C0000070000-000000067F00008000000A200C0000074000__000000C5FED35FC8", +"000000067F00008000000A200C0000070000-000000067F00008000000A200C0000074000__000000C6C7BD8140", +"000000067F00008000000A200C000007180C-000000067F00008000000A200C000007AF72__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C0000074000-000000067F00008000000A200C0000078000__000000C5FED35FC8", +"000000067F00008000000A200C0000074000-000000067F00008000000A200C0000078000__000000C6C7BD8140", +"000000067F00008000000A200C0000078000-000000067F00008000000A200C000007C000__000000C5FED35FC8", +"000000067F00008000000A200C0000078000-000000067F00008000000A200C000007C000__000000C6C7BD8140", +"000000067F00008000000A200C000007AF72-000000067F00008000000A200C00000846D8__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C000007C000-000000067F00008000000A200C0000080000__000000C5FED35FC8", +"000000067F00008000000A200C000007C000-000000067F00008000000A200C0000080000__000000C6C7BD8140", +"000000067F00008000000A200C0000080000-000000067F00008000000A200C0000084000__000000C5FED35FC8", +"000000067F00008000000A200C0000080000-000000067F00008000000A200C0000084000__000000C6C7BD8140", +"000000067F00008000000A200C0000084000-000000067F00008000000A200C0000088000__000000C5FED35FC8", +"000000067F00008000000A200C0000084000-000000067F00008000000A200C0000088000__000000C6C7BD8140", +"000000067F00008000000A200C00000846D8-000000067F00008000000A200C000008DE0B__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C0000088000-000000067F00008000000A200C000008C000__000000C5FED35FC8", +"000000067F00008000000A200C0000088000-000000067F00008000000A200C000008C000__000000C6C7BD8140", +"000000067F00008000000A200C000008C000-000000067F00008000000A200C0000090000__000000C5FED35FC8", +"000000067F00008000000A200C000008C000-000000067F00008000000A200C0000090000__000000C6C7BD8140", +"000000067F00008000000A200C000008DE0B-000000067F00008000000A200C000009752B__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C0000090000-000000067F00008000000A200C0000094000__000000C5FED35FC8", +"000000067F00008000000A200C0000090000-000000067F00008000000A200C0000094000__000000C6C7BD8140", +"000000067F00008000000A200C00000933F0-000000067F00008000000A200C0000110901__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A200C0000094000-000000067F00008000000A200C0000098000__000000C5FED35FC8", +"000000067F00008000000A200C0000094000-000000067F00008000000A200C0000098000__000000C6C7BD8140", +"000000067F00008000000A200C000009752B-000000067F00008000000A200C00000A0C91__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C0000098000-000000067F00008000000A200C000009C000__000000C5FED35FC8", +"000000067F00008000000A200C0000098000-000000067F00008000000A200C000009C000__000000C6C7BD8140", +"000000067F00008000000A200C000009C000-000000067F00008000000A200C00000A0000__000000C5FED35FC8", +"000000067F00008000000A200C000009C000-000000067F00008000000A200C00000A0000__000000C6C7BD8140", +"000000067F00008000000A200C00000A0000-000000067F00008000000A200C00000A4000__000000C5FED35FC8", +"000000067F00008000000A200C00000A0000-000000067F00008000000A200C00000A4000__000000C6C7BD8140", +"000000067F00008000000A200C00000A0C91-000000067F00008000000A200C00000AA3F7__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C00000A4000-000000067F00008000000A200C00000A8000__000000C5FED35FC8", +"000000067F00008000000A200C00000A4000-000000067F00008000000A200C00000A8000__000000C6C7BD8140", +"000000067F00008000000A200C00000A8000-000000067F00008000000A200C00000AC000__000000C5FED35FC8", +"000000067F00008000000A200C00000A8000-000000067F00008000000A200C00000AC000__000000C6C7BD8140", +"000000067F00008000000A200C00000AA3F7-000000067F00008000000A200C00000B3B0C__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C00000AC000-000000067F00008000000A200C00000B0000__000000C5FED35FC8", +"000000067F00008000000A200C00000AC000-000000067F00008000000A200C00000B0000__000000C6C7BD8140", +"000000067F00008000000A200C00000B0000-000000067F00008000000A200C00000B4000__000000C5FED35FC8", +"000000067F00008000000A200C00000B0000-000000067F00008000000A200C00000B4000__000000C6C7BD8140", +"000000067F00008000000A200C00000B3B0C-000000067F00008000000A200C0100000000__000000C4C05DDB29-000000C56021EB29", +"000000067F00008000000A200C00000B4000-000000067F00008000000A200C00000B8000__000000C5FED35FC8", +"000000067F00008000000A200C00000B4000-000000067F00008000000A200C00000B8000__000000C6C7BD8140", +"000000067F00008000000A200C00000B8000-000000067F00008000000A200C00000BC000__000000C5FED35FC8", +"000000067F00008000000A200C00000B8000-000000067F00008000000A200C00000BC000__000000C6C7BD8140", +"000000067F00008000000A200C00000BBC1F-000000067F00008000000A200C00000C5353__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C00000BC000-000000067F00008000000A200C00000C0000__000000C5FED35FC8", +"000000067F00008000000A200C00000BC000-000000067F00008000000A200C00000C0000__000000C6C7BD8140", +"000000067F00008000000A200C00000C0000-000000067F00008000000A200C00000C4000__000000C5FED35FC8", +"000000067F00008000000A200C00000C0000-000000067F00008000000A200C00000C4000__000000C6C7BD8140", +"000000067F00008000000A200C00000C4000-000000067F00008000000A200C00000C8000__000000C5FED35FC8", +"000000067F00008000000A200C00000C4000-000000067F00008000000A200C00000C8000__000000C6C7BD8140", +"000000067F00008000000A200C00000C5353-000000067F00008000000A200C00000CEAB9__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C00000C8000-000000067F00008000000A200C00000CC000__000000C5FED35FC8", +"000000067F00008000000A200C00000C8000-000000067F00008000000A200C00000CC000__000000C6C7BD8140", +"000000067F00008000000A200C00000CC000-000000067F00008000000A200C00000D0000__000000C5FED35FC8", +"000000067F00008000000A200C00000CC000-000000067F00008000000A200C00000D0000__000000C6C7BD8140", +"000000067F00008000000A200C00000CEAB9-000000067F00008000000A200C00000D81D2__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C00000CF742-000000067F00008000000A2014000000B47B__000000C689AF4AC1-000000C6C87B6329", +"000000067F00008000000A200C00000D0000-000000067F00008000000A200C00000D4000__000000C5FED35FC8", +"000000067F00008000000A200C00000D0000-000000067F00008000000A200C00000D4000__000000C6C7BD8140", +"000000067F00008000000A200C00000D4000-000000067F00008000000A200C00000D8000__000000C5FED35FC8", +"000000067F00008000000A200C00000D4000-000000067F00008000000A200C00000D8000__000000C6C7BD8140", +"000000067F00008000000A200C00000D8000-000000067F00008000000A200C00000DC000__000000C5FED35FC8", +"000000067F00008000000A200C00000D8000-000000067F00008000000A200C00000DC000__000000C6C7BD8140", +"000000067F00008000000A200C00000D81D2-000000067F00008000000A200C00000E190B__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C00000DC000-000000067F00008000000A200C00000E0000__000000C5FED35FC8", +"000000067F00008000000A200C00000DC000-000000067F00008000000A200C00000E0000__000000C6C7BD8140", +"000000067F00008000000A200C00000E0000-000000067F00008000000A200C00000E4000__000000C5FED35FC8", +"000000067F00008000000A200C00000E0000-000000067F00008000000A200C00000E4000__000000C6C7BD8140", +"000000067F00008000000A200C00000E190B-000000067F00008000000A200C00000EB071__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C00000E4000-000000067F00008000000A200C00000E8000__000000C5FED35FC8", +"000000067F00008000000A200C00000E4000-000000067F00008000000A200C00000E8000__000000C6C7BD8140", +"000000067F00008000000A200C00000E8000-000000067F00008000000A200C00000EC000__000000C5FED35FC8", +"000000067F00008000000A200C00000E8000-000000067F00008000000A200C00000EC000__000000C6C7BD8140", +"000000067F00008000000A200C00000EB071-000000067F00008000000A200C00000F47AC__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C00000EC000-000000067F00008000000A200C00000F0000__000000C5FED35FC8", +"000000067F00008000000A200C00000EC000-000000067F00008000000A200C00000F0000__000000C6C7BD8140", +"000000067F00008000000A200C00000F0000-000000067F00008000000A200C00000F4000__000000C5FED35FC8", +"000000067F00008000000A200C00000F0000-000000067F00008000000A200C00000F4000__000000C6C7BD8140", +"000000067F00008000000A200C00000F4000-000000067F00008000000A200C00000F8000__000000C5FED35FC8", +"000000067F00008000000A200C00000F4000-000000067F00008000000A200C00000F8000__000000C6C7BD8140", +"000000067F00008000000A200C00000F47AC-000000067F00008000000A200C00000FDF0A__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C00000F8000-000000067F00008000000A200C00000FC000__000000C5FED35FC8", +"000000067F00008000000A200C00000F8000-000000067F00008000000A200C00000FC000__000000C6C7BD8140", +"000000067F00008000000A200C00000FC000-000000067F00008000000A200C0000100000__000000C5FED35FC8", +"000000067F00008000000A200C00000FC000-000000067F00008000000A200C0000100000__000000C6C7BD8140", +"000000067F00008000000A200C00000FDF0A-000000067F00008000000A200C000010762B__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C0000100000-000000067F00008000000A200C0000104000__000000C5FED35FC8", +"000000067F00008000000A200C0000100000-000000067F00008000000A200C0000104000__000000C6C7BD8140", +"000000067F00008000000A200C0000104000-000000067F00008000000A200C0000108000__000000C5FED35FC8", +"000000067F00008000000A200C0000104000-000000067F00008000000A200C0000108000__000000C6C7BD8140", +"000000067F00008000000A200C000010762B-000000067F00008000000A200C0000110D88__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A200C0000108000-000000067F00008000000A200C000010C000__000000C5FED35FC8", +"000000067F00008000000A200C0000108000-000000067F00008000000A200C000010C000__000000C6C7BD8140", +"000000067F00008000000A200C000010C000-000000067F00008000000A200C0000110000__000000C5FED35FC8", +"000000067F00008000000A200C000010C000-000000067F00008000000A200C0000110000__000000C6C7BD8140", +"000000067F00008000000A200C0000110000-000000067F00008000000A20120100000000__000000C6C7BD8140", +"000000067F00008000000A200C0000110000-030000000000000000000000000000000002__000000C5FED35FC8", +"000000067F00008000000A200C0000110901-000000067F00008000000A201400000047CD__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A200C0000110D88-01000000000000000100000005000000000A__000000C56021EB29-000000C600A8FFF9", +"000000067F00008000000A20140000000000-000000067F00008000000A20140000004000__000000C6C7BD8140", +"000000067F00008000000A20140000004000-000000067F00008000000A20140000008000__000000C6C7BD8140", +"000000067F00008000000A201400000047CD-000000067F00008000000A2014000000ADA8__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A20140000008000-000000067F00008000000A2014000000C000__000000C6C7BD8140", +"000000067F00008000000A2014000000ADA8-000000067F00008000000A201400000113B8__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A2014000000B47C-010000000000000001000000050100000000__000000C689AF4AC1-000000C6C87B6329", +"000000067F00008000000A2014000000C000-000000067F00008000000A20140000010000__000000C6C7BD8140", +"000000067F00008000000A20140000010000-000000067F00008000000A20140000014000__000000C6C7BD8140", +"000000067F00008000000A201400000113B8-000000067F00008000000A20140000017969__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A20140000014000-000000067F00008000000A20140000018000__000000C6C7BD8140", +"000000067F00008000000A20140000017969-000000067F00008000000A2014000001DF7E__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A20140000018000-000000067F00008000000A2014000001C000__000000C6C7BD8140", +"000000067F00008000000A2014000001C000-000000067F00008000000A20140000020000__000000C6C7BD8140", +"000000067F00008000000A2014000001DF7E-000000067F00008000000A2014000002457D__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A20140000020000-000000067F00008000000A20140000024000__000000C6C7BD8140", +"000000067F00008000000A20140000024000-000000067F00008000000A20140000028000__000000C6C7BD8140", +"000000067F00008000000A2014000002457D-000000067F00008000000A2014000002AB1D__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A20140000028000-000000067F00008000000A2014000002C000__000000C6C7BD8140", +"000000067F00008000000A2014000002AB1D-030000000000000000000000000000000002__000000C600A8FFF9-000000C689AF4AC1", +"000000067F00008000000A2014000002C000-030000000000000000000000000000000002__000000C6C7BD8140", +"000000067F00008000000A400C0000000000-000000067F00008000000A400C0000004000__000000C896B8DFD8", +"000000067F00008000000A400C0000004000-000000067F00008000000A400C0000008000__000000C896B8DFD8", +"000000067F00008000000A400C0000008000-000000067F00008000000A400C000000C000__000000C896B8DFD8", +"000000067F00008000000A400C0000009743-000000067F00008000000A400C0000012EA9__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C000000C000-000000067F00008000000A400C0000010000__000000C896B8DFD8", +"000000067F00008000000A400C0000010000-000000067F00008000000A400C0000014000__000000C896B8DFD8", +"000000067F00008000000A400C0000012EA9-000000067F00008000000A400C000001C60A__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C0000014000-000000067F00008000000A400C0000018000__000000C896B8DFD8", +"000000067F00008000000A400C0000018000-000000067F00008000000A400C000001C000__000000C896B8DFD8", +"000000067F00008000000A400C000001C000-000000067F00008000000A400C0000020000__000000C896B8DFD8", +"000000067F00008000000A400C000001C60A-000000067F00008000000A400C0000025D38__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C0000020000-000000067F00008000000A400C0000024000__000000C896B8DFD8", +"000000067F00008000000A400C0000024000-000000067F00008000000A400C0000028000__000000C896B8DFD8", +"000000067F00008000000A400C0000025D38-000000067F00008000000A400C000002F49E__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C0000028000-000000067F00008000000A400C000002C000__000000C896B8DFD8", +"000000067F00008000000A400C000002C000-000000067F00008000000A400C0000030000__000000C896B8DFD8", +"000000067F00008000000A400C000002F49E-000000067F00008000000A400C0000038BB1__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C0000030000-000000067F00008000000A400C0000034000__000000C896B8DFD8", +"000000067F00008000000A400C0000034000-000000067F00008000000A400C0000038000__000000C896B8DFD8", +"000000067F00008000000A400C0000038000-000000067F00008000000A400C000003C000__000000C896B8DFD8", +"000000067F00008000000A400C0000038BB1-000000067F00008000000A400C0000042317__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C000003C000-000000067F00008000000A400C0000040000__000000C896B8DFD8", +"000000067F00008000000A400C0000040000-000000067F00008000000A400C0000044000__000000C896B8DFD8", +"000000067F00008000000A400C0000042317-000000067F00008000000A400C000004BA7D__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C0000044000-000000067F00008000000A400C0000048000__000000C896B8DFD8", +"000000067F00008000000A400C0000048000-000000067F00008000000A400C000004C000__000000C896B8DFD8", +"000000067F00008000000A400C000004BA7D-030000000000000000000000000000000002__000000C6C87B6329-000000C74849FAE1", +"000000067F00008000000A400C000004C000-000000067F00008000000A400C0000050000__000000C896B8DFD8", +"000000067F00008000000A400C0000050000-000000067F00008000000A400C0000054000__000000C896B8DFD8", +"000000067F00008000000A400C0000054000-000000067F00008000000A400C0000058000__000000C896B8DFD8", +"000000067F00008000000A400C00000551FC-000000067F00008000000A400C000005E90B__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C0000058000-000000067F00008000000A400C000005C000__000000C896B8DFD8", +"000000067F00008000000A400C000005C000-000000067F00008000000A400C0000060000__000000C896B8DFD8", +"000000067F00008000000A400C000005E90B-000000067F00008000000A400C000006802B__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C0000060000-000000067F00008000000A400C0000064000__000000C896B8DFD8", +"000000067F00008000000A400C0000064000-000000067F00008000000A400C0000068000__000000C896B8DFD8", +"000000067F00008000000A400C0000068000-000000067F00008000000A400C000006C000__000000C896B8DFD8", +"000000067F00008000000A400C000006802B-000000067F00008000000A400C0000071782__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C000006C000-000000067F00008000000A400C0000070000__000000C896B8DFD8", +"000000067F00008000000A400C0000070000-000000067F00008000000A400C0000074000__000000C896B8DFD8", +"000000067F00008000000A400C0000071782-000000067F00008000000A400C000007AEE8__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C0000074000-000000067F00008000000A400C0000078000__000000C896B8DFD8", +"000000067F00008000000A400C0000078000-000000067F00008000000A400C000007C000__000000C896B8DFD8", +"000000067F00008000000A400C000007AEE8-000000067F00008000000A400C000008460B__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C000007C000-000000067F00008000000A400C0000080000__000000C896B8DFD8", +"000000067F00008000000A400C0000080000-000000067F00008000000A400C0000084000__000000C896B8DFD8", +"000000067F00008000000A400C0000084000-000000067F00008000000A400C0000088000__000000C896B8DFD8", +"000000067F00008000000A400C000008460B-000000067F00008000000A400C000008DD71__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C0000088000-000000067F00008000000A400C000008C000__000000C896B8DFD8", +"000000067F00008000000A400C000008C000-000000067F00008000000A400C0000090000__000000C896B8DFD8", +"000000067F00008000000A400C000008DD71-000000067F00008000000A400C00000974D7__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C0000090000-000000067F00008000000A400C0000094000__000000C896B8DFD8", +"000000067F00008000000A400C0000094000-000000067F00008000000A400C0000098000__000000C896B8DFD8", +"000000067F00008000000A400C00000974D7-000000067F00008000000A400C00000A0C0B__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C0000098000-000000067F00008000000A400C000009C000__000000C896B8DFD8", +"000000067F00008000000A400C000009C000-000000067F00008000000A400C00000A0000__000000C896B8DFD8", +"000000067F00008000000A400C00000A0000-000000067F00008000000A400C00000A4000__000000C896B8DFD8", +"000000067F00008000000A400C00000A0C0B-000000067F00008000000A400C00000AA371__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C00000A4000-000000067F00008000000A400C00000A8000__000000C896B8DFD8", +"000000067F00008000000A400C00000A8000-000000067F00008000000A400C00000AC000__000000C896B8DFD8", +"000000067F00008000000A400C00000AA371-000000067F00008000000A400C00000B3AD7__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C00000AC000-000000067F00008000000A400C00000B0000__000000C896B8DFD8", +"000000067F00008000000A400C00000B0000-000000067F00008000000A400C00000B4000__000000C896B8DFD8", +"000000067F00008000000A400C00000B3AD7-000000067F00008000000A400C00000BD20B__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C00000B4000-000000067F00008000000A400C00000B8000__000000C896B8DFD8", +"000000067F00008000000A400C00000B8000-000000067F00008000000A400C00000BC000__000000C896B8DFD8", +"000000067F00008000000A400C00000BC000-000000067F00008000000A400C00000C0000__000000C896B8DFD8", +"000000067F00008000000A400C00000BD20B-000000067F00008000000A400C0100000000__000000C74849FAE1-000000C80801E859", +"000000067F00008000000A400C00000C0000-000000067F00008000000A400C00000C4000__000000C896B8DFD8", +"000000067F00008000000A400C00000C4000-000000067F00008000000A400C00000C8000__000000C896B8DFD8", +"000000067F00008000000A400C00000C4AE6-000000067F00008000000A400C00000CE20C__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C00000C8000-000000067F00008000000A400C00000CC000__000000C896B8DFD8", +"000000067F00008000000A400C00000CC000-000000067F00008000000A400C00000D0000__000000C896B8DFD8", +"000000067F00008000000A400C00000CE20C-000000067F00008000000A400C00000D7929__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C00000D0000-000000067F00008000000A400C00000D4000__000000C896B8DFD8", +"000000067F00008000000A400C00000D4000-000000067F00008000000A400C00000D8000__000000C896B8DFD8", +"000000067F00008000000A400C00000D7929-000000067F00008000000A400C00000E108F__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C00000D8000-000000067F00008000000A400C00000DC000__000000C896B8DFD8", +"000000067F00008000000A400C00000DC000-000000067F00008000000A400C00000E0000__000000C896B8DFD8", +"000000067F00008000000A400C00000E0000-000000067F00008000000A400C00000E4000__000000C896B8DFD8", +"000000067F00008000000A400C00000E108F-000000067F00008000000A400C00000EA7F5__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C00000E4000-000000067F00008000000A400C00000E8000__000000C896B8DFD8", +"000000067F00008000000A400C00000E8000-000000067F00008000000A400C00000EC000__000000C896B8DFD8", +"000000067F00008000000A400C00000EA7F5-000000067F00008000000A400C00000F3F0B__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C00000EC000-000000067F00008000000A400C00000F0000__000000C896B8DFD8", +"000000067F00008000000A400C00000F0000-000000067F00008000000A400C00000F4000__000000C896B8DFD8", +"000000067F00008000000A400C00000F3F0B-000000067F00008000000A400C00000FD671__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C00000F4000-000000067F00008000000A400C00000F8000__000000C896B8DFD8", +"000000067F00008000000A400C00000F8000-000000067F00008000000A400C00000FC000__000000C896B8DFD8", +"000000067F00008000000A400C00000FC000-000000067F00008000000A400C0000100000__000000C896B8DFD8", +"000000067F00008000000A400C00000FD671-000000067F00008000000A400C0000106D95__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C0000100000-000000067F00008000000A400C0000104000__000000C896B8DFD8", +"000000067F00008000000A400C0000104000-000000067F00008000000A400C0000108000__000000C896B8DFD8", +"000000067F00008000000A400C0000106D95-000000067F00008000000A400C00001104FB__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A400C0000107F8F-000000067F00008000000A40140000005626__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000A400C0000108000-000000067F00008000000A400C000010C000__000000C896B8DFD8", +"000000067F00008000000A400C000010C000-000000067F00008000000A400C0000110000__000000C896B8DFD8", +"000000067F00008000000A400C0000110000-030000000000000000000000000000000002__000000C896B8DFD8", +"000000067F00008000000A400C00001104FB-01000000000000000100000005000000000D__000000C80801E859-000000C8993EBFF9", +"000000067F00008000000A40140000005626-000000067F00008000000A4014000000C7F9__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000A4014000000C7F9-000000067F00008000000A401400000139F8__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000A401400000139F8-000000067F00008000000A4014000001ABE9__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000A4014000001ABE9-000000067F00008000000A40140000021DF4__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000A40140000021DF4-000000067F00008000000A40140000028FA9__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000A40140000028FA9-030000000000000000000000000000000002__000000C8993EBFF9-000000C90726D0D9", +"000000067F00008000000A600C0000000000-000000067F00008000000A600C0000004000__000000CA2C877DC8", +"000000067F00008000000A600C0000000000-000000067F00008000000A600C0000004000__000000CB82C2FF68", +"000000067F00008000000A600C0000004000-000000067F00008000000A600C0000008000__000000CA2C877DC8", +"000000067F00008000000A600C0000004000-000000067F00008000000A600C0000008000__000000CB82C2FF68", +"000000067F00008000000A600C0000008000-000000067F00008000000A600C000000C000__000000CA2C877DC8", +"000000067F00008000000A600C0000008000-000000067F00008000000A600C000000C000__000000CB82C2FF68", +"000000067F00008000000A600C0000009746-000000067F00008000000A600C0000012EAC__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C000000C000-000000067F00008000000A600C0000010000__000000CA2C877DC8", +"000000067F00008000000A600C000000C000-000000067F00008000000A600C0000010000__000000CB82C2FF68", +"000000067F00008000000A600C0000010000-000000067F00008000000A600C0000014000__000000CA2C877DC8", +"000000067F00008000000A600C0000010000-000000067F00008000000A600C0000014000__000000CB82C2FF68", +"000000067F00008000000A600C0000012EAC-000000067F00008000000A600C000001C60A__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C0000014000-000000067F00008000000A600C0000018000__000000CA2C877DC8", +"000000067F00008000000A600C0000014000-000000067F00008000000A600C0000018000__000000CB82C2FF68", +"000000067F00008000000A600C0000018000-000000067F00008000000A600C000001C000__000000CA2C877DC8", +"000000067F00008000000A600C0000018000-000000067F00008000000A600C000001C000__000000CB82C2FF68", +"000000067F00008000000A600C000001C000-000000067F00008000000A600C0000020000__000000CA2C877DC8", +"000000067F00008000000A600C000001C000-000000067F00008000000A600C0000020000__000000CB82C2FF68", +"000000067F00008000000A600C000001C60A-000000067F00008000000A600C0000025D38__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C0000020000-000000067F00008000000A600C0000024000__000000CA2C877DC8", +"000000067F00008000000A600C0000020000-000000067F00008000000A600C0000024000__000000CB82C2FF68", +"000000067F00008000000A600C0000024000-000000067F00008000000A600C0000028000__000000CA2C877DC8", +"000000067F00008000000A600C0000024000-000000067F00008000000A600C0000028000__000000CB82C2FF68", +"000000067F00008000000A600C0000025D38-000000067F00008000000A600C000002F49E__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C0000028000-000000067F00008000000A600C000002C000__000000CA2C877DC8", +"000000067F00008000000A600C0000028000-000000067F00008000000A600C000002C000__000000CB82C2FF68", +"000000067F00008000000A600C000002C000-000000067F00008000000A600C0000030000__000000CA2C877DC8", +"000000067F00008000000A600C000002C000-000000067F00008000000A600C0000030000__000000CB82C2FF68", +"000000067F00008000000A600C000002F49E-000000067F00008000000A600C0000038BB1__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C0000030000-000000067F00008000000A600C0000034000__000000CA2C877DC8", +"000000067F00008000000A600C0000030000-000000067F00008000000A600C0000034000__000000CB82C2FF68", +"000000067F00008000000A600C0000034000-000000067F00008000000A600C0000038000__000000CA2C877DC8", +"000000067F00008000000A600C0000034000-000000067F00008000000A600C0000038000__000000CB82C2FF68", +"000000067F00008000000A600C0000038000-000000067F00008000000A600C000003C000__000000CA2C877DC8", +"000000067F00008000000A600C0000038000-000000067F00008000000A600C000003C000__000000CB82C2FF68", +"000000067F00008000000A600C0000038BB1-000000067F00008000000A600C0000042317__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C000003C000-000000067F00008000000A600C0000040000__000000CA2C877DC8", +"000000067F00008000000A600C000003C000-000000067F00008000000A600C0000040000__000000CB82C2FF68", +"000000067F00008000000A600C0000040000-000000067F00008000000A600C0000044000__000000CA2C877DC8", +"000000067F00008000000A600C0000040000-000000067F00008000000A600C0000044000__000000CB82C2FF68", +"000000067F00008000000A600C0000042317-000000067F00008000000A600C000004BA7D__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C0000044000-000000067F00008000000A600C0000048000__000000CA2C877DC8", +"000000067F00008000000A600C0000044000-000000067F00008000000A600C0000048000__000000CB82C2FF68", +"000000067F00008000000A600C0000048000-000000067F00008000000A600C000004C000__000000CA2C877DC8", +"000000067F00008000000A600C0000048000-000000067F00008000000A600C000004C000__000000CB82C2FF68", +"000000067F00008000000A600C000004BA7D-030000000000000000000000000000000002__000000C90726D0D9-000000C986F5F0D9", +"000000067F00008000000A600C000004C000-000000067F00008000000A600C0000050000__000000CA2C877DC8", +"000000067F00008000000A600C000004C000-000000067F00008000000A600C0000050000__000000CB82C2FF68", +"000000067F00008000000A600C0000050000-000000067F00008000000A600C0000054000__000000CA2C877DC8", +"000000067F00008000000A600C0000050000-000000067F00008000000A600C0000054000__000000CB82C2FF68", +"000000067F00008000000A600C0000054000-000000067F00008000000A600C0000058000__000000CA2C877DC8", +"000000067F00008000000A600C0000054000-000000067F00008000000A600C0000058000__000000CB82C2FF68", +"000000067F00008000000A600C0000054BFB-000000067F00008000000A600C000005E30C__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000058000-000000067F00008000000A600C000005C000__000000CA2C877DC8", +"000000067F00008000000A600C0000058000-000000067F00008000000A600C000005C000__000000CB82C2FF68", +"000000067F00008000000A600C000005C000-000000067F00008000000A600C0000060000__000000CA2C877DC8", +"000000067F00008000000A600C000005C000-000000067F00008000000A600C0000060000__000000CB82C2FF68", +"000000067F00008000000A600C000005E30C-000000067F00008000000A600C0000067A2B__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000060000-000000067F00008000000A600C0000064000__000000CA2C877DC8", +"000000067F00008000000A600C0000060000-000000067F00008000000A600C0000064000__000000CB82C2FF68", +"000000067F00008000000A600C0000064000-000000067F00008000000A600C0000068000__000000CA2C877DC8", +"000000067F00008000000A600C0000064000-000000067F00008000000A600C0000068000__000000CB82C2FF68", +"000000067F00008000000A600C0000067A2B-000000067F00008000000A600C0000071186__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000068000-000000067F00008000000A600C000006C000__000000CA2C877DC8", +"000000067F00008000000A600C0000068000-000000067F00008000000A600C000006C000__000000CB82C2FF68", +"000000067F00008000000A600C000006C000-000000067F00008000000A600C0000070000__000000CA2C877DC8", +"000000067F00008000000A600C000006C000-000000067F00008000000A600C0000070000__000000CB82C2FF68", +"000000067F00008000000A600C0000070000-000000067F00008000000A600C0000074000__000000CA2C877DC8", +"000000067F00008000000A600C0000070000-000000067F00008000000A600C0000074000__000000CB82C2FF68", +"000000067F00008000000A600C0000071186-000000067F00008000000A600C000007A8EC__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000074000-000000067F00008000000A600C0000078000__000000CA2C877DC8", +"000000067F00008000000A600C0000074000-000000067F00008000000A600C0000078000__000000CB82C2FF68", +"000000067F00008000000A600C0000078000-000000067F00008000000A600C000007C000__000000CA2C877DC8", +"000000067F00008000000A600C0000078000-000000067F00008000000A600C000007C000__000000CB82C2FF68", +"000000067F00008000000A600C000007A149-000000067F00008000000A600C00000F5F42__000000CB40C16489-000000CB82C37859", +"000000067F00008000000A600C000007A8EC-000000067F00008000000A600C000008400A__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C000007C000-000000067F00008000000A600C0000080000__000000CA2C877DC8", +"000000067F00008000000A600C000007C000-000000067F00008000000A600C0000080000__000000CB82C2FF68", +"000000067F00008000000A600C0000080000-000000067F00008000000A600C0000084000__000000CA2C877DC8", +"000000067F00008000000A600C0000080000-000000067F00008000000A600C0000084000__000000CB82C2FF68", +"000000067F00008000000A600C0000084000-000000067F00008000000A600C0000088000__000000CA2C877DC8", +"000000067F00008000000A600C0000084000-000000067F00008000000A600C0000088000__000000CB82C2FF68", +"000000067F00008000000A600C000008400A-000000067F00008000000A600C000008D770__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000088000-000000067F00008000000A600C000008C000__000000CA2C877DC8", +"000000067F00008000000A600C0000088000-000000067F00008000000A600C000008C000__000000CB82C2FF68", +"000000067F00008000000A600C000008C000-000000067F00008000000A600C0000090000__000000CA2C877DC8", +"000000067F00008000000A600C000008C000-000000067F00008000000A600C0000090000__000000CB82C2FF68", +"000000067F00008000000A600C000008D770-000000067F00008000000A600C0000096ED6__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000090000-000000067F00008000000A600C0000094000__000000CA2C877DC8", +"000000067F00008000000A600C0000090000-000000067F00008000000A600C0000094000__000000CB82C2FF68", +"000000067F00008000000A600C0000094000-000000067F00008000000A600C0000098000__000000CA2C877DC8", +"000000067F00008000000A600C0000094000-000000067F00008000000A600C0000098000__000000CB82C2FF68", +"000000067F00008000000A600C0000096ED6-000000067F00008000000A600C00000A060B__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000098000-000000067F00008000000A600C000009C000__000000CA2C877DC8", +"000000067F00008000000A600C0000098000-000000067F00008000000A600C000009C000__000000CB82C2FF68", +"000000067F00008000000A600C000009C000-000000067F00008000000A600C00000A0000__000000CA2C877DC8", +"000000067F00008000000A600C000009C000-000000067F00008000000A600C00000A0000__000000CB82C2FF68", +"000000067F00008000000A600C00000A0000-000000067F00008000000A600C00000A4000__000000CA2C877DC8", +"000000067F00008000000A600C00000A0000-000000067F00008000000A600C00000A4000__000000CB82C2FF68", +"000000067F00008000000A600C00000A060B-000000067F00008000000A600C00000A9D71__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000A4000-000000067F00008000000A600C00000A8000__000000CA2C877DC8", +"000000067F00008000000A600C00000A4000-000000067F00008000000A600C00000A8000__000000CB82C2FF68", +"000000067F00008000000A600C00000A8000-000000067F00008000000A600C00000AC000__000000CA2C877DC8", +"000000067F00008000000A600C00000A8000-000000067F00008000000A600C00000AC000__000000CB82C2FF68", +"000000067F00008000000A600C00000A9D71-000000067F00008000000A600C00000B34D7__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000AC000-000000067F00008000000A600C00000B0000__000000CB82C2FF68", +"000000067F00008000000A600C00000AC000-030000000000000000000000000000000002__000000CA2C877DC8", +"000000067F00008000000A600C00000B0000-000000067F00008000000A600C00000B4000__000000CB82C2FF68", +"000000067F00008000000A600C00000B34D7-000000067F00008000000A600C00000BCC0C__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000B4000-000000067F00008000000A600C00000B8000__000000CB82C2FF68", +"000000067F00008000000A600C00000B8000-000000067F00008000000A600C00000BC000__000000CB82C2FF68", +"000000067F00008000000A600C00000BC000-000000067F00008000000A600C00000C0000__000000CB82C2FF68", +"000000067F00008000000A600C00000BCC0C-000000067F00008000000A600C00000C6336__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000C0000-000000067F00008000000A600C00000C4000__000000CB82C2FF68", +"000000067F00008000000A600C00000C4000-000000067F00008000000A600C00000C8000__000000CB82C2FF68", +"000000067F00008000000A600C00000C6336-000000067F00008000000A600C00000CFA9C__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000C8000-000000067F00008000000A600C00000CC000__000000CB82C2FF68", +"000000067F00008000000A600C00000CC000-000000067F00008000000A600C00000D0000__000000CB82C2FF68", +"000000067F00008000000A600C00000CFA9C-000000067F00008000000A600C00000D91AB__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000D0000-000000067F00008000000A600C00000D4000__000000CB82C2FF68", +"000000067F00008000000A600C00000D4000-000000067F00008000000A600C00000D8000__000000CB82C2FF68", +"000000067F00008000000A600C00000D8000-000000067F00008000000A600C00000DC000__000000CB82C2FF68", +"000000067F00008000000A600C00000D91AB-000000067F00008000000A600C00000E2911__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000DC000-000000067F00008000000A600C00000E0000__000000CB82C2FF68", +"000000067F00008000000A600C00000E0000-000000067F00008000000A600C00000E4000__000000CB82C2FF68", +"000000067F00008000000A600C00000E2911-000000067F00008000000A600C00000EC077__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000E4000-000000067F00008000000A600C00000E8000__000000CB82C2FF68", +"000000067F00008000000A600C00000E8000-000000067F00008000000A600C00000EC000__000000CB82C2FF68", +"000000067F00008000000A600C00000EC000-000000067F00008000000A600C00000F0000__000000CB82C2FF68", +"000000067F00008000000A600C00000EC077-000000067F00008000000A600C00000F57A8__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000F0000-000000067F00008000000A600C00000F4000__000000CB82C2FF68", +"000000067F00008000000A600C00000F4000-000000067F00008000000A600C00000F8000__000000CB82C2FF68", +"000000067F00008000000A600C00000F57A8-000000067F00008000000A600C00000FEF0A__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C00000F5F4F-000000067F00008000000A60140000011158__000000CB40C16489-000000CB82C37859", +"000000067F00008000000A600C00000F8000-000000067F00008000000A600C00000FC000__000000CB82C2FF68", +"000000067F00008000000A600C00000FC000-000000067F00008000000A600C0000100000__000000CB82C2FF68", +"000000067F00008000000A600C00000FEF0A-000000067F00008000000A600C000010862B__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C0000100000-000000067F00008000000A600C0000104000__000000CB82C2FF68", +"000000067F00008000000A600C0000104000-000000067F00008000000A600C0000108000__000000CB82C2FF68", +"000000067F00008000000A600C0000108000-000000067F00008000000A600C000010C000__000000CB82C2FF68", +"000000067F00008000000A600C000010862B-000000067F00008000000A600C0000111C20__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A600C000010C000-000000067F00008000000A600C0000110000__000000CB82C2FF68", +"000000067F00008000000A600C0000110000-000000067F00008000000A60120100000000__000000CB82C2FF68", +"000000067F00008000000A600C00001117CB-000000067F00008000000A6014000000499B__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000A600C00FFFFFFFF-01000000000000000100000005000000000E__000000C986F5F0D9-000000CAD5D7FFF1", +"000000067F00008000000A60140000000000-000000067F00008000000A60140000004000__000000CB82C2FF68", +"000000067F00008000000A60140000004000-000000067F00008000000A60140000008000__000000CB82C2FF68", +"000000067F00008000000A6014000000499B-000000067F00008000000A6014000000BD4E__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000A60140000008000-000000067F00008000000A6014000000C000__000000CB82C2FF68", +"000000067F00008000000A6014000000BD4E-000000067F00008000000A601400000130ED__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000A6014000000C000-000000067F00008000000A60140000010000__000000CB82C2FF68", +"000000067F00008000000A60140000010000-000000067F00008000000A60140000014000__000000CB82C2FF68", +"000000067F00008000000A60140000011159-000000067F00008000000A60140000029BB2__000000CB40C16489-000000CB82C37859", +"000000067F00008000000A601400000130ED-000000067F00008000000A6014000001A4BD__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000A60140000014000-000000067F00008000000A60140000018000__000000CB82C2FF68", +"000000067F00008000000A60140000018000-000000067F00008000000A6014000001C000__000000CB82C2FF68", +"000000067F00008000000A6014000001A4BD-000000067F00008000000A60140000021886__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000A6014000001C000-000000067F00008000000A60140000020000__000000CB82C2FF68", +"000000067F00008000000A60140000020000-000000067F00008000000A60140000024000__000000CB82C2FF68", +"000000067F00008000000A60140000021886-000000067F00008000000A60140000028C0A__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000A60140000024000-000000067F00008000000A60140000028000__000000CB82C2FF68", +"000000067F00008000000A60140000028000-000000067F00008000000A6014000002C000__000000CB82C2FF68", +"000000067F00008000000A60140000028C0A-030000000000000000000000000000000002__000000CAD5D7FFF1-000000CB40C16489", +"000000067F00008000000A60140000029BB2-030000000000000000000000000000000002__000000CB40C16489-000000CB82C37859", +"000000067F00008000000A6014000002C000-030000000000000000000000000000000002__000000CB82C2FF68", +"000000067F00008000000A800C0000000000-000000067F00008000000A800C0000004000__000000CD51009FE8", +"000000067F00008000000A800C0000004000-000000067F00008000000A800C0000008000__000000CD51009FE8", +"000000067F00008000000A800C0000008000-000000067F00008000000A800C000000C000__000000CD51009FE8", +"000000067F00008000000A800C0000009748-000000067F00008000000A800C0000012EAE__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C000000C000-000000067F00008000000A800C0000010000__000000CD51009FE8", +"000000067F00008000000A800C0000010000-000000067F00008000000A800C0000014000__000000CD51009FE8", +"000000067F00008000000A800C0000012EAE-000000067F00008000000A800C000001C60A__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C0000014000-000000067F00008000000A800C0000018000__000000CD51009FE8", +"000000067F00008000000A800C0000018000-000000067F00008000000A800C000001C000__000000CD51009FE8", +"000000067F00008000000A800C000001C000-000000067F00008000000A800C0000020000__000000CD51009FE8", +"000000067F00008000000A800C000001C60A-000000067F00008000000A800C0000025D38__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C0000020000-000000067F00008000000A800C0000024000__000000CD51009FE8", +"000000067F00008000000A800C0000024000-000000067F00008000000A800C0000028000__000000CD51009FE8", +"000000067F00008000000A800C0000025D38-000000067F00008000000A800C000002F49E__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C0000028000-000000067F00008000000A800C000002C000__000000CD51009FE8", +"000000067F00008000000A800C000002C000-000000067F00008000000A800C0000030000__000000CD51009FE8", +"000000067F00008000000A800C000002F49E-000000067F00008000000A800C0000038BB1__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C0000030000-000000067F00008000000A800C0000034000__000000CD51009FE8", +"000000067F00008000000A800C0000034000-000000067F00008000000A800C0000038000__000000CD51009FE8", +"000000067F00008000000A800C0000038000-000000067F00008000000A800C000003C000__000000CD51009FE8", +"000000067F00008000000A800C0000038BB1-000000067F00008000000A800C0000042317__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C000003C000-000000067F00008000000A800C0000040000__000000CD51009FE8", +"000000067F00008000000A800C0000040000-000000067F00008000000A800C0000044000__000000CD51009FE8", +"000000067F00008000000A800C0000042317-000000067F00008000000A800C000004BA7D__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C0000044000-000000067F00008000000A800C0000048000__000000CD51009FE8", +"000000067F00008000000A800C0000048000-000000067F00008000000A800C000004C000__000000CD51009FE8", +"000000067F00008000000A800C000004BA7D-000000067F00008000000A800C0000054CA0__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800C000004C000-000000067F00008000000A800C0000050000__000000CD51009FE8", +"000000067F00008000000A800C0000050000-000000067F00008000000A800C0000054000__000000CD51009FE8", +"000000067F00008000000A800C0000054000-000000067F00008000000A800C0000058000__000000CD51009FE8", +"000000067F00008000000A800C0000054C9F-000000067F00008000000A800C000005E405__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C0000058000-000000067F00008000000A800C000005C000__000000CD51009FE8", +"000000067F00008000000A800C000005C000-000000067F00008000000A800C0000060000__000000CD51009FE8", +"000000067F00008000000A800C000005E405-000000067F00008000000A800C0000067B10__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C0000060000-000000067F00008000000A800C0000064000__000000CD51009FE8", +"000000067F00008000000A800C0000064000-000000067F00008000000A800C0000068000__000000CD51009FE8", +"000000067F00008000000A800C0000067B10-000000067F00008000000A800C0000071276__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C0000068000-000000067F00008000000A800C000006C000__000000CD51009FE8", +"000000067F00008000000A800C000006C000-000000067F00008000000A800C0000070000__000000CD51009FE8", +"000000067F00008000000A800C0000070000-000000067F00008000000A800C0000074000__000000CD51009FE8", +"000000067F00008000000A800C0000071276-000000067F00008000000A800C000007A9DC__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C0000074000-000000067F00008000000A800C0000078000__000000CD51009FE8", +"000000067F00008000000A800C0000078000-000000067F00008000000A800C000007C000__000000CD51009FE8", +"000000067F00008000000A800C000007A9DC-000000067F00008000000A800C000008410B__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C000007C000-000000067F00008000000A800C0000080000__000000CD51009FE8", +"000000067F00008000000A800C0000080000-000000067F00008000000A800C0000084000__000000CD51009FE8", +"000000067F00008000000A800C0000084000-000000067F00008000000A800C0000088000__000000CD51009FE8", +"000000067F00008000000A800C000008410B-000000067F00008000000A800C000008D871__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C0000088000-000000067F00008000000A800C000008C000__000000CD51009FE8", +"000000067F00008000000A800C000008C000-000000067F00008000000A800C0000090000__000000CD51009FE8", +"000000067F00008000000A800C000008D871-000000067F00008000000A800C0000096F94__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C0000090000-000000067F00008000000A800C0000094000__000000CD51009FE8", +"000000067F00008000000A800C0000094000-000000067F00008000000A800C0000098000__000000CD51009FE8", +"000000067F00008000000A800C0000096F94-000000067F00008000000A800C00000A06FA__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C0000098000-000000067F00008000000A800C000009C000__000000CD51009FE8", +"000000067F00008000000A800C000009C000-000000067F00008000000A800C00000A0000__000000CD51009FE8", +"000000067F00008000000A800C00000A0000-000000067F00008000000A800C00000A4000__000000CD51009FE8", +"000000067F00008000000A800C00000A06FA-000000067F00008000000A800C00000A9E0D__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C00000A4000-000000067F00008000000A800C00000A8000__000000CD51009FE8", +"000000067F00008000000A800C00000A8000-000000067F00008000000A800C00000AC000__000000CD51009FE8", +"000000067F00008000000A800C00000A9E0D-000000067F00008000000A800C00000B3553__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C00000AC000-000000067F00008000000A800C00000B0000__000000CD51009FE8", +"000000067F00008000000A800C00000B0000-000000067F00008000000A800C00000B4000__000000CD51009FE8", +"000000067F00008000000A800C00000B3553-000000067F00008000000A800C0100000000__000000CC11F5EDC9-000000CCB1B9E181", +"000000067F00008000000A800C00000B4000-000000067F00008000000A800C00000B8000__000000CD51009FE8", +"000000067F00008000000A800C00000B8000-000000067F00008000000A800C00000BC000__000000CD51009FE8", +"000000067F00008000000A800C00000BC000-000000067F00008000000A800C00000C0000__000000CD51009FE8", +"000000067F00008000000A800C00000BCB46-000000067F00008000000A800C00000C62AC__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00000C0000-000000067F00008000000A800C00000C4000__000000CD51009FE8", +"000000067F00008000000A800C00000C4000-000000067F00008000000A800C00000C8000__000000CD51009FE8", +"000000067F00008000000A800C00000C62AC-000000067F00008000000A800C00000CFA09__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00000C8000-000000067F00008000000A800C00000CC000__000000CD51009FE8", +"000000067F00008000000A800C00000CC000-000000067F00008000000A800C00000D0000__000000CD51009FE8", +"000000067F00008000000A800C00000CFA09-000000067F00008000000A800C00000D9118__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00000D0000-000000067F00008000000A800C00000D4000__000000CD51009FE8", +"000000067F00008000000A800C00000D4000-000000067F00008000000A800C00000D8000__000000CD51009FE8", +"000000067F00008000000A800C00000D8000-000000067F00008000000A800C00000DC000__000000CD51009FE8", +"000000067F00008000000A800C00000D9118-000000067F00008000000A800C00000E287E__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00000DC000-000000067F00008000000A800C00000E0000__000000CD51009FE8", +"000000067F00008000000A800C00000E0000-000000067F00008000000A800C00000E4000__000000CD51009FE8", +"000000067F00008000000A800C00000E287E-000000067F00008000000A800C00000EBFE4__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00000E4000-000000067F00008000000A800C00000E8000__000000CD51009FE8", +"000000067F00008000000A800C00000E8000-000000067F00008000000A800C00000EC000__000000CD51009FE8", +"000000067F00008000000A800C00000EBFE4-000000067F00008000000A800C00000F570B__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00000EC000-000000067F00008000000A800C00000F0000__000000CD51009FE8", +"000000067F00008000000A800C00000F0000-000000067F00008000000A800C00000F4000__000000CD51009FE8", +"000000067F00008000000A800C00000F4000-000000067F00008000000A800C00000F8000__000000CD51009FE8", +"000000067F00008000000A800C00000F570B-000000067F00008000000A800C00000FEE71__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00000F8000-000000067F00008000000A800C00000FC000__000000CD51009FE8", +"000000067F00008000000A800C00000FC000-000000067F00008000000A800C0000100000__000000CD51009FE8", +"000000067F00008000000A800C00000FEE71-000000067F00008000000A800C0000108587__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C0000100000-000000067F00008000000A800C0000104000__000000CD51009FE8", +"000000067F00008000000A800C0000104000-000000067F00008000000A800C0000108000__000000CD51009FE8", +"000000067F00008000000A800C0000108000-000000067F00008000000A800C000010C000__000000CD51009FE8", +"000000067F00008000000A800C0000108587-000000067F00008000000A800C0000111C20__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C000010C000-000000067F00008000000A800C0000110000__000000CD51009FE8", +"000000067F00008000000A800C0000110000-030000000000000000000000000000000002__000000CD51009FE8", +"000000067F00008000000A800C00FFFFFFFF-010000000000000001000000050000000011__000000CCB1B9E181-000000CD51344F89", +"000000067F00008000000A800C00FFFFFFFF-030000000000000000000000000000000002__000000CB82C37859-000000CC11F5EDC9", +"000000067F00008000000A800F0200000000-000000067F00008000000A80140000007ADF__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000A80140000007ADF-000000067F00008000000A8014000000F7D0__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000A8014000000F7D0-000000067F00008000000A801400000176D0__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000A801400000176D0-000000067F00008000000A8014000001F5D2__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000A8014000001F5D2-000000067F00008000000A801400000274D5__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000A801400000274D5-000000067F00008000000AA00C0000001863__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000AA00C0000000000-000000067F00008000000AA00C0000004000__000000CF7E08BFD0", +"000000067F00008000000AA00C0000001863-000000067F00008000000AA00C000000AFC9__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000AA00C0000004000-000000067F00008000000AA00C0000008000__000000CF7E08BFD0", +"000000067F00008000000AA00C0000008000-000000067F00008000000AA00C000000C000__000000CF7E08BFD0", +"000000067F00008000000AA00C000000AFC9-030000000000000000000000000000000002__000000CD51344F89-000000CDCC7BF889", +"000000067F00008000000AA00C000000C000-000000067F00008000000AA00C0000010000__000000CF7E08BFD0", +"000000067F00008000000AA00C0000010000-000000067F00008000000AA00C0000014000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000126EC-000000067F00008000000AA00C000001BE0C__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000014000-000000067F00008000000AA00C0000018000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000018000-000000067F00008000000AA00C000001C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000001BE0C-000000067F00008000000AA00C000002553F__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C000001C000-000000067F00008000000AA00C0000020000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000020000-000000067F00008000000AA00C0000024000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000024000-000000067F00008000000AA00C0000028000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000002553F-000000067F00008000000AA00C000002ECA5__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000028000-000000067F00008000000AA00C000002C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000002C000-000000067F00008000000AA00C0000030000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000002ECA5-000000067F00008000000AA00C00000383BC__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000030000-000000067F00008000000AA00C0000034000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000034000-000000067F00008000000AA00C0000038000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000038000-000000067F00008000000AA00C000003C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000383BC-000000067F00008000000AA00C0000041B0A__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C000003C000-000000067F00008000000AA00C0000040000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000040000-000000067F00008000000AA00C0000044000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000041B0A-000000067F00008000000AA00C000004B270__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000044000-000000067F00008000000AA00C0000048000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000048000-000000067F00008000000AA00C000004C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000004B270-000000067F00008000000AA00C00000549AA__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C000004C000-000000067F00008000000AA00C0000050000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000050000-000000067F00008000000AA00C0000054000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000054000-000000067F00008000000AA00C0000058000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000549AA-000000067F00008000000AA00C000005E10B__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000058000-000000067F00008000000AA00C000005C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000005C000-000000067F00008000000AA00C0000060000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000005E10B-000000067F00008000000AA00C000006782C__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000060000-000000067F00008000000AA00C0000064000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000064000-000000067F00008000000AA00C0000068000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000006782C-000000067F00008000000AA00C0000070F88__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000068000-000000067F00008000000AA00C000006C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000006C000-000000067F00008000000AA00C0000070000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000070000-000000067F00008000000AA00C0000074000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000070F88-000000067F00008000000AA00C0100000000__000000CDCC7BF889-000000CE6C3FED31", +"000000067F00008000000AA00C0000074000-000000067F00008000000AA00C0000078000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000078000-000000067F00008000000AA00C000007C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000078E97-000000067F00008000000AA00C00000823F9__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C000007C000-000000067F00008000000AA00C0000080000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000080000-000000067F00008000000AA00C0000084000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000823F9-000000067F00008000000AA00C000008BA8A__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C0000084000-000000067F00008000000AA00C0000088000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000088000-000000067F00008000000AA00C000008C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000008BA8A-000000067F00008000000AA00C00000951BF__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C000008C000-000000067F00008000000AA00C0000090000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000090000-000000067F00008000000AA00C0000094000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000094000-000000067F00008000000AA00C0000098000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000951BF-000000067F00008000000AA00C000009E90A__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C0000098000-000000067F00008000000AA00C000009C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000009C000-000000067F00008000000AA00C00000A0000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000009E90A-000000067F00008000000AA00C00000A802B__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000A0000-000000067F00008000000AA00C00000A4000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000A4000-000000067F00008000000AA00C00000A8000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000A8000-000000067F00008000000AA00C00000AC000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000A802B-000000067F00008000000AA00C00000B1782__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000AC000-000000067F00008000000AA00C00000B0000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000B0000-000000067F00008000000AA00C00000B4000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000B1782-000000067F00008000000AA00C00000BAEE8__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000B4000-000000067F00008000000AA00C00000B8000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000B8000-000000067F00008000000AA00C00000BC000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000BAEE8-000000067F00008000000AA00C00000C460C__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000BC000-000000067F00008000000AA00C00000C0000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000C0000-000000067F00008000000AA00C00000C4000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000C4000-000000067F00008000000AA00C00000C8000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000C460C-000000067F00008000000AA00C00000CDD72__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000C8000-000000067F00008000000AA00C00000CC000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000CC000-000000067F00008000000AA00C00000D0000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000CDD72-000000067F00008000000AA00C00000D74D8__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000D0000-000000067F00008000000AA00C00000D4000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000D4000-000000067F00008000000AA00C00000D8000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000D74D8-000000067F00008000000AA00C00000E0C0B__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000D8000-000000067F00008000000AA00C00000DC000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000DC000-000000067F00008000000AA00C00000E0000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000E0000-000000067F00008000000AA00C00000E4000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000E0C0B-000000067F00008000000AA00C00000EA371__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000E4000-000000067F00008000000AA00C00000E8000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000E8000-000000067F00008000000AA00C00000EC000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000EA371-000000067F00008000000AA00C00000F3AD7__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000EC000-000000067F00008000000AA00C00000F0000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000F0000-000000067F00008000000AA00C00000F4000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000F3AD7-000000067F00008000000AA00C00000FD20B__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C00000F4000-000000067F00008000000AA00C00000F8000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000F8000-000000067F00008000000AA00C00000FC000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000FC000-000000067F00008000000AA00C0000100000__000000CF7B8D3FD0", +"000000067F00008000000AA00C00000FD20B-000000067F00008000000AA00C0000106932__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C0000100000-000000067F00008000000AA00C0000104000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000104000-000000067F00008000000AA00C0000108000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000106932-000000067F00008000000AA00C0000110098__000000CE6C3FED31-000000CF7DC97FD1", +"000000067F00008000000AA00C0000108000-000000067F00008000000AA00C000010C000__000000CF7B8D3FD0", +"000000067F00008000000AA00C000010C000-000000067F00008000000AA00C0000110000__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000110000-030000000000000000000000000000000002__000000CF7B8D3FD0", +"000000067F00008000000AA00C0000110098-010000000000000001000000050000000012__000000CE6C3FED31-000000CF7DC97FD1", +"010000000000000001000000000000000000-030000000000000000000000000000000002__000000A29F1D8950", +"030000000000000000000000000000000001-030000000000000000000000000000000002__000000C689AF4AC1-000000C6C87B6329", +]; diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 1ee48eb2fc..973c3cd3a6 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,45 +10,63 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{Context, Result}; +use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::{BufMut, BytesMut}; -use log::*; +use fail::fail_point; +use itertools::Itertools; use std::fmt::Write as FmtWrite; use std::io; use std::io::Write; use std::sync::Arc; use std::time::SystemTime; use tar::{Builder, EntryType, Header}; +use tracing::*; -use crate::relish::*; -use crate::repository::Timeline; -use postgres_ffi::xlog_utils::*; -use postgres_ffi::*; -use zenith_utils::lsn::Lsn; +use crate::tenant::Timeline; +use pageserver_api::reltag::{RelTag, SlruKind}; + +use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; +use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA}; +use postgres_ffi::TransactionId; +use postgres_ffi::XLogFileName; +use postgres_ffi::PG_TLI; +use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; +use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions /// used for constructing tarball. -pub struct Basebackup<'a> { - ar: Builder<&'a mut dyn Write>, - timeline: &'a Arc, +pub struct Basebackup<'a, W> +where + W: Write, +{ + ar: Builder>, + timeline: &'a Arc, pub lsn: Lsn, prev_record_lsn: Lsn, + full_backup: bool, + finished: bool, } -// Create basebackup with non-rel data in it. Omit relational data. +// Create basebackup with non-rel data in it. +// Only include relational data if 'full_backup' is true. // // Currently we use empty lsn in two cases: // * During the basebackup right after timeline creation // * When working without safekeepers. In this situation it is important to match the lsn // we are taking basebackup on with the lsn that is used in pageserver's walreceiver // to start the replication. -impl<'a> Basebackup<'a> { +impl<'a, W> Basebackup<'a, W> +where + W: Write, +{ pub fn new( - write: &'a mut dyn Write, - timeline: &'a Arc, + write: W, + timeline: &'a Arc, req_lsn: Option, - ) -> Result> { + prev_lsn: Option, + full_backup: bool, + ) -> Result> { // Compute postgres doesn't have any previous WAL files, but the first // record that it's going to write needs to include the LSN of the // previous record (xl_prev). We include prev_record_lsn in the @@ -63,8 +81,8 @@ impl<'a> Basebackup<'a> { // an old LSN and it doesn't have any WAL of its own yet. We will set // prev_lsn to Lsn(0) if we cannot provide the correct value. let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { - // Backup was requested at a particular LSN. Wait for it to arrive. - timeline.wait_lsn(req_lsn)?; + // Backup was requested at a particular LSN. The caller should've + // already checked that it's a valid LSN. // If the requested point is the end of the timeline, we can // provide prev_lsn. (get_last_record_rlsn() might return it as @@ -82,30 +100,44 @@ impl<'a> Basebackup<'a> { (end_of_timeline.prev, end_of_timeline.last) }; + // Consolidate the derived and the provided prev_lsn values + let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn { + if backup_prev != Lsn(0) { + ensure!(backup_prev == provided_prev_lsn) + } + provided_prev_lsn + } else { + backup_prev + }; + info!( - "taking basebackup lsn={}, prev_lsn={}", - backup_lsn, backup_prev + "taking basebackup lsn={}, prev_lsn={} (full_backup={})", + backup_lsn, prev_lsn, full_backup ); Ok(Basebackup { - ar: Builder::new(write), + ar: Builder::new(AbortableWrite::new(write)), timeline, lsn: backup_lsn, - prev_record_lsn: backup_prev, + prev_record_lsn: prev_lsn, + full_backup, + finished: false, }) } - pub fn send_tarball(&mut self) -> anyhow::Result<()> { + pub fn send_tarball(mut self) -> anyhow::Result<()> { + // TODO include checksum + // Create pgdata subdirs structure - for dir in pg_constants::PGDATA_SUBDIRS.iter() { + for dir in PGDATA_SUBDIRS.iter() { let header = new_tar_header_dir(*dir)?; self.ar.append(&header, &mut io::empty())?; } // Send empty config files. - for filepath in pg_constants::PGDATA_SPECIAL_FILES.iter() { + for filepath in PGDATA_SPECIAL_FILES.iter() { if *filepath == "pg_hba.conf" { - let data = pg_constants::PG_HBA.as_bytes(); + let data = PG_HBA.as_bytes(); let header = new_tar_header(filepath, data.len() as u64)?; self.ar.append(&header, data)?; } else { @@ -115,56 +147,96 @@ impl<'a> Basebackup<'a> { } // Gather non-relational files from object storage pages. - for obj in self.timeline.list_nonrels(self.lsn)? { - match obj { - RelishTag::Slru { slru, segno } => { - self.add_slru_segment(slru, segno)?; - } - RelishTag::FileNodeMap { spcnode, dbnode } => { - self.add_relmap_file(spcnode, dbnode)?; - } - RelishTag::TwoPhase { xid } => { - self.add_twophase_file(xid)?; - } - _ => {} + for kind in [ + SlruKind::Clog, + SlruKind::MultiXactOffsets, + SlruKind::MultiXactMembers, + ] { + for segno in self.timeline.list_slru_segments(kind, self.lsn)? { + self.add_slru_segment(kind, segno)?; } } + // Create tablespace directories + for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? { + self.add_dbdir(spcnode, dbnode, has_relmap_file)?; + + // Gather and send relational files in each database if full backup is requested. + if self.full_backup { + for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? { + self.add_rel(rel)?; + } + } + } + for xid in self.timeline.list_twophase_files(self.lsn)? { + self.add_twophase_file(xid)?; + } + + fail_point!("basebackup-before-control-file", |_| { + bail!("failpoint basebackup-before-control-file") + }); + // Generate pg_control and bootstrap WAL segment. self.add_pgcontrol_file()?; self.ar.finish()?; + self.finished = true; debug!("all tarred up!"); Ok(()) } + fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> { + let nblocks = self.timeline.get_rel_size(tag, self.lsn, false)?; + + // Function that adds relation segment data to archive + let mut add_file = |segment_index, data: &Vec| -> anyhow::Result<()> { + let file_name = tag.to_segfile_name(segment_index as u32); + let header = new_tar_header(&file_name, data.len() as u64)?; + self.ar.append(&header, data.as_slice())?; + Ok(()) + }; + + // If the relation is empty, create an empty file + if nblocks == 0 { + add_file(0, &vec![])?; + return Ok(()); + } + + // Add a file for each chunk of blocks (aka segment) + let chunks = (0..nblocks).chunks(RELSEG_SIZE as usize); + for (seg, blocks) in chunks.into_iter().enumerate() { + let mut segment_data: Vec = vec![]; + for blknum in blocks { + let img = self + .timeline + .get_rel_page_at_lsn(tag, blknum, self.lsn, false)?; + segment_data.extend_from_slice(&img[..]); + } + + add_file(seg, &segment_data)?; + } + + Ok(()) + } + // // Generate SLRU segment files from repository. // fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { - let seg_size = self - .timeline - .get_relish_size(RelishTag::Slru { slru, segno }, self.lsn)?; + let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?; - if seg_size == None { - trace!( - "SLRU segment {}/{:>04X} was truncated", - slru.to_str(), - segno - ); - return Ok(()); - } - - let nblocks = seg_size.unwrap(); - - let mut slru_buf: Vec = - Vec::with_capacity(nblocks as usize * pg_constants::BLCKSZ as usize); + let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize); for blknum in 0..nblocks { - let img = - self.timeline - .get_page_at_lsn(RelishTag::Slru { slru, segno }, blknum, self.lsn)?; - assert!(img.len() == pg_constants::BLCKSZ as usize); + let img = self + .timeline + .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?; - slru_buf.extend_from_slice(&img); + if slru == SlruKind::Clog { + ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8); + } else { + ensure!(img.len() == BLCKSZ as usize); + } + + slru_buf.extend_from_slice(&img[..BLCKSZ as usize]); } let segname = format!("{}/{:>04X}", slru.to_str(), segno); @@ -176,43 +248,78 @@ impl<'a> Basebackup<'a> { } // - // Extract pg_filenode.map files from repository - // Along with them also send PG_VERSION for each database. + // Include database/tablespace directories. // - fn add_relmap_file(&mut self, spcnode: u32, dbnode: u32) -> anyhow::Result<()> { - let img = self.timeline.get_page_at_lsn( - RelishTag::FileNodeMap { spcnode, dbnode }, - 0, - self.lsn, - )?; - let path = if spcnode == pg_constants::GLOBALTABLESPACE_OID { - let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes(); - let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; - - let header = new_tar_header("global/PG_VERSION", version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; - - String::from("global/pg_filenode.map") // filenode map for global tablespace + // Each directory contains a PG_VERSION file, and the default database + // directories also contain pg_filenode.map files. + // + fn add_dbdir( + &mut self, + spcnode: u32, + dbnode: u32, + has_relmap_file: bool, + ) -> anyhow::Result<()> { + let relmap_img = if has_relmap_file { + let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?; + ensure!(img.len() == 512); + Some(img) } else { + None + }; + + if spcnode == GLOBALTABLESPACE_OID { + let pg_version_str = self.timeline.pg_version.to_string(); + let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; + self.ar.append(&header, pg_version_str.as_bytes())?; + + info!("timeline.pg_version {}", self.timeline.pg_version); + + if let Some(img) = relmap_img { + // filenode map for global tablespace + let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?; + self.ar.append(&header, &img[..])?; + } else { + warn!("global/pg_filenode.map is missing"); + } + } else { + // User defined tablespaces are not supported. However, as + // a special case, if a tablespace/db directory is + // completely empty, we can leave it out altogether. This + // makes taking a base backup after the 'tablespace' + // regression test pass, because the test drops the + // created tablespaces after the tests. + // + // FIXME: this wouldn't be necessary, if we handled + // XLOG_TBLSPC_DROP records. But we probably should just + // throw an error on CREATE TABLESPACE in the first place. + if !has_relmap_file + && self + .timeline + .list_rels(spcnode, dbnode, self.lsn)? + .is_empty() + { + return Ok(()); + } // User defined tablespaces are not supported - assert!(spcnode == pg_constants::DEFAULTTABLESPACE_OID); + ensure!(spcnode == DEFAULTTABLESPACE_OID); // Append dir path for each database let path = format!("base/{}", dbnode); let header = new_tar_header_dir(&path)?; self.ar.append(&header, &mut io::empty())?; - let dst_path = format!("base/{}/PG_VERSION", dbnode); - let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes(); - let header = new_tar_header(&dst_path, version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; + if let Some(img) = relmap_img { + let dst_path = format!("base/{}/PG_VERSION", dbnode); - format!("base/{}/pg_filenode.map", dbnode) + let pg_version_str = self.timeline.pg_version.to_string(); + let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; + self.ar.append(&header, pg_version_str.as_bytes())?; + + let relmap_path = format!("base/{}/pg_filenode.map", dbnode); + let header = new_tar_header(&relmap_path, img.len() as u64)?; + self.ar.append(&header, &img[..])?; + } }; - assert!(img.len() == 512); - let header = new_tar_header(&path, img.len() as u64)?; - self.ar.append(&header, &img[..])?; Ok(()) } @@ -220,9 +327,7 @@ impl<'a> Basebackup<'a> { // Extract twophase state files // fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { - let img = self - .timeline - .get_page_at_lsn(RelishTag::TwoPhase { xid }, 0, self.lsn)?; + let img = self.timeline.get_twophase_file(xid, self.lsn)?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); @@ -240,30 +345,6 @@ impl<'a> Basebackup<'a> { // Also send zenith.signal file with extra bootstrap data. // fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> { - let checkpoint_bytes = self - .timeline - .get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn) - .context("failed to get checkpoint bytes")?; - let pg_control_bytes = self - .timeline - .get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn) - .context("failed get control bytes")?; - let mut pg_control = ControlFileData::decode(&pg_control_bytes)?; - let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?; - - // Generate new pg_control needed for bootstrap - checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0; - - //reset some fields we don't want to preserve - //TODO Check this. - //We may need to determine the value from twophase data. - checkpoint.oldestActiveXid = 0; - - //save new values in pg_control - pg_control.checkPoint = 0; - pg_control.checkPointCopy = checkpoint; - pg_control.state = pg_constants::DB_SHUTDOWNED; - // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { @@ -280,23 +361,54 @@ impl<'a> Basebackup<'a> { zenith_signal.as_bytes(), )?; + let checkpoint_bytes = self + .timeline + .get_checkpoint(self.lsn) + .context("failed to get checkpoint bytes")?; + let pg_control_bytes = self + .timeline + .get_control_file(self.lsn) + .context("failed get control bytes")?; + + let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control( + &pg_control_bytes, + &checkpoint_bytes, + self.lsn, + self.timeline.pg_version, + )?; + //send pg_control - let pg_control_bytes = pg_control.encode(); let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; self.ar.append(&header, &pg_control_bytes[..])?; //send wal segment - let segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE); - let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE); + let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE); + let wal_file_name = XLogFileName(PG_TLI, segno, WAL_SEGMENT_SIZE); let wal_file_path = format!("pg_wal/{}", wal_file_name); - let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?; - let wal_seg = generate_wal_segment(segno, pg_control.system_identifier); - assert!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE); + let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?; + + let wal_seg = + postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version) + .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; + ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); self.ar.append(&header, &wal_seg[..])?; Ok(()) } } +impl<'a, W> Drop for Basebackup<'a, W> +where + W: Write, +{ + /// If the basebackup was not finished, prevent the Archive::drop() from + /// writing the end-of-archive marker. + fn drop(&mut self) { + if !self.finished { + self.ar.get_mut().abort(); + } + } +} + // // Create new tarball entry header // @@ -332,3 +444,49 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result
{ header.set_cksum(); Ok(header) } + +/// A wrapper that passes through all data to the underlying Write, +/// until abort() is called. +/// +/// tar::Builder has an annoying habit of finishing the archive with +/// a valid tar end-of-archive marker (two 512-byte sectors of zeros), +/// even if an error occurs and we don't finish building the archive. +/// We'd rather abort writing the tarball immediately than construct +/// a seemingly valid but incomplete archive. This wrapper allows us +/// to swallow the end-of-archive marker that Builder::drop() emits, +/// without writing it to the underlying sink. +/// +struct AbortableWrite { + w: W, + aborted: bool, +} + +impl AbortableWrite { + pub fn new(w: W) -> Self { + AbortableWrite { w, aborted: false } + } + + pub fn abort(&mut self) { + self.aborted = true; + } +} + +impl Write for AbortableWrite +where + W: Write, +{ + fn write(&mut self, data: &[u8]) -> io::Result { + if self.aborted { + Ok(data.len()) + } else { + self.w.write(data) + } + } + fn flush(&mut self) -> io::Result<()> { + if self.aborted { + Ok(()) + } else { + self.w.flush() + } + } +} diff --git a/pageserver/src/bin/draw_timeline_dir.rs b/pageserver/src/bin/draw_timeline_dir.rs new file mode 100644 index 0000000000..ea1ff7f3c7 --- /dev/null +++ b/pageserver/src/bin/draw_timeline_dir.rs @@ -0,0 +1,150 @@ +//! A tool for visualizing the arrangement of layerfiles within a timeline. +//! +//! It reads filenames from stdin and prints a svg on stdout. The image is a plot in +//! page-lsn space, where every delta layer is a rectangle and every image layer is a +//! thick line. Legend: +//! - The x axis (left to right) represents page index. +//! - The y axis represents LSN, growing upwards. +//! +//! Coordinates in both axis are compressed for better readability. +//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb) +//! +//! Example use: +//! ``` +//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE +//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg +//! $ firefox out.svg +//! ``` +//! +//! This API was chosen so that we can easily work with filenames extracted from ssh, +//! or from pageserver log files. +//! +//! TODO Consider shipping this as a grafana panel plugin: +//! https://grafana.com/tutorials/build-a-panel-plugin/ +use anyhow::Result; +use pageserver::repository::Key; +use std::cmp::Ordering; +use std::io::{self, BufRead}; +use std::{ + collections::{BTreeMap, BTreeSet}, + ops::Range, +}; +use svg_fmt::{rectangle, rgb, BeginSvg, EndSvg, Fill, Stroke}; +use utils::{lsn::Lsn, project_git_version}; + +project_git_version!(GIT_VERSION); + +// Map values to their compressed coordinate - the index the value +// would have in a sorted and deduplicated list of all values. +fn build_coordinate_compression_map(coords: Vec) -> BTreeMap { + let set: BTreeSet = coords.into_iter().collect(); + + let mut map: BTreeMap = BTreeMap::new(); + for (i, e) in set.iter().enumerate() { + map.insert(*e, i); + } + + map +} + +fn parse_filename(name: &str) -> (Range, Range) { + let split: Vec<&str> = name.split("__").collect(); + let keys: Vec<&str> = split[0].split('-').collect(); + let mut lsns: Vec<&str> = split[1].split('-').collect(); + if lsns.len() == 1 { + lsns.push(lsns[0]); + } + + let keys = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap(); + let lsns = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap(); + (keys, lsns) +} + +fn main() -> Result<()> { + // Parse layer filenames from stdin + let mut ranges: Vec<(Range, Range)> = vec![]; + let stdin = io::stdin(); + for line in stdin.lock().lines() { + let range = parse_filename(&line.unwrap()); + ranges.push(range); + } + + // Collect all coordinates + let mut keys: Vec = vec![]; + let mut lsns: Vec = vec![]; + for (keyr, lsnr) in &ranges { + keys.push(keyr.start); + keys.push(keyr.end); + lsns.push(lsnr.start); + lsns.push(lsnr.end); + } + + // Analyze + let key_map = build_coordinate_compression_map(keys); + let lsn_map = build_coordinate_compression_map(lsns); + + // Initialize stats + let mut num_deltas = 0; + let mut num_images = 0; + + // Draw + let stretch = 3.0; // Stretch out vertically for better visibility + println!( + "{}", + BeginSvg { + w: key_map.len() as f32, + h: stretch * lsn_map.len() as f32 + } + ); + for (keyr, lsnr) in &ranges { + let key_start = *key_map.get(&keyr.start).unwrap(); + let key_end = *key_map.get(&keyr.end).unwrap(); + let key_diff = key_end - key_start; + let lsn_max = lsn_map.len(); + + if key_start >= key_end { + panic!("Invalid key range {}-{}", key_start, key_end); + } + + let lsn_start = *lsn_map.get(&lsnr.start).unwrap(); + let lsn_end = *lsn_map.get(&lsnr.end).unwrap(); + + let mut lsn_diff = (lsn_end - lsn_start) as f32; + let mut fill = Fill::None; + let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas + let mut lsn_offset = 0.0; + + // Fill in and thicken rectangle if it's an + // image layer so that we can see it. + match lsn_start.cmp(&lsn_end) { + Ordering::Less => num_deltas += 1, + Ordering::Equal => { + num_images += 1; + lsn_diff = 0.3; + lsn_offset = -lsn_diff / 2.0; + margin = 0.05; + fill = Fill::Color(rgb(0, 0, 0)); + } + Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end), + } + + println!( + " {}", + rectangle( + key_start as f32 + stretch * margin, + stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)), + key_diff as f32 - stretch * 2.0 * margin, + stretch * (lsn_diff - 2.0 * margin) + ) + .fill(fill) + .stroke(Stroke::Color(rgb(0, 0, 0), 0.1)) + .border_radius(0.4) + ); + } + println!("{}", EndSvg); + + eprintln!("num_images: {}", num_images); + eprintln!("num_deltas: {}", num_deltas); + + Ok(()) +} diff --git a/pageserver/src/bin/dump_layerfile.rs b/pageserver/src/bin/dump_layerfile.rs deleted file mode 100644 index b954ad5a15..0000000000 --- a/pageserver/src/bin/dump_layerfile.rs +++ /dev/null @@ -1,31 +0,0 @@ -//! Main entry point for the dump_layerfile executable -//! -//! A handy tool for debugging, that's all. -use anyhow::Result; -use clap::{App, Arg}; -use pageserver::layered_repository::dump_layerfile_from_path; -use pageserver::virtual_file; -use std::path::PathBuf; -use zenith_utils::GIT_VERSION; - -fn main() -> Result<()> { - let arg_matches = App::new("Zenith dump_layerfile utility") - .about("Dump contents of one layer file, for debugging") - .version(GIT_VERSION) - .arg( - Arg::new("path") - .help("Path to file to dump") - .required(true) - .index(1), - ) - .get_matches(); - - let path = PathBuf::from(arg_matches.value_of("path").unwrap()); - - // Basic initialization of things that don't change after startup - virtual_file::init(10); - - dump_layerfile_from_path(&path)?; - - Ok(()) -} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index fb8baa28f6..62119b51c6 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -1,78 +1,71 @@ //! Main entry point for the Page Server executable. -use std::{env, path::Path, str::FromStr}; +use std::{env, ops::ControlFlow, path::Path, str::FromStr}; + +use anyhow::{anyhow, Context}; +use clap::{Arg, ArgAction, Command}; +use fail::FailScenario; +use nix::unistd::Pid; use tracing::*; -use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener, GIT_VERSION}; - -use anyhow::{bail, Context, Result}; - -use clap::{App, Arg}; -use daemonize::Daemonize; +use metrics::set_build_info_metric; use pageserver::{ - branches, config::{defaults::*, PageServerConf}, - http, page_cache, page_service, remote_storage, tenant_mgr, thread_mgr, - thread_mgr::ThreadKind, - virtual_file, LOG_FILE_NAME, + http, page_cache, page_service, profiling, task_mgr, + task_mgr::TaskKind, + task_mgr::{ + BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, + }, + tenant_mgr, virtual_file, +}; +use remote_storage::GenericRemoteStorage; +use utils::{ + auth::JwtAuth, + lock_file, logging, + postgres_backend::AuthType, + project_git_version, + signals::{self, Signal}, + tcp_listener, }; -use zenith_utils::http::endpoint; -use zenith_utils::postgres_backend; -use zenith_utils::shutdown::exit_now; -use zenith_utils::signals::{self, Signal}; -fn main() -> Result<()> { - zenith_metrics::set_common_metrics_prefix("pageserver"); - let arg_matches = App::new("Zenith page server") - .about("Materializes WAL stream to pages and serves them to the postgres") - .version(GIT_VERSION) - .arg( - Arg::new("daemonize") - .short('d') - .long("daemonize") - .takes_value(false) - .help("Run in the background"), - ) - .arg( - Arg::new("init") - .long("init") - .takes_value(false) - .help("Initialize pageserver repo"), - ) - .arg( - Arg::new("workdir") - .short('D') - .long("workdir") - .takes_value(true) - .help("Working directory for the pageserver"), - ) - .arg( - Arg::new("create-tenant") - .long("create-tenant") - .takes_value(true) - .help("Create tenant during init") - .requires("init"), - ) - // See `settings.md` for more details on the extra configuration patameters pageserver can process - .arg( - Arg::new("config-override") - .short('c') - .takes_value(true) - .number_of_values(1) - .multiple_occurrences(true) - .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). - Any option has to be a valid toml document, example: `-c \"foo='hey'\"` `-c \"foo={value=1}\"`"), - ) - .get_matches(); +project_git_version!(GIT_VERSION); - let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith")); +const PID_FILE_NAME: &str = "pageserver.pid"; + +const FEATURES: &[&str] = &[ + #[cfg(feature = "testing")] + "testing", + #[cfg(feature = "fail/failpoints")] + "fail/failpoints", + #[cfg(feature = "profiling")] + "profiling", +]; + +fn version() -> String { + format!( + "{GIT_VERSION} failpoints: {}, features: {:?}", + fail::has_failpoints(), + FEATURES, + ) +} + +fn main() -> anyhow::Result<()> { + let arg_matches = cli().get_matches(); + + if arg_matches.get_flag("enabled-features") { + println!("{{\"features\": {FEATURES:?} }}"); + return Ok(()); + } + + let workdir = arg_matches + .get_one::("workdir") + .map(Path::new) + .unwrap_or_else(|| Path::new(".neon")); let workdir = workdir .canonicalize() .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?; - let cfg_file_path = workdir.join("pageserver.toml"); - let init = arg_matches.is_present("init"); - let create_tenant = arg_matches.value_of("create-tenant"); + let cfg_file_path = workdir.join("pageserver.toml"); // Set CWD to workdir for non-daemon modes env::set_current_dir(&workdir).with_context(|| { @@ -82,79 +75,153 @@ fn main() -> Result<()> { ) })?; - let daemonize = arg_matches.is_present("daemonize"); - if init && daemonize { - bail!("--daemonize cannot be used with --init") - } - - let mut toml = if init { - // We're initializing the repo, so there's no config file yet - DEFAULT_CONFIG_FILE - .parse::() - .expect("could not parse built-in config file") - } else { - // Supplement the CLI arguments with the config file - let cfg_file_contents = std::fs::read_to_string(&cfg_file_path) - .with_context(|| format!("No pageserver config at '{}'", cfg_file_path.display()))?; - cfg_file_contents - .parse::() - .with_context(|| { - format!( - "Failed to read '{}' as pageserver config", - cfg_file_path.display() - ) - })? + let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? { + ControlFlow::Continue(conf) => conf, + ControlFlow::Break(()) => { + info!("Pageserver config init successful"); + return Ok(()); + } }; - // Process any extra options given with -c - if let Some(values) = arg_matches.values_of("config-override") { + let tenants_path = conf.tenants_path(); + if !tenants_path.exists() { + utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| { + format!( + "Failed to create tenants root dir at '{}'", + tenants_path.display() + ) + })?; + } + + // Initialize up failpoints support + let scenario = FailScenario::setup(); + + // Basic initialization of things that don't change after startup + virtual_file::init(conf.max_file_descriptors); + page_cache::init(conf.page_cache_size); + + start_pageserver(conf).context("Failed to start pageserver")?; + + scenario.teardown(); + Ok(()) +} + +fn initialize_config( + cfg_file_path: &Path, + arg_matches: clap::ArgMatches, + workdir: &Path, +) -> anyhow::Result> { + let init = arg_matches.get_flag("init"); + let update_config = init || arg_matches.get_flag("update-config"); + + let (mut toml, config_file_exists) = if cfg_file_path.is_file() { + if init { + anyhow::bail!( + "Config file '{}' already exists, cannot init it, use --update-config to update it", + cfg_file_path.display() + ); + } + // Supplement the CLI arguments with the config file + let cfg_file_contents = std::fs::read_to_string(&cfg_file_path).with_context(|| { + format!( + "Failed to read pageserver config at '{}'", + cfg_file_path.display() + ) + })?; + ( + cfg_file_contents + .parse::() + .with_context(|| { + format!( + "Failed to parse '{}' as pageserver config", + cfg_file_path.display() + ) + })?, + true, + ) + } else if cfg_file_path.exists() { + anyhow::bail!( + "Config file '{}' exists but is not a regular file", + cfg_file_path.display() + ); + } else { + // We're initializing the tenant, so there's no config file yet + ( + DEFAULT_CONFIG_FILE + .parse::() + .context("could not parse built-in config file")?, + false, + ) + }; + + if let Some(values) = arg_matches.get_many::("config-override") { for option_line in values { let doc = toml_edit::Document::from_str(option_line).with_context(|| { - format!( - "Option '{}' could not be parsed as a toml document", - option_line - ) + format!("Option '{option_line}' could not be parsed as a toml document") })?; + for (key, item) in doc.iter() { + if config_file_exists && update_config && key == "id" && toml.contains_key(key) { + anyhow::bail!("Pageserver config file exists at '{}' and has node id already, it cannot be overridden", cfg_file_path.display()); + } toml.insert(key, item.clone()); } } } - trace!("Resulting toml: {}", toml); - let conf = PageServerConf::parse_and_validate(&toml, &workdir) + + debug!("Resulting toml: {toml}"); + let conf = PageServerConf::parse_and_validate(&toml, workdir) .context("Failed to parse pageserver configuration")?; - // The configuration is all set up now. Turn it into a 'static - // that can be freely stored in structs and passed across threads - // as a ref. - let conf: &'static PageServerConf = Box::leak(Box::new(conf)); + if update_config { + info!("Writing pageserver config to '{}'", cfg_file_path.display()); - // Basic initialization of things that don't change after startup - virtual_file::init(conf.max_file_descriptors); - - page_cache::init(conf); - - // Create repo and exit if init was requested - if init { - branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?; - // write the config file std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| { format!( - "Failed to initialize pageserver config at '{}'", + "Failed to write pageserver config to '{}'", cfg_file_path.display() ) })?; - Ok(()) - } else { - start_pageserver(conf, daemonize).context("Failed to start pageserver") + info!( + "Config successfully written to '{}'", + cfg_file_path.display() + ) } + + Ok(if init { + ControlFlow::Break(()) + } else { + ControlFlow::Continue(Box::leak(Box::new(conf))) + }) } -fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> { - // Initialize logger - let log_file = logging::init(LOG_FILE_NAME, daemonize)?; +fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { + logging::init(conf.log_format)?; + info!("version: {}", version()); - info!("version: {}", GIT_VERSION); + let lock_file_path = conf.workdir.join(PID_FILE_NAME); + let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) { + lock_file::LockCreationResult::Created { + new_lock_contents, + file, + } => { + info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}"); + file + } + lock_file::LockCreationResult::AlreadyLocked { + existing_lock_contents, + } => anyhow::bail!( + "Could not lock pid file; pageserver is already running in {:?} with PID {}", + conf.workdir, + existing_lock_contents + ), + lock_file::LockCreationResult::CreationFailed(e) => { + return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}"))) + } + }; + // ensure that the lock file is held even if the main thread of the process is panics + // we need to release the lock file only when the current process is gone + let _ = Box::leak(Box::new(lock_file)); // TODO: Check that it looks like a valid repository before going further @@ -171,42 +238,17 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() ); let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?; - // NB: Don't spawn any threads before daemonizing! - if daemonize { - info!("daemonizing..."); - - // There shouldn't be any logging to stdin/stdout. Redirect it to the main log so - // that we will see any accidental manual fprintf's or backtraces. - let stdout = log_file.try_clone().unwrap(); - let stderr = log_file; - - let daemonize = Daemonize::new() - .pid_file("pageserver.pid") - .working_directory(".") - .stdout(stdout) - .stderr(stderr); - - // XXX: The parent process should exit abruptly right after - // it has spawned a child to prevent coverage machinery from - // dumping stats into a `profraw` file now owned by the child. - // Otherwise, the coverage data will be damaged. - match daemonize.exit_action(|| exit_now(0)).start() { - Ok(_) => info!("Success, daemonized"), - Err(err) => error!(%err, "could not daemonize"), - } - } - let signals = signals::install_shutdown_handlers()?; - let sync_startup = remote_storage::start_local_timeline_sync(conf) - .context("Failed to set up local files sync with external storage")?; - // Initialize tenant manager. - tenant_mgr::set_timeline_states(conf, sync_startup.initial_timeline_states); + // start profiler (if enabled) + let profiler_guard = profiling::init_profiler(conf); + + WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_etcd_client(conf))?; // initialize authentication for incoming connections let auth = match &conf.auth_type { AuthType::Trust | AuthType::MD5 => None, - AuthType::ZenithJWT => { + AuthType::NeonJWT => { // unwrap is ok because check is performed when creating config, so path is set and file exists let key_path = conf.auth_validation_public_key_path.as_ref().unwrap(); Some(JwtAuth::from_key_path(key_path)?.into()) @@ -214,36 +256,71 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() }; info!("Using auth: {:#?}", conf.auth_type); - // Spawn a new thread for the http endpoint + let remote_storage = conf + .remote_storage_config + .as_ref() + .map(|storage_config| { + GenericRemoteStorage::from_config(conf.workdir.clone(), storage_config) + }) + .transpose() + .context("Failed to init generic remote storage")?; + let remote_index = { + let _rt_guard = BACKGROUND_RUNTIME.enter(); + tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())? + }; + + // Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME. // bind before launching separate thread so the error reported before startup exits - let auth_cloned = auth.clone(); - thread_mgr::spawn( - ThreadKind::HttpEndpointListener, - None, - None, - "http_endpoint_thread", - move || { - let router = http::make_router(conf, auth_cloned); - endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher()) - }, - )?; - // Spawn a thread to listen for libpq connections. It will spawn further threads + // Create a Service from the router above to handle incoming requests. + { + let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); + + let router = http::make_router(conf, auth.clone(), remote_index, remote_storage)?; + let service = + utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap(); + let server = hyper::Server::from_tcp(http_listener)? + .serve(service) + .with_graceful_shutdown(task_mgr::shutdown_watcher()); + + task_mgr::spawn( + MGMT_REQUEST_RUNTIME.handle(), + TaskKind::HttpEndpointListener, + None, + None, + "http endpoint listener", + true, + async { + server.await?; + Ok(()) + }, + ); + } + + // Spawn a task to listen for libpq connections. It will spawn further tasks // for each connection. - thread_mgr::spawn( - ThreadKind::LibpqEndpointListener, + task_mgr::spawn( + COMPUTE_REQUEST_RUNTIME.handle(), + TaskKind::LibpqEndpointListener, None, None, - "libpq endpoint thread", - move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type), - )?; + "libpq endpoint listener", + true, + async move { + page_service::libpq_listener_main(conf, auth, pageserver_listener, conf.auth_type).await + }, + ); + set_build_info_metric(GIT_VERSION); + + // All started up! Now just sit and wait for shutdown signal. signals.handle(|signal| match signal { Signal::Quit => { info!( "Got {}. Terminating in immediate shutdown mode", signal.name() ); + profiling::exit_profiler(conf, &profiler_guard); std::process::exit(111); } @@ -252,38 +329,53 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() "Got {}. Terminating gracefully in fast shutdown mode", signal.name() ); - shutdown_pageserver(); + profiling::exit_profiler(conf, &profiler_guard); + BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0)); unreachable!() } }) } -fn shutdown_pageserver() { - // Shut down the libpq endpoint thread. This prevents new connections from - // being accepted. - thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None); - - // Shut down any page service threads. - postgres_backend::set_pgbackend_shutdown_requested(); - thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None); - - // Shut down all the tenants. This flushes everything to disk and kills - // the checkpoint and GC threads. - tenant_mgr::shutdown_all_tenants(); - - // Stop syncing with remote storage. - // - // FIXME: Does this wait for the sync thread to finish syncing what's queued up? - // Should it? - thread_mgr::shutdown_threads(Some(ThreadKind::StorageSync), None, None); - - // Shut down the HTTP endpoint last, so that you can still check the server's - // status while it's shutting down. - thread_mgr::shutdown_threads(Some(ThreadKind::HttpEndpointListener), None, None); - - // There should be nothing left, but let's be sure - thread_mgr::shutdown_threads(None, None, None); - - info!("Shut down successfully completed"); - std::process::exit(0); +fn cli() -> Command { + Command::new("Neon page server") + .about("Materializes WAL stream to pages and serves them to the postgres") + .version(version()) + .arg( + Arg::new("init") + .long("init") + .action(ArgAction::SetTrue) + .help("Initialize pageserver with all given config overrides"), + ) + .arg( + Arg::new("workdir") + .short('D') + .long("workdir") + .help("Working directory for the pageserver"), + ) + // See `settings.md` for more details on the extra configuration patameters pageserver can process + .arg( + Arg::new("config-override") + .short('c') + .num_args(1) + .action(ArgAction::Append) + .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \ + Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), + ) + .arg( + Arg::new("update-config") + .long("update-config") + .action(ArgAction::SetTrue) + .help("Update the config file when started"), + ) + .arg( + Arg::new("enabled-features") + .long("enabled-features") + .action(ArgAction::SetTrue) + .help("Show enabled compile time features"), + ) +} + +#[test] +fn verify_cli() { + cli().debug_assert(); } diff --git a/pageserver/src/bin/pageserver_binutils.rs b/pageserver/src/bin/pageserver_binutils.rs new file mode 100644 index 0000000000..b1484ac45a --- /dev/null +++ b/pageserver/src/bin/pageserver_binutils.rs @@ -0,0 +1,154 @@ +//! A helper tool to manage pageserver binary files. +//! Accepts a file as an argument, attempts to parse it with all ways possible +//! and prints its interpreted context. +//! +//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file. +use std::{ + path::{Path, PathBuf}, + str::FromStr, +}; + +use anyhow::Context; +use clap::{value_parser, Arg, Command}; + +use pageserver::{ + page_cache, + tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, + virtual_file, +}; +use postgres_ffi::ControlFileData; +use utils::{lsn::Lsn, project_git_version}; + +project_git_version!(GIT_VERSION); + +const METADATA_SUBCOMMAND: &str = "metadata"; + +fn main() -> anyhow::Result<()> { + let arg_matches = cli().get_matches(); + + match arg_matches.subcommand() { + Some((subcommand_name, subcommand_matches)) => { + let path = subcommand_matches + .get_one::("metadata_path") + .context("'metadata_path' argument is missing")? + .to_path_buf(); + anyhow::ensure!( + subcommand_name == METADATA_SUBCOMMAND, + "Unknown subcommand {subcommand_name}" + ); + handle_metadata(&path, subcommand_matches)?; + } + None => { + let path = arg_matches + .get_one::("path") + .context("'path' argument is missing")? + .to_path_buf(); + println!( + "No subcommand specified, attempting to guess the format for file {}", + path.display() + ); + if let Err(e) = read_pg_control_file(&path) { + println!( + "Failed to read input file as a pg control one: {e:#}\n\ + Attempting to read it as layer file" + ); + print_layerfile(&path)?; + } + } + }; + Ok(()) +} + +fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> { + let control_file = ControlFileData::decode(&std::fs::read(&control_file_path)?)?; + println!("{control_file:?}"); + let control_file_initdb = Lsn(control_file.checkPoint); + println!( + "pg_initdb_lsn: {}, aligned: {}", + control_file_initdb, + control_file_initdb.align() + ); + Ok(()) +} + +fn print_layerfile(path: &Path) -> anyhow::Result<()> { + // Basic initialization of things that don't change after startup + virtual_file::init(10); + page_cache::init(100); + dump_layerfile_from_path(path, true) +} + +fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> { + let metadata_bytes = std::fs::read(&path)?; + let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; + println!("Current metadata:\n{meta:?}"); + let mut update_meta = false; + if let Some(disk_consistent_lsn) = arg_matches.get_one::("disk_consistent_lsn") { + meta = TimelineMetadata::new( + Lsn::from_str(disk_consistent_lsn)?, + meta.prev_record_lsn(), + meta.ancestor_timeline(), + meta.ancestor_lsn(), + meta.latest_gc_cutoff_lsn(), + meta.initdb_lsn(), + meta.pg_version(), + ); + update_meta = true; + } + if let Some(prev_record_lsn) = arg_matches.get_one::("prev_record_lsn") { + meta = TimelineMetadata::new( + meta.disk_consistent_lsn(), + Some(Lsn::from_str(prev_record_lsn)?), + meta.ancestor_timeline(), + meta.ancestor_lsn(), + meta.latest_gc_cutoff_lsn(), + meta.initdb_lsn(), + meta.pg_version(), + ); + update_meta = true; + } + + if update_meta { + let metadata_bytes = meta.to_bytes()?; + std::fs::write(&path, &metadata_bytes)?; + } + + Ok(()) +} + +fn cli() -> Command { + Command::new("Neon Pageserver binutils") + .about("Reads pageserver (and related) binary files management utility") + .version(GIT_VERSION) + .arg( + Arg::new("path") + .help("Input file path") + .value_parser(value_parser!(PathBuf)) + .required(false), + ) + .subcommand( + Command::new(METADATA_SUBCOMMAND) + .about("Read and update pageserver metadata file") + .arg( + Arg::new("metadata_path") + .help("Input metadata file path") + .value_parser(value_parser!(PathBuf)) + .required(false), + ) + .arg( + Arg::new("disk_consistent_lsn") + .long("disk_consistent_lsn") + .help("Replace disk consistent Lsn"), + ) + .arg( + Arg::new("prev_record_lsn") + .long("prev_record_lsn") + .help("Replace previous record Lsn"), + ), + ) +} + +#[test] +fn verify_cli() { + cli().debug_assert(); +} diff --git a/pageserver/src/bin/pageserver_zst.rs b/pageserver/src/bin/pageserver_zst.rs deleted file mode 100644 index 5b8f8cc3c6..0000000000 --- a/pageserver/src/bin/pageserver_zst.rs +++ /dev/null @@ -1,334 +0,0 @@ -//! A CLI helper to deal with remote storage (S3, usually) blobs as archives. -//! See [`compression`] for more details about the archives. - -use std::{collections::BTreeSet, path::Path}; - -use anyhow::{bail, ensure, Context}; -use clap::{App, Arg}; -use pageserver::{ - layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}, - remote_storage::compression, -}; -use tokio::{fs, io}; -use zenith_utils::GIT_VERSION; - -const LIST_SUBCOMMAND: &str = "list"; -const ARCHIVE_ARG_NAME: &str = "archive"; - -const EXTRACT_SUBCOMMAND: &str = "extract"; -const TARGET_DIRECTORY_ARG_NAME: &str = "target_directory"; - -const CREATE_SUBCOMMAND: &str = "create"; -const SOURCE_DIRECTORY_ARG_NAME: &str = "source_directory"; - -#[tokio::main(flavor = "current_thread")] -async fn main() -> anyhow::Result<()> { - let arg_matches = App::new("pageserver zst blob [un]compressor utility") - .version(GIT_VERSION) - .subcommands(vec![ - App::new(LIST_SUBCOMMAND) - .about("List the archive contents") - .arg( - Arg::new(ARCHIVE_ARG_NAME) - .required(true) - .takes_value(true) - .help("An archive to list the contents of"), - ), - App::new(EXTRACT_SUBCOMMAND) - .about("Extracts the archive into the directory") - .arg( - Arg::new(ARCHIVE_ARG_NAME) - .required(true) - .takes_value(true) - .help("An archive to extract"), - ) - .arg( - Arg::new(TARGET_DIRECTORY_ARG_NAME) - .required(false) - .takes_value(true) - .help("A directory to extract the archive into. Optional, will use the current directory if not specified"), - ), - App::new(CREATE_SUBCOMMAND) - .about("Creates an archive with the contents of a directory (only the first level files are taken, metadata file has to be present in the same directory)") - .arg( - Arg::new(SOURCE_DIRECTORY_ARG_NAME) - .required(true) - .takes_value(true) - .help("A directory to use for creating the archive"), - ) - .arg( - Arg::new(TARGET_DIRECTORY_ARG_NAME) - .required(false) - .takes_value(true) - .help("A directory to create the archive in. Optional, will use the current directory if not specified"), - ), - ]) - .get_matches(); - - let subcommand_name = match arg_matches.subcommand_name() { - Some(name) => name, - None => bail!("No subcommand specified"), - }; - - let subcommand_matches = match arg_matches.subcommand_matches(subcommand_name) { - Some(matches) => matches, - None => bail!( - "No subcommand arguments were recognized for subcommand '{}'", - subcommand_name - ), - }; - - let target_dir = Path::new( - subcommand_matches - .value_of(TARGET_DIRECTORY_ARG_NAME) - .unwrap_or("./"), - ); - - match subcommand_name { - LIST_SUBCOMMAND => { - let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) { - Some(archive) => Path::new(archive), - None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME), - }; - list_archive(archive).await - } - EXTRACT_SUBCOMMAND => { - let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) { - Some(archive) => Path::new(archive), - None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME), - }; - extract_archive(archive, target_dir).await - } - CREATE_SUBCOMMAND => { - let source_dir = match subcommand_matches.value_of(SOURCE_DIRECTORY_ARG_NAME) { - Some(source) => Path::new(source), - None => bail!("No '{}' argument is specified", SOURCE_DIRECTORY_ARG_NAME), - }; - create_archive(source_dir, target_dir).await - } - unknown => bail!("Unknown subcommand {}", unknown), - } -} - -async fn list_archive(archive: &Path) -> anyhow::Result<()> { - let archive = archive.canonicalize().with_context(|| { - format!( - "Failed to get the absolute path for the archive path '{}'", - archive.display() - ) - })?; - ensure!( - archive.is_file(), - "Path '{}' is not an archive file", - archive.display() - ); - println!("Listing an archive at path '{}'", archive.display()); - let archive_name = match archive.file_name().and_then(|name| name.to_str()) { - Some(name) => name, - None => bail!( - "Failed to get the archive name from the path '{}'", - archive.display() - ), - }; - - let archive_bytes = fs::read(&archive) - .await - .context("Failed to read the archive bytes")?; - - let header = compression::read_archive_header(archive_name, &mut archive_bytes.as_slice()) - .await - .context("Failed to read the archive header")?; - - let empty_path = Path::new(""); - println!("-------------------------------"); - - let longest_path_in_archive = header - .files - .iter() - .filter_map(|file| Some(file.subpath.as_path(empty_path).to_str()?.len())) - .max() - .unwrap_or_default() - .max(METADATA_FILE_NAME.len()); - - for regular_file in &header.files { - println!( - "File: {:width$} uncompressed size: {} bytes", - regular_file.subpath.as_path(empty_path).display(), - regular_file.size, - width = longest_path_in_archive, - ) - } - println!( - "File: {:width$} uncompressed size: {} bytes", - METADATA_FILE_NAME, - header.metadata_file_size, - width = longest_path_in_archive, - ); - println!("-------------------------------"); - - Ok(()) -} - -async fn extract_archive(archive: &Path, target_dir: &Path) -> anyhow::Result<()> { - let archive = archive.canonicalize().with_context(|| { - format!( - "Failed to get the absolute path for the archive path '{}'", - archive.display() - ) - })?; - ensure!( - archive.is_file(), - "Path '{}' is not an archive file", - archive.display() - ); - let archive_name = match archive.file_name().and_then(|name| name.to_str()) { - Some(name) => name, - None => bail!( - "Failed to get the archive name from the path '{}'", - archive.display() - ), - }; - - if !target_dir.exists() { - fs::create_dir_all(target_dir).await.with_context(|| { - format!( - "Failed to create the target dir at path '{}'", - target_dir.display() - ) - })?; - } - let target_dir = target_dir.canonicalize().with_context(|| { - format!( - "Failed to get the absolute path for the target dir path '{}'", - target_dir.display() - ) - })?; - ensure!( - target_dir.is_dir(), - "Path '{}' is not a directory", - target_dir.display() - ); - let mut dir_contents = fs::read_dir(&target_dir) - .await - .context("Failed to list the target directory contents")?; - let dir_entry = dir_contents - .next_entry() - .await - .context("Failed to list the target directory contents")?; - ensure!( - dir_entry.is_none(), - "Target directory '{}' is not empty", - target_dir.display() - ); - - println!( - "Extracting an archive at path '{}' into directory '{}'", - archive.display(), - target_dir.display() - ); - - let mut archive_file = fs::File::open(&archive).await.with_context(|| { - format!( - "Failed to get the archive name from the path '{}'", - archive.display() - ) - })?; - let header = compression::read_archive_header(archive_name, &mut archive_file) - .await - .context("Failed to read the archive header")?; - compression::uncompress_with_header(&BTreeSet::new(), &target_dir, header, &mut archive_file) - .await - .context("Failed to extract the archive") -} - -async fn create_archive(source_dir: &Path, target_dir: &Path) -> anyhow::Result<()> { - let source_dir = source_dir.canonicalize().with_context(|| { - format!( - "Failed to get the absolute path for the source dir path '{}'", - source_dir.display() - ) - })?; - ensure!( - source_dir.is_dir(), - "Path '{}' is not a directory", - source_dir.display() - ); - - if !target_dir.exists() { - fs::create_dir_all(target_dir).await.with_context(|| { - format!( - "Failed to create the target dir at path '{}'", - target_dir.display() - ) - })?; - } - let target_dir = target_dir.canonicalize().with_context(|| { - format!( - "Failed to get the absolute path for the target dir path '{}'", - target_dir.display() - ) - })?; - ensure!( - target_dir.is_dir(), - "Path '{}' is not a directory", - target_dir.display() - ); - - println!( - "Compressing directory '{}' and creating resulting archive in directory '{}'", - source_dir.display(), - target_dir.display() - ); - - let mut metadata_file_contents = None; - let mut files_co_archive = Vec::new(); - - let mut source_dir_contents = fs::read_dir(&source_dir) - .await - .context("Failed to read the source directory contents")?; - - while let Some(source_dir_entry) = source_dir_contents - .next_entry() - .await - .context("Failed to read a source dir entry")? - { - let entry_path = source_dir_entry.path(); - if entry_path.is_file() { - if entry_path.file_name().and_then(|name| name.to_str()) == Some(METADATA_FILE_NAME) { - let metadata_bytes = fs::read(entry_path) - .await - .context("Failed to read metata file bytes in the source dir")?; - metadata_file_contents = Some( - TimelineMetadata::from_bytes(&metadata_bytes) - .context("Failed to parse metata file contents in the source dir")?, - ); - } else { - files_co_archive.push(entry_path); - } - } - } - - let metadata = match metadata_file_contents { - Some(metadata) => metadata, - None => bail!( - "No metadata file found in the source dir '{}', cannot create the archive", - source_dir.display() - ), - }; - - let _ = compression::archive_files_as_stream( - &source_dir, - files_co_archive.iter(), - &metadata, - move |mut archive_streamer, archive_name| async move { - let archive_target = target_dir.join(&archive_name); - let mut archive_file = fs::File::create(&archive_target).await?; - io::copy(&mut archive_streamer, &mut archive_file).await?; - Ok(archive_target) - }, - ) - .await - .context("Failed to create an archive")?; - - Ok(()) -} diff --git a/pageserver/src/bin/update_metadata.rs b/pageserver/src/bin/update_metadata.rs deleted file mode 100644 index bfbb6179c5..0000000000 --- a/pageserver/src/bin/update_metadata.rs +++ /dev/null @@ -1,72 +0,0 @@ -//! Main entry point for the edit_metadata executable -//! -//! A handy tool for debugging, that's all. -use anyhow::Result; -use clap::{App, Arg}; -use pageserver::layered_repository::metadata::TimelineMetadata; -use std::path::PathBuf; -use std::str::FromStr; -use zenith_utils::lsn::Lsn; -use zenith_utils::GIT_VERSION; - -fn main() -> Result<()> { - let arg_matches = App::new("Zenith update metadata utility") - .about("Dump or update metadata file") - .version(GIT_VERSION) - .arg( - Arg::new("path") - .help("Path to metadata file") - .required(true), - ) - .arg( - Arg::new("disk_lsn") - .short('d') - .long("disk_lsn") - .takes_value(true) - .help("Replace disk constistent lsn"), - ) - .arg( - Arg::new("prev_lsn") - .short('p') - .long("prev_lsn") - .takes_value(true) - .help("Previous record LSN"), - ) - .get_matches(); - - let path = PathBuf::from(arg_matches.value_of("path").unwrap()); - let metadata_bytes = std::fs::read(&path)?; - let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; - println!("Current metadata:\n{:?}", &meta); - - let mut update_meta = false; - - if let Some(disk_lsn) = arg_matches.value_of("disk_lsn") { - meta = TimelineMetadata::new( - Lsn::from_str(disk_lsn)?, - meta.prev_record_lsn(), - meta.ancestor_timeline(), - meta.ancestor_lsn(), - meta.latest_gc_cutoff_lsn(), - meta.initdb_lsn(), - ); - update_meta = true; - } - - if let Some(prev_lsn) = arg_matches.value_of("prev_lsn") { - meta = TimelineMetadata::new( - meta.disk_consistent_lsn(), - Some(Lsn::from_str(prev_lsn)?), - meta.ancestor_timeline(), - meta.ancestor_lsn(), - meta.latest_gc_cutoff_lsn(), - meta.initdb_lsn(), - ); - update_meta = true; - } - if update_meta { - let metadata_bytes = meta.to_bytes()?; - std::fs::write(&path, &metadata_bytes)?; - } - Ok(()) -} diff --git a/pageserver/src/branches.rs b/pageserver/src/branches.rs deleted file mode 100644 index 8a411060de..0000000000 --- a/pageserver/src/branches.rs +++ /dev/null @@ -1,428 +0,0 @@ -//! -//! Branch management code -//! -// TODO: move all paths construction to conf impl -// - -use anyhow::{bail, Context, Result}; -use postgres_ffi::ControlFileData; -use serde::{Deserialize, Serialize}; -use std::{ - fs, - path::Path, - process::{Command, Stdio}, - str::FromStr, - sync::Arc, -}; -use tracing::*; - -use zenith_utils::crashsafe_dir; -use zenith_utils::logging; -use zenith_utils::lsn::Lsn; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; - -use crate::walredo::WalRedoManager; -use crate::CheckpointConfig; -use crate::{config::PageServerConf, repository::Repository}; -use crate::{import_datadir, LOG_FILE_NAME}; -use crate::{repository::RepositoryTimeline, tenant_mgr}; - -#[derive(Serialize, Deserialize, Clone)] -pub struct BranchInfo { - pub name: String, - #[serde(with = "hex")] - pub timeline_id: ZTimelineId, - pub latest_valid_lsn: Lsn, - pub ancestor_id: Option, - pub ancestor_lsn: Option, - pub current_logical_size: usize, - pub current_logical_size_non_incremental: Option, -} - -impl BranchInfo { - pub fn from_path>( - path: T, - repo: &Arc, - include_non_incremental_logical_size: bool, - ) -> Result { - let path = path.as_ref(); - let name = path.file_name().unwrap().to_string_lossy().to_string(); - let timeline_id = std::fs::read_to_string(path) - .with_context(|| { - format!( - "Failed to read branch file contents at path '{}'", - path.display() - ) - })? - .parse::()?; - - let timeline = match repo.get_timeline(timeline_id)? { - RepositoryTimeline::Local(local_entry) => local_entry, - RepositoryTimeline::Remote { .. } => { - bail!("Timeline {} is remote, no branches to display", timeline_id) - } - }; - - // we use ancestor lsn zero if we don't have an ancestor, so turn this into an option based on timeline id - let (ancestor_id, ancestor_lsn) = match timeline.get_ancestor_timeline_id() { - Some(ancestor_id) => ( - Some(ancestor_id.to_string()), - Some(timeline.get_ancestor_lsn().to_string()), - ), - None => (None, None), - }; - - // non incremental size calculation can be heavy, so let it be optional - // needed for tests to check size calculation - let current_logical_size_non_incremental = include_non_incremental_logical_size - .then(|| { - timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) - }) - .transpose()?; - - Ok(BranchInfo { - name, - timeline_id, - latest_valid_lsn: timeline.get_last_record_lsn(), - ancestor_id, - ancestor_lsn, - current_logical_size: timeline.get_current_logical_size(), - current_logical_size_non_incremental, - }) - } -} - -#[derive(Debug, Clone, Copy)] -pub struct PointInTime { - pub timelineid: ZTimelineId, - pub lsn: Lsn, -} - -pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> { - // Initialize logger - // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages - let _log_file = logging::init(LOG_FILE_NAME, true)?; - - // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo - // process during repository initialization. - // - // FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched - // initdb in the background, and it kept running even after the "zenith init" had exited. - // In tests, we started the page server immediately after that, so that initdb was still - // running in the background, and we failed to run initdb again in the same directory. This - // has been solved for the rapid init+start case now, but the general race condition remains - // if you restart the server quickly. The WAL redo manager doesn't use a separate thread - // anymore, but I think that could still happen. - let dummy_redo_mgr = Arc::new(crate::walredo::DummyRedoManager {}); - - if let Some(tenantid) = create_tenant { - let tenantid = ZTenantId::from_str(tenantid)?; - println!("initializing tenantid {}", tenantid); - create_repo(conf, tenantid, dummy_redo_mgr).context("failed to create repo")?; - } - crashsafe_dir::create_dir_all(conf.tenants_path())?; - - println!("pageserver init succeeded"); - Ok(()) -} - -pub fn create_repo( - conf: &'static PageServerConf, - tenantid: ZTenantId, - wal_redo_manager: Arc, -) -> Result> { - let repo_dir = conf.tenant_path(&tenantid); - if repo_dir.exists() { - bail!("repo for {} already exists", tenantid) - } - - // top-level dir may exist if we are creating it through CLI - crashsafe_dir::create_dir_all(&repo_dir) - .with_context(|| format!("could not create directory {}", repo_dir.display()))?; - - crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?; - crashsafe_dir::create_dir_all(conf.branches_path(&tenantid))?; - crashsafe_dir::create_dir_all(conf.tags_path(&tenantid))?; - - info!("created directory structure in {}", repo_dir.display()); - - // create a new timeline directory - let timeline_id = ZTimelineId::generate(); - let timelinedir = conf.timeline_path(&timeline_id, &tenantid); - - crashsafe_dir::create_dir(&timelinedir)?; - - let repo = Arc::new(crate::layered_repository::LayeredRepository::new( - conf, - wal_redo_manager, - tenantid, - conf.remote_storage_config.is_some(), - )); - - // Load data into pageserver - // TODO To implement zenith import we need to - // move data loading out of create_repo() - bootstrap_timeline(conf, tenantid, timeline_id, repo.as_ref())?; - - Ok(repo) -} - -// Returns checkpoint LSN from controlfile -fn get_lsn_from_controlfile(path: &Path) -> Result { - // Read control file to extract the LSN - let controlfile_path = path.join("global").join("pg_control"); - let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?; - let lsn = controlfile.checkPoint; - - Ok(Lsn(lsn)) -} - -// Create the cluster temporarily in 'initdbpath' directory inside the repository -// to get bootstrap data for timeline initialization. -// -fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { - info!("running initdb in {}... ", initdbpath.display()); - - let initdb_path = conf.pg_bin_dir().join("initdb"); - let initdb_output = Command::new(initdb_path) - .args(&["-D", initdbpath.to_str().unwrap()]) - .args(&["-U", &conf.superuser]) - .args(&["-E", "utf8"]) - .arg("--no-instructions") - // This is only used for a temporary installation that is deleted shortly after, - // so no need to fsync it - .arg("--no-sync") - .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) - .stdout(Stdio::null()) - .output() - .context("failed to execute initdb")?; - if !initdb_output.status.success() { - anyhow::bail!( - "initdb failed: '{}'", - String::from_utf8_lossy(&initdb_output.stderr) - ); - } - - Ok(()) -} - -// -// - run initdb to init temporary instance and get bootstrap data -// - after initialization complete, remove the temp dir. -// -fn bootstrap_timeline( - conf: &'static PageServerConf, - tenantid: ZTenantId, - tli: ZTimelineId, - repo: &dyn Repository, -) -> Result<()> { - let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered(); - - let initdb_path = conf.tenant_path(&tenantid).join("tmp"); - - // Init temporarily repo to get bootstrap data - run_initdb(conf, &initdb_path)?; - let pgdata_path = initdb_path; - - let lsn = get_lsn_from_controlfile(&pgdata_path)?.align(); - - // Import the contents of the data directory at the initial checkpoint - // LSN, and any WAL after that. - // Initdb lsn will be equal to last_record_lsn which will be set after import. - // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = repo.create_empty_timeline(tli, lsn)?; - import_datadir::import_timeline_from_postgres_datadir( - &pgdata_path, - timeline.writer().as_ref(), - lsn, - )?; - timeline.checkpoint(CheckpointConfig::Forced)?; - - println!( - "created initial timeline {} timeline.lsn {}", - tli, - timeline.get_last_record_lsn() - ); - - let data = tli.to_string(); - fs::write(conf.branch_path("main", &tenantid), data)?; - println!("created main branch"); - - // Remove temp dir. We don't need it anymore - fs::remove_dir_all(pgdata_path)?; - - Ok(()) -} - -pub(crate) fn get_branches( - conf: &PageServerConf, - tenantid: &ZTenantId, - include_non_incremental_logical_size: bool, -) -> Result> { - let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?; - - // Each branch has a corresponding record (text file) in the refs/branches - // with timeline_id. - let branches_dir = conf.branches_path(tenantid); - - std::fs::read_dir(&branches_dir) - .with_context(|| { - format!( - "Found no branches directory '{}' for tenant {}", - branches_dir.display(), - tenantid - ) - })? - .map(|dir_entry_res| { - let dir_entry = dir_entry_res.with_context(|| { - format!( - "Failed to list branches directory '{}' content for tenant {}", - branches_dir.display(), - tenantid - ) - })?; - BranchInfo::from_path( - dir_entry.path(), - &repo, - include_non_incremental_logical_size, - ) - }) - .collect() -} - -pub(crate) fn create_branch( - conf: &PageServerConf, - branchname: &str, - startpoint_str: &str, - tenantid: &ZTenantId, -) -> Result { - let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?; - - if conf.branch_path(branchname, tenantid).exists() { - anyhow::bail!("branch {} already exists", branchname); - } - - let mut startpoint = parse_point_in_time(conf, startpoint_str, tenantid)?; - let timeline = repo - .get_timeline(startpoint.timelineid)? - .local_timeline() - .context("Cannot branch off the timeline that's not present locally")?; - if startpoint.lsn == Lsn(0) { - // Find end of WAL on the old timeline - let end_of_wal = timeline.get_last_record_lsn(); - info!("branching at end of WAL: {}", end_of_wal); - startpoint.lsn = end_of_wal; - } else { - // Wait for the WAL to arrive and be processed on the parent branch up - // to the requested branch point. The repository code itself doesn't - // require it, but if we start to receive WAL on the new timeline, - // decoding the new WAL might need to look up previous pages, relation - // sizes etc. and that would get confused if the previous page versions - // are not in the repository yet. - timeline.wait_lsn(startpoint.lsn)?; - } - startpoint.lsn = startpoint.lsn.align(); - if timeline.get_ancestor_lsn() > startpoint.lsn { - // can we safely just branch from the ancestor instead? - anyhow::bail!( - "invalid startpoint {} for the branch {}: less than timeline ancestor lsn {:?}", - startpoint.lsn, - branchname, - timeline.get_ancestor_lsn() - ); - } - - let new_timeline_id = ZTimelineId::generate(); - - // Forward entire timeline creation routine to repository - // backend, so it can do all needed initialization - repo.branch_timeline(startpoint.timelineid, new_timeline_id, startpoint.lsn)?; - - // Remember the human-readable branch name for the new timeline. - // FIXME: there's a race condition, if you create a branch with the same - // name concurrently. - let data = new_timeline_id.to_string(); - fs::write(conf.branch_path(branchname, tenantid), data)?; - - Ok(BranchInfo { - name: branchname.to_string(), - timeline_id: new_timeline_id, - latest_valid_lsn: startpoint.lsn, - ancestor_id: Some(startpoint.timelineid.to_string()), - ancestor_lsn: Some(startpoint.lsn.to_string()), - current_logical_size: 0, - current_logical_size_non_incremental: Some(0), - }) -} - -// -// Parse user-given string that represents a point-in-time. -// -// We support multiple variants: -// -// Raw timeline id in hex, meaning the end of that timeline: -// bc62e7d612d0e6fe8f99a6dd2f281f9d -// -// A specific LSN on a timeline: -// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8 -// -// Same, with a human-friendly branch name: -// main -// main@2/15D3DD8 -// -// Human-friendly tag name: -// mytag -// -// -fn parse_point_in_time( - conf: &PageServerConf, - s: &str, - tenantid: &ZTenantId, -) -> Result { - let mut strings = s.split('@'); - let name = strings.next().unwrap(); - - let lsn = strings - .next() - .map(Lsn::from_str) - .transpose() - .context("invalid LSN in point-in-time specification")?; - - // Check if it's a tag - if lsn.is_none() { - let tagpath = conf.tag_path(name, tenantid); - if tagpath.exists() { - let pointstr = fs::read_to_string(tagpath)?; - - return parse_point_in_time(conf, &pointstr, tenantid); - } - } - - // Check if it's a branch - // Check if it's branch @ LSN - let branchpath = conf.branch_path(name, tenantid); - if branchpath.exists() { - let pointstr = fs::read_to_string(branchpath)?; - - let mut result = parse_point_in_time(conf, &pointstr, tenantid)?; - - result.lsn = lsn.unwrap_or(Lsn(0)); - return Ok(result); - } - - // Check if it's a timelineid - // Check if it's timelineid @ LSN - if let Ok(timelineid) = ZTimelineId::from_str(name) { - let tlipath = conf.timeline_path(&timelineid, tenantid); - if tlipath.exists() { - return Ok(PointInTime { - timelineid, - lsn: lsn.unwrap_or(Lsn(0)), - }); - } - } - - bail!("could not parse point-in-time {}", s); -} diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8b65e7e2e6..f40b608da1 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -4,48 +4,55 @@ //! file, or on the command line. //! See also `settings.md` for better description on every parameter. -use anyhow::{bail, ensure, Context, Result}; -use toml_edit; -use toml_edit::{Document, Item}; -use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; - -use std::convert::TryInto; +use anyhow::{anyhow, bail, ensure, Context, Result}; +use remote_storage::RemoteStorageConfig; use std::env; -use std::num::{NonZeroU32, NonZeroUsize}; +use utils::crashsafe::path_with_suffix_extension; +use utils::id::ConnectionId; + +use std::num::NonZeroUsize; use std::path::{Path, PathBuf}; use std::str::FromStr; use std::time::Duration; +use toml_edit; +use toml_edit::{Document, Item}; +use url::Url; +use utils::{ + id::{NodeId, TenantId, TimelineId}, + logging::LogFormat, + postgres_backend::AuthType, +}; -use crate::layered_repository::TIMELINES_SEGMENT_NAME; +use crate::tenant::TIMELINES_SEGMENT_NAME; +use crate::tenant_config::{TenantConf, TenantConfOpt}; + +/// The name of the metadata file pageserver creates per timeline. +pub const METADATA_FILE_NAME: &str = "metadata"; +pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; +const TENANT_CONFIG_NAME: &str = "config"; pub mod defaults { + use crate::tenant_config::defaults::*; use const_format::formatcp; - pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; - pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); - pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; - pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); - - // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB - // would be more appropriate. But a low value forces the code to be exercised more, - // which is good for now to trigger bugs. - pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; - pub const DEFAULT_CHECKPOINT_PERIOD: &str = "1 s"; - - pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; - pub const DEFAULT_GC_PERIOD: &str = "100 s"; + pub use pageserver_api::{ + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PG_LISTEN_PORT, + }; pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; - pub const DEFAULT_SUPERUSER: &str = "zenith_admin"; - pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 100; - pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; + pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; + pub const DEFAULT_LOG_FORMAT: &str = "plain"; + + pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = + super::ConfigurableSemaphore::DEFAULT_INITIAL.get(); + /// /// Default built-in configuration file. /// @@ -56,12 +63,6 @@ pub mod defaults { #listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}' #listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}' -#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes -#checkpoint_period = '{DEFAULT_CHECKPOINT_PERIOD}' - -#gc_period = '{DEFAULT_GC_PERIOD}' -#gc_horizon = {DEFAULT_GC_HORIZON} - #wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}' #wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}' @@ -70,6 +71,22 @@ pub mod defaults { # initial superuser role name to use when creating a new tenant #initial_superuser_name = '{DEFAULT_SUPERUSER}' +#log_format = '{DEFAULT_LOG_FORMAT}' + +#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}' + +# [tenant_config] +#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes +#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} +#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes +#compaction_period = '{DEFAULT_COMPACTION_PERIOD}' +#compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}' + +#gc_period = '{DEFAULT_GC_PERIOD}' +#gc_horizon = {DEFAULT_GC_HORIZON} +#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD} +#pitr_interval = '{DEFAULT_PITR_INTERVAL}' + # [remote_storage] "### @@ -78,20 +95,15 @@ pub mod defaults { #[derive(Debug, Clone, PartialEq, Eq)] pub struct PageServerConf { + // Identifier of that particular pageserver so e g safekeepers + // can safely distinguish different pageservers + pub id: NodeId, + /// Example (default): 127.0.0.1:64000 pub listen_pg_addr: String, /// Example (default): 127.0.0.1:9898 pub listen_http_addr: String, - // Flush out an inmemory layer, if it's holding WAL older than this - // This puts a backstop on how much WAL needs to be re-digested if the - // page server crashes. - pub checkpoint_distance: u64, - pub checkpoint_period: Duration, - - pub gc_horizon: u64, - pub gc_period: Duration, - // Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call. pub wait_lsn_timeout: Duration, // How long to wait for WAL redo to complete. @@ -116,62 +128,252 @@ pub struct PageServerConf { pub auth_validation_public_key_path: Option, pub remote_storage_config: Option, + + pub profiling: ProfilingConfig, + pub default_tenant_conf: TenantConf, + + /// A prefix to add in etcd brokers before every key. + /// Can be used for isolating different pageserver groups within the same etcd cluster. + pub broker_etcd_prefix: String, + + /// Etcd broker endpoints to connect to. + pub broker_endpoints: Vec, + + pub log_format: LogFormat, + + /// Number of concurrent [`Tenant::gather_size_inputs`] allowed. + pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore, } -/// External backup storage configuration, enough for creating a client for that storage. #[derive(Debug, Clone, PartialEq, Eq)] -pub struct RemoteStorageConfig { - /// Max allowed number of concurrent sync operations between pageserver and the remote storage. - pub max_concurrent_sync: NonZeroUsize, - /// Max allowed errors before the sync task is considered failed and evicted. - pub max_sync_errors: NonZeroU32, - /// The storage connection configuration. - pub storage: RemoteStorageKind, +pub enum ProfilingConfig { + Disabled, + PageRequests, } -/// A kind of a remote storage to connect to, with its connection configuration. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum RemoteStorageKind { - /// Storage based on local file system. - /// Specify a root folder to place all stored relish data into. - LocalFs(PathBuf), - /// AWS S3 based storage, storing all relishes into the root - /// of the S3 bucket from the config. - AwsS3(S3Config), +impl FromStr for ProfilingConfig { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + let result = match s { + "disabled" => ProfilingConfig::Disabled, + "page_requests" => ProfilingConfig::PageRequests, + _ => bail!("invalid value \"{s}\" for profiling option, valid values are \"disabled\" and \"page_requests\""), + }; + Ok(result) + } } -/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write). -#[derive(Clone, PartialEq, Eq)] -pub struct S3Config { - /// Name of the bucket to connect to. - pub bucket_name: String, - /// The region where the bucket is located at. - pub bucket_region: String, - /// A "subfolder" in the bucket, to use the same bucket separately by multiple pageservers at once. - pub prefix_in_bucket: Option, - /// "Login" to use when connecting to bucket. - /// Can be empty for cases like AWS k8s IAM - /// where we can allow certain pods to connect - /// to the bucket directly without any credentials. - pub access_key_id: Option, - /// "Password" to use when connecting to bucket. - pub secret_access_key: Option, - /// A base URL to send S3 requests to. - /// By default, the endpoint is derived from a region name, assuming it's - /// an AWS S3 region name, erroring on wrong region name. - /// Endpoint provides a way to support other S3 flavors and their regions. - /// - /// Example: `http://127.0.0.1:5000` - pub endpoint: Option, +// use dedicated enum for builder to better indicate the intention +// and avoid possible confusion with nested options +pub enum BuilderValue { + Set(T), + NotSet, } -impl std::fmt::Debug for S3Config { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("S3Config") - .field("bucket_name", &self.bucket_name) - .field("bucket_region", &self.bucket_region) - .field("prefix_in_bucket", &self.prefix_in_bucket) - .finish() +impl BuilderValue { + pub fn ok_or(self, err: E) -> Result { + match self { + Self::Set(v) => Ok(v), + Self::NotSet => Err(err), + } + } +} + +// needed to simplify config construction +struct PageServerConfigBuilder { + listen_pg_addr: BuilderValue, + + listen_http_addr: BuilderValue, + + wait_lsn_timeout: BuilderValue, + wal_redo_timeout: BuilderValue, + + superuser: BuilderValue, + + page_cache_size: BuilderValue, + max_file_descriptors: BuilderValue, + + workdir: BuilderValue, + + pg_distrib_dir: BuilderValue, + + auth_type: BuilderValue, + + // + auth_validation_public_key_path: BuilderValue>, + remote_storage_config: BuilderValue>, + + id: BuilderValue, + + profiling: BuilderValue, + broker_etcd_prefix: BuilderValue, + broker_endpoints: BuilderValue>, + + log_format: BuilderValue, + + concurrent_tenant_size_logical_size_queries: BuilderValue, +} + +impl Default for PageServerConfigBuilder { + fn default() -> Self { + use self::BuilderValue::*; + use defaults::*; + Self { + listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()), + listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()), + wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) + .expect("cannot parse default wait lsn timeout")), + wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) + .expect("cannot parse default wal redo timeout")), + superuser: Set(DEFAULT_SUPERUSER.to_string()), + page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE), + max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS), + workdir: Set(PathBuf::new()), + pg_distrib_dir: Set(env::current_dir() + .expect("cannot access current directory") + .join("pg_install")), + auth_type: Set(AuthType::Trust), + auth_validation_public_key_path: Set(None), + remote_storage_config: Set(None), + id: NotSet, + profiling: Set(ProfilingConfig::Disabled), + broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()), + broker_endpoints: Set(Vec::new()), + log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), + + concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()), + } + } +} + +impl PageServerConfigBuilder { + pub fn listen_pg_addr(&mut self, listen_pg_addr: String) { + self.listen_pg_addr = BuilderValue::Set(listen_pg_addr) + } + + pub fn listen_http_addr(&mut self, listen_http_addr: String) { + self.listen_http_addr = BuilderValue::Set(listen_http_addr) + } + + pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) { + self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout) + } + + pub fn wal_redo_timeout(&mut self, wal_redo_timeout: Duration) { + self.wal_redo_timeout = BuilderValue::Set(wal_redo_timeout) + } + + pub fn superuser(&mut self, superuser: String) { + self.superuser = BuilderValue::Set(superuser) + } + + pub fn page_cache_size(&mut self, page_cache_size: usize) { + self.page_cache_size = BuilderValue::Set(page_cache_size) + } + + pub fn max_file_descriptors(&mut self, max_file_descriptors: usize) { + self.max_file_descriptors = BuilderValue::Set(max_file_descriptors) + } + + pub fn workdir(&mut self, workdir: PathBuf) { + self.workdir = BuilderValue::Set(workdir) + } + + pub fn pg_distrib_dir(&mut self, pg_distrib_dir: PathBuf) { + self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir) + } + + pub fn auth_type(&mut self, auth_type: AuthType) { + self.auth_type = BuilderValue::Set(auth_type) + } + + pub fn auth_validation_public_key_path( + &mut self, + auth_validation_public_key_path: Option, + ) { + self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path) + } + + pub fn remote_storage_config(&mut self, remote_storage_config: Option) { + self.remote_storage_config = BuilderValue::Set(remote_storage_config) + } + + pub fn broker_endpoints(&mut self, broker_endpoints: Vec) { + self.broker_endpoints = BuilderValue::Set(broker_endpoints) + } + + pub fn broker_etcd_prefix(&mut self, broker_etcd_prefix: String) { + self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix) + } + + pub fn id(&mut self, node_id: NodeId) { + self.id = BuilderValue::Set(node_id) + } + + pub fn profiling(&mut self, profiling: ProfilingConfig) { + self.profiling = BuilderValue::Set(profiling) + } + + pub fn log_format(&mut self, log_format: LogFormat) { + self.log_format = BuilderValue::Set(log_format) + } + + pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: ConfigurableSemaphore) { + self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u); + } + + pub fn build(self) -> anyhow::Result { + let broker_endpoints = self + .broker_endpoints + .ok_or(anyhow!("No broker endpoints provided"))?; + + Ok(PageServerConf { + listen_pg_addr: self + .listen_pg_addr + .ok_or(anyhow!("missing listen_pg_addr"))?, + listen_http_addr: self + .listen_http_addr + .ok_or(anyhow!("missing listen_http_addr"))?, + wait_lsn_timeout: self + .wait_lsn_timeout + .ok_or(anyhow!("missing wait_lsn_timeout"))?, + wal_redo_timeout: self + .wal_redo_timeout + .ok_or(anyhow!("missing wal_redo_timeout"))?, + superuser: self.superuser.ok_or(anyhow!("missing superuser"))?, + page_cache_size: self + .page_cache_size + .ok_or(anyhow!("missing page_cache_size"))?, + max_file_descriptors: self + .max_file_descriptors + .ok_or(anyhow!("missing max_file_descriptors"))?, + workdir: self.workdir.ok_or(anyhow!("missing workdir"))?, + pg_distrib_dir: self + .pg_distrib_dir + .ok_or(anyhow!("missing pg_distrib_dir"))?, + auth_type: self.auth_type.ok_or(anyhow!("missing auth_type"))?, + auth_validation_public_key_path: self + .auth_validation_public_key_path + .ok_or(anyhow!("missing auth_validation_public_key_path"))?, + remote_storage_config: self + .remote_storage_config + .ok_or(anyhow!("missing remote_storage_config"))?, + id: self.id.ok_or(anyhow!("missing id"))?, + profiling: self.profiling.ok_or(anyhow!("missing profiling"))?, + // TenantConf is handled separately + default_tenant_conf: TenantConf::default(), + broker_endpoints, + broker_etcd_prefix: self + .broker_etcd_prefix + .ok_or(anyhow!("missing broker_etcd_prefix"))?, + log_format: self.log_format.ok_or(anyhow!("missing log_format"))?, + concurrent_tenant_size_logical_size_queries: self + .concurrent_tenant_size_logical_size_queries + .ok_or(anyhow!( + "missing concurrent_tenant_size_logical_size_queries" + ))?, + }) } } @@ -184,111 +386,149 @@ impl PageServerConf { self.workdir.join("tenants") } - pub fn tenant_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenants_path().join(tenantid.to_string()) + pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf { + self.tenants_path().join(tenant_id.to_string()) } - pub fn tags_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenant_path(tenantid).join("refs").join("tags") + /// Points to a place in pageserver's local directory, + /// where certain tenant's tenantconf file should be located. + pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf { + self.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME) } - pub fn tag_path(&self, tag_name: &str, tenantid: &ZTenantId) -> PathBuf { - self.tags_path(tenantid).join(tag_name) + pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf { + self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME) } - pub fn branches_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenant_path(tenantid).join("refs").join("branches") + pub fn timeline_path(&self, timeline_id: &TimelineId, tenant_id: &TenantId) -> PathBuf { + self.timelines_path(tenant_id).join(timeline_id.to_string()) } - pub fn branch_path(&self, branch_name: &str, tenantid: &ZTenantId) -> PathBuf { - self.branches_path(tenantid).join(branch_name) + pub fn timeline_uninit_mark_file_path( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> PathBuf { + path_with_suffix_extension( + self.timeline_path(&timeline_id, &tenant_id), + TIMELINE_UNINIT_MARK_SUFFIX, + ) } - pub fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME) + pub fn traces_path(&self) -> PathBuf { + self.workdir.join("traces") } - pub fn timeline_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf { - self.timelines_path(tenantid).join(timelineid.to_string()) + pub fn trace_path( + &self, + tenant_id: &TenantId, + timeline_id: &TimelineId, + connection_id: &ConnectionId, + ) -> PathBuf { + self.traces_path() + .join(tenant_id.to_string()) + .join(timeline_id.to_string()) + .join(connection_id.to_string()) } - pub fn ancestor_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf { - self.timeline_path(timelineid, tenantid).join("ancestor") + /// Points to a place in pageserver's local directory, + /// where certain timeline's metadata file should be located. + pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf { + self.timeline_path(&timeline_id, &tenant_id) + .join(METADATA_FILE_NAME) } // // Postgres distribution paths // + pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result { + let path = self.pg_distrib_dir.clone(); - pub fn pg_bin_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("bin") + match pg_version { + 14 => Ok(path.join(format!("v{pg_version}"))), + 15 => Ok(path.join(format!("v{pg_version}"))), + _ => bail!("Unsupported postgres version: {}", pg_version), + } } - pub fn pg_lib_dir(&self) -> PathBuf { - self.pg_distrib_dir.join("lib") + pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { + match pg_version { + 14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), + 15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")), + _ => bail!("Unsupported postgres version: {}", pg_version), + } + } + pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { + match pg_version { + 14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), + 15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")), + _ => bail!("Unsupported postgres version: {}", pg_version), + } } /// Parse a configuration file (pageserver.toml) into a PageServerConf struct, /// validating the input and failing on errors. /// /// This leaves any options not present in the file in the built-in defaults. - pub fn parse_and_validate(toml: &Document, workdir: &Path) -> Result { - use defaults::*; + pub fn parse_and_validate(toml: &Document, workdir: &Path) -> anyhow::Result { + let mut builder = PageServerConfigBuilder::default(); + builder.workdir(workdir.to_owned()); - let mut conf = PageServerConf { - workdir: workdir.to_path_buf(), - - listen_pg_addr: DEFAULT_PG_LISTEN_ADDR.to_string(), - listen_http_addr: DEFAULT_HTTP_LISTEN_ADDR.to_string(), - checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_period: humantime::parse_duration(DEFAULT_CHECKPOINT_PERIOD)?, - gc_horizon: DEFAULT_GC_HORIZON, - gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)?, - wait_lsn_timeout: humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)?, - wal_redo_timeout: humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)?, - page_cache_size: DEFAULT_PAGE_CACHE_SIZE, - max_file_descriptors: DEFAULT_MAX_FILE_DESCRIPTORS, - - pg_distrib_dir: PathBuf::new(), - auth_validation_public_key_path: None, - auth_type: AuthType::Trust, - - remote_storage_config: None, - - superuser: DEFAULT_SUPERUSER.to_string(), - }; + let mut t_conf: TenantConfOpt = Default::default(); for (key, item) in toml.iter() { match key { - "listen_pg_addr" => conf.listen_pg_addr = parse_toml_string(key, item)?, - "listen_http_addr" => conf.listen_http_addr = parse_toml_string(key, item)?, - "checkpoint_distance" => conf.checkpoint_distance = parse_toml_u64(key, item)?, - "checkpoint_period" => conf.checkpoint_period = parse_toml_duration(key, item)?, - "gc_horizon" => conf.gc_horizon = parse_toml_u64(key, item)?, - "gc_period" => conf.gc_period = parse_toml_duration(key, item)?, - "wait_lsn_timeout" => conf.wait_lsn_timeout = parse_toml_duration(key, item)?, - "wal_redo_timeout" => conf.wal_redo_timeout = parse_toml_duration(key, item)?, - "initial_superuser_name" => conf.superuser = parse_toml_string(key, item)?, - "page_cache_size" => conf.page_cache_size = parse_toml_u64(key, item)? as usize, + "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?), + "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?), + "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?), + "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?), + "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?), + "page_cache_size" => builder.page_cache_size(parse_toml_u64(key, item)? as usize), "max_file_descriptors" => { - conf.max_file_descriptors = parse_toml_u64(key, item)? as usize + builder.max_file_descriptors(parse_toml_u64(key, item)? as usize) } "pg_distrib_dir" => { - conf.pg_distrib_dir = PathBuf::from(parse_toml_string(key, item)?) + builder.pg_distrib_dir(PathBuf::from(parse_toml_string(key, item)?)) } - "auth_validation_public_key_path" => { - conf.auth_validation_public_key_path = - Some(PathBuf::from(parse_toml_string(key, item)?)) - } - "auth_type" => conf.auth_type = parse_toml_auth_type(key, item)?, + "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some( + PathBuf::from(parse_toml_string(key, item)?), + )), + "auth_type" => builder.auth_type(parse_toml_from_str(key, item)?), "remote_storage" => { - conf.remote_storage_config = Some(Self::parse_remote_storage_config(item)?) + builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item)?)) } - _ => bail!("unrecognized pageserver option '{}'", key), + "tenant_config" => { + t_conf = Self::parse_toml_tenant_conf(item)?; + } + "id" => builder.id(NodeId(parse_toml_u64(key, item)?)), + "profiling" => builder.profiling(parse_toml_from_str(key, item)?), + "broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?), + "broker_endpoints" => builder.broker_endpoints( + parse_toml_array(key, item)? + .into_iter() + .map(|endpoint_str| { + endpoint_str.parse::().with_context(|| { + format!("Array item {endpoint_str} for key {key} is not a valid url endpoint") + }) + }) + .collect::>()?, + ), + "log_format" => builder.log_format( + LogFormat::from_config(&parse_toml_string(key, item)?)? + ), + "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({ + let input = parse_toml_string(key, item)?; + let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; + let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?; + ConfigurableSemaphore::new(permits) + }), + _ => bail!("unrecognized pageserver option '{key}'"), } } - if conf.auth_type == AuthType::ZenithJWT { + let mut conf = builder.build().context("invalid config")?; + + if conf.auth_type == AuthType::NeonJWT { let auth_validation_public_key_path = conf .auth_validation_public_key_path .get_or_insert_with(|| workdir.join("auth_public_key.pem")); @@ -301,119 +541,101 @@ impl PageServerConf { ); } - if conf.pg_distrib_dir == PathBuf::new() { - conf.pg_distrib_dir = env::current_dir()?.join("tmp_install") - }; - if !conf.pg_distrib_dir.join("bin/postgres").exists() { - bail!( - "Can't find postgres binary at {}", - conf.pg_distrib_dir.display() - ); - } + conf.default_tenant_conf = t_conf.merge(TenantConf::default()); Ok(conf) } - /// subroutine of parse_config(), to parse the `[remote_storage]` table. - fn parse_remote_storage_config(toml: &toml_edit::Item) -> anyhow::Result { - let local_path = toml.get("local_path"); - let bucket_name = toml.get("bucket_name"); - let bucket_region = toml.get("bucket_region"); + // subroutine of parse_and_validate to parse `[tenant_conf]` section - let max_concurrent_sync: NonZeroUsize = if let Some(s) = toml.get("max_concurrent_sync") { - parse_toml_u64("max_concurrent_sync", s) - .and_then(|toml_u64| { - toml_u64.try_into().with_context(|| { - format!("'max_concurrent_sync' value {} is too large", toml_u64) - }) - }) - .ok() - .and_then(NonZeroUsize::new) - .context("'max_concurrent_sync' must be a non-zero positive integer")? - } else { - NonZeroUsize::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC).unwrap() - }; - let max_sync_errors: NonZeroU32 = if let Some(s) = toml.get("max_sync_errors") { - parse_toml_u64("max_sync_errors", s) - .and_then(|toml_u64| { - toml_u64.try_into().with_context(|| { - format!("'max_sync_errors' value {} is too large", toml_u64) - }) - }) - .ok() - .and_then(NonZeroU32::new) - .context("'max_sync_errors' must be a non-zero positive integer")? - } else { - NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS).unwrap() - }; + pub fn parse_toml_tenant_conf(item: &toml_edit::Item) -> Result { + let mut t_conf: TenantConfOpt = Default::default(); + if let Some(checkpoint_distance) = item.get("checkpoint_distance") { + t_conf.checkpoint_distance = + Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?); + } - let storage = match (local_path, bucket_name, bucket_region) { - (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"), - (_, Some(_), None) => { - bail!("'bucket_region' option is mandatory if 'bucket_name' is given ") - } - (_, None, Some(_)) => { - bail!("'bucket_name' option is mandatory if 'bucket_region' is given ") - } - (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config { - bucket_name: parse_toml_string("bucket_name", bucket_name)?, - bucket_region: parse_toml_string("bucket_region", bucket_region)?, - access_key_id: toml - .get("access_key_id") - .map(|access_key_id| parse_toml_string("access_key_id", access_key_id)) - .transpose()?, - secret_access_key: toml - .get("secret_access_key") - .map(|secret_access_key| { - parse_toml_string("secret_access_key", secret_access_key) - }) - .transpose()?, - prefix_in_bucket: toml - .get("prefix_in_bucket") - .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket)) - .transpose()?, - endpoint: toml - .get("endpoint") - .map(|endpoint| parse_toml_string("endpoint", endpoint)) - .transpose()?, - }), - (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from( - parse_toml_string("local_path", local_path)?, - )), - (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"), - }; + if let Some(checkpoint_timeout) = item.get("checkpoint_timeout") { + t_conf.checkpoint_timeout = Some(parse_toml_duration( + "checkpoint_timeout", + checkpoint_timeout, + )?); + } - Ok(RemoteStorageConfig { - max_concurrent_sync, - max_sync_errors, - storage, - }) + if let Some(compaction_target_size) = item.get("compaction_target_size") { + t_conf.compaction_target_size = Some(parse_toml_u64( + "compaction_target_size", + compaction_target_size, + )?); + } + + if let Some(compaction_period) = item.get("compaction_period") { + t_conf.compaction_period = + Some(parse_toml_duration("compaction_period", compaction_period)?); + } + + if let Some(compaction_threshold) = item.get("compaction_threshold") { + t_conf.compaction_threshold = + Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?); + } + + if let Some(gc_horizon) = item.get("gc_horizon") { + t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?); + } + + if let Some(gc_period) = item.get("gc_period") { + t_conf.gc_period = Some(parse_toml_duration("gc_period", gc_period)?); + } + + if let Some(pitr_interval) = item.get("pitr_interval") { + t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?); + } + if let Some(walreceiver_connect_timeout) = item.get("walreceiver_connect_timeout") { + t_conf.walreceiver_connect_timeout = Some(parse_toml_duration( + "walreceiver_connect_timeout", + walreceiver_connect_timeout, + )?); + } + if let Some(lagging_wal_timeout) = item.get("lagging_wal_timeout") { + t_conf.lagging_wal_timeout = Some(parse_toml_duration( + "lagging_wal_timeout", + lagging_wal_timeout, + )?); + } + if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") { + t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?); + } + + Ok(t_conf) } #[cfg(test)] pub fn test_repo_dir(test_name: &str) -> PathBuf { - PathBuf::from(format!("../tmp_check/test_{}", test_name)) + PathBuf::from(format!("../tmp_check/test_{test_name}")) } #[cfg(test)] pub fn dummy_conf(repo_dir: PathBuf) -> Self { PageServerConf { - checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_period: Duration::from_secs(10), - gc_horizon: defaults::DEFAULT_GC_HORIZON, - gc_period: Duration::from_secs(10), + id: NodeId(0), wait_lsn_timeout: Duration::from_secs(60), wal_redo_timeout: Duration::from_secs(60), page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - superuser: "zenith_admin".to_string(), + superuser: "cloud_admin".to_string(), workdir: repo_dir, pg_distrib_dir: PathBuf::new(), auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, + profiling: ProfilingConfig::Disabled, + default_tenant_conf: TenantConf::dummy_conf(), + broker_endpoints: Vec::new(), + broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), + log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), + concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), } } } @@ -423,7 +645,7 @@ impl PageServerConf { fn parse_toml_string(name: &str, item: &Item) -> Result { let s = item .as_str() - .with_context(|| format!("configure option {} is not a string", name))?; + .with_context(|| format!("configure option {name} is not a string"))?; Ok(s.to_string()) } @@ -432,9 +654,9 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result { // for our use, though. let i: i64 = item .as_integer() - .with_context(|| format!("configure option {} is not an integer", name))?; + .with_context(|| format!("configure option {name} is not an integer"))?; if i < 0 { - bail!("configure option {} cannot be negative", name); + bail!("configure option {name} cannot be negative"); } Ok(i as u64) } @@ -442,25 +664,107 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result { fn parse_toml_duration(name: &str, item: &Item) -> Result { let s = item .as_str() - .with_context(|| format!("configure option {} is not a string", name))?; + .with_context(|| format!("configure option {name} is not a string"))?; Ok(humantime::parse_duration(s)?) } -fn parse_toml_auth_type(name: &str, item: &Item) -> Result { +fn parse_toml_from_str(name: &str, item: &Item) -> anyhow::Result +where + T: FromStr, + ::Err: std::fmt::Display, +{ let v = item .as_str() - .with_context(|| format!("configure option {} is not a string", name))?; - AuthType::from_str(v) + .with_context(|| format!("configure option {name} is not a string"))?; + T::from_str(v).map_err(|e| { + anyhow!( + "Failed to parse string as {parse_type} for configure option {name}: {e}", + parse_type = stringify!(T) + ) + }) +} + +fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result> { + let array = item + .as_array() + .with_context(|| format!("configure option {name} is not an array"))?; + + array + .iter() + .map(|value| { + value + .as_str() + .map(str::to_string) + .with_context(|| format!("Array item {value:?} for key {name} is not a string")) + }) + .collect() +} + +/// Configurable semaphore permits setting. +/// +/// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty +/// semaphore cannot be distinguished, leading any feature using these to await forever (or until +/// new permits are added). +#[derive(Debug, Clone)] +pub struct ConfigurableSemaphore { + initial_permits: NonZeroUsize, + inner: std::sync::Arc, +} + +impl ConfigurableSemaphore { + pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) { + Some(x) => x, + None => panic!("const unwrap is not yet stable"), + }; + + /// Initializse using a non-zero amount of permits. + /// + /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a + /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will + /// behave like [`futures::future::pending`], just waiting until new permits are added. + pub fn new(initial_permits: NonZeroUsize) -> Self { + ConfigurableSemaphore { + initial_permits, + inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())), + } + } +} + +impl Default for ConfigurableSemaphore { + fn default() -> Self { + Self::new(Self::DEFAULT_INITIAL) + } +} + +impl PartialEq for ConfigurableSemaphore { + fn eq(&self, other: &Self) -> bool { + // the number of permits can be increased at runtime, so we cannot really fulfill the + // PartialEq value equality otherwise + self.initial_permits == other.initial_permits + } +} + +impl Eq for ConfigurableSemaphore {} + +impl ConfigurableSemaphore { + pub fn inner(&self) -> &std::sync::Arc { + &self.inner + } } #[cfg(test)] mod tests { - use std::fs; + use std::{ + fs, + num::{NonZeroU32, NonZeroUsize}, + }; + use remote_storage::{RemoteStorageKind, S3Config}; use tempfile::{tempdir, TempDir}; use super::*; + use crate::DEFAULT_PG_VERSION; const ALL_BASE_VALUES_TOML: &str = r#" # Initial configuration file created by 'pageserver --init' @@ -468,12 +772,6 @@ mod tests { listen_pg_addr = '127.0.0.1:64000' listen_http_addr = '127.0.0.1:9898' -checkpoint_distance = 111 # in bytes -checkpoint_period = '111 s' - -gc_period = '222 s' -gc_horizon = 222 - wait_lsn_timeout = '111 s' wal_redo_timeout = '111 s' @@ -482,31 +780,33 @@ max_file_descriptors = 333 # initial superuser role name to use when creating a new tenant initial_superuser_name = 'zzzz' +id = 10 - "#; +log_format = 'json' + +"#; #[test] fn parse_defaults() -> anyhow::Result<()> { let tempdir = tempdir()?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - // we have to create dummy pathes to overcome the validation errors - let config_string = format!("pg_distrib_dir='{}'", pg_distrib_dir.display()); + let broker_endpoint = "http://127.0.0.1:7777"; + // we have to create dummy values to overcome the validation errors + let config_string = format!( + "pg_distrib_dir='{}'\nid=10\nbroker_endpoints = ['{broker_endpoint}']", + pg_distrib_dir.display() + ); let toml = config_string.parse()?; - let parsed_config = - PageServerConf::parse_and_validate(&toml, &workdir).unwrap_or_else(|e| { - panic!("Failed to parse config '{}', reason: {}", config_string, e) - }); + let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) + .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); assert_eq!( parsed_config, PageServerConf { + id: NodeId(10), listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_period: humantime::parse_duration(defaults::DEFAULT_CHECKPOINT_PERIOD)?, - gc_horizon: defaults::DEFAULT_GC_HORIZON, - gc_period: humantime::parse_duration(defaults::DEFAULT_GC_PERIOD)?, wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?, wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?, superuser: defaults::DEFAULT_SUPERUSER.to_string(), @@ -517,6 +817,14 @@ initial_superuser_name = 'zzzz' auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, + profiling: ProfilingConfig::Disabled, + default_tenant_conf: TenantConf::default(), + broker_endpoints: vec![broker_endpoint + .parse() + .expect("Failed to parse a valid broker endpoint URL")], + broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), + log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), + concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), }, "Correct defaults should be used when no config values are provided" ); @@ -528,28 +836,23 @@ initial_superuser_name = 'zzzz' fn parse_basic_config() -> anyhow::Result<()> { let tempdir = tempdir()?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; + let broker_endpoint = "http://127.0.0.1:7777"; let config_string = format!( - "{}pg_distrib_dir='{}'", - ALL_BASE_VALUES_TOML, + "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoints = ['{broker_endpoint}']", pg_distrib_dir.display() ); let toml = config_string.parse()?; - let parsed_config = - PageServerConf::parse_and_validate(&toml, &workdir).unwrap_or_else(|e| { - panic!("Failed to parse config '{}', reason: {}", config_string, e) - }); + let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir) + .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); assert_eq!( parsed_config, PageServerConf { + id: NodeId(10), listen_pg_addr: "127.0.0.1:64000".to_string(), listen_http_addr: "127.0.0.1:9898".to_string(), - checkpoint_distance: 111, - checkpoint_period: Duration::from_secs(111), - gc_horizon: 222, - gc_period: Duration::from_secs(222), wait_lsn_timeout: Duration::from_secs(111), wal_redo_timeout: Duration::from_secs(111), superuser: "zzzz".to_string(), @@ -560,6 +863,14 @@ initial_superuser_name = 'zzzz' auth_type: AuthType::Trust, auth_validation_public_key_path: None, remote_storage_config: None, + profiling: ProfilingConfig::Disabled, + default_tenant_conf: TenantConf::default(), + broker_endpoints: vec![broker_endpoint + .parse() + .expect("Failed to parse a valid broker endpoint URL")], + broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), + log_format: LogFormat::Json, + concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), }, "Should be able to parse all basic config values correctly" ); @@ -571,6 +882,7 @@ initial_superuser_name = 'zzzz' fn parse_remote_fs_storage_config() -> anyhow::Result<()> { let tempdir = tempdir()?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; + let broker_endpoint = "http://127.0.0.1:7777"; let local_storage_path = tempdir.path().join("local_remote_storage"); @@ -588,37 +900,36 @@ local_path = '{}'"#, for remote_storage_config_str in identical_toml_declarations { let config_string = format!( - r#"{} + r#"{ALL_BASE_VALUES_TOML} pg_distrib_dir='{}' +broker_endpoints = ['{broker_endpoint}'] -{}"#, - ALL_BASE_VALUES_TOML, +{remote_storage_config_str}"#, pg_distrib_dir.display(), - remote_storage_config_str, ); let toml = config_string.parse()?; let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) .unwrap_or_else(|e| { - panic!("Failed to parse config '{}', reason: {}", config_string, e) + panic!("Failed to parse config '{config_string}', reason: {e:?}") }) .remote_storage_config .expect("Should have remote storage config for the local FS"); assert_eq!( - parsed_remote_storage_config, - RemoteStorageConfig { - max_concurrent_sync: NonZeroUsize::new( - defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC - ) - .unwrap(), - max_sync_errors: NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS) - .unwrap(), - storage: RemoteStorageKind::LocalFs(local_storage_path.clone()), - }, - "Remote storage config should correctly parse the local FS config and fill other storage defaults" - ); + parsed_remote_storage_config, + RemoteStorageConfig { + max_concurrent_syncs: NonZeroUsize::new( + remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS + ) + .unwrap(), + max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS) + .unwrap(), + storage: RemoteStorageKind::LocalFs(local_storage_path.clone()), + }, + "Remote storage config should correctly parse the local FS config and fill other storage defaults" + ); } Ok(()) } @@ -631,47 +942,44 @@ pg_distrib_dir='{}' let bucket_name = "some-sample-bucket".to_string(); let bucket_region = "eu-north-1".to_string(); let prefix_in_bucket = "test_prefix".to_string(); - let access_key_id = "SOMEKEYAAAAASADSAH*#".to_string(); - let secret_access_key = "SOMEsEcReTsd292v".to_string(); let endpoint = "http://localhost:5000".to_string(); - let max_concurrent_sync = NonZeroUsize::new(111).unwrap(); + let max_concurrent_syncs = NonZeroUsize::new(111).unwrap(); let max_sync_errors = NonZeroU32::new(222).unwrap(); + let s3_concurrency_limit = NonZeroUsize::new(333).unwrap(); + let broker_endpoint = "http://127.0.0.1:7777"; let identical_toml_declarations = &[ format!( r#"[remote_storage] -max_concurrent_sync = {} -max_sync_errors = {} -bucket_name = '{}' -bucket_region = '{}' -prefix_in_bucket = '{}' -access_key_id = '{}' -secret_access_key = '{}' -endpoint = '{}'"#, - max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint +max_concurrent_syncs = {max_concurrent_syncs} +max_sync_errors = {max_sync_errors} +bucket_name = '{bucket_name}' +bucket_region = '{bucket_region}' +prefix_in_bucket = '{prefix_in_bucket}' +endpoint = '{endpoint}' +concurrency_limit = {s3_concurrency_limit}"# ), format!( - "remote_storage={{max_concurrent_sync={}, max_sync_errors={}, bucket_name='{}', bucket_region='{}', prefix_in_bucket='{}', access_key_id='{}', secret_access_key='{}', endpoint='{}'}}", - max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint + "remote_storage={{max_concurrent_syncs={max_concurrent_syncs}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\ + bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}", ), ]; for remote_storage_config_str in identical_toml_declarations { let config_string = format!( - r#"{} + r#"{ALL_BASE_VALUES_TOML} pg_distrib_dir='{}' +broker_endpoints = ['{broker_endpoint}'] -{}"#, - ALL_BASE_VALUES_TOML, +{remote_storage_config_str}"#, pg_distrib_dir.display(), - remote_storage_config_str, ); let toml = config_string.parse()?; let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir) .unwrap_or_else(|e| { - panic!("Failed to parse config '{}', reason: {}", config_string, e) + panic!("Failed to parse config '{config_string}', reason: {e:?}") }) .remote_storage_config .expect("Should have remote storage config for S3"); @@ -679,15 +987,14 @@ pg_distrib_dir='{}' assert_eq!( parsed_remote_storage_config, RemoteStorageConfig { - max_concurrent_sync, + max_concurrent_syncs, max_sync_errors, storage: RemoteStorageKind::AwsS3(S3Config { bucket_name: bucket_name.clone(), bucket_region: bucket_region.clone(), - access_key_id: Some(access_key_id.clone()), - secret_access_key: Some(secret_access_key.clone()), prefix_in_bucket: Some(prefix_in_bucket.clone()), - endpoint: Some(endpoint.clone()) + endpoint: Some(endpoint.clone()), + concurrency_limit: s3_concurrency_limit, }), }, "Remote storage config should correctly parse the S3 config" @@ -703,8 +1010,9 @@ pg_distrib_dir='{}' fs::create_dir_all(&workdir)?; let pg_distrib_dir = tempdir_path.join("pg_distrib"); - fs::create_dir_all(&pg_distrib_dir)?; - let postgres_bin_dir = pg_distrib_dir.join("bin"); + let pg_distrib_dir_versioned = pg_distrib_dir.join(format!("v{DEFAULT_PG_VERSION}")); + fs::create_dir_all(&pg_distrib_dir_versioned)?; + let postgres_bin_dir = pg_distrib_dir_versioned.join("bin"); fs::create_dir_all(&postgres_bin_dir)?; fs::write(postgres_bin_dir.join("postgres"), "I'm postgres, trust me")?; diff --git a/pageserver/src/http/mod.rs b/pageserver/src/http/mod.rs index 4c0be17ecd..1c083bd382 100644 --- a/pageserver/src/http/mod.rs +++ b/pageserver/src/http/mod.rs @@ -1,3 +1,4 @@ -pub mod models; pub mod routes; pub use routes::make_router; + +pub use pageserver_api::models; diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs deleted file mode 100644 index 6ce377c535..0000000000 --- a/pageserver/src/http/models.rs +++ /dev/null @@ -1,17 +0,0 @@ -use serde::{Deserialize, Serialize}; - -use crate::ZTenantId; - -#[derive(Serialize, Deserialize)] -pub struct BranchCreateRequest { - #[serde(with = "hex")] - pub tenant_id: ZTenantId, - pub name: String, - pub start_point: String, -} - -#[derive(Serialize, Deserialize)] -pub struct TenantCreateRequest { - #[serde(with = "hex")] - pub tenant_id: ZTenantId, -} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index dcb81849e0..1bb5f94f4e 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -1,7 +1,11 @@ openapi: "3.0.2" info: title: Page Server API + description: Neon Pageserver API version: "1.0" + license: + name: "Apache" + url: https://github.com/neondatabase/neon/blob/main/LICENSE servers: - url: "" paths: @@ -17,7 +21,13 @@ paths: application/json: schema: type: object - /v1/timeline/{tenant_id}: + required: + - id + properties: + id: + type: integer + + /v1/tenant/{tenant_id}: parameters: - name: tenant_id in: path @@ -26,18 +36,68 @@ paths: type: string format: hex get: - description: List tenant timelines + description: Get tenant status responses: "200": - description: array of brief timeline descriptions + description: Currently returns the flag whether the tenant has inprogress timeline downloads + content: + application/json: + schema: + $ref: "#/components/schemas/TenantInfo" + "400": + description: Error when no tenant id found in path or no timeline id + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + /v1/tenant/{tenant_id}/timeline: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: include-non-incremental-logical-size + in: query + schema: + type: string + description: Controls calculation of current_logical_size_non_incremental + - name: include-non-incremental-physical-size + in: query + schema: + type: string + description: Controls calculation of current_physical_size_non_incremental + get: + description: Get timelines for tenant + responses: + "200": + description: TimelineInfo content: application/json: schema: type: array items: - # currently, just a timeline id string, but when remote index gets to be accessed - # remote/local timeline field would be added at least - type: string + $ref: "#/components/schemas/TimelineInfo" "400": description: Error when no tenant id found in path content: @@ -62,7 +122,8 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - /v1/timeline/{tenant_id}/{timeline_id}: + + /v1/tenant/{tenant_id}/timeline/{timeline_id}: parameters: - name: tenant_id in: path @@ -77,7 +138,18 @@ paths: type: string format: hex get: - description: Get timeline info for tenant's remote timeline + description: Get info about the timeline + parameters: + - name: include-non-incremental-logical-size + in: query + schema: + type: string + description: Controls calculation of current_logical_size_non_incremental + - name: include-non-incremental-physical-size + in: query + schema: + type: string + description: Controls calculation of current_physical_size_non_incremental responses: "200": description: TimelineInfo @@ -86,7 +158,7 @@ paths: schema: $ref: "#/components/schemas/TimelineInfo" "400": - description: Error when no tenant id found in path or no branch name + description: Error when no tenant id found in path or no timeline id content: application/json: schema: @@ -109,7 +181,37 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - /v1/branch/{tenant_id}: + delete: + description: "Attempts to delete specified timeline. On 500 errors should be retried" + responses: + "200": + description: Ok + "400": + description: Error when no tenant id found in path or no timeline id + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp: parameters: - name: tenant_id in: path @@ -117,24 +219,31 @@ paths: schema: type: string format: hex - - name: include-non-incremental-logical-size - in: query + - name: timeline_id + in: path + required: true schema: type: string - description: Controls calculation of current_logical_size_non_incremental + format: hex get: - description: Get branches for tenant + description: Get LSN by a timestamp + parameters: + - name: timestamp + in: query + required: true + schema: + type: string + format: date-time + description: A timestamp to get the LSN responses: "200": - description: BranchInfo + description: OK content: application/json: schema: - type: array - items: - $ref: "#/components/schemas/BranchInfo" + type: string "400": - description: Error when no tenant id found in path + description: Error when no tenant id found in path, no timeline id or invalid timestamp content: application/json: schema: @@ -157,7 +266,7 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - /v1/branch/{tenant_id}/{branch_name}: + /v1/tenant/{tenant_id}/attach: parameters: - name: tenant_id in: path @@ -165,78 +274,173 @@ paths: schema: type: string format: hex - - name: branch_name - in: path - required: true - schema: - type: string - - name: include-non-incremental-logical-size - in: query - schema: - type: string - description: Controls calculation of current_logical_size_non_incremental - get: - description: Get branches for tenant - responses: - "200": - description: BranchInfo - content: - application/json: - schema: - $ref: "#/components/schemas/BranchInfo" - "400": - description: Error when no tenant id found in path or no branch name - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - /v1/branch/: post: - description: Create branch + description: Schedules attach operation to happen in the background for given tenant + responses: + "202": + description: Tenant attaching scheduled + "400": + description: Error when no tenant id found in path parameters + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "404": + description: Timeline not found + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" + "409": + description: Tenant download is already in progress + content: + application/json: + schema: + $ref: "#/components/schemas/ConflictError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + /v1/tenant/{tenant_id}/detach: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Detach local tenant + responses: + "200": + description: Tenant detached + "400": + description: Error when no tenant id found in path parameters + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + /v1/tenant/{tenant_id}/size: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + get: + description: | + Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes). + responses: + "200": + description: OK, + content: + application/json: + schema: + type: object + required: + - id + - size + properties: + id: + type: string + format: hex + size: + type: integer + description: | + Size metric in bytes. + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + /v1/tenant/{tenant_id}/timeline/: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + post: + description: | + Create a timeline. Returns new timeline id on success.\ + If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline. + If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver. requestBody: content: application/json: schema: type: object - required: - - "tenant_id" - - "name" - - "start_point" properties: - tenant_id: + new_timeline_id: type: string format: hex - name: + ancestor_timeline_id: type: string - start_point: + format: hex + ancestor_start_lsn: type: string + format: hex + pg_version: + type: integer responses: "201": - description: BranchInfo + description: TimelineInfo content: application/json: schema: - $ref: "#/components/schemas/BranchInfo" + $ref: "#/components/schemas/TimelineInfo" "400": - description: Malformed branch create request + description: Malformed timeline create request content: application/json: schema: @@ -253,6 +457,12 @@ paths: application/json: schema: $ref: "#/components/schemas/ForbiddenError" + "409": + description: Timeline already exists, creation skipped + content: + application/json: + schema: + $ref: "#/components/schemas/ConflictError" "500": description: Generic operation error content: @@ -290,29 +500,72 @@ paths: schema: $ref: "#/components/schemas/Error" post: - description: Create tenant + description: | + Create a tenant. Returns new tenant id on success.\ + If no new tenant id is specified in parameters, it would be generated. It's an error to recreate the same tenant. requestBody: content: application/json: schema: - type: object - required: - - "tenant_id" - properties: - tenant_id: - type: string - format: hex + $ref: "#/components/schemas/TenantCreateInfo" responses: "201": - description: CREATED + description: New tenant created successfully + content: + application/json: + schema: + type: string + format: hex + "400": + description: Malformed tenant create request + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "409": + description: Tenant already exists, creation skipped + content: + application/json: + schema: + $ref: "#/components/schemas/ConflictError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + /v1/tenant/config: + put: + description: | + Update tenant's config. + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/TenantConfigInfo" + responses: + "200": + description: OK content: application/json: schema: type: array items: - type: string + $ref: "#/components/schemas/TenantInfo" "400": - description: Malformed tenant create request + description: Malformed tenant config request content: application/json: schema: @@ -335,7 +588,6 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - components: securitySchemes: JWT: @@ -352,40 +604,69 @@ components: id: type: string state: - type: string - BranchInfo: + oneOf: + - type: string + - type: object + properties: + background_jobs_running: + type: boolean + + current_physical_size: + type: integer + has_in_progress_downloads: + type: boolean + TenantCreateInfo: type: object - required: - - name - - timeline_id - - latest_valid_lsn - - current_logical_size properties: - name: - type: string - timeline_id: + new_tenant_id: type: string format: hex - ancestor_id: + tenant_id: type: string format: hex - ancestor_lsn: + gc_period: type: string - current_logical_size: + gc_horizon: type: integer - current_logical_size_non_incremental: + pitr_interval: + type: string + checkpoint_distance: type: integer - latest_valid_lsn: + checkpoint_timeout: + type: string + compaction_period: + type: string + compaction_threshold: + type: string + TenantConfigInfo: + type: object + properties: + tenant_id: + type: string + format: hex + gc_period: + type: string + gc_horizon: type: integer + pitr_interval: + type: string + checkpoint_distance: + type: integer + checkpoint_timeout: + type: string + compaction_period: + type: string + compaction_threshold: + type: string TimelineInfo: type: object required: - timeline_id - tenant_id - last_record_lsn - - prev_record_lsn - - start_lsn - disk_consistent_lsn + - awaits_download + - state properties: timeline_id: type: string @@ -393,20 +674,74 @@ components: tenant_id: type: string format: hex + last_record_lsn: + type: string + format: hex + disk_consistent_lsn: + type: string + format: hex + remote_consistent_lsn: + type: string + format: hex ancestor_timeline_id: type: string format: hex - last_record_lsn: + ancestor_lsn: type: string + format: hex prev_record_lsn: type: string - start_lsn: + format: hex + current_logical_size: + type: integer + current_physical_size: + type: integer + current_logical_size_non_incremental: + type: integer + current_physical_size_non_incremental: + type: integer + wal_source_connstr: type: string - disk_consistent_lsn: + last_received_msg_lsn: type: string - timeline_state: + format: hex + last_received_msg_ts: + type: integer + awaits_download: + type: boolean + state: type: string + # These 'local' and 'remote' fields just duplicate some of the fields + # above. They are kept for backwards-compatibility. They can be removed, + # when the control plane has been updated to look at the above fields + # directly. + local: + $ref: "#/components/schemas/LocalTimelineInfo" + remote: + $ref: "#/components/schemas/RemoteTimelineInfo" + + LocalTimelineInfo: + type: object + properties: + ancestor_timeline_id: + type: string + format: hex + ancestor_lsn: + type: string + format: hex + current_logical_size: + type: integer + current_physical_size: + type: integer + RemoteTimelineInfo: + type: object + required: + - remote_consistent_lsn + properties: + remote_consistent_lsn: + type: string + format: hex Error: type: object required: @@ -428,6 +763,20 @@ components: properties: msg: type: string + NotFoundError: + type: object + required: + - msg + properties: + msg: + type: string + ConflictError: + type: object + required: + - msg + properties: + msg: + type: string security: - JWT: [] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index b13a45750e..db581efc7d 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,52 +1,68 @@ use std::sync::Arc; -use anyhow::{Context, Result}; -use hyper::header; +use anyhow::{anyhow, Context, Result}; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; -use serde::Serialize; +use remote_storage::GenericRemoteStorage; +use tokio::task::JoinError; use tracing::*; -use zenith_utils::auth::JwtAuth; -use zenith_utils::http::endpoint::attach_openapi_ui; -use zenith_utils::http::endpoint::auth_middleware; -use zenith_utils::http::endpoint::check_permission; -use zenith_utils::http::error::ApiError; -use zenith_utils::http::{ - endpoint, - error::HttpErrorBody, - json::{json_request, json_response}, - request::get_request_param, - request::parse_request_param, + +use super::models::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; +use super::models::{ + StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, + TimelineCreateRequest, +}; +use crate::pgdatadir_mapping::LsnForTimestamp; +use crate::storage_sync; +use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; +use crate::tenant::{TenantState, Timeline}; +use crate::tenant_config::TenantConfOpt; +use crate::{config::PageServerConf, tenant_mgr}; +use utils::{ + auth::JwtAuth, + http::{ + endpoint::{self, attach_openapi_ui, auth_middleware, check_permission}, + error::{ApiError, HttpErrorBody}, + json::{json_request, json_response}, + request::parse_request_param, + RequestExt, RouterBuilder, + }, + id::{TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, }; -use zenith_utils::http::{RequestExt, RouterBuilder}; -use zenith_utils::lsn::Lsn; -use zenith_utils::zid::{opt_display_serde, ZTimelineId}; -use super::models::BranchCreateRequest; -use super::models::TenantCreateRequest; -use crate::branches::BranchInfo; -use crate::repository::RepositoryTimeline; -use crate::repository::TimelineSyncState; -use crate::{branches, config::PageServerConf, tenant_mgr, ZTenantId}; +// Imports only used for testing APIs +#[cfg(feature = "testing")] +use super::models::{ConfigureFailpointsRequest, TimelineGcRequest}; +#[cfg(feature = "testing")] +use crate::CheckpointConfig; -#[derive(Debug)] struct State { conf: &'static PageServerConf, auth: Option>, + remote_index: RemoteIndex, allowlist_routes: Vec, + remote_storage: Option, } impl State { - fn new(conf: &'static PageServerConf, auth: Option>) -> Self { + fn new( + conf: &'static PageServerConf, + auth: Option>, + remote_index: RemoteIndex, + remote_storage: Option, + ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() .map(|v| v.parse().unwrap()) .collect::>(); - Self { + Ok(Self { conf, auth, allowlist_routes, - } + remote_index, + remote_storage, + }) } } @@ -63,268 +79,806 @@ fn get_config(request: &Request) -> &'static PageServerConf { get_state(request).conf } -// healthcheck handler -async fn status_handler(_: Request) -> Result, ApiError> { - Ok(Response::builder() - .status(StatusCode::OK) - .header(header::CONTENT_TYPE, "application/json") - .body(Body::from("{}")) - .map_err(ApiError::from_err)?) -} +// Helper function to construct a TimelineInfo struct for a timeline +async fn build_timeline_info( + state: &State, + timeline: &Arc, + include_non_incremental_logical_size: bool, + include_non_incremental_physical_size: bool, +) -> anyhow::Result { + let last_record_lsn = timeline.get_last_record_lsn(); + let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = { + let guard = timeline.last_received_wal.lock().unwrap(); + if let Some(info) = guard.as_ref() { + ( + Some(info.wal_source_connstr.clone()), + Some(info.last_received_msg_lsn), + Some(info.last_received_msg_ts), + ) + } else { + (None, None, None) + } + }; -async fn branch_create_handler(mut request: Request) -> Result, ApiError> { - let request_data: BranchCreateRequest = json_request(&mut request).await?; - - check_permission(&request, Some(request_data.tenant_id))?; - - let response_data = tokio::task::spawn_blocking(move || { - let _enter = info_span!("/branch_create", name = %request_data.name, tenant = %request_data.tenant_id, startpoint=%request_data.start_point).entered(); - branches::create_branch( - get_config(&request), - &request_data.name, - &request_data.start_point, - &request_data.tenant_id, + let (remote_consistent_lsn, awaits_download) = if let Some(remote_entry) = state + .remote_index + .read() + .await + .timeline_entry(&TenantTimelineId { + tenant_id: timeline.tenant_id, + timeline_id: timeline.timeline_id, + }) { + ( + Some(remote_entry.metadata.disk_consistent_lsn()), + remote_entry.awaits_download, ) - }) - .await - .map_err(ApiError::from_err)??; - Ok(json_response(StatusCode::CREATED, response_data)?) + } else { + (None, false) + }; + + let ancestor_timeline_id = timeline.get_ancestor_timeline_id(); + let ancestor_lsn = match timeline.get_ancestor_lsn() { + Lsn(0) => None, + lsn @ Lsn(_) => Some(lsn), + }; + let current_logical_size = match timeline.get_current_logical_size() { + Ok(size) => Some(size), + Err(err) => { + error!("Timeline info creation failed to get current logical size: {err:?}"); + None + } + }; + let current_physical_size = Some(timeline.get_physical_size()); + let state = timeline.current_state(); + + let info = TimelineInfo { + tenant_id: timeline.tenant_id, + timeline_id: timeline.timeline_id, + ancestor_timeline_id, + ancestor_lsn, + disk_consistent_lsn: timeline.get_disk_consistent_lsn(), + last_record_lsn, + prev_record_lsn: Some(timeline.get_prev_record_lsn()), + latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), + current_logical_size, + current_physical_size, + current_logical_size_non_incremental: if include_non_incremental_logical_size { + Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?) + } else { + None + }, + current_physical_size_non_incremental: if include_non_incremental_physical_size { + Some(timeline.get_physical_size_non_incremental()?) + } else { + None + }, + wal_source_connstr, + last_received_msg_lsn, + last_received_msg_ts, + pg_version: timeline.pg_version, + + remote_consistent_lsn, + awaits_download, + state, + + // Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility + // with the control plane. + local: LocalTimelineInfo { + ancestor_timeline_id, + ancestor_lsn, + current_logical_size, + current_physical_size, + }, + remote: RemoteTimelineInfo { + remote_consistent_lsn, + }, + }; + Ok(info) } -// Gate non incremental logical size calculation behind a flag -// after pgbench -i -s100 calculation took 28ms so if multiplied by the number of timelines -// and tenants it can take noticeable amount of time. Also the value currently used only in tests -fn get_include_non_incremental_logical_size(request: &Request) -> bool { +// healthcheck handler +async fn status_handler(request: Request) -> Result, ApiError> { + let config = get_config(&request); + json_response(StatusCode::OK, StatusResponse { id: config.id }) +} + +async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let request_data: TimelineCreateRequest = json_request(&mut request).await?; + check_permission(&request, Some(tenant_id))?; + + let state = get_state(&request); + + let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?; + let new_timeline_info = async { + match tenant.create_timeline( + request_data.new_timeline_id.map(TimelineId::from), + request_data.ancestor_timeline_id.map(TimelineId::from), + request_data.ancestor_start_lsn, + request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION) + ).await { + Ok(Some(new_timeline)) => { + // Created. Construct a TimelineInfo for it. + let timeline_info = build_timeline_info(state, &new_timeline, false, false) + .await + .map_err(ApiError::InternalServerError)?; + Ok(Some(timeline_info)) + } + Ok(None) => Ok(None), // timeline already exists + Err(err) => Err(ApiError::InternalServerError(err)), + } + } + .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) + .await?; + + Ok(match new_timeline_info { + Some(info) => json_response(StatusCode::CREATED, info)?, + None => json_response(StatusCode::CONFLICT, ())?, + }) +} + +async fn timeline_list_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let include_non_incremental_logical_size = + query_param_present(&request, "include-non-incremental-logical-size"); + let include_non_incremental_physical_size = + query_param_present(&request, "include-non-incremental-physical-size"); + check_permission(&request, Some(tenant_id))?; + + let state = get_state(&request); + + let timelines = info_span!("timeline_list", tenant = %tenant_id).in_scope(|| { + let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?; + Ok(tenant.list_timelines()) + })?; + + let mut response_data = Vec::with_capacity(timelines.len()); + for timeline in timelines { + let timeline_info = build_timeline_info( + state, + &timeline, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ) + .await + .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}") + .map_err(ApiError::InternalServerError)?; + + response_data.push(timeline_info); + } + + json_response(StatusCode::OK, response_data) +} + +/// Checks if a query param is present in the request's URL +fn query_param_present(request: &Request, param: &str) -> bool { request .uri() .query() .map(|v| { url::form_urlencoded::parse(v.as_bytes()) .into_owned() - .any(|(param, _)| param == "include-non-incremental-logical-size") + .any(|(p, _)| p == param) }) .unwrap_or(false) } -async fn branch_list_handler(request: Request) -> Result, ApiError> { - let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?; - - let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); - - check_permission(&request, Some(tenantid))?; - - let response_data = tokio::task::spawn_blocking(move || { - let _enter = info_span!("branch_list", tenant = %tenantid).entered(); - crate::branches::get_branches( - get_config(&request), - &tenantid, - include_non_incremental_logical_size, - ) - }) - .await - .map_err(ApiError::from_err)??; - Ok(json_response(StatusCode::OK, response_data)?) -} - -async fn branch_detail_handler(request: Request) -> Result, ApiError> { - let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?; - let branch_name: String = get_request_param(&request, "branch_name")?.to_string(); - let conf = get_state(&request).conf; - let path = conf.branch_path(&branch_name, &tenantid); - - let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); - - let response_data = tokio::task::spawn_blocking(move || { - let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered(); - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - BranchInfo::from_path(path, &repo, include_non_incremental_logical_size) - }) - .await - .map_err(ApiError::from_err)??; - - Ok(json_response(StatusCode::OK, response_data)?) -} - -async fn timeline_list_handler(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - - let conf = get_state(&request).conf; - let timelines_dir = conf.timelines_path(&tenant_id); - - let mut timelines_dir_contents = - tokio::fs::read_dir(&timelines_dir).await.with_context(|| { - format!( - "Failed to list timelines dir '{}' contents", - timelines_dir.display() - ) - })?; - - let mut local_timelines = Vec::new(); - while let Some(entry) = timelines_dir_contents.next_entry().await.with_context(|| { - format!( - "Failed to list timelines dir '{}' contents", - timelines_dir.display() - ) - })? { - let entry_path = entry.path(); - let entry_type = entry.file_type().await.with_context(|| { - format!( - "Failed to get file type of timeline dirs' entry '{}'", - entry_path.display() - ) - })?; - - if entry_type.is_dir() { - match entry.file_name().to_string_lossy().parse::() { - Ok(timeline_id) => local_timelines.push(timeline_id.to_string()), - Err(e) => error!( - "Failed to get parse timeline id from timeline dirs' entry '{}': {}", - entry_path.display(), - e - ), - } - } - } - - Ok(json_response(StatusCode::OK, local_timelines)?) -} - -#[derive(Debug, Serialize)] -#[serde(tag = "type")] -enum TimelineInfo { - Local { - #[serde(with = "hex")] - timeline_id: ZTimelineId, - #[serde(with = "hex")] - tenant_id: ZTenantId, - #[serde(with = "opt_display_serde")] - ancestor_timeline_id: Option, - last_record_lsn: Lsn, - prev_record_lsn: Lsn, - disk_consistent_lsn: Lsn, - timeline_state: Option, - }, - Remote { - #[serde(with = "hex")] - timeline_id: ZTimelineId, - #[serde(with = "hex")] - tenant_id: ZTenantId, - }, +fn get_query_param(request: &Request, param_name: &str) -> Result { + request.uri().query().map_or( + Err(ApiError::BadRequest(anyhow!("empty query in request"))), + |v| { + url::form_urlencoded::parse(v.as_bytes()) + .into_owned() + .find(|(k, _)| k == param_name) + .map_or( + Err(ApiError::BadRequest(anyhow!( + "no {param_name} specified in query parameters" + ))), + |(_, v)| Ok(v), + ) + }, + ) } async fn timeline_detail_handler(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let include_non_incremental_logical_size = + query_param_present(&request, "include-non-incremental-logical-size"); + let include_non_incremental_physical_size = + query_param_present(&request, "include-non-incremental-physical-size"); check_permission(&request, Some(tenant_id))?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + let state = get_state(&request); - let response_data = tokio::task::spawn_blocking(move || { - let _enter = - info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id) - .entered(); - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - Ok::<_, anyhow::Error>(match repo.get_timeline(timeline_id)?.local_timeline() { - None => TimelineInfo::Remote { - timeline_id, - tenant_id, - }, - Some(timeline) => TimelineInfo::Local { - timeline_id, - tenant_id, - ancestor_timeline_id: timeline.get_ancestor_timeline_id(), - disk_consistent_lsn: timeline.get_disk_consistent_lsn(), - last_record_lsn: timeline.get_last_record_lsn(), - prev_record_lsn: timeline.get_prev_record_lsn(), - timeline_state: repo.get_timeline_state(timeline_id), - }, + let timeline_info = async { + let timeline = tokio::task::spawn_blocking(move || { + tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id, false) }) - }) - .await - .map_err(ApiError::from_err)??; + .await + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?; - Ok(json_response(StatusCode::OK, response_data)?) + let timeline = timeline.map_err(ApiError::NotFound)?; + + let timeline_info = build_timeline_info( + state, + &timeline, + include_non_incremental_logical_size, + include_non_incremental_physical_size, + ) + .await + .context("Failed to get local timeline info: {e:#}") + .map_err(ApiError::InternalServerError)?; + + Ok::<_, ApiError>(timeline_info) + } + .instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id)) + .await?; + + json_response(StatusCode::OK, timeline_info) } -async fn timeline_attach_handler(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; +async fn get_lsn_by_timestamp_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let timestamp_raw = get_query_param(&request, "timestamp")?; + let timestamp = humantime::parse_rfc3339(timestamp_raw.as_str()) + .with_context(|| format!("Invalid time: {:?}", timestamp_raw)) + .map_err(ApiError::BadRequest)?; + let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp); - tokio::task::spawn_blocking(move || { - let _enter = - info_span!("timeline_attach_handler", tenant = %tenant_id, timeline = %timeline_id) - .entered(); - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - match repo.get_timeline(timeline_id)? { - RepositoryTimeline::Local(_) => { - anyhow::bail!("Timeline with id {} is already local", timeline_id) - } - RepositoryTimeline::Remote { - id: _, - disk_consistent_lsn: _, - } => { - // FIXME (rodionov) get timeline already schedules timeline for download, and duplicate tasks can cause errors - // first should be fixed in https://github.com/zenithdb/zenith/issues/997 - // TODO (rodionov) change timeline state to awaits download (incapsulate it somewhere in the repo) - // TODO (rodionov) can we safely request replication on the timeline before sync is completed? (can be implemented on top of the #997) + let timeline = tenant_mgr::get_tenant(tenant_id, true) + .and_then(|tenant| tenant.get_timeline(timeline_id, true)) + .map_err(ApiError::NotFound)?; + let result = match timeline + .find_lsn_for_timestamp(timestamp_pg) + .map_err(ApiError::InternalServerError)? + { + LsnForTimestamp::Present(lsn) => format!("{lsn}"), + LsnForTimestamp::Future(_lsn) => "future".into(), + LsnForTimestamp::Past(_lsn) => "past".into(), + LsnForTimestamp::NoData(_lsn) => "nodata".into(), + }; + json_response(StatusCode::OK, result) +} + +// TODO makes sense to provide tenant config right away the same way as it handled in tenant_create +async fn tenant_attach_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + info!("Handling tenant attach {tenant_id}"); + + tokio::task::spawn_blocking(move || match tenant_mgr::get_tenant(tenant_id, false) { + Ok(tenant) => { + if tenant.list_timelines().is_empty() { + info!("Attaching to tenant {tenant_id} with zero timelines"); Ok(()) + } else { + Err(ApiError::Conflict( + "Tenant is already present locally".to_owned(), + )) } } + Err(_) => Ok(()), }) .await - .map_err(ApiError::from_err)??; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; - Ok(json_response(StatusCode::ACCEPTED, ())?) + let state = get_state(&request); + let remote_index = &state.remote_index; + + let mut index_accessor = remote_index.write().await; + if let Some(tenant_entry) = index_accessor.tenant_entry_mut(&tenant_id) { + if tenant_entry.has_in_progress_downloads() { + return Err(ApiError::Conflict( + "Tenant download is already in progress".to_string(), + )); + } + + for (timeline_id, remote_timeline) in tenant_entry.iter_mut() { + storage_sync::schedule_layer_download(tenant_id, *timeline_id); + remote_timeline.awaits_download = true; + } + return json_response(StatusCode::ACCEPTED, ()); + } + // no tenant in the index, release the lock to make the potentially lengthy download operation + drop(index_accessor); + + // download index parts for every tenant timeline + let remote_timelines = match gather_tenant_timelines_index_parts(state, tenant_id).await { + Ok(Some(remote_timelines)) => remote_timelines, + Ok(None) => return Err(ApiError::NotFound(anyhow!("Unknown remote tenant"))), + Err(e) => { + error!("Failed to retrieve remote tenant data: {:?}", e); + return Err(ApiError::NotFound(anyhow!( + "Failed to retrieve remote tenant" + ))); + } + }; + + // recheck that download is not in progress because + // we've released the lock to avoid holding it during the download + let mut index_accessor = remote_index.write().await; + let tenant_entry = match index_accessor.tenant_entry_mut(&tenant_id) { + Some(tenant_entry) => { + if tenant_entry.has_in_progress_downloads() { + return Err(ApiError::Conflict( + "Tenant download is already in progress".to_string(), + )); + } + tenant_entry + } + None => index_accessor.add_tenant_entry(tenant_id), + }; + + // populate remote index with the data from index part and create directories on the local filesystem + for (timeline_id, mut remote_timeline) in remote_timelines { + tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id)) + .await + .context("Failed to create new timeline directory") + .map_err(ApiError::InternalServerError)?; + + remote_timeline.awaits_download = true; + tenant_entry.insert(timeline_id, remote_timeline); + // schedule actual download + storage_sync::schedule_layer_download(tenant_id, timeline_id); + } + + json_response(StatusCode::ACCEPTED, ()) } -async fn timeline_detach_handler(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; +/// Note: is expensive from s3 access perspective, +/// for details see comment to `storage_sync::gather_tenant_timelines_index_parts` +async fn gather_tenant_timelines_index_parts( + state: &State, + tenant_id: TenantId, +) -> anyhow::Result>> { + let index_parts = match state.remote_storage.as_ref() { + Some(storage) => { + storage_sync::gather_tenant_timelines_index_parts(state.conf, storage, tenant_id).await + } + None => return Ok(None), + } + .with_context(|| format!("Failed to download index parts for tenant {tenant_id}"))?; + + let mut remote_timelines = Vec::with_capacity(index_parts.len()); + for (timeline_id, index_part) in index_parts { + let timeline_path = state.conf.timeline_path(&timeline_id, &tenant_id); + let remote_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part) + .with_context(|| { + format!("Failed to convert index part into remote timeline for timeline {tenant_id}/{timeline_id}") + })?; + remote_timelines.push((timeline_id, remote_timeline)); + } + Ok(Some(remote_timelines)) +} + +async fn timeline_delete_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + let state = get_state(&request); + tenant_mgr::delete_timeline(tenant_id, timeline_id) + .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id)) + .await + // FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both + // user and internal errors. Replace this with better handling once the error type permits + // it. + .map_err(ApiError::InternalServerError)?; - tokio::task::spawn_blocking(move || { - let _enter = - info_span!("timeline_detach_handler", tenant = %tenant_id, timeline = %timeline_id) - .entered(); - let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; - repo.detach_timeline(timeline_id) - }) - .await - .map_err(ApiError::from_err)??; + let mut remote_index = state.remote_index.write().await; + remote_index.remove_timeline_entry(TenantTimelineId { + tenant_id, + timeline_id, + }); - Ok(json_response(StatusCode::OK, ())?) + json_response(StatusCode::OK, ()) +} + +async fn tenant_detach_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + let state = get_state(&request); + let conf = state.conf; + tenant_mgr::detach_tenant(conf, tenant_id) + .instrument(info_span!("tenant_detach", tenant = %tenant_id)) + .await + // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors. + // Replace this with better handling once the error type permits it. + .map_err(ApiError::InternalServerError)?; + + let mut remote_index = state.remote_index.write().await; + remote_index.remove_tenant_entry(&tenant_id); + + json_response(StatusCode::OK, ()) } async fn tenant_list_handler(request: Request) -> Result, ApiError> { - // check for management permission check_permission(&request, None)?; + let state = get_state(&request); + // clone to avoid holding the lock while awaiting for blocking task + let remote_index = state.remote_index.read().await.clone(); + let response_data = tokio::task::spawn_blocking(move || { let _enter = info_span!("tenant_list").entered(); - crate::tenant_mgr::list_tenants() + crate::tenant_mgr::list_tenant_info(&remote_index) }) .await - .map_err(ApiError::from_err)??; + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?; - Ok(json_response(StatusCode::OK, response_data)?) + json_response(StatusCode::OK, response_data) +} + +async fn tenant_status(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + // if tenant is in progress of downloading it can be absent in global tenant map + let tenant = tenant_mgr::get_tenant(tenant_id, false); + + let state = get_state(&request); + let remote_index = &state.remote_index; + + let index_accessor = remote_index.read().await; + let has_in_progress_downloads = index_accessor + .tenant_entry(&tenant_id) + .map(|t| t.has_in_progress_downloads()) + .unwrap_or_else(|| { + info!("Tenant {tenant_id} not found in remote index"); + false + }); + + let (tenant_state, current_physical_size) = match tenant { + Ok(tenant) => { + let timelines = tenant.list_timelines(); + // Calculate total physical size of all timelines + let mut current_physical_size = 0; + for timeline in timelines { + current_physical_size += timeline.get_physical_size(); + } + + (tenant.current_state(), Some(current_physical_size)) + } + Err(e) => { + error!("Failed to get local tenant state: {e:#}"); + if has_in_progress_downloads { + (TenantState::Paused, None) + } else { + (TenantState::Broken, None) + } + } + }; + + json_response( + StatusCode::OK, + TenantInfo { + id: tenant_id, + state: tenant_state, + current_physical_size, + has_in_progress_downloads: Some(has_in_progress_downloads), + }, + ) +} + +async fn tenant_size_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::InternalServerError)?; + + // this can be long operation, it currently is not backed by any request coalescing or similar + let inputs = tenant + .gather_size_inputs() + .await + .map_err(ApiError::InternalServerError)?; + + let size = inputs.calculate().map_err(ApiError::InternalServerError)?; + + /// Private response type with the additional "unstable" `inputs` field. + /// + /// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is + /// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`. + #[serde_with::serde_as] + #[derive(serde::Serialize)] + struct TenantHistorySize { + #[serde_as(as = "serde_with::DisplayFromStr")] + id: TenantId, + /// Size is a mixture of WAL and logical size, so the unit is bytes. + size: u64, + inputs: crate::tenant::size::ModelInputs, + } + + json_response( + StatusCode::OK, + TenantHistorySize { + id: tenant_id, + size, + inputs, + }, + ) +} + +// Helper function to standardize the error messages we produce on bad durations +// +// Intended to be used with anyhow's `with_context`, e.g.: +// +// let value = result.with_context(bad_duration("name", &value))?; +// +fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String { + move || format!("Cannot parse `{field_name}` duration {value:?}") } async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { - // check for management permission check_permission(&request, None)?; let request_data: TenantCreateRequest = json_request(&mut request).await?; + println!("tenant create: {:?}", request_data.trace_read_requests); + let remote_index = get_state(&request).remote_index.clone(); - tokio::task::spawn_blocking(move || { - let _enter = info_span!("tenant_create", tenant = %request_data.tenant_id).entered(); - tenant_mgr::create_repository_for_tenant(get_config(&request), request_data.tenant_id) + let mut tenant_conf = TenantConfOpt::default(); + if let Some(gc_period) = request_data.gc_period { + tenant_conf.gc_period = Some( + humantime::parse_duration(&gc_period) + .with_context(bad_duration("gc_period", &gc_period)) + .map_err(ApiError::BadRequest)?, + ); + } + tenant_conf.gc_horizon = request_data.gc_horizon; + tenant_conf.image_creation_threshold = request_data.image_creation_threshold; + + if let Some(pitr_interval) = request_data.pitr_interval { + tenant_conf.pitr_interval = Some( + humantime::parse_duration(&pitr_interval) + .with_context(bad_duration("pitr_interval", &pitr_interval)) + .map_err(ApiError::BadRequest)?, + ); + } + + if let Some(walreceiver_connect_timeout) = request_data.walreceiver_connect_timeout { + tenant_conf.walreceiver_connect_timeout = Some( + humantime::parse_duration(&walreceiver_connect_timeout) + .with_context(bad_duration( + "walreceiver_connect_timeout", + &walreceiver_connect_timeout, + )) + .map_err(ApiError::BadRequest)?, + ); + } + if let Some(lagging_wal_timeout) = request_data.lagging_wal_timeout { + tenant_conf.lagging_wal_timeout = Some( + humantime::parse_duration(&lagging_wal_timeout) + .with_context(bad_duration("lagging_wal_timeout", &lagging_wal_timeout)) + .map_err(ApiError::BadRequest)?, + ); + } + if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag { + tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag); + } + if let Some(trace_read_requests) = request_data.trace_read_requests { + tenant_conf.trace_read_requests = Some(trace_read_requests); + } + + tenant_conf.checkpoint_distance = request_data.checkpoint_distance; + if let Some(checkpoint_timeout) = request_data.checkpoint_timeout { + tenant_conf.checkpoint_timeout = Some( + humantime::parse_duration(&checkpoint_timeout) + .with_context(bad_duration("checkpoint_timeout", &checkpoint_timeout)) + .map_err(ApiError::BadRequest)?, + ); + } + + tenant_conf.compaction_target_size = request_data.compaction_target_size; + tenant_conf.compaction_threshold = request_data.compaction_threshold; + + if let Some(compaction_period) = request_data.compaction_period { + tenant_conf.compaction_period = Some( + humantime::parse_duration(&compaction_period) + .with_context(bad_duration("compaction_period", &compaction_period)) + .map_err(ApiError::BadRequest)?, + ); + } + + let target_tenant_id = request_data + .new_tenant_id + .map(TenantId::from) + .unwrap_or_else(TenantId::generate); + + let new_tenant_id = tokio::task::spawn_blocking(move || { + let _enter = info_span!("tenant_create", tenant = ?target_tenant_id).entered(); + let conf = get_config(&request); + + tenant_mgr::create_tenant(conf, tenant_conf, target_tenant_id, remote_index) + // FIXME: `create_tenant` can fail from both user and internal errors. Replace this + // with better error handling once the type permits it + .map_err(ApiError::InternalServerError) }) .await - .map_err(ApiError::from_err)??; - Ok(json_response(StatusCode::CREATED, ())?) + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; + + Ok(match new_tenant_id { + Some(id) => json_response(StatusCode::CREATED, TenantCreateResponse(id))?, + None => json_response(StatusCode::CONFLICT, ())?, + }) +} + +async fn tenant_config_handler(mut request: Request) -> Result, ApiError> { + let request_data: TenantConfigRequest = json_request(&mut request).await?; + let tenant_id = request_data.tenant_id; + check_permission(&request, Some(tenant_id))?; + + let mut tenant_conf: TenantConfOpt = Default::default(); + if let Some(gc_period) = request_data.gc_period { + tenant_conf.gc_period = Some( + humantime::parse_duration(&gc_period) + .with_context(bad_duration("gc_period", &gc_period)) + .map_err(ApiError::BadRequest)?, + ); + } + tenant_conf.gc_horizon = request_data.gc_horizon; + tenant_conf.image_creation_threshold = request_data.image_creation_threshold; + + if let Some(pitr_interval) = request_data.pitr_interval { + tenant_conf.pitr_interval = Some( + humantime::parse_duration(&pitr_interval) + .with_context(bad_duration("pitr_interval", &pitr_interval)) + .map_err(ApiError::BadRequest)?, + ); + } + if let Some(walreceiver_connect_timeout) = request_data.walreceiver_connect_timeout { + tenant_conf.walreceiver_connect_timeout = Some( + humantime::parse_duration(&walreceiver_connect_timeout) + .with_context(bad_duration( + "walreceiver_connect_timeout", + &walreceiver_connect_timeout, + )) + .map_err(ApiError::BadRequest)?, + ); + } + if let Some(lagging_wal_timeout) = request_data.lagging_wal_timeout { + tenant_conf.lagging_wal_timeout = Some( + humantime::parse_duration(&lagging_wal_timeout) + .with_context(bad_duration("lagging_wal_timeout", &lagging_wal_timeout)) + .map_err(ApiError::BadRequest)?, + ); + } + if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag { + tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag); + } + if let Some(trace_read_requests) = request_data.trace_read_requests { + tenant_conf.trace_read_requests = Some(trace_read_requests); + } + + tenant_conf.checkpoint_distance = request_data.checkpoint_distance; + if let Some(checkpoint_timeout) = request_data.checkpoint_timeout { + tenant_conf.checkpoint_timeout = Some( + humantime::parse_duration(&checkpoint_timeout) + .with_context(bad_duration("checkpoint_timeout", &checkpoint_timeout)) + .map_err(ApiError::BadRequest)?, + ); + } + tenant_conf.compaction_target_size = request_data.compaction_target_size; + tenant_conf.compaction_threshold = request_data.compaction_threshold; + + if let Some(compaction_period) = request_data.compaction_period { + tenant_conf.compaction_period = Some( + humantime::parse_duration(&compaction_period) + .with_context(bad_duration("compaction_period", &compaction_period)) + .map_err(ApiError::BadRequest)?, + ); + } + + tokio::task::spawn_blocking(move || { + let _enter = info_span!("tenant_config", tenant = ?tenant_id).entered(); + + let state = get_state(&request); + tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id) + // FIXME: `update_tenant_config` can fail because of both user and internal errors. + // Replace this `map_err` with better error handling once the type permits it + .map_err(ApiError::InternalServerError) + }) + .await + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; + + json_response(StatusCode::OK, ()) +} + +#[cfg(feature = "testing")] +async fn failpoints_handler(mut request: Request) -> Result, ApiError> { + if !fail::has_failpoints() { + return Err(ApiError::BadRequest(anyhow!( + "Cannot manage failpoints because pageserver was compiled without failpoints support" + ))); + } + + let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?; + for fp in failpoints { + info!("cfg failpoint: {} {}", fp.name, fp.actions); + + // We recognize one extra "action" that's not natively recognized + // by the failpoints crate: exit, to immediately kill the process + let cfg_result = if fp.actions == "exit" { + fail::cfg_callback(fp.name, || { + info!("Exit requested by failpoint"); + std::process::exit(1); + }) + } else { + fail::cfg(fp.name, &fp.actions) + }; + + if let Err(err_msg) = cfg_result { + return Err(ApiError::BadRequest(anyhow!( + "Failed to configure failpoints: {err_msg}" + ))); + } + } + + json_response(StatusCode::OK, ()) +} + +// Run GC immediately on given timeline. +#[cfg(feature = "testing")] +async fn timeline_gc_handler(mut request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + // FIXME: currently this will return a 500 error on bad tenant id; it should be 4XX + let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?; + let gc_req: TimelineGcRequest = json_request(&mut request).await?; + + let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); + + // Use tenant's pitr setting + let pitr = tenant.get_pitr_interval(); + let result = tenant + .gc_iteration(Some(timeline_id), gc_horizon, pitr, true) + .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id)) + .await + // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it + // better once the types support it. + .map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, result) +} + +// Run compaction immediately on given timeline. +#[cfg(feature = "testing")] +async fn timeline_compact_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(ApiError::NotFound)?; + timeline.compact().map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + +// Run checkpoint immediately on given timeline. +#[cfg(feature = "testing")] +async fn timeline_checkpoint_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(ApiError::NotFound)?; + timeline + .checkpoint(CheckpointConfig::Forced) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) } async fn handler_404(_: Request) -> Result, ApiError> { @@ -337,7 +891,9 @@ async fn handler_404(_: Request) -> Result, ApiError> { pub fn make_router( conf: &'static PageServerConf, auth: Option>, -) -> RouterBuilder { + remote_index: RemoteIndex, + remote_storage: Option, +) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); if auth.is_some() { @@ -351,26 +907,67 @@ pub fn make_router( })) } - router - .data(Arc::new(State::new(conf, auth))) + macro_rules! testing_api { + ($handler_desc:literal, $handler:path $(,)?) => {{ + #[cfg(not(feature = "testing"))] + async fn cfg_disabled(_req: Request) -> Result, ApiError> { + Err(ApiError::BadRequest(anyhow!(concat!( + "Cannot ", + $handler_desc, + " because pageserver was compiled without testing APIs", + )))) + } + + #[cfg(feature = "testing")] + let handler = $handler; + #[cfg(not(feature = "testing"))] + let handler = cfg_disabled; + handler + }}; + } + + Ok(router + .data(Arc::new( + State::new(conf, auth, remote_index, remote_storage) + .context("Failed to initialize router state")?, + )) .get("/v1/status", status_handler) - .get("/v1/timeline/:tenant_id", timeline_list_handler) - .get( - "/v1/timeline/:tenant_id/:timeline_id", - timeline_detail_handler, + .put( + "/v1/failpoints", + testing_api!("manage failpoints", failpoints_handler), ) - .post( - "/v1/timeline/:tenant_id/:timeline_id/attach", - timeline_attach_handler, - ) - .post( - "/v1/timeline/:tenant_id/:timeline_id/detach", - timeline_detach_handler, - ) - .get("/v1/branch/:tenant_id", branch_list_handler) - .get("/v1/branch/:tenant_id/:branch_name", branch_detail_handler) - .post("/v1/branch", branch_create_handler) .get("/v1/tenant", tenant_list_handler) .post("/v1/tenant", tenant_create_handler) - .any(handler_404) + .get("/v1/tenant/:tenant_id", tenant_status) + .get("/v1/tenant/:tenant_id/size", tenant_size_handler) + .put("/v1/tenant/config", tenant_config_handler) + .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) + .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) + .post("/v1/tenant/:tenant_id/attach", tenant_attach_handler) + .post("/v1/tenant/:tenant_id/detach", tenant_detach_handler) + .get( + "/v1/tenant/:tenant_id/timeline/:timeline_id", + timeline_detail_handler, + ) + .get( + "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp", + get_lsn_by_timestamp_handler, + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", + testing_api!("run timeline GC", timeline_gc_handler), + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/compact", + testing_api!("run timeline compaction", timeline_compact_handler), + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", + testing_api!("run timeline checkpoint", timeline_checkpoint_handler), + ) + .delete( + "/v1/tenant/:tenant_id/timeline/:timeline_id", + timeline_delete_handler, + ) + .any(handler_404)) } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index e317118bb5..642e41765b 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -1,8 +1,7 @@ //! //! Import data and WAL from a PostgreSQL data directory and WAL segments into -//! a zenith Timeline. +//! a neon Timeline. //! -use std::fs; use std::fs::File; use std::io::{Read, Seek, SeekFrom}; use std::path::{Path, PathBuf}; @@ -10,16 +9,32 @@ use std::path::{Path, PathBuf}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use tracing::*; +use walkdir::WalkDir; -use crate::relish::*; -use crate::repository::*; +use crate::pgdatadir_mapping::*; +use crate::tenant::Timeline; use crate::walingest::WalIngest; +use crate::walrecord::DecodedWALRecord; +use pageserver_api::reltag::{RelTag, SlruKind}; +use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::*; -use postgres_ffi::waldecoder::*; -use postgres_ffi::xlog_utils::*; +use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::ControlFileData; +use postgres_ffi::DBState_DB_SHUTDOWNED; use postgres_ffi::Oid; -use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; -use zenith_utils::lsn::Lsn; +use postgres_ffi::XLogFileName; +use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE}; +use utils::lsn::Lsn; + +// Returns checkpoint LSN from controlfile +pub fn get_lsn_from_controlfile(path: &Path) -> Result { + // Read control file to extract the LSN + let controlfile_path = path.join("global").join("pg_control"); + let controlfile = ControlFileData::decode(&std::fs::read(controlfile_path)?)?; + let lsn = controlfile.checkPoint; + + Ok(Lsn(lsn)) +} /// /// Import all relation data pages from local disk into the repository. @@ -28,102 +43,39 @@ use zenith_utils::lsn::Lsn; /// The code that deals with the checkpoint would not work right if the /// cluster was not shut down cleanly. pub fn import_timeline_from_postgres_datadir( - path: &Path, - writer: &dyn TimelineWriter, - lsn: Lsn, + tline: &Timeline, + pgdata_path: &Path, + pgdata_lsn: Lsn, ) -> Result<()> { let mut pg_control: Option = None; - // Scan 'global' - for direntry in fs::read_dir(path.join("global"))? { - let direntry = direntry?; - match direntry.file_name().to_str() { - None => continue, + // TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn) + // Then fishing out pg_control would be unnecessary + let mut modification = tline.begin_modification(pgdata_lsn); + modification.init_empty()?; - Some("pg_control") => { - pg_control = Some(import_control_file(writer, lsn, &direntry.path())?); + // Import all but pg_wal + let all_but_wal = WalkDir::new(pgdata_path) + .into_iter() + .filter_entry(|entry| !entry.path().ends_with("pg_wal")); + for entry in all_but_wal { + let entry = entry?; + let metadata = entry.metadata().expect("error getting dir entry metadata"); + if metadata.is_file() { + let absolute_path = entry.path(); + let relative_path = absolute_path.strip_prefix(pgdata_path)?; + + let file = File::open(absolute_path)?; + let len = metadata.len() as usize; + if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? { + pg_control = Some(control_file); } - Some("pg_filenode.map") => import_nonrel_file( - writer, - lsn, - RelishTag::FileNodeMap { - spcnode: pg_constants::GLOBALTABLESPACE_OID, - dbnode: 0, - }, - &direntry.path(), - )?, - - // Load any relation files into the page server - _ => import_relfile( - &direntry.path(), - writer, - lsn, - pg_constants::GLOBALTABLESPACE_OID, - 0, - )?, + modification.flush()?; } } - // Scan 'base'. It contains database dirs, the database OID is the filename. - // E.g. 'base/12345', where 12345 is the database OID. - for direntry in fs::read_dir(path.join("base"))? { - let direntry = direntry?; - - //skip all temporary files - if direntry.file_name().to_str().unwrap() == "pgsql_tmp" { - continue; - } - - let dboid = direntry.file_name().to_str().unwrap().parse::()?; - - for direntry in fs::read_dir(direntry.path())? { - let direntry = direntry?; - match direntry.file_name().to_str() { - None => continue, - - Some("PG_VERSION") => continue, - Some("pg_filenode.map") => import_nonrel_file( - writer, - lsn, - RelishTag::FileNodeMap { - spcnode: pg_constants::DEFAULTTABLESPACE_OID, - dbnode: dboid, - }, - &direntry.path(), - )?, - - // Load any relation files into the page server - _ => import_relfile( - &direntry.path(), - writer, - lsn, - pg_constants::DEFAULTTABLESPACE_OID, - dboid, - )?, - } - } - } - for entry in fs::read_dir(path.join("pg_xact"))? { - let entry = entry?; - import_slru_file(writer, lsn, SlruKind::Clog, &entry.path())?; - } - for entry in fs::read_dir(path.join("pg_multixact").join("members"))? { - let entry = entry?; - import_slru_file(writer, lsn, SlruKind::MultiXactMembers, &entry.path())?; - } - for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? { - let entry = entry?; - import_slru_file(writer, lsn, SlruKind::MultiXactOffsets, &entry.path())?; - } - for entry in fs::read_dir(path.join("pg_twophase"))? { - let entry = entry?; - let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?; - import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?; - } - // TODO: Scan pg_tblspc - // We're done importing all the data files. - writer.advance_last_record_lsn(lsn); + modification.commit()?; // We expect the Postgres server to be shut down cleanly. let pg_control = pg_control.context("pg_control file not found")?; @@ -132,7 +84,7 @@ pub fn import_timeline_from_postgres_datadir( "Postgres cluster was not shut down cleanly" ); ensure!( - pg_control.checkPointCopy.redo == lsn.0, + pg_control.checkPointCopy.redo == pgdata_lsn.0, "unexpected checkpoint REDO pointer" ); @@ -140,56 +92,74 @@ pub fn import_timeline_from_postgres_datadir( // this reads the checkpoint record itself, advancing the tip of the timeline to // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'. import_wal( - &path.join("pg_wal"), - writer, + &pgdata_path.join("pg_wal"), + tline, Lsn(pg_control.checkPointCopy.redo), - lsn, + pgdata_lsn, )?; Ok(()) } // subroutine of import_timeline_from_postgres_datadir(), to load one relation file. -fn import_relfile( +fn import_rel( + modification: &mut DatadirModification, path: &Path, - timeline: &dyn TimelineWriter, - lsn: Lsn, spcoid: Oid, dboid: Oid, -) -> Result<()> { + mut reader: Reader, + len: usize, +) -> anyhow::Result<()> { // Does it look like a relation file? trace!("importing rel file {}", path.display()); - let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap()); - if let Err(e) = p { + let filename = &path + .file_name() + .expect("missing rel filename") + .to_string_lossy(); + let (relnode, forknum, segno) = parse_relfilename(filename).map_err(|e| { warn!("unrecognized file in postgres datadir: {:?} ({})", path, e); - return Err(e.into()); - } - let (relnode, forknum, segno) = p.unwrap(); + e + })?; - let mut file = File::open(path)?; let mut buf: [u8; 8192] = [0u8; 8192]; - let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32); + ensure!(len % BLCKSZ as usize == 0); + let nblocks = len / BLCKSZ as usize; + + let rel = RelTag { + spcnode: spcoid, + dbnode: dboid, + relnode, + forknum, + }; + + let mut blknum: u32 = segno * (1024 * 1024 * 1024 / BLCKSZ as u32); + + // Call put_rel_creation for every segment of the relation, + // because there is no guarantee about the order in which we are processing segments. + // ignore "relation already exists" error + if let Err(e) = modification.put_rel_creation(rel, nblocks as u32) { + if e.to_string().contains("already exists") { + debug!("relation {} already exists. we must be extending it", rel); + } else { + return Err(e); + } + } + loop { - let r = file.read_exact(&mut buf); + let r = reader.read_exact(&mut buf); match r { Ok(_) => { - let rel = RelTag { - spcnode: spcoid, - dbnode: dboid, - relnode, - forknum, - }; - let tag = RelishTag::Relation(rel); - timeline.put_page_image(tag, blknum, lsn, Bytes::copy_from_slice(&buf))?; + modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; } // TODO: UnexpectedEof is expected Err(err) => match err.kind() { std::io::ErrorKind::UnexpectedEof => { // reached EOF. That's expected. - // FIXME: maybe check that we read the full length of the file? + let relative_blknum = blknum - segno * (1024 * 1024 * 1024 / BLCKSZ as u32); + ensure!(relative_blknum == nblocks as u32, "unexpected EOF"); break; } _ => { @@ -200,90 +170,49 @@ fn import_relfile( blknum += 1; } + // Update relation size + // + // If we process rel segments out of order, + // put_rel_extend will skip the update. + modification.put_rel_extend(rel, blknum)?; + Ok(()) } -/// -/// Import a "non-blocky" file into the repository -/// -/// This is used for small files like the control file, twophase files etc. that -/// are just slurped into the repository as one blob. -/// -fn import_nonrel_file( - timeline: &dyn TimelineWriter, - lsn: Lsn, - tag: RelishTag, - path: &Path, -) -> Result<()> { - let mut file = File::open(path)?; - let mut buffer = Vec::new(); - // read the whole file - file.read_to_end(&mut buffer)?; - - trace!("importing non-rel file {}", path.display()); - - timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]))?; - Ok(()) -} - -/// -/// Import pg_control file into the repository. -/// -/// The control file is imported as is, but we also extract the checkpoint record -/// from it and store it separated. -fn import_control_file( - timeline: &dyn TimelineWriter, - lsn: Lsn, - path: &Path, -) -> Result { - let mut file = File::open(path)?; - let mut buffer = Vec::new(); - // read the whole file - file.read_to_end(&mut buffer)?; - - trace!("importing control file {}", path.display()); - - // Import it as ControlFile - timeline.put_page_image( - RelishTag::ControlFile, - 0, - lsn, - Bytes::copy_from_slice(&buffer[..]), - )?; - - // Extract the checkpoint record and import it separately. - let pg_control = ControlFileData::decode(&buffer)?; - let checkpoint_bytes = pg_control.checkPointCopy.encode(); - timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, checkpoint_bytes)?; - - Ok(pg_control) -} - -/// /// Import an SLRU segment file /// -fn import_slru_file( - timeline: &dyn TimelineWriter, - lsn: Lsn, +fn import_slru( + modification: &mut DatadirModification, slru: SlruKind, path: &Path, + mut reader: Reader, + len: usize, ) -> Result<()> { - // Does it look like an SLRU file? - let mut file = File::open(path)?; - let mut buf: [u8; 8192] = [0u8; 8192]; - let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?; - trace!("importing slru file {}", path.display()); + let mut buf: [u8; 8192] = [0u8; 8192]; + let filename = &path + .file_name() + .expect("missing slru filename") + .to_string_lossy(); + let segno = u32::from_str_radix(filename, 16)?; + + ensure!(len % BLCKSZ as usize == 0); // we assume SLRU block size is the same as BLCKSZ + let nblocks = len / BLCKSZ as usize; + + ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize); + + modification.put_slru_segment_creation(slru, segno, nblocks as u32)?; + let mut rpageno = 0; loop { - let r = file.read_exact(&mut buf); + let r = reader.read_exact(&mut buf); match r { Ok(_) => { - timeline.put_page_image( - RelishTag::Slru { slru, segno }, + modification.put_slru_page_image( + slru, + segno, rpageno, - lsn, Bytes::copy_from_slice(&buf), )?; } @@ -292,7 +221,7 @@ fn import_slru_file( Err(err) => match err.kind() { std::io::ErrorKind::UnexpectedEof => { // reached EOF. That's expected. - // FIXME: maybe check that we read the full length of the file? + ensure!(rpageno == nblocks as u32, "unexpected EOF"); break; } _ => { @@ -301,8 +230,6 @@ fn import_slru_file( }, }; rpageno += 1; - - // TODO: Check that the file isn't unexpectedly large, not larger than SLRU_PAGES_PER_SEGMENT pages } Ok(()) @@ -310,23 +237,18 @@ fn import_slru_file( /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. -fn import_wal( - walpath: &Path, - writer: &dyn TimelineWriter, - startpoint: Lsn, - endpoint: Lsn, -) -> Result<()> { - let mut waldecoder = WalStreamDecoder::new(startpoint); +fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> { + let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version); - let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE); - let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE); + let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE); + let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = startpoint; - let mut walingest = WalIngest::new(writer.deref(), startpoint)?; + let mut walingest = WalIngest::new(tline, startpoint)?; while last_lsn <= endpoint { // FIXME: assume postgresql tli 1 for now - let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE); + let filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE); let mut buf = Vec::new(); // Read local file @@ -345,7 +267,7 @@ fn import_wal( } let nread = file.read_to_end(&mut buf)?; - if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize { + if nread != WAL_SEGMENT_SIZE - offset as usize { // Maybe allow this for .partial files? error!("read only {} bytes from WAL file", nread); } @@ -353,9 +275,11 @@ fn import_wal( waldecoder.feed_bytes(&buf); let mut nrecords = 0; + let mut modification = tline.begin_modification(endpoint); + let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(writer, recdata, lsn)?; + walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; last_lsn = lsn; nrecords += 1; @@ -371,10 +295,283 @@ fn import_wal( } if last_lsn != startpoint { - debug!("reached end of WAL at {}", last_lsn); + info!("reached end of WAL at {}", last_lsn); } else { info!("no WAL to import at {}", last_lsn); } Ok(()) } + +pub fn import_basebackup_from_tar( + tline: &Timeline, + reader: Reader, + base_lsn: Lsn, +) -> Result<()> { + info!("importing base at {base_lsn}"); + let mut modification = tline.begin_modification(base_lsn); + modification.init_empty()?; + + let mut pg_control: Option = None; + + // Import base + for base_tar_entry in tar::Archive::new(reader).entries()? { + let entry = base_tar_entry?; + let header = entry.header(); + let len = header.entry_size()? as usize; + let file_path = header.path()?.into_owned(); + + match header.entry_type() { + tar::EntryType::Regular => { + if let Some(res) = import_file(&mut modification, file_path.as_ref(), entry, len)? { + // We found the pg_control file. + pg_control = Some(res); + } + modification.flush()?; + } + tar::EntryType::Directory => { + debug!("directory {:?}", file_path); + } + _ => { + bail!( + "entry {} in backup tar archive is of unexpected type: {:?}", + file_path.display(), + header.entry_type() + ); + } + } + } + + // sanity check: ensure that pg_control is loaded + let _pg_control = pg_control.context("pg_control file not found")?; + + modification.commit()?; + Ok(()) +} + +pub fn import_wal_from_tar( + tline: &Timeline, + reader: Reader, + start_lsn: Lsn, + end_lsn: Lsn, +) -> Result<()> { + // Set up walingest mutable state + let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version); + let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE); + let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE); + let mut last_lsn = start_lsn; + let mut walingest = WalIngest::new(tline, start_lsn)?; + + // Ingest wal until end_lsn + info!("importing wal until {}", end_lsn); + let mut pg_wal_tar = tar::Archive::new(reader); + let mut pg_wal_entries_iter = pg_wal_tar.entries()?; + while last_lsn <= end_lsn { + let bytes = { + let entry = pg_wal_entries_iter.next().expect("expected more wal")?; + let header = entry.header(); + let file_path = header.path()?.into_owned(); + + match header.entry_type() { + tar::EntryType::Regular => { + // FIXME: assume postgresql tli 1 for now + let expected_filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE); + let file_name = file_path + .file_name() + .expect("missing wal filename") + .to_string_lossy(); + ensure!(expected_filename == file_name); + + debug!("processing wal file {:?}", file_path); + read_all_bytes(entry)? + } + tar::EntryType::Directory => { + debug!("directory {:?}", file_path); + continue; + } + _ => { + bail!( + "entry {} in WAL tar archive is of unexpected type: {:?}", + file_path.display(), + header.entry_type() + ); + } + } + }; + + waldecoder.feed_bytes(&bytes[offset..]); + + let mut modification = tline.begin_modification(end_lsn); + let mut decoded = DecodedWALRecord::default(); + while last_lsn <= end_lsn { + if let Some((lsn, recdata)) = waldecoder.poll_decode()? { + walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; + last_lsn = lsn; + + debug!("imported record at {} (end {})", lsn, end_lsn); + } + } + + debug!("imported records up to {}", last_lsn); + segno += 1; + offset = 0; + } + + if last_lsn != start_lsn { + info!("reached end of WAL at {}", last_lsn); + } else { + info!("there was no WAL to import at {}", last_lsn); + } + + // Log any extra unused files + for e in &mut pg_wal_entries_iter { + let entry = e?; + let header = entry.header(); + let file_path = header.path()?.into_owned(); + info!("skipping {:?}", file_path); + } + + Ok(()) +} + +fn import_file( + modification: &mut DatadirModification, + file_path: &Path, + reader: Reader, + len: usize, +) -> Result> { + if file_path.starts_with("global") { + let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; + let dbnode = 0; + + match file_path + .file_name() + .expect("missing filename") + .to_string_lossy() + .as_ref() + { + "pg_control" => { + let bytes = read_all_bytes(reader)?; + + // Extract the checkpoint record and import it separately. + let pg_control = ControlFileData::decode(&bytes[..])?; + let checkpoint_bytes = pg_control.checkPointCopy.encode()?; + modification.put_checkpoint(checkpoint_bytes)?; + debug!("imported control file"); + + // Import it as ControlFile + modification.put_control_file(bytes)?; + return Ok(Some(pg_control)); + } + "pg_filenode.map" => { + let bytes = read_all_bytes(reader)?; + modification.put_relmap_file(spcnode, dbnode, bytes)?; + debug!("imported relmap file") + } + "PG_VERSION" => { + debug!("ignored PG_VERSION file"); + } + _ => { + import_rel(modification, file_path, spcnode, dbnode, reader, len)?; + debug!("imported rel creation"); + } + } + } else if file_path.starts_with("base") { + let spcnode = pg_constants::DEFAULTTABLESPACE_OID; + let dbnode: u32 = file_path + .iter() + .nth(1) + .expect("invalid file path, expected dbnode") + .to_string_lossy() + .parse()?; + + match file_path + .file_name() + .expect("missing base filename") + .to_string_lossy() + .as_ref() + { + "pg_filenode.map" => { + let bytes = read_all_bytes(reader)?; + modification.put_relmap_file(spcnode, dbnode, bytes)?; + debug!("imported relmap file") + } + "PG_VERSION" => { + debug!("ignored PG_VERSION file"); + } + _ => { + import_rel(modification, file_path, spcnode, dbnode, reader, len)?; + debug!("imported rel creation"); + } + } + } else if file_path.starts_with("pg_xact") { + let slru = SlruKind::Clog; + + import_slru(modification, slru, file_path, reader, len)?; + debug!("imported clog slru"); + } else if file_path.starts_with("pg_multixact/offsets") { + let slru = SlruKind::MultiXactOffsets; + + import_slru(modification, slru, file_path, reader, len)?; + debug!("imported multixact offsets slru"); + } else if file_path.starts_with("pg_multixact/members") { + let slru = SlruKind::MultiXactMembers; + + import_slru(modification, slru, file_path, reader, len)?; + debug!("imported multixact members slru"); + } else if file_path.starts_with("pg_twophase") { + let file_name = &file_path + .file_name() + .expect("missing twophase filename") + .to_string_lossy(); + let xid = u32::from_str_radix(file_name, 16)?; + + let bytes = read_all_bytes(reader)?; + modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?; + debug!("imported twophase file"); + } else if file_path.starts_with("pg_wal") { + debug!("found wal file in base section. ignore it"); + } else if file_path.starts_with("zenith.signal") { + // Parse zenith signal file to set correct previous LSN + let bytes = read_all_bytes(reader)?; + // zenith.signal format is "PREV LSN: prev_lsn" + // TODO write serialization and deserialization in the same place. + let zenith_signal = std::str::from_utf8(&bytes)?.trim(); + let prev_lsn = match zenith_signal { + "PREV LSN: none" => Lsn(0), + "PREV LSN: invalid" => Lsn(0), + other => { + let split = other.split(':').collect::>(); + split[1] + .trim() + .parse::() + .context("can't parse zenith.signal")? + } + }; + + // zenith.signal is not necessarily the last file, that we handle + // but it is ok to call `finish_write()`, because final `modification.commit()` + // will update lsn once more to the final one. + let writer = modification.tline.writer(); + writer.finish_write(prev_lsn); + + debug!("imported zenith signal {}", prev_lsn); + } else if file_path.starts_with("pg_tblspc") { + // TODO Backups exported from neon won't have pg_tblspc, but we will need + // this to import arbitrary postgres databases. + bail!("Importing pg_tblspc is not implemented"); + } else { + debug!( + "ignoring unrecognized file \"{}\" in tar archive", + file_path.display() + ); + } + + Ok(None) +} + +fn read_all_bytes(mut reader: Reader) -> Result { + let mut buf: Vec = vec![]; + reader.read_to_end(&mut buf)?; + Ok(Bytes::copy_from_slice(&buf[..])) +} diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs new file mode 100644 index 0000000000..64024a2d8d --- /dev/null +++ b/pageserver/src/keyspace.rs @@ -0,0 +1,131 @@ +use crate::repository::{key_range_size, singleton_range, Key}; +use postgres_ffi::BLCKSZ; +use std::ops::Range; + +/// +/// Represents a set of Keys, in a compact form. +/// +#[derive(Clone, Debug)] +pub struct KeySpace { + /// Contiguous ranges of keys that belong to the key space. In key order, + /// and with no overlap. + pub ranges: Vec>, +} + +impl KeySpace { + /// + /// Partition a key space into roughly chunks of roughly 'target_size' bytes + /// in each partition. + /// + pub fn partition(&self, target_size: u64) -> KeyPartitioning { + // Assume that each value is 8k in size. + let target_nblocks = (target_size / BLCKSZ as u64) as usize; + + let mut parts = Vec::new(); + let mut current_part = Vec::new(); + let mut current_part_size: usize = 0; + for range in &self.ranges { + // If appending the next contiguous range in the keyspace to the current + // partition would cause it to be too large, start a new partition. + let this_size = key_range_size(range) as usize; + if current_part_size + this_size > target_nblocks && !current_part.is_empty() { + parts.push(KeySpace { + ranges: current_part, + }); + current_part = Vec::new(); + current_part_size = 0; + } + + // If the next range is larger than 'target_size', split it into + // 'target_size' chunks. + let mut remain_size = this_size; + let mut start = range.start; + while remain_size > target_nblocks { + let next = start.add(target_nblocks as u32); + parts.push(KeySpace { + ranges: vec![start..next], + }); + start = next; + remain_size -= target_nblocks + } + current_part.push(start..range.end); + current_part_size += remain_size; + } + + // add last partition that wasn't full yet. + if !current_part.is_empty() { + parts.push(KeySpace { + ranges: current_part, + }); + } + + KeyPartitioning { parts } + } +} + +/// +/// Represents a partitioning of the key space. +/// +/// The only kind of partitioning we do is to partition the key space into +/// partitions that are roughly equal in physical size (see KeySpace::partition). +/// But this data structure could represent any partitioning. +/// +#[derive(Clone, Debug, Default)] +pub struct KeyPartitioning { + pub parts: Vec, +} + +impl KeyPartitioning { + pub fn new() -> Self { + KeyPartitioning { parts: Vec::new() } + } +} + +/// +/// A helper object, to collect a set of keys and key ranges into a KeySpace +/// object. This takes care of merging adjacent keys and key ranges into +/// contiguous ranges. +/// +#[derive(Clone, Debug, Default)] +pub struct KeySpaceAccum { + accum: Option>, + + ranges: Vec>, +} + +impl KeySpaceAccum { + pub fn new() -> Self { + Self { + accum: None, + ranges: Vec::new(), + } + } + + pub fn add_key(&mut self, key: Key) { + self.add_range(singleton_range(key)) + } + + pub fn add_range(&mut self, range: Range) { + match self.accum.as_mut() { + Some(accum) => { + if range.start == accum.end { + accum.end = range.end; + } else { + assert!(range.start > accum.end); + self.ranges.push(accum.clone()); + *accum = range; + } + } + None => self.accum = Some(range), + } + } + + pub fn to_keyspace(mut self) -> KeySpace { + if let Some(accum) = self.accum.take() { + self.ranges.push(accum); + } + KeySpace { + ranges: self.ranges, + } + } +} diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs deleted file mode 100644 index 5dae1902c1..0000000000 --- a/pageserver/src/layered_repository.rs +++ /dev/null @@ -1,2503 +0,0 @@ -//! -//! Zenith repository implementation that keeps old data in files on disk, and -//! the recent changes in memory. See layered_repository/*_layer.rs files. -//! The functions here are responsible for locating the correct layer for the -//! get/put call, tracing timeline branching history as needed. -//! -//! The files are stored in the .zenith/tenants//timelines/ -//! directory. See layered_repository/README for how the files are managed. -//! In addition to the layer files, there is a metadata file in the same -//! directory that contains information about the timeline, in particular its -//! parent timeline, and the last LSN that has been written to disk. -//! - -use anyhow::{bail, ensure, Context, Result}; -use bookfile::Book; -use bytes::Bytes; -use lazy_static::lazy_static; -use postgres_ffi::pg_constants::BLCKSZ; -use tracing::*; - -use std::cmp; -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::collections::{BTreeSet, HashSet}; -use std::fs; -use std::fs::{File, OpenOptions}; -use std::io::Write; -use std::ops::{Bound::Included, Deref}; -use std::path::{Path, PathBuf}; -use std::sync::atomic::{self, AtomicBool, AtomicUsize}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard}; -use std::time::Instant; - -use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; -use crate::config::PageServerConf; -use crate::page_cache; -use crate::relish::*; -use crate::remote_storage::{schedule_timeline_checkpoint_upload, schedule_timeline_download}; -use crate::repository::{ - BlockNumber, GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncState, - TimelineWriter, ZenithWalRecord, -}; -use crate::thread_mgr; -use crate::virtual_file::VirtualFile; -use crate::walreceiver::IS_WAL_RECEIVER; -use crate::walredo::WalRedoManager; -use crate::CheckpointConfig; -use crate::{ZTenantId, ZTimelineId}; - -use zenith_metrics::{ - register_histogram, register_int_gauge_vec, Histogram, IntGauge, IntGaugeVec, -}; -use zenith_metrics::{register_histogram_vec, HistogramVec}; -use zenith_utils::crashsafe_dir; -use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; -use zenith_utils::seqwait::SeqWait; - -mod delta_layer; -mod ephemeral_file; -mod filename; -mod global_layer_map; -mod image_layer; -mod inmemory_layer; -mod interval_tree; -mod layer_map; -pub mod metadata; -mod par_fsync; -mod storage_layer; - -use delta_layer::DeltaLayer; -use ephemeral_file::is_ephemeral_file; -use filename::{DeltaFileName, ImageFileName}; -use image_layer::ImageLayer; -use inmemory_layer::InMemoryLayer; -use layer_map::LayerMap; -use storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, SegmentBlk, SegmentTag, RELISH_SEG_SIZE, -}; - -// re-export this function so that page_cache.rs can use it. -pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; - -static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); - -// Metrics collected on operations on the storage repository. -lazy_static! { - static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( - "pageserver_storage_time", - "Time spent on storage operations", - &["operation"] - ) - .expect("failed to define a metric"); -} - -// Metrics collected on operations on the storage repository. -lazy_static! { - static ref RECONSTRUCT_TIME: Histogram = register_histogram!( - "pageserver_getpage_reconstruct_time", - "FIXME Time spent on storage operations" - ) - .expect("failed to define a metric"); -} - -lazy_static! { - // NOTE: can be zero if pageserver was restarted and there hasn't been any - // activity yet. - static ref LOGICAL_TIMELINE_SIZE: IntGaugeVec = register_int_gauge_vec!( - "pageserver_logical_timeline_size", - "Logical timeline size (bytes)", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} - -/// Parts of the `.zenith/tenants//timelines/` directory prefix. -pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; - -/// -/// Repository consists of multiple timelines. Keep them in a hash table. -/// -pub struct LayeredRepository { - conf: &'static PageServerConf, - tenantid: ZTenantId, - timelines: Mutex>, - // This mutex prevents creation of new timelines during GC. - // Adding yet another mutex (in addition to `timelines`) is needed because holding - // `timelines` mutex during all GC iteration (especially with enforced checkpoint) - // may block for a long time `get_timeline`, `get_timelines_state`,... and other operations - // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn - // timeout... - gc_cs: Mutex<()>, - walredo_mgr: Arc, - /// Makes every timeline to backup their files to remote storage. - upload_relishes: bool, -} - -/// Public interface -impl Repository for LayeredRepository { - fn get_timeline(&self, timelineid: ZTimelineId) -> Result { - let mut timelines = self.timelines.lock().unwrap(); - Ok( - match self.get_or_init_timeline(timelineid, &mut timelines)? { - LayeredTimelineEntry::Local(local) => RepositoryTimeline::Local(local), - LayeredTimelineEntry::Remote { - id, - disk_consistent_lsn, - } => RepositoryTimeline::Remote { - id, - disk_consistent_lsn, - }, - }, - ) - } - - fn create_empty_timeline( - &self, - timelineid: ZTimelineId, - initdb_lsn: Lsn, - ) -> Result> { - let mut timelines = self.timelines.lock().unwrap(); - - // Create the timeline directory, and write initial metadata to file. - crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?; - - let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); - Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?; - - let timeline = LayeredTimeline::new( - self.conf, - metadata, - None, - timelineid, - self.tenantid, - Arc::clone(&self.walredo_mgr), - 0, - self.upload_relishes, - ); - - let timeline_rc = Arc::new(timeline); - let r = timelines.insert(timelineid, LayeredTimelineEntry::Local(timeline_rc.clone())); - assert!(r.is_none()); - Ok(timeline_rc) - } - - /// Branch a timeline - fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()> { - // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn - // about timelines, so otherwise a race condition is possible, where we create new timeline and GC - // concurrently removes data that is needed by the new timeline. - let _gc_cs = self.gc_cs.lock().unwrap(); - - let mut timelines = self.timelines.lock().unwrap(); - let src_timeline = match self.get_or_init_timeline(src, &mut timelines)? { - LayeredTimelineEntry::Local(timeline) => timeline, - LayeredTimelineEntry::Remote { .. } => { - bail!("Cannot branch off the timeline {} that's not local", src) - } - }; - let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); - - src_timeline - .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) - .context("invalid branch start lsn")?; - - let RecordLsn { - last: src_last, - prev: src_prev, - } = src_timeline.get_last_record_rlsn(); - - // Use src_prev from the source timeline only if we branched at the last record. - let dst_prev = if src_last == start_lsn { - Some(src_prev) - } else { - None - }; - - // create a new timeline directory - let timelinedir = self.conf.timeline_path(&dst, &self.tenantid); - - crashsafe_dir::create_dir(&timelinedir)?; - - // Create the metadata file, noting the ancestor of the new timeline. - // There is initially no data in it, but all the read-calls know to look - // into the ancestor. - let metadata = TimelineMetadata::new( - start_lsn, - dst_prev, - Some(src), - start_lsn, - *src_timeline.latest_gc_cutoff_lsn.read().unwrap(), - src_timeline.initdb_lsn, - ); - crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?; - Self::save_metadata(self.conf, dst, self.tenantid, &metadata, true)?; - - info!("branched timeline {} from {} at {}", dst, src, start_lsn); - - Ok(()) - } - - /// Public entry point to GC. All the logic is in the private - /// gc_iteration_internal function, this public facade just wraps it for - /// metrics collection. - fn gc_iteration( - &self, - target_timelineid: Option, - horizon: u64, - checkpoint_before_gc: bool, - ) -> Result { - STORAGE_TIME - .with_label_values(&["gc"]) - .observe_closure_duration(|| { - self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) - }) - } - - fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()> { - // Scan through the hashmap and collect a list of all the timelines, - // while holding the lock. Then drop the lock and actually perform the - // checkpoints. We don't want to block everything else while the - // checkpoint runs. - let timelines = self.timelines.lock().unwrap(); - let timelines_to_checkpoint = timelines - .iter() - .map(|(timelineid, timeline)| (*timelineid, timeline.clone())) - .collect::>(); - drop(timelines); - - for (timelineid, timeline) in &timelines_to_checkpoint { - let _entered = - info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid).entered(); - match timeline { - LayeredTimelineEntry::Local(timeline) => timeline.checkpoint(cconf)?, - LayeredTimelineEntry::Remote { .. } => debug!( - "Cannot run the checkpoint for remote timeline {}", - timelineid - ), - } - } - - Ok(()) - } - - // Detaches the timeline from the repository. - fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> { - let mut timelines = self.timelines.lock().unwrap(); - match timelines.entry(timeline_id) { - Entry::Vacant(_) => { - bail!("cannot detach non existing timeline"); - } - Entry::Occupied(mut entry) => { - let timeline_entry = entry.get_mut(); - - let timeline = match timeline_entry { - LayeredTimelineEntry::Remote { .. } => { - bail!("cannot detach remote timeline {}", timeline_id); - } - LayeredTimelineEntry::Local(timeline) => timeline, - }; - - // TODO (rodionov) keep local state in timeline itself (refactoring related to https://github.com/zenithdb/zenith/issues/997 and #1104) - - // FIXME this is local disk consistent lsn, need to keep the latest succesfully uploaded checkpoint lsn in timeline (metadata?) - // https://github.com/zenithdb/zenith/issues/1104 - let remote_disk_consistent_lsn = timeline.disk_consistent_lsn.load(); - // reference to timeline is dropped here - entry.insert(LayeredTimelineEntry::Remote { - id: timeline_id, - disk_consistent_lsn: remote_disk_consistent_lsn, - }); - } - }; - // Release the lock to shutdown and remove the files without holding it - drop(timelines); - // shutdown the timeline (this shuts down the walreceiver) - thread_mgr::shutdown_threads(None, Some(self.tenantid), Some(timeline_id)); - - // remove timeline files (maybe avoid this for ease of debugging if something goes wrong) - fs::remove_dir_all(self.conf.timeline_path(&timeline_id, &self.tenantid))?; - Ok(()) - } - - // TODO this method currentlly does not do anything to prevent (or react to) state updates between a sync task schedule and a sync task end (that causes this update). - // Sync task is enqueued and can error and be rescheduled, so some significant time may pass between the events. - // - /// Reacts on the timeline sync state change, changing pageserver's memory state for this timeline (unload or load of the timeline files). - fn set_timeline_state( - &self, - timeline_id: ZTimelineId, - new_state: TimelineSyncState, - ) -> Result<()> { - debug!( - "set_timeline_state: timeline_id: {}, new_state: {:?}", - timeline_id, new_state - ); - let mut timelines_accessor = self.timelines.lock().unwrap(); - - match new_state { - TimelineSyncState::Ready(_) => { - let reloaded_timeline = - self.init_local_timeline(timeline_id, &mut timelines_accessor)?; - timelines_accessor - .insert(timeline_id, LayeredTimelineEntry::Local(reloaded_timeline)); - None - } - TimelineSyncState::Evicted(_) => timelines_accessor.remove(&timeline_id), - TimelineSyncState::AwaitsDownload(disk_consistent_lsn) - | TimelineSyncState::CloudOnly(disk_consistent_lsn) => timelines_accessor.insert( - timeline_id, - LayeredTimelineEntry::Remote { - id: timeline_id, - disk_consistent_lsn, - }, - ), - }; - // NOTE we do not delete local data in case timeline became cloud only, this is performed in detach_timeline - drop(timelines_accessor); - - Ok(()) - } - - /// Layered repo does not store anything but - /// * local, fully loaded timelines, ready for usage - /// * remote timelines, that need a download task scheduled first before they can be used - /// - /// [`TimelineSyncState::Evicted`] and other non-local and non-remote states are not stored in the layered repo at all, - /// hence their statuses cannot be returned by the repo. - fn get_timeline_state(&self, timeline_id: ZTimelineId) -> Option { - let timelines_accessor = self.timelines.lock().unwrap(); - let timeline_entry = timelines_accessor.get(&timeline_id)?; - Some( - if timeline_entry - .local_or_schedule_download(self.tenantid) - .is_some() - { - TimelineSyncState::Ready(timeline_entry.disk_consistent_lsn()) - } else { - TimelineSyncState::CloudOnly(timeline_entry.disk_consistent_lsn()) - }, - ) - } -} - -#[derive(Clone)] -enum LayeredTimelineEntry { - Local(Arc), - Remote { - id: ZTimelineId, - /// metadata contents of the latest successfully uploaded checkpoint - disk_consistent_lsn: Lsn, - }, -} - -impl LayeredTimelineEntry { - fn timeline_id(&self) -> ZTimelineId { - match self { - LayeredTimelineEntry::Local(timeline) => timeline.timelineid, - LayeredTimelineEntry::Remote { id, .. } => *id, - } - } - - /// Gets local timeline data, if it's present. Otherwise schedules a download fot the remote timeline and returns `None`. - fn local_or_schedule_download(&self, tenant_id: ZTenantId) -> Option<&LayeredTimeline> { - match self { - Self::Local(local) => Some(local.as_ref()), - Self::Remote { - id: timeline_id, .. - } => { - debug!( - "Accessed a remote timeline {} for tenant {}, scheduling a timeline download", - timeline_id, tenant_id - ); - schedule_timeline_download(tenant_id, *timeline_id); - None - } - } - } - - /// Gets a current (latest for the remote case) disk consistent Lsn for the timeline. - fn disk_consistent_lsn(&self) -> Lsn { - match self { - Self::Local(local) => local.disk_consistent_lsn.load(), - Self::Remote { - disk_consistent_lsn, - .. - } => *disk_consistent_lsn, - } - } -} - -/// Private functions -impl LayeredRepository { - // Implementation of the public `get_timeline` function. This differs from the public - // interface in that the caller must already hold the mutex on the 'timelines' hashmap. - fn get_or_init_timeline( - &self, - timelineid: ZTimelineId, - timelines: &mut HashMap, - ) -> Result { - match timelines.get(&timelineid) { - Some(timeline_entry) => { - let _ = timeline_entry.local_or_schedule_download(self.tenantid); - Ok(timeline_entry.clone()) - } - None => { - let timeline = self.init_local_timeline(timelineid, timelines)?; - timelines.insert( - timelineid, - LayeredTimelineEntry::Local(Arc::clone(&timeline)), - ); - Ok(LayeredTimelineEntry::Local(timeline)) - } - } - } - - fn init_local_timeline( - &self, - timelineid: ZTimelineId, - timelines: &mut HashMap, - ) -> anyhow::Result> { - let metadata = Self::load_metadata(self.conf, timelineid, self.tenantid) - .context("failed to load metadata")?; - let disk_consistent_lsn = metadata.disk_consistent_lsn(); - - let ancestor = metadata - .ancestor_timeline() - .map(|ancestor_timelineid| self.get_or_init_timeline(ancestor_timelineid, timelines)) - .transpose()?; - let _enter = - info_span!("loading timeline", timeline = %timelineid, tenant = %self.tenantid) - .entered(); - let mut timeline = LayeredTimeline::new( - self.conf, - metadata, - ancestor, - timelineid, - self.tenantid, - Arc::clone(&self.walredo_mgr), - 0, // init with 0 and update after layers are loaded, - self.upload_relishes, - ); - timeline - .load_layer_map(disk_consistent_lsn) - .context("failed to load layermap")?; - timeline.init_current_logical_size()?; - - Ok(Arc::new(timeline)) - } - - pub fn new( - conf: &'static PageServerConf, - walredo_mgr: Arc, - tenantid: ZTenantId, - upload_relishes: bool, - ) -> LayeredRepository { - LayeredRepository { - tenantid, - conf, - timelines: Mutex::new(HashMap::new()), - gc_cs: Mutex::new(()), - walredo_mgr, - upload_relishes, - } - } - - /// Save timeline metadata to file - fn save_metadata( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - data: &TimelineMetadata, - first_save: bool, - ) -> Result<()> { - let _enter = info_span!("saving metadata").entered(); - let path = metadata_path(conf, timelineid, tenantid); - // use OpenOptions to ensure file presence is consistent with first_save - let mut file = VirtualFile::open_with_options( - &path, - OpenOptions::new().write(true).create_new(first_save), - )?; - - let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?; - - if file.write(&metadata_bytes)? != metadata_bytes.len() { - bail!("Could not write all the metadata bytes in a single call"); - } - file.sync_all()?; - - // fsync the parent directory to ensure the directory entry is durable - if first_save { - let timeline_dir = File::open( - &path - .parent() - .expect("Metadata should always have a parent dir"), - )?; - timeline_dir.sync_all()?; - } - - Ok(()) - } - - fn load_metadata( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - ) -> Result { - let path = metadata_path(conf, timelineid, tenantid); - info!("loading metadata from {}", path.display()); - let metadata_bytes = std::fs::read(&path)?; - TimelineMetadata::from_bytes(&metadata_bytes) - } - - // - // How garbage collection works: - // - // +--bar-------------> - // / - // +----+-----foo----------------> - // / - // ----main--+--------------------------> - // \ - // +-----baz--------> - // - // - // 1. Grab a mutex to prevent new timelines from being created - // 2. Scan all timelines, and on each timeline, make note of the - // all the points where other timelines have been branched off. - // We will refrain from removing page versions at those LSNs. - // 3. For each timeline, scan all layer files on the timeline. - // Remove all files for which a newer file exists and which - // don't cover any branch point LSNs. - // - // TODO: - // - if a relation has a non-incremental persistent layer on a child branch, then we - // don't need to keep that in the parent anymore. But currently - // we do. - fn gc_iteration_internal( - &self, - target_timelineid: Option, - horizon: u64, - checkpoint_before_gc: bool, - ) -> Result { - let mut totals: GcResult = Default::default(); - let now = Instant::now(); - - // grab mutex to prevent new timelines from being created here. - let _gc_cs = self.gc_cs.lock().unwrap(); - - let mut timelines = self.timelines.lock().unwrap(); - - // Scan all timelines. For each timeline, remember the timeline ID and - // the branch point where it was created. - // - let mut timelineids: Vec = Vec::new(); - - // We scan the directory, not the in-memory hash table, because the hash - // table only contains entries for timelines that have been accessed. We - // need to take all timelines into account, not only the active ones. - let timelines_path = self.conf.timelines_path(&self.tenantid); - - for direntry in fs::read_dir(timelines_path)? { - let direntry = direntry?; - if let Some(fname) = direntry.file_name().to_str() { - if let Ok(timelineid) = fname.parse::() { - timelineids.push(timelineid); - } - } - } - - // Now collect info about branchpoints - let mut all_branchpoints: BTreeSet<(ZTimelineId, Lsn)> = BTreeSet::new(); - for &timelineid in &timelineids { - let timeline = match self.get_or_init_timeline(timelineid, &mut timelines)? { - LayeredTimelineEntry::Local(timeline) => timeline, - LayeredTimelineEntry::Remote { .. } => { - warn!( - "Timeline {} is not local, cannot proceed with gc", - timelineid - ); - return Ok(totals); - } - }; - - if let Some(ancestor_timeline) = &timeline.ancestor_timeline { - let ancestor_timeline = - match ancestor_timeline.local_or_schedule_download(self.tenantid) { - Some(timeline) => timeline, - None => { - warn!( - "Timeline {} has ancestor {} is not local, cannot proceed with gc", - timelineid, - ancestor_timeline.timeline_id() - ); - return Ok(totals); - } - }; - // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timelineid) = target_timelineid { - if ancestor_timeline.timelineid == timelineid { - all_branchpoints - .insert((ancestor_timeline.timelineid, timeline.ancestor_lsn)); - } - } - // Collect branchpoints for all timelines - else { - all_branchpoints.insert((ancestor_timeline.timelineid, timeline.ancestor_lsn)); - } - } - } - - // Ok, we now know all the branch points. - // Perform GC for each timeline. - for timelineid in timelineids { - if thread_mgr::is_shutdown_requested() { - // We were requested to shut down. Stop and return with the progress we - // made. - break; - } - - // We have already loaded all timelines above - // so this operation is just a quick map lookup. - let timeline = match self.get_or_init_timeline(timelineid, &mut *timelines)? { - LayeredTimelineEntry::Local(timeline) => timeline, - LayeredTimelineEntry::Remote { .. } => { - debug!("Skipping GC for non-local timeline {}", timelineid); - continue; - } - }; - - // If target_timeline is specified, only GC it - if let Some(target_timelineid) = target_timelineid { - if timelineid != target_timelineid { - continue; - } - } - - if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) { - drop(timelines); - let branchpoints: Vec = all_branchpoints - .range(( - Included((timelineid, Lsn(0))), - Included((timelineid, Lsn(u64::MAX))), - )) - .map(|&x| x.1) - .collect(); - - // If requested, force flush all in-memory layers to disk first, - // so that they too can be garbage collected. That's - // used in tests, so we want as deterministic results as possible. - if checkpoint_before_gc { - timeline.checkpoint(CheckpointConfig::Forced)?; - info!("timeline {} checkpoint_before_gc done", timelineid); - } - let result = timeline.gc_timeline(branchpoints, cutoff)?; - - totals += result; - timelines = self.timelines.lock().unwrap(); - } - } - - totals.elapsed = now.elapsed(); - Ok(totals) - } -} - -pub struct LayeredTimeline { - conf: &'static PageServerConf, - - tenantid: ZTenantId, - timelineid: ZTimelineId, - - layers: Mutex, - - // WAL redo manager - walredo_mgr: Arc, - - // What page versions do we hold in the repository? If we get a - // request > last_record_lsn, we need to wait until we receive all - // the WAL up to the request. The SeqWait provides functions for - // that. TODO: If we get a request for an old LSN, such that the - // versions have already been garbage collected away, we should - // throw an error, but we don't track that currently. - // - // last_record_lsn.load().last points to the end of last processed WAL record. - // - // We also remember the starting point of the previous record in - // 'last_record_lsn.load().prev'. It's used to set the xl_prev pointer of the - // first WAL record when the node is started up. But here, we just - // keep track of it. - last_record_lsn: SeqWait, - - // All WAL records have been processed and stored durably on files on - // local disk, up to this LSN. On crash and restart, we need to re-process - // the WAL starting from this point. - // - // Some later WAL records might have been processed and also flushed to disk - // already, so don't be surprised to see some, but there's no guarantee on - // them yet. - disk_consistent_lsn: AtomicLsn, - - // Parent timeline that this timeline was branched from, and the LSN - // of the branch point. - ancestor_timeline: Option, - ancestor_lsn: Lsn, - - // this variable indicates how much space is used from user's point of view, - // e.g. we do not account here for multiple versions of data and so on. - // this is counted incrementally based on physical relishes (excluding FileNodeMap) - // current_logical_size is not stored no disk and initialized on timeline creation using - // get_current_logical_size_non_incremental in init_current_logical_size - // this is needed because when we save it in metadata it can become out of sync - // because current_logical_size is consistent on last_record_lsn, not ondisk_consistent_lsn - // NOTE: current_logical_size also includes size of the ancestor - current_logical_size: AtomicUsize, // bytes - - // To avoid calling .with_label_values and formatting the tenant and timeline IDs to strings - // every time the logical size is updated, keep a direct reference to the Gauge here. - // unfortunately it doesnt forward atomic methods like .fetch_add - // so use two fields: actual size and metric - // see https://github.com/zenithdb/zenith/issues/622 for discussion - // TODO: it is possible to combine these two fields into single one using custom metric which uses SeqCst - // ordering for its operations, but involves private modules, and macro trickery - current_logical_size_gauge: IntGauge, - - /// If `true`, will backup its files that appear after each checkpointing to the remote storage. - upload_relishes: AtomicBool, - - /// Ensures layers aren't frozen by checkpointer between - /// [`LayeredTimeline::get_layer_for_write`] and layer reads. - /// Locked automatically by [`LayeredTimelineWriter`] and checkpointer. - /// Must always be acquired before the layer map/individual layer lock - /// to avoid deadlock. - write_lock: Mutex<()>, - - // Prevent concurrent checkpoints. - // Checkpoints are normally performed by one thread. But checkpoint can also be manually requested by admin - // (that's used in tests), and shutdown also forces a checkpoint. These forced checkpoints run in a different thread - // and could be triggered at the same time as a normal checkpoint. - checkpoint_cs: Mutex<()>, - - // Needed to ensure that we can't create a branch at a point that was already garbage collected - latest_gc_cutoff_lsn: RwLock, - - // It may change across major versions so for simplicity - // keep it after running initdb for a timeline. - // It is needed in checks when we want to error on some operations - // when they are requested for pre-initdb lsn. - // It can be unified with latest_gc_cutoff_lsn under some "first_valid_lsn", - // though lets keep them both for better error visibility. - initdb_lsn: Lsn, -} - -/// Public interface functions -impl Timeline for LayeredTimeline { - fn get_ancestor_lsn(&self) -> Lsn { - self.ancestor_lsn - } - - fn get_ancestor_timeline_id(&self) -> Option { - self.ancestor_timeline - .as_ref() - .map(LayeredTimelineEntry::timeline_id) - } - - /// Wait until WAL has been received up to the given LSN. - fn wait_lsn(&self, lsn: Lsn) -> Result<()> { - // This should never be called from the WAL receiver thread, because that could lead - // to a deadlock. - assert!( - !IS_WAL_RECEIVER.with(|c| c.get()), - "wait_lsn called by WAL receiver thread" - ); - - self.last_record_lsn - .wait_for_timeout(lsn, self.conf.wait_lsn_timeout) - .with_context(|| { - format!( - "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", - lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() - ) - })?; - - Ok(()) - } - - fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard { - self.latest_gc_cutoff_lsn.read().unwrap() - } - - /// Look up given page version. - fn get_page_at_lsn(&self, rel: RelishTag, rel_blknum: BlockNumber, lsn: Lsn) -> Result { - if !rel.is_blocky() && rel_blknum != 0 { - bail!( - "invalid request for block {} for non-blocky relish {}", - rel_blknum, - rel - ); - } - debug_assert!(lsn <= self.get_last_record_lsn()); - let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); - - if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - RECONSTRUCT_TIME - .observe_closure_duration(|| self.materialize_page(seg, seg_blknum, lsn, &*layer)) - } else { - // FIXME: This can happen if PostgreSQL extends a relation but never writes - // the page. See https://github.com/zenithdb/zenith/issues/841 - // - // Would be nice to detect that situation better. - if seg.segno > 0 && self.get_rel_exists(rel, lsn)? { - warn!("Page {} blk {} at {} not found", rel, rel_blknum, lsn); - return Ok(ZERO_PAGE.clone()); - } - - bail!("segment {} not found at {}", rel, lsn); - } - } - - fn get_relish_size(&self, rel: RelishTag, lsn: Lsn) -> Result> { - if !rel.is_blocky() { - bail!( - "invalid get_relish_size request for non-blocky relish {}", - rel - ); - } - debug_assert!(lsn <= self.get_last_record_lsn()); - - let mut segno = 0; - loop { - let seg = SegmentTag { rel, segno }; - - let segsize; - if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - segsize = layer.get_seg_size(lsn)?; - trace!("get_seg_size: {} at {} -> {}", seg, lsn, segsize); - } else { - if segno == 0 { - return Ok(None); - } - segsize = 0; - } - - if segsize != RELISH_SEG_SIZE { - let result = segno * RELISH_SEG_SIZE + segsize; - return Ok(Some(result)); - } - segno += 1; - } - } - - fn get_rel_exists(&self, rel: RelishTag, lsn: Lsn) -> Result { - debug_assert!(lsn <= self.get_last_record_lsn()); - - let seg = SegmentTag { rel, segno: 0 }; - - let result; - if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - result = layer.get_seg_exists(lsn)?; - } else { - result = false; - } - - trace!("get_rel_exists: {} at {} -> {}", rel, lsn, result); - Ok(result) - } - - fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result> { - let request_tag = RelTag { - spcnode, - dbnode, - relnode: 0, - forknum: 0, - }; - - self.list_relishes(Some(request_tag), lsn) - } - - fn list_nonrels(&self, lsn: Lsn) -> Result> { - info!("list_nonrels called at {}", lsn); - - self.list_relishes(None, lsn) - } - - fn list_relishes(&self, tag: Option, lsn: Lsn) -> Result> { - trace!("list_relishes called at {}", lsn); - debug_assert!(lsn <= self.get_last_record_lsn()); - - // List of all relishes along with a flag that marks if they exist at the given lsn. - let mut all_relishes_map: HashMap = HashMap::new(); - let mut result = HashSet::new(); - let mut timeline = self; - - // Iterate through layers back in time and find the most - // recent state of the relish. Don't add relish to the list - // if newer version is already there. - // - // This most recent version can represent dropped or existing relish. - // We will filter dropped relishes below. - // - loop { - let rels = timeline.layers.lock().unwrap().list_relishes(tag, lsn)?; - - for (&new_relish, &new_relish_exists) in rels.iter() { - match all_relishes_map.entry(new_relish) { - Entry::Occupied(o) => { - trace!( - "Newer version of the object {} is already found: exists {}", - new_relish, - o.get(), - ); - } - Entry::Vacant(v) => { - v.insert(new_relish_exists); - trace!( - "Newer version of the object {} NOT found. Insert NEW: exists {}", - new_relish, - new_relish_exists - ); - } - } - } - - match &timeline.ancestor_timeline { - None => break, - Some(ancestor_entry) => { - match ancestor_entry.local_or_schedule_download(self.tenantid) { - Some(ancestor) => { - timeline = ancestor; - continue; - } - None => bail!("Cannot list relishes for timeline {} tenant {} due to its ancestor being remote only", self.timelineid, self.tenantid), - } - } - } - } - - // Filter out dropped relishes - for (&new_relish, &new_relish_exists) in all_relishes_map.iter() { - if new_relish_exists { - result.insert(new_relish); - trace!("List object {}", new_relish); - } else { - trace!("Filtered out dropped object {}", new_relish); - } - } - - Ok(result) - } - - /// Public entry point for checkpoint(). All the logic is in the private - /// checkpoint_internal function, this public facade just wraps it for - /// metrics collection. - fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()> { - match cconf { - CheckpointConfig::Flush => STORAGE_TIME - .with_label_values(&["flush checkpoint"]) - .observe_closure_duration(|| self.checkpoint_internal(0, false)), - CheckpointConfig::Forced => STORAGE_TIME - .with_label_values(&["forced checkpoint"]) - .observe_closure_duration(|| self.checkpoint_internal(0, true)), - CheckpointConfig::Distance(distance) => STORAGE_TIME - .with_label_values(&["checkpoint"]) - .observe_closure_duration(|| self.checkpoint_internal(distance, true)), - } - } - - /// - /// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn. - /// - fn check_lsn_is_in_scope( - &self, - lsn: Lsn, - latest_gc_cutoff_lsn: &RwLockReadGuard, - ) -> Result<()> { - ensure!( - lsn >= **latest_gc_cutoff_lsn, - "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", - lsn, - **latest_gc_cutoff_lsn, - ); - Ok(()) - } - - fn get_last_record_lsn(&self) -> Lsn { - self.last_record_lsn.load().last - } - - fn get_prev_record_lsn(&self) -> Lsn { - self.last_record_lsn.load().prev - } - - fn get_last_record_rlsn(&self) -> RecordLsn { - self.last_record_lsn.load() - } - - fn get_current_logical_size(&self) -> usize { - self.current_logical_size.load(atomic::Ordering::Acquire) as usize - } - - fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { - let mut total_blocks: usize = 0; - - let _enter = info_span!("calc logical size", %lsn).entered(); - - // list of all relations in this timeline, including ancestor timelines - let all_rels = self.list_rels(0, 0, lsn)?; - - for rel in all_rels { - if let Some(size) = self.get_relish_size(rel, lsn)? { - total_blocks += size as usize; - } - } - - let non_rels = self.list_nonrels(lsn)?; - for non_rel in non_rels { - // TODO support TwoPhase - if matches!(non_rel, RelishTag::Slru { slru: _, segno: _ }) { - if let Some(size) = self.get_relish_size(non_rel, lsn)? { - total_blocks += size as usize; - } - } - } - - Ok(total_blocks * BLCKSZ as usize) - } - - fn get_disk_consistent_lsn(&self) -> Lsn { - self.disk_consistent_lsn.load() - } - - fn writer<'a>(&'a self) -> Box { - Box::new(LayeredTimelineWriter { - tl: self, - _write_guard: self.write_lock.lock().unwrap(), - }) - } - - fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline { - self - } -} - -impl LayeredTimeline { - /// Open a Timeline handle. - /// - /// Loads the metadata for the timeline into memory, but not the layer map. - #[allow(clippy::too_many_arguments)] - fn new( - conf: &'static PageServerConf, - metadata: TimelineMetadata, - ancestor: Option, - timelineid: ZTimelineId, - tenantid: ZTenantId, - walredo_mgr: Arc, - current_logical_size: usize, - upload_relishes: bool, - ) -> LayeredTimeline { - let current_logical_size_gauge = LOGICAL_TIMELINE_SIZE - .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) - .unwrap(); - LayeredTimeline { - conf, - timelineid, - tenantid, - layers: Mutex::new(LayerMap::default()), - - walredo_mgr, - - // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. - last_record_lsn: SeqWait::new(RecordLsn { - last: metadata.disk_consistent_lsn(), - prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)), - }), - disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), - - ancestor_timeline: ancestor, - ancestor_lsn: metadata.ancestor_lsn(), - current_logical_size: AtomicUsize::new(current_logical_size), - current_logical_size_gauge, - upload_relishes: AtomicBool::new(upload_relishes), - - write_lock: Mutex::new(()), - checkpoint_cs: Mutex::new(()), - - latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), - initdb_lsn: metadata.initdb_lsn(), - } - } - - /// - /// Scan the timeline directory to populate the layer map. - /// Returns all timeline-related files that were found and loaded. - /// - fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { - let mut layers = self.layers.lock().unwrap(); - let mut num_layers = 0; - - // Scan timeline directory and create ImageFileName and DeltaFilename - // structs representing all files on disk - let timeline_path = self.conf.timeline_path(&self.timelineid, &self.tenantid); - - for direntry in fs::read_dir(timeline_path)? { - let direntry = direntry?; - let fname = direntry.file_name(); - let fname = fname.to_str().unwrap(); - - if let Some(imgfilename) = ImageFileName::parse_str(fname) { - // create an ImageLayer struct for each image file. - if imgfilename.lsn > disk_consistent_lsn { - warn!( - "found future image layer {} on timeline {} disk_consistent_lsn is {}", - imgfilename, self.timelineid, disk_consistent_lsn - ); - - rename_to_backup(direntry.path())?; - continue; - } - - let layer = - ImageLayer::new(self.conf, self.timelineid, self.tenantid, &imgfilename); - - trace!("found layer {}", layer.filename().display()); - layers.insert_historic(Arc::new(layer)); - num_layers += 1; - } else if let Some(deltafilename) = DeltaFileName::parse_str(fname) { - // Create a DeltaLayer struct for each delta file. - ensure!(deltafilename.start_lsn < deltafilename.end_lsn); - // The end-LSN is exclusive, while disk_consistent_lsn is - // inclusive. For example, if disk_consistent_lsn is 100, it is - // OK for a delta layer to have end LSN 101, but if the end LSN - // is 102, then it might not have been fully flushed to disk - // before crash. - if deltafilename.end_lsn > disk_consistent_lsn + 1 { - warn!( - "found future delta layer {} on timeline {} disk_consistent_lsn is {}", - deltafilename, self.timelineid, disk_consistent_lsn - ); - - rename_to_backup(direntry.path())?; - continue; - } - - let layer = - DeltaLayer::new(self.conf, self.timelineid, self.tenantid, &deltafilename); - - trace!("found layer {}", layer.filename().display()); - layers.insert_historic(Arc::new(layer)); - num_layers += 1; - } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { - // ignore these - } else if is_ephemeral_file(fname) { - // Delete any old ephemeral files - trace!("deleting old ephemeral file in timeline dir: {}", fname); - fs::remove_file(direntry.path())?; - } else { - warn!("unrecognized filename in timeline dir: {}", fname); - } - } - - info!("loaded layer map with {} layers", num_layers); - - Ok(()) - } - - /// - /// Used to init current logical size on startup - /// - fn init_current_logical_size(&mut self) -> Result<()> { - if self.current_logical_size.load(atomic::Ordering::Relaxed) != 0 { - bail!("cannot init already initialized current logical size") - }; - let lsn = self.get_last_record_lsn(); - self.current_logical_size = - AtomicUsize::new(self.get_current_logical_size_non_incremental(lsn)?); - trace!( - "current_logical_size initialized to {}", - self.current_logical_size.load(atomic::Ordering::Relaxed) - ); - Ok(()) - } - - /// - /// Get a handle to a Layer for reading. - /// - /// The returned Layer might be from an ancestor timeline, if the - /// segment hasn't been updated on this timeline yet. - /// - fn get_layer_for_read( - &self, - seg: SegmentTag, - lsn: Lsn, - ) -> Result, Lsn)>> { - let self_layers = self.layers.lock().unwrap(); - self.get_layer_for_read_locked(seg, lsn, &self_layers) - } - - /// - /// Get a handle to a Layer for reading. - /// - /// The returned Layer might be from an ancestor timeline, if the - /// segment hasn't been updated on this timeline yet. - /// - /// This function takes the current timeline's locked LayerMap as an argument, - /// so callers can avoid potential race conditions. - fn get_layer_for_read_locked( - &self, - seg: SegmentTag, - lsn: Lsn, - self_layers: &MutexGuard, - ) -> Result, Lsn)>> { - trace!("get_layer_for_read called for {} at {}", seg, lsn); - - // If you requested a page at an older LSN, before the branch point, dig into - // the right ancestor timeline. This can only happen if you launch a read-only - // node with an old LSN, a primary always uses a recent LSN in its requests. - let mut timeline = self; - let mut lsn = lsn; - - while lsn < timeline.ancestor_lsn { - trace!("going into ancestor {} ", timeline.ancestor_lsn); - timeline = match timeline - .ancestor_timeline - .as_ref() - .and_then(|ancestor_entry| ancestor_entry.local_or_schedule_download(self.tenantid)) - { - Some(timeline) => timeline, - None => { - bail!( - "Cannot get the whole layer for read locked: timeline {} is not present locally", - self.timelineid - ) - } - }; - } - - // Now we have the right starting timeline for our search. - loop { - let layers_owned: MutexGuard; - let layers = if self as *const LayeredTimeline != timeline as *const LayeredTimeline { - layers_owned = timeline.layers.lock().unwrap(); - &layers_owned - } else { - self_layers - }; - - // - // FIXME: If the relation has been dropped, does this return the right - // thing? The compute node should not normally request dropped relations, - // but if OID wraparound happens the same relfilenode might get reused - // for an unrelated relation. - // - - // Do we have a layer on this timeline? - if let Some(layer) = layers.get(&seg, lsn) { - trace!( - "found layer in cache: {} {}-{}", - timeline.timelineid, - layer.get_start_lsn(), - layer.get_end_lsn() - ); - - assert!(layer.get_start_lsn() <= lsn); - - if layer.is_dropped() && layer.get_end_lsn() <= lsn { - return Ok(None); - } - - return Ok(Some((layer.clone(), lsn))); - } - - // If not, check if there's a layer on the ancestor timeline - match &timeline.ancestor_timeline { - Some(ancestor_entry) => { - match ancestor_entry.local_or_schedule_download(self.tenantid) { - Some(ancestor) => { - lsn = timeline.ancestor_lsn; - timeline = ancestor; - trace!("recursing into ancestor at {}/{}", timeline.timelineid, lsn); - continue; - } - None => bail!( - "Cannot get a layer for read from remote ancestor timeline {}", - self.timelineid - ), - } - } - None => return Ok(None), - } - } - } - - /// - /// Get a handle to the latest layer for appending. - /// - fn get_layer_for_write(&self, seg: SegmentTag, lsn: Lsn) -> Result> { - let mut layers = self.layers.lock().unwrap(); - - assert!(lsn.is_aligned()); - - let last_record_lsn = self.get_last_record_lsn(); - assert!( - lsn > last_record_lsn, - "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", - lsn, - last_record_lsn, - ); - - // Do we have a layer open for writing already? - let layer; - if let Some(open_layer) = layers.get_open(&seg) { - if open_layer.get_start_lsn() > lsn { - bail!("unexpected open layer in the future"); - } - - // Open layer exists, but it is dropped, so create a new one. - if open_layer.is_dropped() { - assert!(!open_layer.is_writeable()); - // Layer that is created after dropped one represents a new relish segment. - trace!( - "creating layer for write for new relish segment after dropped layer {} at {}/{}", - seg, - self.timelineid, - lsn - ); - - layer = InMemoryLayer::create( - self.conf, - self.timelineid, - self.tenantid, - seg, - lsn, - last_record_lsn, - )?; - } else { - return Ok(open_layer); - } - } - // No writeable layer for this relation. Create one. - // - // Is this a completely new relation? Or the first modification after branching? - // - else if let Some((prev_layer, _prev_lsn)) = - self.get_layer_for_read_locked(seg, lsn, &layers)? - { - // Create new entry after the previous one. - let start_lsn; - if prev_layer.get_timeline_id() != self.timelineid { - // First modification on this timeline - start_lsn = self.ancestor_lsn + 1; - trace!( - "creating layer for write for {} at branch point {}", - seg, - start_lsn - ); - } else { - start_lsn = prev_layer.get_end_lsn(); - trace!( - "creating layer for write for {} after previous layer {}", - seg, - start_lsn - ); - } - trace!( - "prev layer is at {}/{} - {}", - prev_layer.get_timeline_id(), - prev_layer.get_start_lsn(), - prev_layer.get_end_lsn() - ); - layer = InMemoryLayer::create_successor_layer( - self.conf, - prev_layer, - self.timelineid, - self.tenantid, - start_lsn, - last_record_lsn, - )?; - } else { - // New relation. - trace!( - "creating layer for write for new rel {} at {}/{}", - seg, - self.timelineid, - lsn - ); - - layer = InMemoryLayer::create( - self.conf, - self.timelineid, - self.tenantid, - seg, - lsn, - last_record_lsn, - )?; - } - - let layer_rc: Arc = Arc::new(layer); - layers.insert_open(Arc::clone(&layer_rc)); - - Ok(layer_rc) - } - - /// - /// Flush to disk all data that was written with the put_* functions - /// - /// NOTE: This has nothing to do with checkpoint in PostgreSQL. - fn checkpoint_internal(&self, checkpoint_distance: u64, reconstruct_pages: bool) -> Result<()> { - // Prevent concurrent checkpoints - let _checkpoint_cs = self.checkpoint_cs.lock().unwrap(); - - let write_guard = self.write_lock.lock().unwrap(); - let mut layers = self.layers.lock().unwrap(); - - // Bump the generation number in the layer map, so that we can distinguish - // entries inserted after the checkpoint started - let current_generation = layers.increment_generation(); - - let RecordLsn { - last: last_record_lsn, - prev: prev_record_lsn, - } = self.last_record_lsn.load(); - - trace!("checkpoint starting at {}", last_record_lsn); - - // Take the in-memory layer with the oldest WAL record. If it's older - // than the threshold, write it out to disk as a new image and delta file. - // Repeat until all remaining in-memory layers are within the threshold. - // - // That's necessary to limit the amount of WAL that needs to be kept - // in the safekeepers, and that needs to be reprocessed on page server - // crash. TODO: It's not a great policy for keeping memory usage in - // check, though. We should also aim at flushing layers that consume - // a lot of memory and/or aren't receiving much updates anymore. - let mut disk_consistent_lsn = last_record_lsn; - - let mut layer_paths = Vec::new(); - let mut freeze_end_lsn = Lsn(0); - let mut evicted_layers = Vec::new(); - - // - // Determine which layers we need to evict and calculate max(latest_lsn) - // among those layers. - // - while let Some((oldest_layer_id, oldest_layer, oldest_generation)) = - layers.peek_oldest_open() - { - let oldest_lsn = oldest_layer.get_oldest_lsn(); - // Does this layer need freezing? - // - // Write out all in-memory layers that contain WAL older than CHECKPOINT_DISTANCE. - // If we reach a layer with the same - // generation number, we know that we have cycled through all layers that were open - // when we started. We don't want to process layers inserted after we started, to - // avoid getting into an infinite loop trying to process again entries that we - // inserted ourselves. - // - // Once we have decided to write out at least one layer, we must also write out - // any other layers that contain WAL older than the end LSN of the layers we have - // already decided to write out. In other words, we must write out all layers - // whose [oldest_lsn, latest_lsn) range overlaps with any of the other layers - // that we are writing out. Otherwise, when we advance 'disk_consistent_lsn', it's - // ambiguous whether those layers are already durable on disk or not. For example, - // imagine that there are two layers in memory that contain page versions in the - // following LSN ranges: - // - // A: 100-150 - // B: 110-200 - // - // If we flush layer A, we must also flush layer B, because they overlap. If we - // flushed only A, and advanced 'disk_consistent_lsn' to 150, we would break the - // rule that all WAL older than 'disk_consistent_lsn' are durable on disk, because - // B contains some WAL older than 150. On the other hand, if we flushed out A and - // advanced 'disk_consistent_lsn' only up to 110, after crash and restart we would - // delete the first layer because its end LSN is larger than 110. If we changed - // the deletion logic to not delete it, then we would start streaming at 110, and - // process again the WAL records in the range 110-150 that are already in layer A, - // and the WAL processing code does not cope with that. We solve that dilemma by - // insisting that if we write out the first layer, we also write out the second - // layer, and advance disk_consistent_lsn all the way up to 200. - // - let distance = last_record_lsn.widening_sub(oldest_lsn); - if (distance < 0 - || distance < checkpoint_distance.into() - || oldest_generation == current_generation) - && oldest_lsn >= freeze_end_lsn - // this layer intersects with evicted layer and so also need to be evicted - { - info!( - "the oldest layer is now {} which is {} bytes behind last_record_lsn", - oldest_layer.filename().display(), - distance - ); - disk_consistent_lsn = oldest_lsn; - break; - } - let latest_lsn = oldest_layer.get_latest_lsn(); - if latest_lsn > freeze_end_lsn { - freeze_end_lsn = latest_lsn; // calculate max of latest_lsn of the layers we're about to evict - } - layers.remove_open(oldest_layer_id); - evicted_layers.push((oldest_layer_id, oldest_layer)); - } - - // Freeze evicted layers - for (_evicted_layer_id, evicted_layer) in evicted_layers.iter() { - // Mark the layer as no longer accepting writes and record the end_lsn. - // This happens in-place, no new layers are created now. - evicted_layer.freeze(freeze_end_lsn); - layers.insert_historic(evicted_layer.clone()); - } - - // Call unload() on all frozen layers, to release memory. - // This shouldn't be much memory, as only metadata is slurped - // into memory. - for layer in layers.iter_historic_layers() { - layer.unload()?; - } - - drop(layers); - drop(write_guard); - - // Create delta/image layers for evicted layers - for (_evicted_layer_id, evicted_layer) in evicted_layers.iter() { - let mut this_layer_paths = - self.evict_layer(evicted_layer.clone(), reconstruct_pages)?; - layer_paths.append(&mut this_layer_paths); - } - - // Sync layers - if !layer_paths.is_empty() { - // We must fsync the timeline dir to ensure the directory entries for - // new layer files are durable - layer_paths.push(self.conf.timeline_path(&self.timelineid, &self.tenantid)); - - // Fsync all the layer files and directory using multiple threads to - // minimize latency. - par_fsync::par_fsync(&layer_paths)?; - - layer_paths.pop().unwrap(); - } - - // If we were able to advance 'disk_consistent_lsn', save it the metadata file. - // After crash, we will restart WAL streaming and processing from that point. - let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); - if disk_consistent_lsn != old_disk_consistent_lsn { - assert!(disk_consistent_lsn > old_disk_consistent_lsn); - - // We can only save a valid 'prev_record_lsn' value on disk if we - // flushed *all* in-memory changes to disk. We only track - // 'prev_record_lsn' in memory for the latest processed record, so we - // don't remember what the correct value that corresponds to some old - // LSN is. But if we flush everything, then the value corresponding - // current 'last_record_lsn' is correct and we can store it on disk. - let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn { - Some(prev_record_lsn) - } else { - None - }; - - let ancestor_timelineid = self - .ancestor_timeline - .as_ref() - .map(LayeredTimelineEntry::timeline_id); - - let metadata = TimelineMetadata::new( - disk_consistent_lsn, - ondisk_prev_record_lsn, - ancestor_timelineid, - self.ancestor_lsn, - *self.latest_gc_cutoff_lsn.read().unwrap(), - self.initdb_lsn, - ); - - LayeredRepository::save_metadata( - self.conf, - self.timelineid, - self.tenantid, - &metadata, - false, - )?; - if self.upload_relishes.load(atomic::Ordering::Relaxed) { - schedule_timeline_checkpoint_upload( - self.tenantid, - self.timelineid, - layer_paths, - metadata, - ); - } - - // Also update the in-memory copy - self.disk_consistent_lsn.store(disk_consistent_lsn); - } - - Ok(()) - } - - fn evict_layer( - &self, - layer: Arc, - reconstruct_pages: bool, - ) -> Result> { - let new_historics = layer.write_to_disk(self, reconstruct_pages)?; - - let mut layer_paths = Vec::new(); - let _write_guard = self.write_lock.lock().unwrap(); - let mut layers = self.layers.lock().unwrap(); - - // Finally, replace the frozen in-memory layer with the new on-disk layers - layers.remove_historic(layer); - - // Add the historics to the LayerMap - for delta_layer in new_historics.delta_layers { - layer_paths.push(delta_layer.path()); - layers.insert_historic(Arc::new(delta_layer)); - } - for image_layer in new_historics.image_layers { - layer_paths.push(image_layer.path()); - layers.insert_historic(Arc::new(image_layer)); - } - Ok(layer_paths) - } - - /// - /// Garbage collect layer files on a timeline that are no longer needed. - /// - /// The caller specifies how much history is needed with the two arguments: - /// - /// retain_lsns: keep a version of each page at these LSNs - /// cutoff: also keep everything newer than this LSN - /// - /// The 'retain_lsns' list is currently used to prevent removing files that - /// are needed by child timelines. In the future, the user might be able to - /// name additional points in time to retain. The caller is responsible for - /// collecting that information. - /// - /// The 'cutoff' point is used to retain recent versions that might still be - /// needed by read-only nodes. (As of this writing, the caller just passes - /// the latest LSN subtracted by a constant, and doesn't do anything smart - /// to figure out what read-only nodes might actually need.) - /// - /// Currently, we don't make any attempt at removing unneeded page versions - /// within a layer file. We can only remove the whole file if it's fully - /// obsolete. - /// - pub fn gc_timeline(&self, retain_lsns: Vec, cutoff: Lsn) -> Result { - let now = Instant::now(); - let mut result: GcResult = Default::default(); - let disk_consistent_lsn = self.get_disk_consistent_lsn(); - let _checkpoint_cs = self.checkpoint_cs.lock().unwrap(); - - let _enter = info_span!("garbage collection", timeline = %self.timelineid, tenant = %self.tenantid, cutoff = %cutoff).entered(); - - // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. - // See branch_timeline() for details. - *self.latest_gc_cutoff_lsn.write().unwrap() = cutoff; - - info!("GC starting"); - - debug!("retain_lsns: {:?}", retain_lsns); - - let mut layers_to_remove: Vec> = Vec::new(); - - // Scan all on-disk layers in the timeline. - // - // Garbage collect the layer if all conditions are satisfied: - // 1. it is older than cutoff LSN; - // 2. it doesn't need to be retained for 'retain_lsns'; - // 3. newer on-disk layer exists (only for non-dropped segments); - // 4. this layer doesn't serve as a tombstone for some older layer; - // - let mut layers = self.layers.lock().unwrap(); - 'outer: for l in layers.iter_historic_layers() { - // This layer is in the process of being flushed to disk. - // It will be swapped out of the layer map, replaced with - // on-disk layers containing the same data. - // We can't GC it, as it's not on disk. We can't remove it - // from the layer map yet, as it would make its data - // inaccessible. - if l.is_in_memory() { - continue; - } - - let seg = l.get_seg_tag(); - - if seg.rel.is_relation() { - result.ondisk_relfiles_total += 1; - } else { - result.ondisk_nonrelfiles_total += 1; - } - - // 1. Is it newer than cutoff point? - if l.get_end_lsn() > cutoff { - info!( - "keeping {} {}-{} because it's newer than cutoff {}", - seg, - l.get_start_lsn(), - l.get_end_lsn(), - cutoff - ); - if seg.rel.is_relation() { - result.ondisk_relfiles_needed_by_cutoff += 1; - } else { - result.ondisk_nonrelfiles_needed_by_cutoff += 1; - } - continue 'outer; - } - - // 2. Is it needed by a child branch? - // NOTE With that wee would keep data that - // might be referenced by child branches forever. - // We can track this in child timeline GC and delete parent layers when - // they are no longer needed. This might be complicated with long inheritance chains. - for retain_lsn in &retain_lsns { - // start_lsn is inclusive - if &l.get_start_lsn() <= retain_lsn { - info!( - "keeping {} {}-{} because it's still might be referenced by child branch forked at {} is_dropped: {} is_incremental: {}", - seg, - l.get_start_lsn(), - l.get_end_lsn(), - retain_lsn, - l.is_dropped(), - l.is_incremental(), - ); - if seg.rel.is_relation() { - result.ondisk_relfiles_needed_by_branches += 1; - } else { - result.ondisk_nonrelfiles_needed_by_branches += 1; - } - continue 'outer; - } - } - - // 3. Is there a later on-disk layer for this relation? - if !l.is_dropped() - && !layers.newer_image_layer_exists( - l.get_seg_tag(), - l.get_end_lsn(), - disk_consistent_lsn, - ) - { - info!( - "keeping {} {}-{} because it is the latest layer", - seg, - l.get_start_lsn(), - l.get_end_lsn() - ); - if seg.rel.is_relation() { - result.ondisk_relfiles_not_updated += 1; - } else { - result.ondisk_nonrelfiles_not_updated += 1; - } - continue 'outer; - } - - // 4. Does this layer serve as a tombstone for some older layer? - if l.is_dropped() { - let prior_lsn = l.get_start_lsn().checked_sub(1u64).unwrap(); - - // Check if this layer serves as a tombstone for this timeline - // We have to do this separately from timeline check below, - // because LayerMap of this timeline is already locked. - let mut is_tombstone = layers.layer_exists_at_lsn(l.get_seg_tag(), prior_lsn)?; - if is_tombstone { - info!( - "earlier layer exists at {} in {}", - prior_lsn, self.timelineid - ); - } - // Now check ancestor timelines, if any are present locally - else if let Some(ancestor) = - self.ancestor_timeline.as_ref().and_then(|timeline_entry| { - timeline_entry.local_or_schedule_download(self.tenantid) - }) - { - let prior_lsn = ancestor.get_last_record_lsn(); - if seg.rel.is_blocky() { - info!( - "check blocky relish size {} at {} in {} for layer {}-{}", - seg, - prior_lsn, - ancestor.timelineid, - l.get_start_lsn(), - l.get_end_lsn() - ); - match ancestor.get_relish_size(seg.rel, prior_lsn).unwrap() { - Some(size) => { - let (last_live_seg, _rel_blknum) = - SegmentTag::from_blknum(seg.rel, size - 1); - info!( - "blocky rel size is {} last_live_seg.segno {} seg.segno {}", - size, last_live_seg.segno, seg.segno - ); - if last_live_seg.segno >= seg.segno { - is_tombstone = true; - } - } - _ => { - info!("blocky rel doesn't exist"); - } - } - } else { - info!( - "check non-blocky relish existence {} at {} in {} for layer {}-{}", - seg, - prior_lsn, - ancestor.timelineid, - l.get_start_lsn(), - l.get_end_lsn() - ); - is_tombstone = ancestor.get_rel_exists(seg.rel, prior_lsn).unwrap_or(false); - } - } - - if is_tombstone { - info!( - "keeping {} {}-{} because this layer serves as a tombstone for older layer", - seg, - l.get_start_lsn(), - l.get_end_lsn() - ); - - if seg.rel.is_relation() { - result.ondisk_relfiles_needed_as_tombstone += 1; - } else { - result.ondisk_nonrelfiles_needed_as_tombstone += 1; - } - continue 'outer; - } - } - - // We didn't find any reason to keep this file, so remove it. - info!( - "garbage collecting {} {}-{} is_dropped: {} is_incremental: {}", - l.get_seg_tag(), - l.get_start_lsn(), - l.get_end_lsn(), - l.is_dropped(), - l.is_incremental(), - ); - layers_to_remove.push(Arc::clone(&l)); - } - - // Actually delete the layers from disk and remove them from the map. - // (couldn't do this in the loop above, because you cannot modify a collection - // while iterating it. BTreeMap::retain() would be another option) - for doomed_layer in layers_to_remove { - doomed_layer.delete()?; - layers.remove_historic(doomed_layer.clone()); - - match ( - doomed_layer.is_dropped(), - doomed_layer.get_seg_tag().rel.is_relation(), - ) { - (true, true) => result.ondisk_relfiles_dropped += 1, - (true, false) => result.ondisk_nonrelfiles_dropped += 1, - (false, true) => result.ondisk_relfiles_removed += 1, - (false, false) => result.ondisk_nonrelfiles_removed += 1, - } - } - - result.elapsed = now.elapsed(); - Ok(result) - } - - fn lookup_cached_page( - &self, - rel: &RelishTag, - rel_blknum: BlockNumber, - lsn: Lsn, - ) -> Option<(Lsn, Bytes)> { - let cache = page_cache::get(); - if let RelishTag::Relation(rel_tag) = &rel { - let (lsn, read_guard) = cache.lookup_materialized_page( - self.tenantid, - self.timelineid, - *rel_tag, - rel_blknum, - lsn, - )?; - let img = Bytes::from(read_guard.to_vec()); - Some((lsn, img)) - } else { - None - } - } - - /// - /// Reconstruct a page version from given Layer - /// - fn materialize_page( - &self, - seg: SegmentTag, - seg_blknum: SegmentBlk, - lsn: Lsn, - layer: &dyn Layer, - ) -> Result { - // Check the page cache. We will get back the most recent page with lsn <= `lsn`. - // The cached image can be returned directly if there is no WAL between the cached image - // and requested LSN. The cached image can also be used to reduce the amount of WAL needed - // for redo. - let rel = seg.rel; - let rel_blknum = seg.segno * RELISH_SEG_SIZE + seg_blknum; - let cached_page_img = match self.lookup_cached_page(&rel, rel_blknum, lsn) { - Some((cached_lsn, cached_img)) => { - match cached_lsn.cmp(&lsn) { - cmp::Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check - cmp::Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image - cmp::Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn - } - Some((cached_lsn, cached_img)) - } - None => None, - }; - - let mut data = PageReconstructData { - records: Vec::new(), - page_img: cached_page_img, - }; - - // Holds an Arc reference to 'layer_ref' when iterating in the loop below. - let mut layer_arc: Arc; - - // Call the layer's get_page_reconstruct_data function to get the base image - // and WAL records needed to materialize the page. If it returns 'Continue', - // call it again on the predecessor layer until we have all the required data. - let mut layer_ref = layer; - let mut curr_lsn = lsn; - loop { - let result = layer_ref - .get_page_reconstruct_data(seg_blknum, curr_lsn, &mut data) - .with_context(|| { - format!( - "Failed to get reconstruct data {} {:?} {} {}", - layer_ref.get_seg_tag(), - layer_ref.filename(), - seg_blknum, - curr_lsn, - ) - })?; - match result { - PageReconstructResult::Complete => break, - PageReconstructResult::Continue(cont_lsn) => { - // Fetch base image / more WAL from the returned predecessor layer - if let Some((cont_layer, cont_lsn)) = self.get_layer_for_read(seg, cont_lsn)? { - if cont_lsn == curr_lsn { - // We landed on the same layer again. Shouldn't happen, but if it does, - // don't get stuck in an infinite loop. - bail!( - "could not find predecessor of layer {} at {}, layer returned its own LSN", - layer_ref.filename().display(), - cont_lsn - ); - } - layer_arc = cont_layer; - layer_ref = &*layer_arc; - curr_lsn = cont_lsn; - continue; - } else { - bail!( - "could not find predecessor of layer {} at {}", - layer_ref.filename().display(), - cont_lsn - ); - } - } - PageReconstructResult::Missing(lsn) => { - // Oops, we could not reconstruct the page. - if data.records.is_empty() { - // no records, and no base image. This can happen if PostgreSQL extends a relation - // but never writes the page. - // - // Would be nice to detect that situation better. - warn!("Page {} blk {} at {} not found", rel, rel_blknum, lsn); - return Ok(ZERO_PAGE.clone()); - } - bail!( - "No base image found for page {} blk {} at {}/{}", - rel, - rel_blknum, - self.timelineid, - lsn, - ); - } - } - } - - self.reconstruct_page(rel, rel_blknum, lsn, data) - } - - /// - /// Reconstruct a page version, using the given base image and WAL records in 'data'. - /// - fn reconstruct_page( - &self, - rel: RelishTag, - rel_blknum: BlockNumber, - request_lsn: Lsn, - mut data: PageReconstructData, - ) -> Result { - // Perform WAL redo if needed - data.records.reverse(); - - // If we have a page image, and no WAL, we're all set - if data.records.is_empty() { - if let Some((img_lsn, img)) = &data.page_img { - trace!( - "found page image for blk {} in {} at {}, no WAL redo required", - rel_blknum, - rel, - img_lsn - ); - Ok(img.clone()) - } else { - // FIXME: this ought to be an error? - warn!( - "Page {} blk {} at {} not found", - rel, rel_blknum, request_lsn - ); - Ok(ZERO_PAGE.clone()) - } - } else { - // We need to do WAL redo. - // - // If we don't have a base image, then the oldest WAL record better initialize - // the page - if data.page_img.is_none() && !data.records.first().unwrap().1.will_init() { - // FIXME: this ought to be an error? - warn!( - "Base image for page {}/{} at {} not found, but got {} WAL records", - rel, - rel_blknum, - request_lsn, - data.records.len() - ); - Ok(ZERO_PAGE.clone()) - } else { - let base_img = if let Some((_lsn, img)) = data.page_img { - trace!("found {} WAL records and a base image for blk {} in {} at {}, performing WAL redo", data.records.len(), rel_blknum, rel, request_lsn); - Some(img) - } else { - trace!("found {} WAL records that will init the page for blk {} in {} at {}, performing WAL redo", data.records.len(), rel_blknum, rel, request_lsn); - None - }; - - let last_rec_lsn = data.records.last().unwrap().0; - - let img = self.walredo_mgr.request_redo( - rel, - rel_blknum, - request_lsn, - base_img, - data.records, - )?; - - if let RelishTag::Relation(rel_tag) = &rel { - let cache = page_cache::get(); - cache.memorize_materialized_page( - self.tenantid, - self.timelineid, - *rel_tag, - rel_blknum, - last_rec_lsn, - &img, - ); - } - - Ok(img) - } - } - } - - /// - /// This is a helper function to increase current_total_relation_size - /// - fn increase_current_logical_size(&self, diff: u32) { - let val = self - .current_logical_size - .fetch_add(diff as usize, atomic::Ordering::SeqCst); - trace!( - "increase_current_logical_size: {} + {} = {}", - val, - diff, - val + diff as usize, - ); - self.current_logical_size_gauge - .set(val as i64 + diff as i64); - } - - /// - /// This is a helper function to decrease current_total_relation_size - /// - fn decrease_current_logical_size(&self, diff: u32) { - let val = self - .current_logical_size - .fetch_sub(diff as usize, atomic::Ordering::SeqCst); - trace!( - "decrease_current_logical_size: {} - {} = {}", - val, - diff, - val - diff as usize, - ); - self.current_logical_size_gauge - .set(val as i64 - diff as i64); - } -} - -struct LayeredTimelineWriter<'a> { - tl: &'a LayeredTimeline, - _write_guard: MutexGuard<'a, ()>, -} - -impl Deref for LayeredTimelineWriter<'_> { - type Target = dyn Timeline; - - fn deref(&self) -> &Self::Target { - self.tl - } -} - -impl<'a> TimelineWriter for LayeredTimelineWriter<'a> { - fn put_wal_record( - &self, - lsn: Lsn, - rel: RelishTag, - rel_blknum: u32, - rec: ZenithWalRecord, - ) -> Result<()> { - if !rel.is_blocky() && rel_blknum != 0 { - bail!( - "invalid request for block {} for non-blocky relish {}", - rel_blknum, - rel - ); - } - ensure!(lsn.is_aligned(), "unaligned record LSN"); - - let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); - let layer = self.tl.get_layer_for_write(seg, lsn)?; - let delta_size = layer.put_wal_record(lsn, seg_blknum, rec)?; - self.tl - .increase_current_logical_size(delta_size * BLCKSZ as u32); - Ok(()) - } - - fn put_page_image( - &self, - rel: RelishTag, - rel_blknum: BlockNumber, - lsn: Lsn, - img: Bytes, - ) -> Result<()> { - if !rel.is_blocky() && rel_blknum != 0 { - bail!( - "invalid request for block {} for non-blocky relish {}", - rel_blknum, - rel - ); - } - ensure!(lsn.is_aligned(), "unaligned record LSN"); - - let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); - - let layer = self.tl.get_layer_for_write(seg, lsn)?; - let delta_size = layer.put_page_image(seg_blknum, lsn, img)?; - - self.tl - .increase_current_logical_size(delta_size * BLCKSZ as u32); - Ok(()) - } - - fn put_truncation(&self, rel: RelishTag, lsn: Lsn, relsize: BlockNumber) -> Result<()> { - if !rel.is_blocky() { - bail!("invalid truncation for non-blocky relish {}", rel); - } - ensure!(lsn.is_aligned(), "unaligned record LSN"); - - debug!("put_truncation: {} to {} blocks at {}", rel, relsize, lsn); - - let oldsize = self - .tl - .get_relish_size(rel, self.tl.get_last_record_lsn())? - .with_context(|| { - format!( - "attempted to truncate non-existent relish {} at {}", - rel, lsn - ) - })?; - - if oldsize <= relsize { - return Ok(()); - } - let old_last_seg = (oldsize - 1) / RELISH_SEG_SIZE; - - let last_remain_seg = if relsize == 0 { - 0 - } else { - (relsize - 1) / RELISH_SEG_SIZE - }; - - // Drop segments beyond the last remaining segment. - for remove_segno in (last_remain_seg + 1)..=old_last_seg { - let seg = SegmentTag { - rel, - segno: remove_segno, - }; - - let layer = self.tl.get_layer_for_write(seg, lsn)?; - layer.drop_segment(lsn); - } - - // Truncate the last remaining segment to the specified size - if relsize == 0 || relsize % RELISH_SEG_SIZE != 0 { - let seg = SegmentTag { - rel, - segno: last_remain_seg, - }; - let layer = self.tl.get_layer_for_write(seg, lsn)?; - layer.put_truncation(lsn, relsize % RELISH_SEG_SIZE) - } - self.tl - .decrease_current_logical_size((oldsize - relsize) * BLCKSZ as u32); - Ok(()) - } - - fn drop_relish(&self, rel: RelishTag, lsn: Lsn) -> Result<()> { - trace!("drop_segment: {} at {}", rel, lsn); - - if rel.is_blocky() { - if let Some(oldsize) = self - .tl - .get_relish_size(rel, self.tl.get_last_record_lsn())? - { - let old_last_seg = if oldsize == 0 { - 0 - } else { - (oldsize - 1) / RELISH_SEG_SIZE - }; - - // Drop all segments of the relish - for remove_segno in 0..=old_last_seg { - let seg = SegmentTag { - rel, - segno: remove_segno, - }; - let layer = self.tl.get_layer_for_write(seg, lsn)?; - layer.drop_segment(lsn); - } - self.tl - .decrease_current_logical_size(oldsize * BLCKSZ as u32); - } else { - warn!( - "drop_segment called on non-existent relish {} at {}", - rel, lsn - ); - } - } else { - // TODO handle TwoPhase relishes - let (seg, _seg_blknum) = SegmentTag::from_blknum(rel, 0); - let layer = self.tl.get_layer_for_write(seg, lsn)?; - layer.drop_segment(lsn); - } - - Ok(()) - } - - /// - /// Remember the (end of) last valid WAL record remembered in the timeline. - /// - fn advance_last_record_lsn(&self, new_lsn: Lsn) { - assert!(new_lsn.is_aligned()); - - self.tl.last_record_lsn.advance(new_lsn); - } -} - -/// Dump contents of a layer file to stdout. -pub fn dump_layerfile_from_path(path: &Path) -> Result<()> { - let file = File::open(path)?; - let book = Book::new(file)?; - - match book.magic() { - delta_layer::DELTA_FILE_MAGIC => { - DeltaLayer::new_for_path(path, &book)?.dump()?; - } - image_layer::IMAGE_FILE_MAGIC => { - ImageLayer::new_for_path(path, &book)?.dump()?; - } - magic => bail!("unrecognized magic identifier: {:?}", magic), - } - - Ok(()) -} - -/// Add a suffix to a layer file's name: .{num}.old -/// Uses the first available num (starts at 0) -fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { - let filename = path.file_name().unwrap().to_str().unwrap(); - let mut new_path = path.clone(); - - for i in 0u32.. { - new_path.set_file_name(format!("{}.{}.old", filename, i)); - if !new_path.exists() { - std::fs::rename(&path, &new_path)?; - return Ok(()); - } - } - - bail!("couldn't find an unused backup number for {:?}", path) -} - -/// -/// Tests that are specific to the layered storage format. -/// -/// There are more unit tests in repository.rs that work through the -/// Repository interface and are expected to work regardless of the -/// file format and directory layout. The test here are more low level. -/// -#[cfg(test)] -mod tests { - use super::*; - use crate::repository::repo_harness::*; - - #[test] - fn corrupt_metadata() -> Result<()> { - const TEST_NAME: &str = "corrupt_metadata"; - let harness = RepoHarness::create(TEST_NAME)?; - let repo = harness.load(); - - repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - drop(repo); - - let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); - - assert!(metadata_path.is_file()); - - let mut metadata_bytes = std::fs::read(&metadata_path)?; - assert_eq!(metadata_bytes.len(), 512); - metadata_bytes[512 - 4 - 2] ^= 1; - std::fs::write(metadata_path, metadata_bytes)?; - - let new_repo = harness.load(); - let err = new_repo.get_timeline(TIMELINE_ID).err().unwrap(); - assert_eq!(err.to_string(), "failed to load metadata"); - assert_eq!( - err.source().unwrap().to_string(), - "metadata checksum mismatch" - ); - - Ok(()) - } - - /// - /// Test the logic in 'load_layer_map' that removes layer files that are - /// newer than 'disk_consistent_lsn'. - /// - #[test] - fn future_layerfiles() -> Result<()> { - const TEST_NAME: &str = "future_layerfiles"; - let harness = RepoHarness::create(TEST_NAME)?; - let repo = harness.load(); - - // Create a timeline with disk_consistent_lsn = 8000 - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; - let writer = tline.writer(); - writer.advance_last_record_lsn(Lsn(0x8000)); - drop(writer); - repo.checkpoint_iteration(CheckpointConfig::Forced)?; - drop(repo); - - let timeline_path = harness.timeline_path(&TIMELINE_ID); - - let make_empty_file = |filename: &str| -> std::io::Result<()> { - let path = timeline_path.join(filename); - - assert!(!path.exists()); - std::fs::write(&path, &[])?; - - Ok(()) - }; - - // Helper function to check that a relation file exists, and a corresponding - // .0.old file does not. - let assert_exists = |filename: &str| { - let path = timeline_path.join(filename); - assert!(path.exists(), "file {} was removed", filename); - - // Check that there is no .old file - let backup_path = timeline_path.join(format!("{}.0.old", filename)); - assert!( - !backup_path.exists(), - "unexpected backup file {}", - backup_path.display() - ); - }; - - // Helper function to check that a relation file does *not* exists, and a corresponding - // ..old file does. - let assert_is_renamed = |filename: &str, num: u32| { - let path = timeline_path.join(filename); - assert!( - !path.exists(), - "file {} was not removed as expected", - filename - ); - - let backup_path = timeline_path.join(format!("{}.{}.old", filename, num)); - assert!( - backup_path.exists(), - "backup file {} was not created", - backup_path.display() - ); - }; - - // These files are considered to be in the future and will be renamed out - // of the way - let future_filenames = vec![ - format!("pg_control_0_{:016X}", 0x8001), - format!("pg_control_0_{:016X}_{:016X}", 0x8001, 0x8008), - ]; - // But these are not: - let past_filenames = vec![ - format!("pg_control_0_{:016X}", 0x8000), - format!("pg_control_0_{:016X}_{:016X}", 0x7000, 0x8001), - ]; - - for filename in future_filenames.iter().chain(past_filenames.iter()) { - make_empty_file(filename)?; - } - - // Load the timeline. This will cause the files in the "future" to be renamed - // away. - let new_repo = harness.load(); - new_repo.get_timeline(TIMELINE_ID).unwrap(); - drop(new_repo); - - for filename in future_filenames.iter() { - assert_is_renamed(filename, 0); - } - for filename in past_filenames.iter() { - assert_exists(filename); - } - - // Create the future files again, and load again. They should be renamed to - // *.1.old this time. - for filename in future_filenames.iter() { - make_empty_file(filename)?; - } - - let new_repo = harness.load(); - new_repo.get_timeline(TIMELINE_ID).unwrap(); - drop(new_repo); - - for filename in future_filenames.iter() { - assert_is_renamed(filename, 0); - assert_is_renamed(filename, 1); - } - for filename in past_filenames.iter() { - assert_exists(filename); - } - - Ok(()) - } -} diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs deleted file mode 100644 index 7434b8de11..0000000000 --- a/pageserver/src/layered_repository/delta_layer.rs +++ /dev/null @@ -1,704 +0,0 @@ -//! -//! A DeltaLayer represents a collection of WAL records or page images in a range of -//! LSNs, for one segment. It is stored on a file on disk. -//! -//! Usually a delta layer only contains differences - in the form of WAL records against -//! a base LSN. However, if a segment is newly created, by creating a new relation or -//! extending an old one, there might be no base image. In that case, all the entries in -//! the delta layer must be page images or WAL records with the 'will_init' flag set, so -//! that they can be replayed without referring to an older page version. Also in some -//! circumstances, the predecessor layer might actually be another delta layer. That -//! can happen when you create a new branch in the middle of a delta layer, and the WAL -//! records on the new branch are put in a new delta layer. -//! -//! When a delta file needs to be accessed, we slurp the metadata and segsize chapters -//! into memory, into the DeltaLayerInner struct. See load() and unload() functions. -//! To access a page/WAL record, we search `page_version_metas` for the block # and LSN. -//! The byte ranges in the metadata can be used to find the page/WAL record in -//! PAGE_VERSIONS_CHAPTER. -//! -//! On disk, the delta files are stored in timelines/ directory. -//! Currently, there are no subdirectories, and each delta file is named like this: -//! -//! ______ -//! -//! For example: -//! -//! 1663_13990_2609_0_5_000000000169C348_000000000169C349 -//! -//! If a relation is dropped, we add a '_DROPPED' to the end of the filename to indicate that. -//! So the above example would become: -//! -//! 1663_13990_2609_0_5_000000000169C348_000000000169C349_DROPPED -//! -//! The end LSN indicates when it was dropped in that case, we don't store it in the -//! file contents in any way. -//! -//! A detlta file is constructed using the 'bookfile' crate. Each file consists of two -//! parts: the page versions and the segment sizes. They are stored as separate chapters. -//! -use crate::config::PageServerConf; -use crate::layered_repository::filename::{DeltaFileName, PathOrConf}; -use crate::layered_repository::storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag, - RELISH_SEG_SIZE, -}; -use crate::virtual_file::VirtualFile; -use crate::walrecord; -use crate::{ZTenantId, ZTimelineId}; -use anyhow::{bail, ensure, Result}; -use log::*; -use serde::{Deserialize, Serialize}; -use zenith_utils::vec_map::VecMap; -// avoid binding to Write (conflicts with std::io::Write) -// while being able to use std::fmt::Write's methods -use std::fmt::Write as _; -use std::fs; -use std::io::{BufWriter, Write}; -use std::ops::Bound::Included; -use std::os::unix::fs::FileExt; -use std::path::{Path, PathBuf}; -use std::sync::{Mutex, MutexGuard}; - -use bookfile::{Book, BookWriter, BoundedReader, ChapterWriter}; - -use zenith_utils::bin_ser::BeSer; -use zenith_utils::lsn::Lsn; - -// Magic constant to identify a Zenith delta file -pub const DELTA_FILE_MAGIC: u32 = 0x5A616E01; - -/// Mapping from (block #, lsn) -> page/WAL record -/// byte ranges in PAGE_VERSIONS_CHAPTER -static PAGE_VERSION_METAS_CHAPTER: u64 = 1; -/// Page/WAL bytes - cannot be interpreted -/// without PAGE_VERSION_METAS_CHAPTER -static PAGE_VERSIONS_CHAPTER: u64 = 2; -static SEG_SIZES_CHAPTER: u64 = 3; - -/// Contains the [`Summary`] struct -static SUMMARY_CHAPTER: u64 = 4; - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] -struct Summary { - tenantid: ZTenantId, - timelineid: ZTimelineId, - seg: SegmentTag, - - start_lsn: Lsn, - end_lsn: Lsn, - - dropped: bool, -} - -impl From<&DeltaLayer> for Summary { - fn from(layer: &DeltaLayer) -> Self { - Self { - tenantid: layer.tenantid, - timelineid: layer.timelineid, - seg: layer.seg, - - start_lsn: layer.start_lsn, - end_lsn: layer.end_lsn, - - dropped: layer.dropped, - } - } -} - -#[derive(Serialize, Deserialize)] -struct BlobRange { - offset: u64, - size: usize, -} - -fn read_blob(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result> { - let mut buf = vec![0u8; range.size]; - reader.read_exact_at(&mut buf, range.offset)?; - Ok(buf) -} - -/// -/// DeltaLayer is the in-memory data structure associated with an -/// on-disk delta file. We keep a DeltaLayer in memory for each -/// file, in the LayerMap. If a layer is in "loaded" state, we have a -/// copy of the file in memory, in 'inner'. Otherwise the struct is -/// just a placeholder for a file that exists on disk, and it needs to -/// be loaded before using it in queries. -/// -pub struct DeltaLayer { - path_or_conf: PathOrConf, - - pub tenantid: ZTenantId, - pub timelineid: ZTimelineId, - pub seg: SegmentTag, - - // - // This entry contains all the changes from 'start_lsn' to 'end_lsn'. The - // start is inclusive, and end is exclusive. - // - pub start_lsn: Lsn, - pub end_lsn: Lsn, - - dropped: bool, - - inner: Mutex, -} - -pub struct DeltaLayerInner { - /// If false, the 'page_version_metas' and 'seg_sizes' have not been - /// loaded into memory yet. - loaded: bool, - - book: Option>, - - /// All versions of all pages in the file are are kept here. - /// Indexed by block number and LSN. - page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>, - - /// `seg_sizes` tracks the size of the segment at different points in time. - seg_sizes: VecMap, -} - -impl DeltaLayerInner { - fn get_seg_size(&self, lsn: Lsn) -> Result { - // Scan the VecMap backwards, starting from the given entry. - let slice = self - .seg_sizes - .slice_range((Included(&Lsn(0)), Included(&lsn))); - if let Some((_entry_lsn, entry)) = slice.last() { - Ok(*entry) - } else { - bail!("could not find seg size in delta layer") - } - } -} - -impl Layer for DeltaLayer { - fn get_tenant_id(&self) -> ZTenantId { - self.tenantid - } - - fn get_timeline_id(&self) -> ZTimelineId { - self.timelineid - } - - fn get_seg_tag(&self) -> SegmentTag { - self.seg - } - - fn is_dropped(&self) -> bool { - self.dropped - } - - fn get_start_lsn(&self) -> Lsn { - self.start_lsn - } - - fn get_end_lsn(&self) -> Lsn { - self.end_lsn - } - - fn filename(&self) -> PathBuf { - PathBuf::from(self.layer_name().to_string()) - } - - /// Look up given page in the cache. - fn get_page_reconstruct_data( - &self, - blknum: SegmentBlk, - lsn: Lsn, - reconstruct_data: &mut PageReconstructData, - ) -> Result { - let mut need_image = true; - - assert!((0..RELISH_SEG_SIZE).contains(&blknum)); - - match &reconstruct_data.page_img { - Some((cached_lsn, _)) if &self.end_lsn <= cached_lsn => { - return Ok(PageReconstructResult::Complete) - } - _ => {} - } - - { - // Open the file and lock the metadata in memory - let inner = self.load()?; - let page_version_reader = inner - .book - .as_ref() - .expect("should be loaded in load call above") - .chapter_reader(PAGE_VERSIONS_CHAPTER)?; - - // Scan the metadata VecMap backwards, starting from the given entry. - let minkey = (blknum, Lsn(0)); - let maxkey = (blknum, lsn); - let iter = inner - .page_version_metas - .slice_range((Included(&minkey), Included(&maxkey))) - .iter() - .rev(); - for ((_blknum, pv_lsn), blob_range) in iter { - match &reconstruct_data.page_img { - Some((cached_lsn, _)) if pv_lsn <= cached_lsn => { - return Ok(PageReconstructResult::Complete) - } - _ => {} - } - - let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?; - - match pv { - PageVersion::Page(img) => { - // Found a page image, return it - reconstruct_data.page_img = Some((*pv_lsn, img)); - need_image = false; - break; - } - PageVersion::Wal(rec) => { - let will_init = rec.will_init(); - reconstruct_data.records.push((*pv_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; - } - } - } - } - - // If we didn't find any records for this, check if the request is beyond EOF - if need_image - && reconstruct_data.records.is_empty() - && self.seg.rel.is_blocky() - && blknum >= inner.get_seg_size(lsn)? - { - return Ok(PageReconstructResult::Missing(self.start_lsn)); - } - - // release metadata lock and close the file - } - - // If an older page image is needed to reconstruct the page, let the - // caller know. - if need_image { - Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1))) - } else { - Ok(PageReconstructResult::Complete) - } - } - - /// Get size of the relation at given LSN - fn get_seg_size(&self, lsn: Lsn) -> Result { - assert!(lsn >= self.start_lsn); - ensure!( - self.seg.rel.is_blocky(), - "get_seg_size() called on a non-blocky rel" - ); - - let inner = self.load()?; - inner.get_seg_size(lsn) - } - - /// Does this segment exist at given LSN? - fn get_seg_exists(&self, lsn: Lsn) -> Result { - // Is the requested LSN after the rel was dropped? - if self.dropped && lsn >= self.end_lsn { - return Ok(false); - } - - // Otherwise, it exists. - Ok(true) - } - - /// - /// Release most of the memory used by this layer. If it's accessed again later, - /// it will need to be loaded back. - /// - fn unload(&self) -> Result<()> { - let mut inner = self.inner.lock().unwrap(); - inner.page_version_metas = VecMap::default(); - inner.seg_sizes = VecMap::default(); - inner.loaded = false; - - // Note: we keep the Book open. Is that a good idea? The virtual file - // machinery has its own rules for closing the file descriptor if it's not - // needed, but the Book struct uses up some memory, too. - - Ok(()) - } - - fn delete(&self) -> Result<()> { - // delete underlying file - fs::remove_file(self.path())?; - Ok(()) - } - - fn is_incremental(&self) -> bool { - true - } - - fn is_in_memory(&self) -> bool { - false - } - - /// debugging function to print out the contents of the layer - fn dump(&self) -> Result<()> { - println!( - "----- delta layer for ten {} tli {} seg {} {}-{} ----", - self.tenantid, self.timelineid, self.seg, self.start_lsn, self.end_lsn - ); - - println!("--- seg sizes ---"); - let inner = self.load()?; - for (k, v) in inner.seg_sizes.as_slice() { - println!(" {}: {}", k, v); - } - println!("--- page versions ---"); - - let path = self.path(); - let file = std::fs::File::open(&path)?; - let book = Book::new(file)?; - - let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?; - for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() { - let mut desc = String::new(); - - let buf = read_blob(&chapter, blob_range)?; - let pv = PageVersion::des(&buf)?; - - match pv { - PageVersion::Page(img) => { - write!(&mut desc, " img {} bytes", img.len())?; - } - PageVersion::Wal(rec) => { - let wal_desc = walrecord::describe_wal_record(&rec); - write!( - &mut desc, - " rec {} bytes will_init: {} {}", - blob_range.size, - rec.will_init(), - wal_desc - )?; - } - } - - println!(" blk {} at {}: {}", blk, lsn, desc); - } - - Ok(()) - } -} - -impl DeltaLayer { - fn path_for( - path_or_conf: &PathOrConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - fname: &DeltaFileName, - ) -> PathBuf { - match path_or_conf { - PathOrConf::Path(path) => path.clone(), - PathOrConf::Conf(conf) => conf - .timeline_path(&timelineid, &tenantid) - .join(fname.to_string()), - } - } - - /// - /// Load the contents of the file into memory - /// - fn load(&self) -> Result> { - // quick exit if already loaded - let mut inner = self.inner.lock().unwrap(); - - if inner.loaded { - return Ok(inner); - } - - let path = self.path(); - - // Open the file if it's not open already. - if inner.book.is_none() { - let file = VirtualFile::open(&path)?; - inner.book = Some(Book::new(file)?); - } - let book = inner.book.as_ref().unwrap(); - - match &self.path_or_conf { - PathOrConf::Conf(_) => { - let chapter = book.read_chapter(SUMMARY_CHAPTER)?; - let actual_summary = Summary::des(&chapter)?; - - let expected_summary = Summary::from(self); - - if actual_summary != expected_summary { - bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary); - } - } - PathOrConf::Path(path) => { - let actual_filename = Path::new(path.file_name().unwrap()); - let expected_filename = self.filename(); - - if actual_filename != expected_filename { - println!( - "warning: filename does not match what is expected from in-file summary" - ); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); - } - } - } - - let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?; - let page_version_metas = VecMap::des(&chapter)?; - - let chapter = book.read_chapter(SEG_SIZES_CHAPTER)?; - let seg_sizes = VecMap::des(&chapter)?; - - debug!("loaded from {}", &path.display()); - - inner.page_version_metas = page_version_metas; - inner.seg_sizes = seg_sizes; - inner.loaded = true; - - Ok(inner) - } - - /// Create a DeltaLayer struct representing an existing file on disk. - pub fn new( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - filename: &DeltaFileName, - ) -> DeltaLayer { - DeltaLayer { - path_or_conf: PathOrConf::Conf(conf), - timelineid, - tenantid, - seg: filename.seg, - start_lsn: filename.start_lsn, - end_lsn: filename.end_lsn, - dropped: filename.dropped, - inner: Mutex::new(DeltaLayerInner { - loaded: false, - book: None, - page_version_metas: VecMap::default(), - seg_sizes: VecMap::default(), - }), - } - } - - /// Create a DeltaLayer struct representing an existing file on disk. - /// - /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary. - pub fn new_for_path(path: &Path, book: &Book) -> Result - where - F: std::os::unix::prelude::FileExt, - { - let chapter = book.read_chapter(SUMMARY_CHAPTER)?; - let summary = Summary::des(&chapter)?; - - Ok(DeltaLayer { - path_or_conf: PathOrConf::Path(path.to_path_buf()), - timelineid: summary.timelineid, - tenantid: summary.tenantid, - seg: summary.seg, - start_lsn: summary.start_lsn, - end_lsn: summary.end_lsn, - dropped: summary.dropped, - inner: Mutex::new(DeltaLayerInner { - loaded: false, - book: None, - page_version_metas: VecMap::default(), - seg_sizes: VecMap::default(), - }), - }) - } - - fn layer_name(&self) -> DeltaFileName { - DeltaFileName { - seg: self.seg, - start_lsn: self.start_lsn, - end_lsn: self.end_lsn, - dropped: self.dropped, - } - } - - /// Path to the layer file in pageserver workdir. - pub fn path(&self) -> PathBuf { - Self::path_for( - &self.path_or_conf, - self.timelineid, - self.tenantid, - &self.layer_name(), - ) - } -} - -/// A builder object for constructing a new delta layer. -/// -/// Usage: -/// -/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...) -/// -/// 2. Write the contents by calling `put_page_version` for every page -/// version to store in the layer. -/// -/// 3. Call `finish`. -/// -pub struct DeltaLayerWriter { - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - seg: SegmentTag, - start_lsn: Lsn, - end_lsn: Lsn, - dropped: bool, - - page_version_writer: ChapterWriter>, - pv_offset: u64, - - page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>, -} - -impl DeltaLayerWriter { - /// - /// Start building a new delta layer. - /// - pub fn new( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - seg: SegmentTag, - start_lsn: Lsn, - end_lsn: Lsn, - dropped: bool, - ) -> Result { - // Create the file - // - // Note: This overwrites any existing file. There shouldn't be any. - // FIXME: throw an error instead? - let path = DeltaLayer::path_for( - &PathOrConf::Conf(conf), - timelineid, - tenantid, - &DeltaFileName { - seg, - start_lsn, - end_lsn, - dropped, - }, - ); - let file = VirtualFile::create(&path)?; - let buf_writer = BufWriter::new(file); - let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?; - - // Open the page-versions chapter for writing. The calls to - // `put_page_version` will use this to write the contents. - let page_version_writer = book.new_chapter(PAGE_VERSIONS_CHAPTER); - - Ok(DeltaLayerWriter { - conf, - timelineid, - tenantid, - seg, - start_lsn, - end_lsn, - dropped, - page_version_writer, - page_version_metas: VecMap::default(), - pv_offset: 0, - }) - } - - /// - /// Append a page version to the file. - /// - /// 'buf' is a serialized PageVersion. - /// The page versions must be appended in blknum, lsn order. - /// - pub fn put_page_version(&mut self, blknum: SegmentBlk, lsn: Lsn, buf: &[u8]) -> Result<()> { - // Remember the offset and size metadata. The metadata is written - // to a separate chapter, in `finish`. - let blob_range = BlobRange { - offset: self.pv_offset, - size: buf.len(), - }; - self.page_version_metas - .append((blknum, lsn), blob_range) - .unwrap(); - - // write the page version - self.page_version_writer.write_all(buf)?; - self.pv_offset += buf.len() as u64; - - Ok(()) - } - - /// - /// Finish writing the delta layer. - /// - /// 'seg_sizes' is a list of size changes to store with the actual data. - /// - pub fn finish(self, seg_sizes: VecMap) -> Result { - // Close the page-versions chapter - let book = self.page_version_writer.close()?; - - // Write out page versions metadata - let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER); - let buf = VecMap::ser(&self.page_version_metas)?; - chapter.write_all(&buf)?; - let book = chapter.close()?; - - if self.seg.rel.is_blocky() { - assert!(!seg_sizes.is_empty()); - } - - // and seg_sizes to separate chapter - let mut chapter = book.new_chapter(SEG_SIZES_CHAPTER); - let buf = VecMap::ser(&seg_sizes)?; - chapter.write_all(&buf)?; - let book = chapter.close()?; - - let mut chapter = book.new_chapter(SUMMARY_CHAPTER); - let summary = Summary { - tenantid: self.tenantid, - timelineid: self.timelineid, - seg: self.seg, - - start_lsn: self.start_lsn, - end_lsn: self.end_lsn, - - dropped: self.dropped, - }; - Summary::ser_into(&summary, &mut chapter)?; - let book = chapter.close()?; - - // This flushes the underlying 'buf_writer'. - book.close()?; - - // Note: Because we opened the file in write-only mode, we cannot - // reuse the same VirtualFile for reading later. That's why we don't - // set inner.book here. The first read will have to re-open it. - let layer = DeltaLayer { - path_or_conf: PathOrConf::Conf(self.conf), - tenantid: self.tenantid, - timelineid: self.timelineid, - seg: self.seg, - start_lsn: self.start_lsn, - end_lsn: self.end_lsn, - dropped: self.dropped, - inner: Mutex::new(DeltaLayerInner { - loaded: false, - book: None, - page_version_metas: VecMap::default(), - seg_sizes: VecMap::default(), - }), - }; - - trace!("created delta layer {}", &layer.path().display()); - - Ok(layer) - } -} diff --git a/pageserver/src/layered_repository/ephemeral_file.rs b/pageserver/src/layered_repository/ephemeral_file.rs deleted file mode 100644 index 79a72f4563..0000000000 --- a/pageserver/src/layered_repository/ephemeral_file.rs +++ /dev/null @@ -1,310 +0,0 @@ -//! Implementation of append-only file data structure -//! used to keep in-memory layers spilled on disk. - -use crate::config::PageServerConf; -use crate::page_cache; -use crate::page_cache::PAGE_SZ; -use crate::page_cache::{ReadBufResult, WriteBufResult}; -use crate::virtual_file::VirtualFile; -use lazy_static::lazy_static; -use std::cmp::min; -use std::collections::HashMap; -use std::fs::OpenOptions; -use std::io::{Error, ErrorKind, Seek, SeekFrom, Write}; -use std::ops::DerefMut; -use std::path::PathBuf; -use std::sync::{Arc, RwLock}; -use zenith_utils::zid::ZTenantId; -use zenith_utils::zid::ZTimelineId; - -use std::os::unix::fs::FileExt; - -lazy_static! { - /// - /// This is the global cache of file descriptors (File objects). - /// - static ref EPHEMERAL_FILES: RwLock = RwLock::new(EphemeralFiles { - next_file_id: 1, - files: HashMap::new(), - }); -} - -pub struct EphemeralFiles { - next_file_id: u64, - - files: HashMap>, -} - -pub struct EphemeralFile { - file_id: u64, - _tenantid: ZTenantId, - _timelineid: ZTimelineId, - file: Arc, - - pos: u64, -} - -impl EphemeralFile { - pub fn create( - conf: &PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, - ) -> Result { - let mut l = EPHEMERAL_FILES.write().unwrap(); - let file_id = l.next_file_id; - l.next_file_id += 1; - - let filename = conf - .timeline_path(&timelineid, &tenantid) - .join(PathBuf::from(format!("ephemeral-{}", file_id))); - - let file = VirtualFile::open_with_options( - &filename, - OpenOptions::new().read(true).write(true).create(true), - )?; - let file_rc = Arc::new(file); - l.files.insert(file_id, file_rc.clone()); - - Ok(EphemeralFile { - file_id, - _tenantid: tenantid, - _timelineid: timelineid, - file: file_rc, - pos: 0, - }) - } - - pub fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), Error> { - let mut off = 0; - while off < PAGE_SZ { - let n = self - .file - .read_at(&mut buf[off..], blkno as u64 * PAGE_SZ as u64 + off as u64)?; - - if n == 0 { - // Reached EOF. Fill the rest of the buffer with zeros. - const ZERO_BUF: [u8; PAGE_SZ] = [0u8; PAGE_SZ]; - - buf[off..].copy_from_slice(&ZERO_BUF[off..]); - break; - } - - off += n as usize; - } - Ok(()) - } -} - -/// Does the given filename look like an ephemeral file? -pub fn is_ephemeral_file(filename: &str) -> bool { - if let Some(rest) = filename.strip_prefix("ephemeral-") { - rest.parse::().is_ok() - } else { - false - } -} - -impl FileExt for EphemeralFile { - fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result { - // Look up the right page - let blkno = (offset / PAGE_SZ as u64) as u32; - let off = offset as usize % PAGE_SZ; - let len = min(PAGE_SZ - off, dstbuf.len()); - - let read_guard; - let mut write_guard; - - let cache = page_cache::get(); - let buf = match cache.read_ephemeral_buf(self.file_id, blkno) { - ReadBufResult::Found(guard) => { - read_guard = guard; - read_guard.as_ref() - } - ReadBufResult::NotFound(guard) => { - // Read the page from disk into the buffer - write_guard = guard; - self.fill_buffer(write_guard.deref_mut(), blkno)?; - write_guard.mark_valid(); - - // And then fall through to read the requested slice from the - // buffer. - write_guard.as_ref() - } - }; - - dstbuf[0..len].copy_from_slice(&buf[off..(off + len)]); - Ok(len) - } - - fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result { - // Look up the right page - let blkno = (offset / PAGE_SZ as u64) as u32; - let off = offset as usize % PAGE_SZ; - let len = min(PAGE_SZ - off, srcbuf.len()); - - let mut write_guard; - let cache = page_cache::get(); - let buf = match cache.write_ephemeral_buf(self.file_id, blkno) { - WriteBufResult::Found(guard) => { - write_guard = guard; - write_guard.deref_mut() - } - WriteBufResult::NotFound(guard) => { - // Read the page from disk into the buffer - // TODO: if we're overwriting the whole page, no need to read it in first - write_guard = guard; - self.fill_buffer(write_guard.deref_mut(), blkno)?; - write_guard.mark_valid(); - - // And then fall through to modify it. - write_guard.deref_mut() - } - }; - - buf[off..(off + len)].copy_from_slice(&srcbuf[0..len]); - write_guard.mark_dirty(); - Ok(len) - } -} - -impl Write for EphemeralFile { - fn write(&mut self, buf: &[u8]) -> Result { - let n = self.write_at(buf, self.pos)?; - self.pos += n as u64; - Ok(n) - } - - fn flush(&mut self) -> Result<(), std::io::Error> { - // we don't need to flush data: - // * we either write input bytes or not, not keeping any intermediate data buffered - // * rust unix file `flush` impl does not flush things either, returning `Ok(())` - Ok(()) - } -} - -impl Seek for EphemeralFile { - fn seek(&mut self, pos: SeekFrom) -> Result { - match pos { - SeekFrom::Start(offset) => { - self.pos = offset; - } - SeekFrom::End(_offset) => { - return Err(Error::new( - ErrorKind::Other, - "SeekFrom::End not supported by EphemeralFile", - )); - } - SeekFrom::Current(offset) => { - let pos = self.pos as i128 + offset as i128; - if pos < 0 { - return Err(Error::new( - ErrorKind::InvalidInput, - "offset would be negative", - )); - } - if pos > u64::MAX as i128 { - return Err(Error::new(ErrorKind::InvalidInput, "offset overflow")); - } - self.pos = pos as u64; - } - } - Ok(self.pos) - } -} - -impl Drop for EphemeralFile { - fn drop(&mut self) { - // drop all pages from page cache - let cache = page_cache::get(); - cache.drop_buffers_for_ephemeral(self.file_id); - - // remove entry from the hash map - EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id); - - // unlink file - // FIXME: print error - let _ = std::fs::remove_file(&self.file.path); - } -} - -pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Error> { - if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) { - file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64)?; - Ok(()) - } else { - Err(std::io::Error::new( - ErrorKind::Other, - "could not write back page, not found in ephemeral files hash", - )) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use rand::seq::SliceRandom; - use rand::thread_rng; - use std::fs; - use std::str::FromStr; - - fn repo_harness( - test_name: &str, - ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), Error> { - let repo_dir = PageServerConf::test_repo_dir(test_name); - let _ = fs::remove_dir_all(&repo_dir); - let conf = PageServerConf::dummy_conf(repo_dir); - // Make a static copy of the config. This can never be free'd, but that's - // OK in a test. - let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - - let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap(); - let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap(); - fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?; - - Ok((conf, tenantid, timelineid)) - } - - // Helper function to slurp contents of a file, starting at the current position, - // into a string - fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result { - let mut buf = Vec::new(); - buf.resize(len, 0u8); - - efile.read_exact_at(&mut buf, offset)?; - - Ok(String::from_utf8_lossy(&buf) - .trim_end_matches('\0') - .to_string()) - } - - #[test] - fn test_ephemeral_files() -> Result<(), Error> { - let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?; - - let mut file_a = EphemeralFile::create(conf, tenantid, timelineid)?; - - file_a.write_all(b"foo")?; - assert_eq!("foo", read_string(&file_a, 0, 20)?); - - file_a.write_all(b"bar")?; - assert_eq!("foobar", read_string(&file_a, 0, 20)?); - - // Open a lot of files, enough to cause some page evictions. - let mut efiles = Vec::new(); - for fileno in 0..100 { - let mut efile = EphemeralFile::create(conf, tenantid, timelineid)?; - efile.write_all(format!("file {}", fileno).as_bytes())?; - assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?); - efiles.push((fileno, efile)); - } - - // Check that all the files can still be read from. Use them in random order for - // good measure. - efiles.as_mut_slice().shuffle(&mut thread_rng()); - for (fileno, efile) in efiles.iter_mut() { - assert_eq!(format!("file {}", fileno), read_string(efile, 0, 10)?); - } - - Ok(()) - } -} diff --git a/pageserver/src/layered_repository/filename.rs b/pageserver/src/layered_repository/filename.rs deleted file mode 100644 index df23700dfd..0000000000 --- a/pageserver/src/layered_repository/filename.rs +++ /dev/null @@ -1,279 +0,0 @@ -//! -//! Helper functions for dealing with filenames of the image and delta layer files. -//! -use crate::config::PageServerConf; -use crate::layered_repository::storage_layer::SegmentTag; -use crate::relish::*; -use std::fmt; -use std::path::PathBuf; - -use zenith_utils::lsn::Lsn; - -// Note: LayeredTimeline::load_layer_map() relies on this sort order -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] -pub struct DeltaFileName { - pub seg: SegmentTag, - pub start_lsn: Lsn, - pub end_lsn: Lsn, - pub dropped: bool, -} - -/// Represents the filename of a DeltaLayer -/// -/// ______ -/// -/// or if it was dropped: -/// -/// _______DROPPED -/// -impl DeltaFileName { - /// - /// Parse a string as a delta file name. Returns None if the filename does not - /// match the expected pattern. - /// - pub fn parse_str(fname: &str) -> Option { - let rel; - let mut parts; - if let Some(rest) = fname.strip_prefix("rel_") { - parts = rest.split('_'); - rel = RelishTag::Relation(RelTag { - spcnode: parts.next()?.parse::().ok()?, - dbnode: parts.next()?.parse::().ok()?, - relnode: parts.next()?.parse::().ok()?, - forknum: parts.next()?.parse::().ok()?, - }); - } else if let Some(rest) = fname.strip_prefix("pg_xact_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::Clog, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") { - parts = rest.split('_'); - rel = RelishTag::FileNodeMap { - spcnode: parts.next()?.parse::().ok()?, - dbnode: parts.next()?.parse::().ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_twophase_") { - parts = rest.split('_'); - rel = RelishTag::TwoPhase { - xid: parts.next()?.parse::().ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") { - parts = rest.split('_'); - rel = RelishTag::Checkpoint; - } else if let Some(rest) = fname.strip_prefix("pg_control_") { - parts = rest.split('_'); - rel = RelishTag::ControlFile; - } else { - return None; - } - - let segno = parts.next()?.parse::().ok()?; - - let seg = SegmentTag { rel, segno }; - - let start_lsn = Lsn::from_hex(parts.next()?).ok()?; - let end_lsn = Lsn::from_hex(parts.next()?).ok()?; - - let mut dropped = false; - if let Some(suffix) = parts.next() { - if suffix == "DROPPED" { - dropped = true; - } else { - return None; - } - } - if parts.next().is_some() { - return None; - } - - Some(DeltaFileName { - seg, - start_lsn, - end_lsn, - dropped, - }) - } -} - -impl fmt::Display for DeltaFileName { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let basename = match self.seg.rel { - RelishTag::Relation(reltag) => format!( - "rel_{}_{}_{}_{}", - reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum - ), - RelishTag::Slru { - slru: SlruKind::Clog, - segno, - } => format!("pg_xact_{:04X}", segno), - RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno, - } => format!("pg_multixact_members_{:04X}", segno), - RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno, - } => format!("pg_multixact_offsets_{:04X}", segno), - RelishTag::FileNodeMap { spcnode, dbnode } => { - format!("pg_filenodemap_{}_{}", spcnode, dbnode) - } - RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid), - RelishTag::Checkpoint => "pg_control_checkpoint".to_string(), - RelishTag::ControlFile => "pg_control".to_string(), - }; - - write!( - f, - "{}_{}_{:016X}_{:016X}{}", - basename, - self.seg.segno, - u64::from(self.start_lsn), - u64::from(self.end_lsn), - if self.dropped { "_DROPPED" } else { "" } - ) - } -} - -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] -pub struct ImageFileName { - pub seg: SegmentTag, - pub lsn: Lsn, -} - -/// -/// Represents the filename of an ImageLayer -/// -/// _____ -/// -impl ImageFileName { - /// - /// Parse a string as an image file name. Returns None if the filename does not - /// match the expected pattern. - /// - pub fn parse_str(fname: &str) -> Option { - let rel; - let mut parts; - if let Some(rest) = fname.strip_prefix("rel_") { - parts = rest.split('_'); - rel = RelishTag::Relation(RelTag { - spcnode: parts.next()?.parse::().ok()?, - dbnode: parts.next()?.parse::().ok()?, - relnode: parts.next()?.parse::().ok()?, - forknum: parts.next()?.parse::().ok()?, - }); - } else if let Some(rest) = fname.strip_prefix("pg_xact_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::Clog, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") { - parts = rest.split('_'); - rel = RelishTag::FileNodeMap { - spcnode: parts.next()?.parse::().ok()?, - dbnode: parts.next()?.parse::().ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_twophase_") { - parts = rest.split('_'); - rel = RelishTag::TwoPhase { - xid: parts.next()?.parse::().ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") { - parts = rest.split('_'); - rel = RelishTag::Checkpoint; - } else if let Some(rest) = fname.strip_prefix("pg_control_") { - parts = rest.split('_'); - rel = RelishTag::ControlFile; - } else { - return None; - } - - let segno = parts.next()?.parse::().ok()?; - - let seg = SegmentTag { rel, segno }; - - let lsn = Lsn::from_hex(parts.next()?).ok()?; - - if parts.next().is_some() { - return None; - } - - Some(ImageFileName { seg, lsn }) - } -} - -impl fmt::Display for ImageFileName { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let basename = match self.seg.rel { - RelishTag::Relation(reltag) => format!( - "rel_{}_{}_{}_{}", - reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum - ), - RelishTag::Slru { - slru: SlruKind::Clog, - segno, - } => format!("pg_xact_{:04X}", segno), - RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno, - } => format!("pg_multixact_members_{:04X}", segno), - RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno, - } => format!("pg_multixact_offsets_{:04X}", segno), - RelishTag::FileNodeMap { spcnode, dbnode } => { - format!("pg_filenodemap_{}_{}", spcnode, dbnode) - } - RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid), - RelishTag::Checkpoint => "pg_control_checkpoint".to_string(), - RelishTag::ControlFile => "pg_control".to_string(), - }; - - write!( - f, - "{}_{}_{:016X}", - basename, - self.seg.segno, - u64::from(self.lsn), - ) - } -} - -/// Helper enum to hold a PageServerConf, or a path -/// -/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the -/// global config, and paths to layer files are constructed using the tenant/timeline -/// path from the config. But in the 'dump_layerfile' binary, we need to construct a Layer -/// struct for a file on disk, without having a page server running, so that we have no -/// config. In that case, we use the Path variant to hold the full path to the file on -/// disk. -pub enum PathOrConf { - Path(PathBuf), - Conf(&'static PageServerConf), -} diff --git a/pageserver/src/layered_repository/global_layer_map.rs b/pageserver/src/layered_repository/global_layer_map.rs deleted file mode 100644 index 169a89650a..0000000000 --- a/pageserver/src/layered_repository/global_layer_map.rs +++ /dev/null @@ -1,142 +0,0 @@ -//! -//! Global registry of open layers. -//! -//! Whenever a new in-memory layer is created to hold incoming WAL, it is registered -//! in [`GLOBAL_LAYER_MAP`], so that we can keep track of the total number of -//! in-memory layers in the system, and know when we need to evict some to release -//! memory. -//! -//! Each layer is assigned a unique ID when it's registered in the global registry. -//! The ID can be used to relocate the layer later, without having to hold locks. -//! - -use std::sync::atomic::{AtomicU8, Ordering}; -use std::sync::{Arc, RwLock}; - -use super::inmemory_layer::InMemoryLayer; - -use lazy_static::lazy_static; - -const MAX_USAGE_COUNT: u8 = 5; - -lazy_static! { - pub static ref GLOBAL_LAYER_MAP: RwLock = - RwLock::new(InMemoryLayers::default()); -} - -// TODO these types can probably be smaller -#[derive(PartialEq, Eq, Clone, Copy)] -pub struct LayerId { - index: usize, - tag: u64, // to avoid ABA problem -} - -enum SlotData { - Occupied(Arc), - /// Vacant slots form a linked list, the value is the index - /// of the next vacant slot in the list. - Vacant(Option), -} - -struct Slot { - tag: u64, - data: SlotData, - usage_count: AtomicU8, // for clock algorithm -} - -#[derive(Default)] -pub struct InMemoryLayers { - slots: Vec, - num_occupied: usize, - - // Head of free-slot list. - next_empty_slot_idx: Option, -} - -impl InMemoryLayers { - pub fn insert(&mut self, layer: Arc) -> LayerId { - let slot_idx = match self.next_empty_slot_idx { - Some(slot_idx) => slot_idx, - None => { - let idx = self.slots.len(); - self.slots.push(Slot { - tag: 0, - data: SlotData::Vacant(None), - usage_count: AtomicU8::new(0), - }); - idx - } - }; - let slots_len = self.slots.len(); - - let slot = &mut self.slots[slot_idx]; - - match slot.data { - SlotData::Occupied(_) => { - panic!("an occupied slot was in the free list"); - } - SlotData::Vacant(next_empty_slot_idx) => { - self.next_empty_slot_idx = next_empty_slot_idx; - } - } - - slot.data = SlotData::Occupied(layer); - slot.usage_count.store(1, Ordering::Relaxed); - - self.num_occupied += 1; - assert!(self.num_occupied <= slots_len); - - LayerId { - index: slot_idx, - tag: slot.tag, - } - } - - pub fn get(&self, layer_id: &LayerId) -> Option> { - let slot = self.slots.get(layer_id.index)?; // TODO should out of bounds indexes just panic? - if slot.tag != layer_id.tag { - return None; - } - - if let SlotData::Occupied(layer) = &slot.data { - let _ = slot.usage_count.fetch_update( - Ordering::Relaxed, - Ordering::Relaxed, - |old_usage_count| { - if old_usage_count < MAX_USAGE_COUNT { - Some(old_usage_count + 1) - } else { - None - } - }, - ); - Some(Arc::clone(layer)) - } else { - None - } - } - - // TODO this won't be a public API in the future - pub fn remove(&mut self, layer_id: &LayerId) { - let slot = &mut self.slots[layer_id.index]; - - if slot.tag != layer_id.tag { - return; - } - - match &slot.data { - SlotData::Occupied(_layer) => { - // TODO evict the layer - } - SlotData::Vacant(_) => unimplemented!(), - } - - slot.data = SlotData::Vacant(self.next_empty_slot_idx); - self.next_empty_slot_idx = Some(layer_id.index); - - assert!(self.num_occupied > 0); - self.num_occupied -= 1; - - slot.tag = slot.tag.wrapping_add(1); - } -} diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs deleted file mode 100644 index 24445ff7e9..0000000000 --- a/pageserver/src/layered_repository/image_layer.rs +++ /dev/null @@ -1,534 +0,0 @@ -//! An ImageLayer represents an image or a snapshot of a segment at one particular LSN. -//! It is stored in a file on disk. -//! -//! On disk, the image files are stored in timelines/ directory. -//! Currently, there are no subdirectories, and each image layer file is named like this: -//! -//! Note that segno is -//! _____ -//! -//! For example: -//! -//! 1663_13990_2609_0_5_000000000169C348 -//! -//! An image file is constructed using the 'bookfile' crate. -//! -//! Only metadata is loaded into memory by the load function. -//! When images are needed, they are read directly from disk. -//! -//! For blocky relishes, the images are stored in BLOCKY_IMAGES_CHAPTER. -//! All the images are required to be BLOCK_SIZE, which allows for random access. -//! -//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER. -//! -use crate::config::PageServerConf; -use crate::layered_repository::filename::{ImageFileName, PathOrConf}; -use crate::layered_repository::storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, SegmentBlk, SegmentTag, -}; -use crate::layered_repository::RELISH_SEG_SIZE; -use crate::virtual_file::VirtualFile; -use crate::{ZTenantId, ZTimelineId}; -use anyhow::{anyhow, bail, ensure, Context, Result}; -use bytes::Bytes; -use log::*; -use serde::{Deserialize, Serialize}; -use std::convert::TryInto; -use std::fs; -use std::io::{BufWriter, Write}; -use std::path::{Path, PathBuf}; -use std::sync::{Mutex, MutexGuard}; - -use bookfile::{Book, BookWriter, ChapterWriter}; - -use zenith_utils::bin_ser::BeSer; -use zenith_utils::lsn::Lsn; - -// Magic constant to identify a Zenith segment image file -pub const IMAGE_FILE_MAGIC: u32 = 0x5A616E01 + 1; - -/// Contains each block in block # order -const BLOCKY_IMAGES_CHAPTER: u64 = 1; -const NONBLOCKY_IMAGE_CHAPTER: u64 = 2; - -/// Contains the [`Summary`] struct -const SUMMARY_CHAPTER: u64 = 3; - -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] -struct Summary { - tenantid: ZTenantId, - timelineid: ZTimelineId, - seg: SegmentTag, - - lsn: Lsn, -} - -impl From<&ImageLayer> for Summary { - fn from(layer: &ImageLayer) -> Self { - Self { - tenantid: layer.tenantid, - timelineid: layer.timelineid, - seg: layer.seg, - - lsn: layer.lsn, - } - } -} - -const BLOCK_SIZE: usize = 8192; - -/// -/// ImageLayer is the in-memory data structure associated with an on-disk image -/// file. We keep an ImageLayer in memory for each file, in the LayerMap. If a -/// layer is in "loaded" state, we have a copy of the file in memory, in 'inner'. -/// Otherwise the struct is just a placeholder for a file that exists on disk, -/// and it needs to be loaded before using it in queries. -/// -pub struct ImageLayer { - path_or_conf: PathOrConf, - pub tenantid: ZTenantId, - pub timelineid: ZTimelineId, - pub seg: SegmentTag, - - // This entry contains an image of all pages as of this LSN - pub lsn: Lsn, - - inner: Mutex, -} - -#[derive(Clone)] -enum ImageType { - Blocky { num_blocks: SegmentBlk }, - NonBlocky, -} - -pub struct ImageLayerInner { - /// If None, the 'image_type' has not been loaded into memory yet. - book: Option>, - - /// Derived from filename and bookfile chapter metadata - image_type: ImageType, -} - -impl Layer for ImageLayer { - fn filename(&self) -> PathBuf { - PathBuf::from(self.layer_name().to_string()) - } - - fn get_tenant_id(&self) -> ZTenantId { - self.tenantid - } - - fn get_timeline_id(&self) -> ZTimelineId { - self.timelineid - } - - fn get_seg_tag(&self) -> SegmentTag { - self.seg - } - - fn is_dropped(&self) -> bool { - false - } - - fn get_start_lsn(&self) -> Lsn { - self.lsn - } - - fn get_end_lsn(&self) -> Lsn { - // End-bound is exclusive - self.lsn + 1 - } - - /// Look up given page in the file - fn get_page_reconstruct_data( - &self, - blknum: SegmentBlk, - lsn: Lsn, - reconstruct_data: &mut PageReconstructData, - ) -> Result { - assert!((0..RELISH_SEG_SIZE).contains(&blknum)); - assert!(lsn >= self.lsn); - - match reconstruct_data.page_img { - Some((cached_lsn, _)) if self.lsn <= cached_lsn => { - return Ok(PageReconstructResult::Complete) - } - _ => {} - } - - let inner = self.load()?; - - let buf = match &inner.image_type { - ImageType::Blocky { num_blocks } => { - // Check if the request is beyond EOF - if blknum >= *num_blocks { - return Ok(PageReconstructResult::Missing(lsn)); - } - - let mut buf = vec![0u8; BLOCK_SIZE]; - let offset = BLOCK_SIZE as u64 * blknum as u64; - - let chapter = inner - .book - .as_ref() - .unwrap() - .chapter_reader(BLOCKY_IMAGES_CHAPTER)?; - - chapter.read_exact_at(&mut buf, offset).with_context(|| { - format!( - "failed to read page from data file {} at offset {}", - self.filename().display(), - offset - ) - })?; - - buf - } - ImageType::NonBlocky => { - ensure!(blknum == 0); - inner - .book - .as_ref() - .unwrap() - .read_chapter(NONBLOCKY_IMAGE_CHAPTER)? - .into_vec() - } - }; - - reconstruct_data.page_img = Some((self.lsn, Bytes::from(buf))); - Ok(PageReconstructResult::Complete) - } - - /// Get size of the segment - fn get_seg_size(&self, _lsn: Lsn) -> Result { - let inner = self.load()?; - match inner.image_type { - ImageType::Blocky { num_blocks } => Ok(num_blocks), - ImageType::NonBlocky => Err(anyhow!("get_seg_size called for non-blocky segment")), - } - } - - /// Does this segment exist at given LSN? - fn get_seg_exists(&self, _lsn: Lsn) -> Result { - Ok(true) - } - - fn unload(&self) -> Result<()> { - Ok(()) - } - - fn delete(&self) -> Result<()> { - // delete underlying file - fs::remove_file(self.path())?; - Ok(()) - } - - fn is_incremental(&self) -> bool { - false - } - - fn is_in_memory(&self) -> bool { - false - } - - /// debugging function to print out the contents of the layer - fn dump(&self) -> Result<()> { - println!( - "----- image layer for ten {} tli {} seg {} at {} ----", - self.tenantid, self.timelineid, self.seg, self.lsn - ); - - let inner = self.load()?; - - match inner.image_type { - ImageType::Blocky { num_blocks } => println!("({}) blocks ", num_blocks), - ImageType::NonBlocky => { - let chapter = inner - .book - .as_ref() - .unwrap() - .read_chapter(NONBLOCKY_IMAGE_CHAPTER)?; - println!("non-blocky ({} bytes)", chapter.len()); - } - } - - Ok(()) - } -} - -impl ImageLayer { - fn path_for( - path_or_conf: &PathOrConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - fname: &ImageFileName, - ) -> PathBuf { - match path_or_conf { - PathOrConf::Path(path) => path.to_path_buf(), - PathOrConf::Conf(conf) => conf - .timeline_path(&timelineid, &tenantid) - .join(fname.to_string()), - } - } - - /// - /// Load the contents of the file into memory - /// - fn load(&self) -> Result> { - // quick exit if already loaded - let mut inner = self.inner.lock().unwrap(); - - if inner.book.is_some() { - return Ok(inner); - } - - let path = self.path(); - let file = VirtualFile::open(&path) - .with_context(|| format!("Failed to open virtual file '{}'", path.display()))?; - let book = Book::new(file).with_context(|| { - format!( - "Failed to open virtual file '{}' as a bookfile", - path.display() - ) - })?; - - match &self.path_or_conf { - PathOrConf::Conf(_) => { - let chapter = book.read_chapter(SUMMARY_CHAPTER)?; - let actual_summary = Summary::des(&chapter)?; - - let expected_summary = Summary::from(self); - - if actual_summary != expected_summary { - bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary); - } - } - PathOrConf::Path(path) => { - let actual_filename = Path::new(path.file_name().unwrap()); - let expected_filename = self.filename(); - - if actual_filename != expected_filename { - println!( - "warning: filename does not match what is expected from in-file summary" - ); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); - } - } - } - - let image_type = if self.seg.rel.is_blocky() { - let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?; - let images_len = chapter.len(); - ensure!(images_len % BLOCK_SIZE as u64 == 0); - let num_blocks: SegmentBlk = (images_len / BLOCK_SIZE as u64).try_into()?; - ImageType::Blocky { num_blocks } - } else { - let _chapter = book.chapter_reader(NONBLOCKY_IMAGE_CHAPTER)?; - ImageType::NonBlocky - }; - - debug!("loaded from {}", &path.display()); - - *inner = ImageLayerInner { - book: Some(book), - image_type, - }; - - Ok(inner) - } - - /// Create an ImageLayer struct representing an existing file on disk - pub fn new( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - filename: &ImageFileName, - ) -> ImageLayer { - ImageLayer { - path_or_conf: PathOrConf::Conf(conf), - timelineid, - tenantid, - seg: filename.seg, - lsn: filename.lsn, - inner: Mutex::new(ImageLayerInner { - book: None, - image_type: ImageType::Blocky { num_blocks: 0 }, - }), - } - } - - /// Create an ImageLayer struct representing an existing file on disk. - /// - /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary. - pub fn new_for_path(path: &Path, book: &Book) -> Result - where - F: std::os::unix::prelude::FileExt, - { - let chapter = book.read_chapter(SUMMARY_CHAPTER)?; - let summary = Summary::des(&chapter)?; - - Ok(ImageLayer { - path_or_conf: PathOrConf::Path(path.to_path_buf()), - timelineid: summary.timelineid, - tenantid: summary.tenantid, - seg: summary.seg, - lsn: summary.lsn, - inner: Mutex::new(ImageLayerInner { - book: None, - image_type: ImageType::Blocky { num_blocks: 0 }, - }), - }) - } - - fn layer_name(&self) -> ImageFileName { - ImageFileName { - seg: self.seg, - lsn: self.lsn, - } - } - - /// Path to the layer file in pageserver workdir. - pub fn path(&self) -> PathBuf { - Self::path_for( - &self.path_or_conf, - self.timelineid, - self.tenantid, - &self.layer_name(), - ) - } -} - -/// A builder object for constructing a new image layer. -/// -/// Usage: -/// -/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...) -/// -/// 2. Write the contents by calling `put_page_image` for every page -/// in the segment. -/// -/// 3. Call `finish`. -/// -pub struct ImageLayerWriter { - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - seg: SegmentTag, - lsn: Lsn, - - num_blocks: SegmentBlk, - - page_image_writer: ChapterWriter>, - num_blocks_written: SegmentBlk, -} - -impl ImageLayerWriter { - pub fn new( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - seg: SegmentTag, - lsn: Lsn, - num_blocks: SegmentBlk, - ) -> Result { - // Create the file - // - // Note: This overwrites any existing file. There shouldn't be any. - // FIXME: throw an error instead? - let path = ImageLayer::path_for( - &PathOrConf::Conf(conf), - timelineid, - tenantid, - &ImageFileName { seg, lsn }, - ); - let file = VirtualFile::create(&path)?; - let buf_writer = BufWriter::new(file); - let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?; - - // Open the page-images chapter for writing. The calls to - // `put_page_image` will use this to write the contents. - let chapter = if seg.rel.is_blocky() { - book.new_chapter(BLOCKY_IMAGES_CHAPTER) - } else { - assert_eq!(num_blocks, 1); - book.new_chapter(NONBLOCKY_IMAGE_CHAPTER) - }; - - let writer = ImageLayerWriter { - conf, - timelineid, - tenantid, - seg, - lsn, - num_blocks, - page_image_writer: chapter, - num_blocks_written: 0, - }; - - Ok(writer) - } - - /// - /// Write next page image to the file. - /// - /// The page versions must be appended in blknum order. - /// - pub fn put_page_image(&mut self, block_bytes: &[u8]) -> Result<()> { - assert!(self.num_blocks_written < self.num_blocks); - if self.seg.rel.is_blocky() { - assert_eq!(block_bytes.len(), BLOCK_SIZE); - } - self.page_image_writer.write_all(block_bytes)?; - self.num_blocks_written += 1; - Ok(()) - } - - pub fn finish(self) -> Result { - // Check that the `put_page_image' was called for every block. - assert!(self.num_blocks_written == self.num_blocks); - - // Close the page-images chapter - let book = self.page_image_writer.close()?; - - // Write out the summary chapter - let image_type = if self.seg.rel.is_blocky() { - ImageType::Blocky { - num_blocks: self.num_blocks, - } - } else { - ImageType::NonBlocky - }; - let mut chapter = book.new_chapter(SUMMARY_CHAPTER); - let summary = Summary { - tenantid: self.tenantid, - timelineid: self.timelineid, - seg: self.seg, - lsn: self.lsn, - }; - Summary::ser_into(&summary, &mut chapter)?; - let book = chapter.close()?; - - // This flushes the underlying 'buf_writer'. - book.close()?; - - // Note: Because we open the file in write-only mode, we cannot - // reuse the same VirtualFile for reading later. That's why we don't - // set inner.book here. The first read will have to re-open it. - let layer = ImageLayer { - path_or_conf: PathOrConf::Conf(self.conf), - timelineid: self.timelineid, - tenantid: self.tenantid, - seg: self.seg, - lsn: self.lsn, - inner: Mutex::new(ImageLayerInner { - book: None, - image_type, - }), - }; - trace!("created image layer {}", layer.path().display()); - - Ok(layer) - } -} diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs deleted file mode 100644 index 17b061b20e..0000000000 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ /dev/null @@ -1,805 +0,0 @@ -//! An in-memory layer stores recently received PageVersions. -//! The page versions are held in a BTreeMap. To avoid OOM errors, the map size is limited -//! and layers can be spilled to disk into ephemeral files. -//! -//! And there's another BTreeMap to track the size of the relation. -//! -use crate::config::PageServerConf; -use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter}; -use crate::layered_repository::ephemeral_file::EphemeralFile; -use crate::layered_repository::filename::DeltaFileName; -use crate::layered_repository::image_layer::{ImageLayer, ImageLayerWriter}; -use crate::layered_repository::storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag, - RELISH_SEG_SIZE, -}; -use crate::layered_repository::LayeredTimeline; -use crate::layered_repository::ZERO_PAGE; -use crate::repository::ZenithWalRecord; -use crate::{ZTenantId, ZTimelineId}; -use anyhow::{ensure, Result}; -use bytes::Bytes; -use log::*; -use std::collections::HashMap; -use std::io::Seek; -use std::os::unix::fs::FileExt; -use std::path::PathBuf; -use std::sync::{Arc, RwLock}; -use zenith_utils::bin_ser::BeSer; -use zenith_utils::lsn::Lsn; -use zenith_utils::vec_map::VecMap; - -pub struct InMemoryLayer { - conf: &'static PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, - seg: SegmentTag, - - /// - /// This layer contains all the changes from 'start_lsn'. The - /// start is inclusive. - /// - start_lsn: Lsn, - - /// - /// LSN of the oldest page version stored in this layer. - /// - /// This is different from 'start_lsn' in that we enforce that the 'start_lsn' - /// of a layer always matches the 'end_lsn' of its predecessor, even if there - /// are no page versions until at a later LSN. That way you can detect any - /// missing layer files more easily. 'oldest_lsn' is the first page version - /// actually stored in this layer. In the range between 'start_lsn' and - /// 'oldest_lsn', there are no changes to the segment. - /// 'oldest_lsn' is used to adjust 'disk_consistent_lsn' and that is why it should - /// point to the beginning of WAL record. This is the other difference with 'start_lsn' - /// which points to end of WAL record. This is why 'oldest_lsn' can be smaller than 'start_lsn'. - /// - oldest_lsn: Lsn, - - /// The above fields never change. The parts that do change are in 'inner', - /// and protected by mutex. - inner: RwLock, - - /// Predecessor layer might be needed? - incremental: bool, -} - -pub struct InMemoryLayerInner { - /// Frozen layers have an exclusive end LSN. - /// Writes are only allowed when this is None - end_lsn: Option, - - /// If this relation was dropped, remember when that happened. - /// The drop LSN is recorded in [`end_lsn`]. - dropped: bool, - - /// The PageVersion structs are stored in a serialized format in this file. - /// Each serialized PageVersion is preceded by a 'u32' length field. - /// 'page_versions' map stores offsets into this file. - file: EphemeralFile, - - /// Metadata about all versions of all pages in the layer is kept - /// here. Indexed by block number and LSN. The value is an offset - /// into the ephemeral file where the page version is stored. - page_versions: HashMap>, - - /// - /// `seg_sizes` tracks the size of the segment at different points in time. - /// - /// For a blocky rel, there is always one entry, at the layer's start_lsn, - /// so that determining the size never depends on the predecessor layer. For - /// a non-blocky rel, 'seg_sizes' is not used and is always empty. - /// - seg_sizes: VecMap, - - /// - /// LSN of the newest page version stored in this layer. - /// - /// The difference between 'end_lsn' and 'latest_lsn' is the same as between - /// 'start_lsn' and 'oldest_lsn'. See comments in 'oldest_lsn'. - /// - latest_lsn: Lsn, -} - -impl InMemoryLayerInner { - fn assert_writeable(&self) { - assert!(self.end_lsn.is_none()); - } - - fn get_seg_size(&self, lsn: Lsn) -> SegmentBlk { - // Scan the BTreeMap backwards, starting from the given entry. - let slice = self.seg_sizes.slice_range(..=lsn); - - // We make sure there is always at least one entry - if let Some((_entry_lsn, entry)) = slice.last() { - *entry - } else { - panic!("could not find seg size in in-memory layer"); - } - } - - /// - /// Read a page version from the ephemeral file. - /// - fn read_pv(&self, off: u64) -> Result { - let mut buf = Vec::new(); - self.read_pv_bytes(off, &mut buf)?; - Ok(PageVersion::des(&buf)?) - } - - /// - /// Read a page version from the ephemeral file, as raw bytes, at - /// the given offset. The bytes are read into 'buf', which is - /// expanded if necessary. Returns the size of the page version. - /// - fn read_pv_bytes(&self, off: u64, buf: &mut Vec) -> Result { - // read length - let mut lenbuf = [0u8; 4]; - self.file.read_exact_at(&mut lenbuf, off)?; - let len = u32::from_ne_bytes(lenbuf) as usize; - - if buf.len() < len { - buf.resize(len, 0); - } - self.file.read_exact_at(&mut buf[0..len], off + 4)?; - Ok(len) - } - - fn write_pv(&mut self, pv: &PageVersion) -> Result { - // remember starting position - let pos = self.file.stream_position()?; - - // make room for the 'length' field by writing zeros as a placeholder. - self.file.seek(std::io::SeekFrom::Start(pos + 4)).unwrap(); - - pv.ser_into(&mut self.file).unwrap(); - - // write the 'length' field. - let len = self.file.stream_position()? - pos - 4; - let lenbuf = u32::to_ne_bytes(len as u32); - self.file.write_all_at(&lenbuf, pos)?; - - Ok(pos) - } -} - -impl Layer for InMemoryLayer { - // An in-memory layer can be spilled to disk into ephemeral file, - // This function is used only for debugging, so we don't need to be very precise. - // Construct a filename as if it was a delta layer. - fn filename(&self) -> PathBuf { - let inner = self.inner.read().unwrap(); - - let end_lsn; - if let Some(drop_lsn) = inner.end_lsn { - end_lsn = drop_lsn; - } else { - end_lsn = Lsn(u64::MAX); - } - - let delta_filename = DeltaFileName { - seg: self.seg, - start_lsn: self.start_lsn, - end_lsn, - dropped: inner.dropped, - } - .to_string(); - - PathBuf::from(format!("inmem-{}", delta_filename)) - } - - fn get_tenant_id(&self) -> ZTenantId { - self.tenantid - } - - fn get_timeline_id(&self) -> ZTimelineId { - self.timelineid - } - - fn get_seg_tag(&self) -> SegmentTag { - self.seg - } - - fn get_start_lsn(&self) -> Lsn { - self.start_lsn - } - - fn get_end_lsn(&self) -> Lsn { - let inner = self.inner.read().unwrap(); - - if let Some(end_lsn) = inner.end_lsn { - end_lsn - } else { - Lsn(u64::MAX) - } - } - - fn is_dropped(&self) -> bool { - let inner = self.inner.read().unwrap(); - inner.dropped - } - - /// Look up given page in the cache. - fn get_page_reconstruct_data( - &self, - blknum: SegmentBlk, - lsn: Lsn, - reconstruct_data: &mut PageReconstructData, - ) -> Result { - let mut need_image = true; - - assert!((0..RELISH_SEG_SIZE).contains(&blknum)); - - { - let inner = self.inner.read().unwrap(); - - // Scan the page versions backwards, starting from `lsn`. - if let Some(vec_map) = inner.page_versions.get(&blknum) { - let slice = vec_map.slice_range(..=lsn); - for (entry_lsn, pos) in slice.iter().rev() { - match &reconstruct_data.page_img { - Some((cached_lsn, _)) if entry_lsn <= cached_lsn => { - return Ok(PageReconstructResult::Complete) - } - _ => {} - } - - let pv = inner.read_pv(*pos)?; - match pv { - PageVersion::Page(img) => { - reconstruct_data.page_img = Some((*entry_lsn, img)); - need_image = false; - break; - } - PageVersion::Wal(rec) => { - reconstruct_data.records.push((*entry_lsn, rec.clone())); - if rec.will_init() { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; - } - } - } - } - } - - // If we didn't find any records for this, check if the request is beyond EOF - if need_image - && reconstruct_data.records.is_empty() - && self.seg.rel.is_blocky() - && blknum >= self.get_seg_size(lsn)? - { - return Ok(PageReconstructResult::Missing(self.start_lsn)); - } - - // release lock on 'inner' - } - - // If an older page image is needed to reconstruct the page, let the - // caller know - if need_image { - if self.incremental { - Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1))) - } else { - Ok(PageReconstructResult::Missing(self.start_lsn)) - } - } else { - Ok(PageReconstructResult::Complete) - } - } - - /// Get size of the relation at given LSN - fn get_seg_size(&self, lsn: Lsn) -> Result { - assert!(lsn >= self.start_lsn); - ensure!( - self.seg.rel.is_blocky(), - "get_seg_size() called on a non-blocky rel" - ); - - let inner = self.inner.read().unwrap(); - Ok(inner.get_seg_size(lsn)) - } - - /// Does this segment exist at given LSN? - fn get_seg_exists(&self, lsn: Lsn) -> Result { - let inner = self.inner.read().unwrap(); - - // If the segment created after requested LSN, - // it doesn't exist in the layer. But we shouldn't - // have requested it in the first place. - assert!(lsn >= self.start_lsn); - - // Is the requested LSN after the segment was dropped? - if inner.dropped { - if let Some(end_lsn) = inner.end_lsn { - if lsn >= end_lsn { - return Ok(false); - } - } else { - panic!("dropped in-memory layer with no end LSN"); - } - } - - // Otherwise, it exists - Ok(true) - } - - /// Cannot unload anything in an in-memory layer, since there's no backing - /// store. To release memory used by an in-memory layer, use 'freeze' to turn - /// it into an on-disk layer. - fn unload(&self) -> Result<()> { - Ok(()) - } - - /// Nothing to do here. When you drop the last reference to the layer, it will - /// be deallocated. - fn delete(&self) -> Result<()> { - panic!("can't delete an InMemoryLayer") - } - - fn is_incremental(&self) -> bool { - self.incremental - } - - fn is_in_memory(&self) -> bool { - true - } - - /// debugging function to print out the contents of the layer - fn dump(&self) -> Result<()> { - let inner = self.inner.read().unwrap(); - - let end_str = inner - .end_lsn - .as_ref() - .map(Lsn::to_string) - .unwrap_or_default(); - - println!( - "----- in-memory layer for tli {} seg {} {}-{} {} ----", - self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped, - ); - - for (k, v) in inner.seg_sizes.as_slice() { - println!("seg_sizes {}: {}", k, v); - } - - // List the blocks in order - let mut page_versions: Vec<(&SegmentBlk, &VecMap)> = - inner.page_versions.iter().collect(); - page_versions.sort_by_key(|k| k.0); - - for (blknum, versions) in page_versions { - for (lsn, off) in versions.as_slice() { - let pv = inner.read_pv(*off); - let pv_description = match pv { - Ok(PageVersion::Page(_img)) => "page", - Ok(PageVersion::Wal(_rec)) => "wal", - Err(_err) => "INVALID", - }; - - println!("blk {} at {}: {}\n", blknum, lsn, pv_description); - } - } - - Ok(()) - } -} - -/// A result of an inmemory layer data being written to disk. -pub struct LayersOnDisk { - pub delta_layers: Vec, - pub image_layers: Vec, -} - -impl InMemoryLayer { - /// Return the oldest page version that's stored in this layer - pub fn get_oldest_lsn(&self) -> Lsn { - self.oldest_lsn - } - - pub fn get_latest_lsn(&self) -> Lsn { - let inner = self.inner.read().unwrap(); - inner.latest_lsn - } - - /// - /// Create a new, empty, in-memory layer - /// - pub fn create( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, - seg: SegmentTag, - start_lsn: Lsn, - oldest_lsn: Lsn, - ) -> Result { - trace!( - "initializing new empty InMemoryLayer for writing {} on timeline {} at {}", - seg, - timelineid, - start_lsn - ); - - // The segment is initially empty, so initialize 'seg_sizes' with 0. - let mut seg_sizes = VecMap::default(); - if seg.rel.is_blocky() { - seg_sizes.append(start_lsn, 0).unwrap(); - } - - let file = EphemeralFile::create(conf, tenantid, timelineid)?; - - Ok(InMemoryLayer { - conf, - timelineid, - tenantid, - seg, - start_lsn, - oldest_lsn, - incremental: false, - inner: RwLock::new(InMemoryLayerInner { - end_lsn: None, - dropped: false, - file, - page_versions: HashMap::new(), - seg_sizes, - latest_lsn: oldest_lsn, - }), - }) - } - - // Write operations - - /// Remember new page version, as a WAL record over previous version - pub fn put_wal_record( - &self, - lsn: Lsn, - blknum: SegmentBlk, - rec: ZenithWalRecord, - ) -> Result { - self.put_page_version(blknum, lsn, PageVersion::Wal(rec)) - } - - /// Remember new page version, as a full page image - pub fn put_page_image(&self, blknum: SegmentBlk, lsn: Lsn, img: Bytes) -> Result { - self.put_page_version(blknum, lsn, PageVersion::Page(img)) - } - - /// Common subroutine of the public put_wal_record() and put_page_image() functions. - /// Adds the page version to the in-memory tree - pub fn put_page_version(&self, blknum: SegmentBlk, lsn: Lsn, pv: PageVersion) -> Result { - assert!((0..RELISH_SEG_SIZE).contains(&blknum)); - - trace!( - "put_page_version blk {} of {} at {}/{}", - blknum, - self.seg.rel, - self.timelineid, - lsn - ); - let mut inner = self.inner.write().unwrap(); - - inner.assert_writeable(); - assert!(lsn >= inner.latest_lsn); - inner.latest_lsn = lsn; - - // Write the page version to the file, and remember its offset in 'page_versions' - { - let off = inner.write_pv(&pv)?; - let vec_map = inner.page_versions.entry(blknum).or_default(); - let old = vec_map.append_or_update_last(lsn, off).unwrap().0; - if old.is_some() { - // We already had an entry for this LSN. That's odd.. - warn!( - "Page version of rel {} blk {} at {} already exists", - self.seg.rel, blknum, lsn - ); - } - } - - // Also update the relation size, if this extended the relation. - if self.seg.rel.is_blocky() { - let newsize = blknum + 1; - - // use inner get_seg_size, since calling self.get_seg_size will try to acquire the lock, - // which we've just acquired above - let oldsize = inner.get_seg_size(lsn); - if newsize > oldsize { - trace!( - "enlarging segment {} from {} to {} blocks at {}", - self.seg, - oldsize, - newsize, - lsn - ); - - // If we are extending the relation by more than one page, initialize the "gap" - // with zeros - // - // XXX: What if the caller initializes the gap with subsequent call with same LSN? - // I don't think that can happen currently, but that is highly dependent on how - // PostgreSQL writes its WAL records and there's no guarantee of it. If it does - // happen, we would hit the "page version already exists" warning above on the - // subsequent call to initialize the gap page. - for gapblknum in oldsize..blknum { - let zeropv = PageVersion::Page(ZERO_PAGE.clone()); - trace!( - "filling gap blk {} with zeros for write of {}", - gapblknum, - blknum - ); - - // Write the page version to the file, and remember its offset in - // 'page_versions' - { - let off = inner.write_pv(&zeropv)?; - let vec_map = inner.page_versions.entry(gapblknum).or_default(); - let old = vec_map.append_or_update_last(lsn, off).unwrap().0; - if old.is_some() { - warn!( - "Page version of seg {} blk {} at {} already exists", - self.seg, gapblknum, lsn - ); - } - } - } - - inner.seg_sizes.append_or_update_last(lsn, newsize).unwrap(); - return Ok(newsize - oldsize); - } - } - - Ok(0) - } - - /// Remember that the relation was truncated at given LSN - pub fn put_truncation(&self, lsn: Lsn, new_size: SegmentBlk) { - assert!( - self.seg.rel.is_blocky(), - "put_truncation() called on a non-blocky rel" - ); - - let mut inner = self.inner.write().unwrap(); - inner.assert_writeable(); - - // check that this we truncate to a smaller size than segment was before the truncation - let old_size = inner.get_seg_size(lsn); - assert!(new_size < old_size); - - let (old, _delta_size) = inner - .seg_sizes - .append_or_update_last(lsn, new_size) - .unwrap(); - - if old.is_some() { - // We already had an entry for this LSN. That's odd.. - warn!("Inserting truncation, but had an entry for the LSN already"); - } - } - - /// Remember that the segment was dropped at given LSN - pub fn drop_segment(&self, lsn: Lsn) { - let mut inner = self.inner.write().unwrap(); - - assert!(inner.end_lsn.is_none()); - assert!(!inner.dropped); - inner.dropped = true; - assert!(self.start_lsn < lsn); - inner.end_lsn = Some(lsn); - - trace!("dropped segment {} at {}", self.seg, lsn); - } - - /// - /// Initialize a new InMemoryLayer for, by copying the state at the given - /// point in time from given existing layer. - /// - pub fn create_successor_layer( - conf: &'static PageServerConf, - src: Arc, - timelineid: ZTimelineId, - tenantid: ZTenantId, - start_lsn: Lsn, - oldest_lsn: Lsn, - ) -> Result { - let seg = src.get_seg_tag(); - - assert!(oldest_lsn.is_aligned()); - - trace!( - "initializing new InMemoryLayer for writing {} on timeline {} at {}", - seg, - timelineid, - start_lsn, - ); - - // Copy the segment size at the start LSN from the predecessor layer. - let mut seg_sizes = VecMap::default(); - if seg.rel.is_blocky() { - let size = src.get_seg_size(start_lsn)?; - seg_sizes.append(start_lsn, size).unwrap(); - } - - let file = EphemeralFile::create(conf, tenantid, timelineid)?; - - Ok(InMemoryLayer { - conf, - timelineid, - tenantid, - seg, - start_lsn, - oldest_lsn, - incremental: true, - inner: RwLock::new(InMemoryLayerInner { - end_lsn: None, - dropped: false, - file, - page_versions: HashMap::new(), - seg_sizes, - latest_lsn: oldest_lsn, - }), - }) - } - - pub fn is_writeable(&self) -> bool { - let inner = self.inner.read().unwrap(); - inner.end_lsn.is_none() - } - - /// Make the layer non-writeable. Only call once. - /// Records the end_lsn for non-dropped layers. - /// `end_lsn` is inclusive - pub fn freeze(&self, end_lsn: Lsn) { - let mut inner = self.inner.write().unwrap(); - - if inner.end_lsn.is_some() { - assert!(inner.dropped); - } else { - assert!(!inner.dropped); - assert!(self.start_lsn < end_lsn + 1); - inner.end_lsn = Some(Lsn(end_lsn.0 + 1)); - - if let Some((lsn, _)) = inner.seg_sizes.as_slice().last() { - assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn); - } - - for (_blk, vec_map) in inner.page_versions.iter() { - for (lsn, _pos) in vec_map.as_slice() { - assert!(*lsn <= end_lsn); - } - } - } - } - - /// Write the this frozen in-memory layer to disk. - /// - /// Returns new layers that replace this one. - /// If not dropped and reconstruct_pages is true, returns a new image layer containing the page versions - /// at the `end_lsn`. Can also return a DeltaLayer that includes all the - /// WAL records between start and end LSN. (The delta layer is not needed - /// when a new relish is created with a single LSN, so that the start and - /// end LSN are the same.) - pub fn write_to_disk( - &self, - timeline: &LayeredTimeline, - reconstruct_pages: bool, - ) -> Result { - trace!( - "write_to_disk {} get_end_lsn is {}", - self.filename().display(), - self.get_end_lsn() - ); - - // Grab the lock in read-mode. We hold it over the I/O, but because this - // layer is not writeable anymore, no one should be trying to acquire the - // write lock on it, so we shouldn't block anyone. There's one exception - // though: another thread might have grabbed a reference to this layer - // in `get_layer_for_write' just before the checkpointer called - // `freeze`, and then `write_to_disk` on it. When the thread gets the - // lock, it will see that it's not writeable anymore and retry, but it - // would have to wait until we release it. That race condition is very - // rare though, so we just accept the potential latency hit for now. - let inner = self.inner.read().unwrap(); - - // Since `end_lsn` is exclusive, subtract 1 to calculate the last LSN - // that is included. - let end_lsn_exclusive = inner.end_lsn.unwrap(); - let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1); - - // Figure out if we should create a delta layer, image layer, or both. - let image_lsn: Option; - let delta_end_lsn: Option; - if self.is_dropped() || !reconstruct_pages { - // The segment was dropped. Create just a delta layer containing all the - // changes up to and including the drop. - delta_end_lsn = Some(end_lsn_exclusive); - image_lsn = None; - } else if self.start_lsn == end_lsn_inclusive { - // The layer contains exactly one LSN. It's enough to write an image - // layer at that LSN. - delta_end_lsn = None; - image_lsn = Some(end_lsn_inclusive); - } else { - // Create a delta layer with all the changes up to the end LSN, - // and an image layer at the end LSN. - // - // Note that we the delta layer does *not* include the page versions - // at the end LSN. They are included in the image layer, and there's - // no need to store them twice. - delta_end_lsn = Some(end_lsn_inclusive); - image_lsn = Some(end_lsn_inclusive); - } - - let mut delta_layers = Vec::new(); - let mut image_layers = Vec::new(); - - if let Some(delta_end_lsn) = delta_end_lsn { - let mut delta_layer_writer = DeltaLayerWriter::new( - self.conf, - self.timelineid, - self.tenantid, - self.seg, - self.start_lsn, - delta_end_lsn, - self.is_dropped(), - )?; - - // Write all page versions, in block + LSN order - let mut buf: Vec = Vec::new(); - - let pv_iter = inner.page_versions.iter(); - let mut pages: Vec<(&SegmentBlk, &VecMap)> = pv_iter.collect(); - pages.sort_by_key(|(blknum, _vec_map)| *blknum); - for (blknum, vec_map) in pages { - for (lsn, pos) in vec_map.as_slice() { - if *lsn < delta_end_lsn { - let len = inner.read_pv_bytes(*pos, &mut buf)?; - delta_layer_writer.put_page_version(*blknum, *lsn, &buf[..len])?; - } - } - } - - // Create seg_sizes - let seg_sizes = if delta_end_lsn == end_lsn_exclusive { - inner.seg_sizes.clone() - } else { - inner.seg_sizes.split_at(&end_lsn_exclusive).0 - }; - - let delta_layer = delta_layer_writer.finish(seg_sizes)?; - delta_layers.push(delta_layer); - } - - drop(inner); - - // Write a new base image layer at the cutoff point - if let Some(image_lsn) = image_lsn { - let size = if self.seg.rel.is_blocky() { - self.get_seg_size(image_lsn)? - } else { - 1 - }; - let mut image_layer_writer = ImageLayerWriter::new( - self.conf, - self.timelineid, - self.tenantid, - self.seg, - image_lsn, - size, - )?; - - for blknum in 0..size { - let img = timeline.materialize_page(self.seg, blknum, image_lsn, &*self)?; - - image_layer_writer.put_page_image(&img)?; - } - let image_layer = image_layer_writer.finish()?; - image_layers.push(image_layer); - } - - Ok(LayersOnDisk { - delta_layers, - image_layers, - }) - } -} diff --git a/pageserver/src/layered_repository/interval_tree.rs b/pageserver/src/layered_repository/interval_tree.rs deleted file mode 100644 index 978ecd837e..0000000000 --- a/pageserver/src/layered_repository/interval_tree.rs +++ /dev/null @@ -1,468 +0,0 @@ -/// -/// IntervalTree is data structure for holding intervals. It is generic -/// to make unit testing possible, but the only real user of it is the layer map, -/// -/// It's inspired by the "segment tree" or a "statistic tree" as described in -/// https://en.wikipedia.org/wiki/Segment_tree. However, we use a B-tree to hold -/// the points instead of a binary tree. This is called an "interval tree" instead -/// of "segment tree" because the term "segment" is already using Zenith to mean -/// something else. To add to the confusion, there is another data structure known -/// as "interval tree" out there (see https://en.wikipedia.org/wiki/Interval_tree), -/// for storing intervals, but this isn't that. -/// -/// The basic idea is to have a B-tree of "interesting Points". At each Point, -/// there is a list of intervals that contain the point. The Points are formed -/// from the start bounds of each interval; there is a Point for each distinct -/// start bound. -/// -/// Operations: -/// -/// To find intervals that contain a given point, you search the b-tree to find -/// the nearest Point <= search key. Then you just return the list of intervals. -/// -/// To insert an interval, find the Point with start key equal to the inserted item. -/// If the Point doesn't exist yet, create it, by copying all the items from the -/// previous Point that cover the new Point. Then walk right, inserting the new -/// interval to all the Points that are contained by the new interval (including the -/// newly created Point). -/// -/// To remove an interval, you scan the tree for all the Points that are contained by -/// the removed interval, and remove it from the list in each Point. -/// -/// Requirements and assumptions: -/// -/// - Can store overlapping items -/// - But there are not many overlapping items -/// - The interval bounds don't change after it is added to the tree -/// - Intervals are uniquely identified by pointer equality. You must not be insert the -/// same interval object twice, and `remove` uses pointer equality to remove the right -/// interval. It is OK to have two intervals with the same bounds, however. -/// -use std::collections::BTreeMap; -use std::fmt::Debug; -use std::ops::Range; -use std::sync::Arc; - -pub struct IntervalTree -where - I: IntervalItem, -{ - points: BTreeMap>, -} - -struct Point { - /// All intervals that contain this point, in no particular order. - /// - /// We assume that there aren't a lot of overlappingg intervals, so that this vector - /// never grows very large. If that assumption doesn't hold, we could keep this ordered - /// by the end bound, to speed up `search`. But as long as there are only a few elements, - /// a linear search is OK. - elements: Vec>, -} - -/// Abstraction for an interval that can be stored in the tree -/// -/// The start bound is inclusive and the end bound is exclusive. End must be greater -/// than start. -pub trait IntervalItem { - type Key: Ord + Copy + Debug + Sized; - - fn start_key(&self) -> Self::Key; - fn end_key(&self) -> Self::Key; - - fn bounds(&self) -> Range { - self.start_key()..self.end_key() - } -} - -impl IntervalTree -where - I: IntervalItem, -{ - /// Return an element that contains 'key', or precedes it. - /// - /// If there are multiple candidates, returns the one with the highest 'end' key. - pub fn search(&self, key: I::Key) -> Option> { - // Find the greatest point that precedes or is equal to the search key. If there is - // none, returns None. - let (_, p) = self.points.range(..=key).next_back()?; - - // Find the element with the highest end key at this point - let highest_item = p - .elements - .iter() - .reduce(|a, b| { - // starting with Rust 1.53, could use `std::cmp::min_by_key` here - if a.end_key() > b.end_key() { - a - } else { - b - } - }) - .unwrap(); - Some(Arc::clone(highest_item)) - } - - /// Iterate over all items with start bound >= 'key' - pub fn iter_newer(&self, key: I::Key) -> IntervalIter { - IntervalIter { - point_iter: self.points.range(key..), - elem_iter: None, - } - } - - /// Iterate over all items - pub fn iter(&self) -> IntervalIter { - IntervalIter { - point_iter: self.points.range(..), - elem_iter: None, - } - } - - pub fn insert(&mut self, item: Arc) { - let start_key = item.start_key(); - let end_key = item.end_key(); - assert!(start_key < end_key); - let bounds = start_key..end_key; - - // Find the starting point and walk forward from there - let mut found_start_point = false; - let iter = self.points.range_mut(bounds); - for (point_key, point) in iter { - if *point_key == start_key { - found_start_point = true; - // It is an error to insert the same item to the tree twice. - assert!( - !point.elements.iter().any(|x| Arc::ptr_eq(x, &item)), - "interval is already in the tree" - ); - } - point.elements.push(Arc::clone(&item)); - } - if !found_start_point { - // Create a new Point for the starting point - - // Look at the previous point, and copy over elements that overlap with this - // new point - let mut new_elements: Vec> = Vec::new(); - if let Some((_, prev_point)) = self.points.range(..start_key).next_back() { - let overlapping_prev_elements = prev_point - .elements - .iter() - .filter(|x| x.bounds().contains(&start_key)) - .cloned(); - - new_elements.extend(overlapping_prev_elements); - } - new_elements.push(item); - - let new_point = Point { - elements: new_elements, - }; - self.points.insert(start_key, new_point); - } - } - - pub fn remove(&mut self, item: &Arc) { - // range search points - let start_key = item.start_key(); - let end_key = item.end_key(); - let bounds = start_key..end_key; - - let mut points_to_remove: Vec = Vec::new(); - let mut found_start_point = false; - for (point_key, point) in self.points.range_mut(bounds) { - if *point_key == start_key { - found_start_point = true; - } - let len_before = point.elements.len(); - point.elements.retain(|other| !Arc::ptr_eq(other, item)); - let len_after = point.elements.len(); - assert_eq!(len_after + 1, len_before); - if len_after == 0 { - points_to_remove.push(*point_key); - } - } - assert!(found_start_point); - - for k in points_to_remove { - self.points.remove(&k).unwrap(); - } - } -} - -pub struct IntervalIter<'a, I: ?Sized> -where - I: IntervalItem, -{ - point_iter: std::collections::btree_map::Range<'a, I::Key, Point>, - elem_iter: Option<(I::Key, std::slice::Iter<'a, Arc>)>, -} - -impl<'a, I> Iterator for IntervalIter<'a, I> -where - I: IntervalItem + ?Sized, -{ - type Item = Arc; - - fn next(&mut self) -> Option { - // Iterate over all elements in all the points in 'point_iter'. To avoid - // returning the same element twice, we only return each element at its - // starting point. - loop { - // Return next remaining element from the current point - if let Some((point_key, elem_iter)) = &mut self.elem_iter { - for elem in elem_iter { - if elem.start_key() == *point_key { - return Some(Arc::clone(elem)); - } - } - } - // No more elements at this point. Move to next point. - if let Some((point_key, point)) = self.point_iter.next() { - self.elem_iter = Some((*point_key, point.elements.iter())); - continue; - } else { - // No more points, all done - return None; - } - } - } -} - -impl Default for IntervalTree -where - I: IntervalItem, -{ - fn default() -> Self { - IntervalTree { - points: BTreeMap::new(), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::fmt; - - #[derive(Debug)] - struct MockItem { - start_key: u32, - end_key: u32, - val: String, - } - impl IntervalItem for MockItem { - type Key = u32; - - fn start_key(&self) -> u32 { - self.start_key - } - fn end_key(&self) -> u32 { - self.end_key - } - } - impl MockItem { - fn new(start_key: u32, end_key: u32) -> Self { - MockItem { - start_key, - end_key, - val: format!("{}-{}", start_key, end_key), - } - } - fn new_str(start_key: u32, end_key: u32, val: &str) -> Self { - MockItem { - start_key, - end_key, - val: format!("{}-{}: {}", start_key, end_key, val), - } - } - } - impl fmt::Display for MockItem { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.val) - } - } - #[rustfmt::skip] - fn assert_search( - tree: &IntervalTree, - key: u32, - expected: &[&str], - ) -> Option> { - if let Some(v) = tree.search(key) { - let vstr = v.to_string(); - - assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v); - assert!( - expected.contains(&vstr.as_str()), - "search with {} returned {}, expected one of: {:?}", - key, v, expected, - ); - - Some(v) - } else { - assert!( - expected.is_empty(), - "search with {} returned None, expected one of {:?}", - key, expected - ); - None - } - } - - fn assert_contents(tree: &IntervalTree, expected: &[&str]) { - let mut contents: Vec = tree.iter().map(|e| e.to_string()).collect(); - contents.sort(); - assert_eq!(contents, expected); - } - - fn dump_tree(tree: &IntervalTree) { - for (point_key, point) in tree.points.iter() { - print!("{}:", point_key); - for e in point.elements.iter() { - print!(" {}", e); - } - println!(); - } - } - - #[test] - fn test_interval_tree_simple() { - let mut tree: IntervalTree = IntervalTree::default(); - - // Simple, non-overlapping ranges. - tree.insert(Arc::new(MockItem::new(10, 11))); - tree.insert(Arc::new(MockItem::new(11, 12))); - tree.insert(Arc::new(MockItem::new(12, 13))); - tree.insert(Arc::new(MockItem::new(18, 19))); - tree.insert(Arc::new(MockItem::new(17, 18))); - tree.insert(Arc::new(MockItem::new(15, 16))); - - assert_search(&tree, 9, &[]); - assert_search(&tree, 10, &["10-11"]); - assert_search(&tree, 11, &["11-12"]); - assert_search(&tree, 12, &["12-13"]); - assert_search(&tree, 13, &["12-13"]); - assert_search(&tree, 14, &["12-13"]); - assert_search(&tree, 15, &["15-16"]); - assert_search(&tree, 16, &["15-16"]); - assert_search(&tree, 17, &["17-18"]); - assert_search(&tree, 18, &["18-19"]); - assert_search(&tree, 19, &["18-19"]); - assert_search(&tree, 20, &["18-19"]); - - // remove a few entries and search around them again - tree.remove(&assert_search(&tree, 10, &["10-11"]).unwrap()); // first entry - tree.remove(&assert_search(&tree, 12, &["12-13"]).unwrap()); // entry in the middle - tree.remove(&assert_search(&tree, 18, &["18-19"]).unwrap()); // last entry - assert_search(&tree, 9, &[]); - assert_search(&tree, 10, &[]); - assert_search(&tree, 11, &["11-12"]); - assert_search(&tree, 12, &["11-12"]); - assert_search(&tree, 14, &["11-12"]); - assert_search(&tree, 15, &["15-16"]); - assert_search(&tree, 17, &["17-18"]); - assert_search(&tree, 18, &["17-18"]); - } - - #[test] - fn test_interval_tree_overlap() { - let mut tree: IntervalTree = IntervalTree::default(); - - // Overlapping items - tree.insert(Arc::new(MockItem::new(22, 24))); - tree.insert(Arc::new(MockItem::new(23, 25))); - let x24_26 = Arc::new(MockItem::new(24, 26)); - tree.insert(Arc::clone(&x24_26)); - let x26_28 = Arc::new(MockItem::new(26, 28)); - tree.insert(Arc::clone(&x26_28)); - tree.insert(Arc::new(MockItem::new(25, 27))); - - assert_search(&tree, 22, &["22-24"]); - assert_search(&tree, 23, &["22-24", "23-25"]); - assert_search(&tree, 24, &["23-25", "24-26"]); - assert_search(&tree, 25, &["24-26", "25-27"]); - assert_search(&tree, 26, &["25-27", "26-28"]); - assert_search(&tree, 27, &["26-28"]); - assert_search(&tree, 28, &["26-28"]); - assert_search(&tree, 29, &["26-28"]); - - tree.remove(&x24_26); - tree.remove(&x26_28); - assert_search(&tree, 23, &["22-24", "23-25"]); - assert_search(&tree, 24, &["23-25"]); - assert_search(&tree, 25, &["25-27"]); - assert_search(&tree, 26, &["25-27"]); - assert_search(&tree, 27, &["25-27"]); - assert_search(&tree, 28, &["25-27"]); - assert_search(&tree, 29, &["25-27"]); - } - - #[test] - fn test_interval_tree_nested() { - let mut tree: IntervalTree = IntervalTree::default(); - - // Items containing other items - tree.insert(Arc::new(MockItem::new(31, 39))); - tree.insert(Arc::new(MockItem::new(32, 34))); - tree.insert(Arc::new(MockItem::new(33, 35))); - tree.insert(Arc::new(MockItem::new(30, 40))); - - assert_search(&tree, 30, &["30-40"]); - assert_search(&tree, 31, &["30-40", "31-39"]); - assert_search(&tree, 32, &["30-40", "32-34", "31-39"]); - assert_search(&tree, 33, &["30-40", "32-34", "33-35", "31-39"]); - assert_search(&tree, 34, &["30-40", "33-35", "31-39"]); - assert_search(&tree, 35, &["30-40", "31-39"]); - assert_search(&tree, 36, &["30-40", "31-39"]); - assert_search(&tree, 37, &["30-40", "31-39"]); - assert_search(&tree, 38, &["30-40", "31-39"]); - assert_search(&tree, 39, &["30-40"]); - assert_search(&tree, 40, &["30-40"]); - assert_search(&tree, 41, &["30-40"]); - } - - #[test] - fn test_interval_tree_duplicates() { - let mut tree: IntervalTree = IntervalTree::default(); - - // Duplicate keys - let item_a = Arc::new(MockItem::new_str(55, 56, "a")); - tree.insert(Arc::clone(&item_a)); - let item_b = Arc::new(MockItem::new_str(55, 56, "b")); - tree.insert(Arc::clone(&item_b)); - let item_c = Arc::new(MockItem::new_str(55, 56, "c")); - tree.insert(Arc::clone(&item_c)); - let item_d = Arc::new(MockItem::new_str(54, 56, "d")); - tree.insert(Arc::clone(&item_d)); - let item_e = Arc::new(MockItem::new_str(55, 57, "e")); - tree.insert(Arc::clone(&item_e)); - - dump_tree(&tree); - - assert_search( - &tree, - 55, - &["55-56: a", "55-56: b", "55-56: c", "54-56: d", "55-57: e"], - ); - tree.remove(&item_b); - dump_tree(&tree); - - assert_contents(&tree, &["54-56: d", "55-56: a", "55-56: c", "55-57: e"]); - - tree.remove(&item_d); - dump_tree(&tree); - assert_contents(&tree, &["55-56: a", "55-56: c", "55-57: e"]); - } - - #[test] - #[should_panic] - fn test_interval_tree_insert_twice() { - let mut tree: IntervalTree = IntervalTree::default(); - - // Inserting the same item twice is not cool - let item = Arc::new(MockItem::new(1, 2)); - tree.insert(Arc::clone(&item)); - tree.insert(Arc::clone(&item)); // fails assertion - } -} diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs deleted file mode 100644 index fe82fd491c..0000000000 --- a/pageserver/src/layered_repository/layer_map.rs +++ /dev/null @@ -1,494 +0,0 @@ -//! -//! The layer map tracks what layers exist for all the relishes in a timeline. -//! -//! When the timeline is first accessed, the server lists of all layer files -//! in the timelines/ directory, and populates this map with -//! ImageLayer and DeltaLayer structs corresponding to each file. When new WAL -//! is received, we create InMemoryLayers to hold the incoming records. Now and -//! then, in the checkpoint() function, the in-memory layers are frozen, forming -//! new image and delta layers and corresponding files are written to disk. -//! - -use crate::layered_repository::interval_tree::{IntervalItem, IntervalIter, IntervalTree}; -use crate::layered_repository::storage_layer::{Layer, SegmentTag}; -use crate::layered_repository::InMemoryLayer; -use crate::relish::*; -use anyhow::Result; -use lazy_static::lazy_static; -use std::cmp::Ordering; -use std::collections::{BinaryHeap, HashMap}; -use std::sync::Arc; -use zenith_metrics::{register_int_gauge, IntGauge}; -use zenith_utils::lsn::Lsn; - -use super::global_layer_map::{LayerId, GLOBAL_LAYER_MAP}; - -lazy_static! { - static ref NUM_INMEMORY_LAYERS: IntGauge = - register_int_gauge!("pageserver_inmemory_layers", "Number of layers in memory") - .expect("failed to define a metric"); - static ref NUM_ONDISK_LAYERS: IntGauge = - register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk") - .expect("failed to define a metric"); -} - -/// -/// LayerMap tracks what layers exist on a timeline. -/// -#[derive(Default)] -pub struct LayerMap { - /// All the layers keyed by segment tag - segs: HashMap, - - /// All in-memory layers, ordered by 'oldest_lsn' and generation - /// of each layer. This allows easy access to the in-memory layer that - /// contains the oldest WAL record. - open_layers: BinaryHeap, - - /// Generation number, used to distinguish newly inserted entries in the - /// binary heap from older entries during checkpoint. - current_generation: u64, -} - -impl LayerMap { - /// - /// Look up a layer using the given segment tag and LSN. This differs from a - /// plain key-value lookup in that if there is any layer that covers the - /// given LSN, or precedes the given LSN, it is returned. In other words, - /// you don't need to know the exact start LSN of the layer. - /// - pub fn get(&self, tag: &SegmentTag, lsn: Lsn) -> Option> { - let segentry = self.segs.get(tag)?; - - segentry.get(lsn) - } - - /// - /// Get the open layer for given segment for writing. Or None if no open - /// layer exists. - /// - pub fn get_open(&self, tag: &SegmentTag) -> Option> { - let segentry = self.segs.get(tag)?; - - segentry - .open_layer_id - .and_then(|layer_id| GLOBAL_LAYER_MAP.read().unwrap().get(&layer_id)) - } - - /// - /// Insert an open in-memory layer - /// - pub fn insert_open(&mut self, layer: Arc) { - let segentry = self.segs.entry(layer.get_seg_tag()).or_default(); - - let layer_id = segentry.update_open(Arc::clone(&layer)); - - let oldest_lsn = layer.get_oldest_lsn(); - - // After a crash and restart, 'oldest_lsn' of the oldest in-memory - // layer becomes the WAL streaming starting point, so it better not point - // in the middle of a WAL record. - assert!(oldest_lsn.is_aligned()); - - // Also add it to the binary heap - let open_layer_entry = OpenLayerEntry { - oldest_lsn: layer.get_oldest_lsn(), - layer_id, - generation: self.current_generation, - }; - self.open_layers.push(open_layer_entry); - - NUM_INMEMORY_LAYERS.inc(); - } - - /// Remove an open in-memory layer - pub fn remove_open(&mut self, layer_id: LayerId) { - // Note: we don't try to remove the entry from the binary heap. - // It will be removed lazily by peek_oldest_open() when it's made it to - // the top of the heap. - - let layer_opt = { - let mut global_map = GLOBAL_LAYER_MAP.write().unwrap(); - let layer_opt = global_map.get(&layer_id); - global_map.remove(&layer_id); - // TODO it's bad that a ref can still exist after being evicted from cache - layer_opt - }; - - if let Some(layer) = layer_opt { - let mut segentry = self.segs.get_mut(&layer.get_seg_tag()).unwrap(); - - if segentry.open_layer_id == Some(layer_id) { - // Also remove it from the SegEntry of this segment - segentry.open_layer_id = None; - } else { - // We could have already updated segentry.open for - // dropped (non-writeable) layer. This is fine. - assert!(!layer.is_writeable()); - assert!(layer.is_dropped()); - } - - NUM_INMEMORY_LAYERS.dec(); - } - } - - /// - /// Insert an on-disk layer - /// - pub fn insert_historic(&mut self, layer: Arc) { - let segentry = self.segs.entry(layer.get_seg_tag()).or_default(); - segentry.insert_historic(layer); - - NUM_ONDISK_LAYERS.inc(); - } - - /// - /// Remove an on-disk layer from the map. - /// - /// This should be called when the corresponding file on disk has been deleted. - /// - pub fn remove_historic(&mut self, layer: Arc) { - let tag = layer.get_seg_tag(); - - if let Some(segentry) = self.segs.get_mut(&tag) { - segentry.historic.remove(&layer); - } - NUM_ONDISK_LAYERS.dec(); - } - - // List relations along with a flag that marks if they exist at the given lsn. - // spcnode 0 and dbnode 0 have special meanings and mean all tabespaces/databases. - // Pass Tag if we're only interested in some relations. - pub fn list_relishes(&self, tag: Option, lsn: Lsn) -> Result> { - let mut rels: HashMap = HashMap::new(); - - for (seg, segentry) in self.segs.iter() { - match seg.rel { - RelishTag::Relation(reltag) => { - if let Some(request_rel) = tag { - if (request_rel.spcnode == 0 || reltag.spcnode == request_rel.spcnode) - && (request_rel.dbnode == 0 || reltag.dbnode == request_rel.dbnode) - { - if let Some(exists) = segentry.exists_at_lsn(lsn)? { - rels.insert(seg.rel, exists); - } - } - } - } - _ => { - if tag == None { - if let Some(exists) = segentry.exists_at_lsn(lsn)? { - rels.insert(seg.rel, exists); - } - } - } - } - } - Ok(rels) - } - - /// Is there a newer image layer for given segment? - /// - /// This is used for garbage collection, to determine if an old layer can - /// be deleted. - /// We ignore segments newer than disk_consistent_lsn because they will be removed at restart - pub fn newer_image_layer_exists( - &self, - seg: SegmentTag, - lsn: Lsn, - disk_consistent_lsn: Lsn, - ) -> bool { - if let Some(segentry) = self.segs.get(&seg) { - segentry.newer_image_layer_exists(lsn, disk_consistent_lsn) - } else { - false - } - } - - /// Is there any layer for given segment that is alive at the lsn? - /// - /// This is a public wrapper for SegEntry fucntion, - /// used for garbage collection, to determine if some alive layer - /// exists at the lsn. If so, we shouldn't delete a newer dropped layer - /// to avoid incorrectly making it visible. - pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result { - Ok(if let Some(segentry) = self.segs.get(&seg) { - segentry.exists_at_lsn(lsn)?.unwrap_or(false) - } else { - false - }) - } - - /// Return the oldest in-memory layer, along with its generation number. - pub fn peek_oldest_open(&mut self) -> Option<(LayerId, Arc, u64)> { - let global_map = GLOBAL_LAYER_MAP.read().unwrap(); - - while let Some(oldest_entry) = self.open_layers.peek() { - if let Some(layer) = global_map.get(&oldest_entry.layer_id) { - return Some((oldest_entry.layer_id, layer, oldest_entry.generation)); - } else { - self.open_layers.pop(); - } - } - None - } - - /// Increment the generation number used to stamp open in-memory layers. Layers - /// added with `insert_open` after this call will be associated with the new - /// generation. Returns the new generation number. - pub fn increment_generation(&mut self) -> u64 { - self.current_generation += 1; - self.current_generation - } - - pub fn iter_historic_layers(&self) -> HistoricLayerIter { - HistoricLayerIter { - seg_iter: self.segs.iter(), - iter: None, - } - } - - /// debugging function to print out the contents of the layer map - #[allow(unused)] - pub fn dump(&self) -> Result<()> { - println!("Begin dump LayerMap"); - for (seg, segentry) in self.segs.iter() { - if let Some(open) = &segentry.open_layer_id { - if let Some(layer) = GLOBAL_LAYER_MAP.read().unwrap().get(open) { - layer.dump()?; - } else { - println!("layer not found in global map"); - } - } - - for layer in segentry.historic.iter() { - layer.dump()?; - } - } - println!("End dump LayerMap"); - Ok(()) - } -} - -impl IntervalItem for dyn Layer { - type Key = Lsn; - - fn start_key(&self) -> Lsn { - self.get_start_lsn() - } - fn end_key(&self) -> Lsn { - self.get_end_lsn() - } -} - -/// -/// Per-segment entry in the LayerMap::segs hash map. Holds all the layers -/// associated with the segment. -/// -/// The last layer that is open for writes is always an InMemoryLayer, -/// and is kept in a separate field, because there can be only one for -/// each segment. The older layers, stored on disk, are kept in an -/// IntervalTree. -#[derive(Default)] -struct SegEntry { - open_layer_id: Option, - historic: IntervalTree, -} - -impl SegEntry { - /// Does the segment exist at given LSN? - /// Return None if object is not found in this SegEntry. - fn exists_at_lsn(&self, lsn: Lsn) -> Result> { - if let Some(layer) = self.get(lsn) { - Ok(Some(layer.get_seg_exists(lsn)?)) - } else { - Ok(None) - } - } - - pub fn get(&self, lsn: Lsn) -> Option> { - if let Some(open_layer_id) = &self.open_layer_id { - let open_layer = GLOBAL_LAYER_MAP.read().unwrap().get(open_layer_id)?; - if open_layer.get_start_lsn() <= lsn { - return Some(open_layer); - } - } - - self.historic.search(lsn) - } - - pub fn newer_image_layer_exists(&self, lsn: Lsn, disk_consistent_lsn: Lsn) -> bool { - // We only check on-disk layers, because - // in-memory layers are not durable - - // The end-LSN is exclusive, while disk_consistent_lsn is - // inclusive. For example, if disk_consistent_lsn is 100, it is - // OK for a delta layer to have end LSN 101, but if the end LSN - // is 102, then it might not have been fully flushed to disk - // before crash. - self.historic - .iter_newer(lsn) - .any(|layer| !layer.is_incremental() && layer.get_end_lsn() <= disk_consistent_lsn + 1) - } - - // Set new open layer for a SegEntry. - // It's ok to rewrite previous open layer, - // but only if it is not writeable anymore. - pub fn update_open(&mut self, layer: Arc) -> LayerId { - if let Some(prev_open_layer_id) = &self.open_layer_id { - if let Some(prev_open_layer) = GLOBAL_LAYER_MAP.read().unwrap().get(prev_open_layer_id) - { - assert!(!prev_open_layer.is_writeable()); - } - } - let open_layer_id = GLOBAL_LAYER_MAP.write().unwrap().insert(layer); - self.open_layer_id = Some(open_layer_id); - open_layer_id - } - - pub fn insert_historic(&mut self, layer: Arc) { - self.historic.insert(layer); - } -} - -/// Entry held in LayerMap::open_layers, with boilerplate comparison routines -/// to implement a min-heap ordered by 'oldest_lsn' and 'generation' -/// -/// The generation number associated with each entry can be used to distinguish -/// recently-added entries (i.e after last call to increment_generation()) from older -/// entries with the same 'oldest_lsn'. -struct OpenLayerEntry { - oldest_lsn: Lsn, // copy of layer.get_oldest_lsn() - generation: u64, - layer_id: LayerId, -} -impl Ord for OpenLayerEntry { - fn cmp(&self, other: &Self) -> Ordering { - // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here - // to get that. Entries with identical oldest_lsn are ordered by generation - other - .oldest_lsn - .cmp(&self.oldest_lsn) - .then_with(|| other.generation.cmp(&self.generation)) - } -} -impl PartialOrd for OpenLayerEntry { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} -impl PartialEq for OpenLayerEntry { - fn eq(&self, other: &Self) -> bool { - self.cmp(other) == Ordering::Equal - } -} -impl Eq for OpenLayerEntry {} - -/// Iterator returned by LayerMap::iter_historic_layers() -pub struct HistoricLayerIter<'a> { - seg_iter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>, - iter: Option>, -} - -impl<'a> Iterator for HistoricLayerIter<'a> { - type Item = Arc; - - fn next(&mut self) -> std::option::Option<::Item> { - loop { - if let Some(x) = &mut self.iter { - if let Some(x) = x.next() { - return Some(Arc::clone(&x)); - } - } - if let Some((_tag, segentry)) = self.seg_iter.next() { - self.iter = Some(segentry.historic.iter()); - continue; - } else { - return None; - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::config::PageServerConf; - use std::str::FromStr; - use zenith_utils::zid::{ZTenantId, ZTimelineId}; - - /// Arbitrary relation tag, for testing. - const TESTREL_A: RelishTag = RelishTag::Relation(RelTag { - spcnode: 0, - dbnode: 111, - relnode: 1000, - forknum: 0, - }); - - lazy_static! { - static ref DUMMY_TIMELINEID: ZTimelineId = - ZTimelineId::from_str("00000000000000000000000000000000").unwrap(); - static ref DUMMY_TENANTID: ZTenantId = - ZTenantId::from_str("00000000000000000000000000000000").unwrap(); - } - - /// Construct a dummy InMemoryLayer for testing - fn dummy_inmem_layer( - conf: &'static PageServerConf, - segno: u32, - start_lsn: Lsn, - oldest_lsn: Lsn, - ) -> Arc { - Arc::new( - InMemoryLayer::create( - conf, - *DUMMY_TIMELINEID, - *DUMMY_TENANTID, - SegmentTag { - rel: TESTREL_A, - segno, - }, - start_lsn, - oldest_lsn, - ) - .unwrap(), - ) - } - - #[test] - fn test_open_layers() -> Result<()> { - let conf = PageServerConf::dummy_conf(PageServerConf::test_repo_dir("dummy_inmem_layer")); - let conf = Box::leak(Box::new(conf)); - std::fs::create_dir_all(conf.timeline_path(&DUMMY_TIMELINEID, &DUMMY_TENANTID))?; - - let mut layers = LayerMap::default(); - - let gen1 = layers.increment_generation(); - layers.insert_open(dummy_inmem_layer(conf, 0, Lsn(0x100), Lsn(0x100))); - layers.insert_open(dummy_inmem_layer(conf, 1, Lsn(0x100), Lsn(0x200))); - layers.insert_open(dummy_inmem_layer(conf, 2, Lsn(0x100), Lsn(0x120))); - layers.insert_open(dummy_inmem_layer(conf, 3, Lsn(0x100), Lsn(0x110))); - - let gen2 = layers.increment_generation(); - layers.insert_open(dummy_inmem_layer(conf, 4, Lsn(0x100), Lsn(0x110))); - layers.insert_open(dummy_inmem_layer(conf, 5, Lsn(0x100), Lsn(0x100))); - - // A helper function (closure) to pop the next oldest open entry from the layer map, - // and assert that it is what we'd expect - let mut assert_pop_layer = |expected_segno: u32, expected_generation: u64| { - let (layer_id, l, generation) = layers.peek_oldest_open().unwrap(); - assert!(l.get_seg_tag().segno == expected_segno); - assert!(generation == expected_generation); - layers.remove_open(layer_id); - }; - - assert_pop_layer(0, gen1); // 0x100 - assert_pop_layer(5, gen2); // 0x100 - assert_pop_layer(3, gen1); // 0x110 - assert_pop_layer(4, gen2); // 0x110 - assert_pop_layer(2, gen1); // 0x120 - assert_pop_layer(1, gen1); // 0x200 - - Ok(()) - } -} diff --git a/pageserver/src/layered_repository/metadata.rs b/pageserver/src/layered_repository/metadata.rs deleted file mode 100644 index 960a1b7fe3..0000000000 --- a/pageserver/src/layered_repository/metadata.rs +++ /dev/null @@ -1,228 +0,0 @@ -//! Every image of a certain timeline from [`crate::layered_repository::LayeredRepository`] -//! has a metadata that needs to be stored persistently. -//! -//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of -//! external storage import and export operations. -//! -//! The module contains all structs and related helper methods related to timeline metadata. - -use std::{convert::TryInto, path::PathBuf}; - -use anyhow::ensure; -use zenith_utils::{ - bin_ser::BeSer, - lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, -}; - -use crate::config::PageServerConf; - -// Taken from PG_CONTROL_MAX_SAFE_SIZE -const METADATA_MAX_SAFE_SIZE: usize = 512; -const METADATA_CHECKSUM_SIZE: usize = std::mem::size_of::(); -const METADATA_MAX_DATA_SIZE: usize = METADATA_MAX_SAFE_SIZE - METADATA_CHECKSUM_SIZE; - -/// The name of the metadata file pageserver creates per timeline. -pub const METADATA_FILE_NAME: &str = "metadata"; - -/// Metadata stored on disk for each timeline -/// -/// The fields correspond to the values we hold in memory, in LayeredTimeline. -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] -pub struct TimelineMetadata { - disk_consistent_lsn: Lsn, - // This is only set if we know it. We track it in memory when the page - // server is running, but we only track the value corresponding to - // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a - // lot. We only store it in the metadata file when we flush *all* the - // in-memory data so that 'last_record_lsn' is the same as - // 'disk_consistent_lsn'. That's OK, because after page server restart, as - // soon as we reprocess at least one record, we will have a valid - // 'prev_record_lsn' value in memory again. This is only really needed when - // doing a clean shutdown, so that there is no more WAL beyond - // 'disk_consistent_lsn' - prev_record_lsn: Option, - ancestor_timeline: Option, - ancestor_lsn: Lsn, - latest_gc_cutoff_lsn: Lsn, - initdb_lsn: Lsn, -} - -/// Points to a place in pageserver's local directory, -/// where certain timeline's metadata file should be located. -pub fn metadata_path( - conf: &'static PageServerConf, - timelineid: ZTimelineId, - tenantid: ZTenantId, -) -> PathBuf { - conf.timeline_path(&timelineid, &tenantid) - .join(METADATA_FILE_NAME) -} - -impl TimelineMetadata { - pub fn new( - disk_consistent_lsn: Lsn, - prev_record_lsn: Option, - ancestor_timeline: Option, - ancestor_lsn: Lsn, - latest_gc_cutoff_lsn: Lsn, - initdb_lsn: Lsn, - ) -> Self { - Self { - disk_consistent_lsn, - prev_record_lsn, - ancestor_timeline, - ancestor_lsn, - latest_gc_cutoff_lsn, - initdb_lsn, - } - } - - pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result { - ensure!( - metadata_bytes.len() == METADATA_MAX_SAFE_SIZE, - "metadata bytes size is wrong" - ); - - let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE]; - let calculated_checksum = crc32c::crc32c(data); - - let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] = - metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?; - let expected_checksum = u32::from_le_bytes(*checksum_bytes); - ensure!( - calculated_checksum == expected_checksum, - "metadata checksum mismatch" - ); - - let data = TimelineMetadata::from(serialize::DeTimelineMetadata::des_prefix(data)?); - assert!(data.disk_consistent_lsn.is_aligned()); - - Ok(data) - } - - pub fn to_bytes(&self) -> anyhow::Result> { - let serializeable_metadata = serialize::SeTimelineMetadata::from(self); - let mut metadata_bytes = serialize::SeTimelineMetadata::ser(&serializeable_metadata)?; - assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE); - metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8); - - let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]); - metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum)); - Ok(metadata_bytes) - } - - /// [`Lsn`] that corresponds to the corresponding timeline directory - /// contents, stored locally in the pageserver workdir. - pub fn disk_consistent_lsn(&self) -> Lsn { - self.disk_consistent_lsn - } - - pub fn prev_record_lsn(&self) -> Option { - self.prev_record_lsn - } - - pub fn ancestor_timeline(&self) -> Option { - self.ancestor_timeline - } - - pub fn ancestor_lsn(&self) -> Lsn { - self.ancestor_lsn - } - - pub fn latest_gc_cutoff_lsn(&self) -> Lsn { - self.latest_gc_cutoff_lsn - } - - pub fn initdb_lsn(&self) -> Lsn { - self.initdb_lsn - } -} - -/// This module is for direct conversion of metadata to bytes and back. -/// For a certain metadata, besides the conversion a few verification steps has to -/// be done, so all serde derives are hidden from the user, to avoid accidental -/// verification-less metadata creation. -mod serialize { - use serde::{Deserialize, Serialize}; - use zenith_utils::{lsn::Lsn, zid::ZTimelineId}; - - use super::TimelineMetadata; - - #[derive(Serialize)] - pub(super) struct SeTimelineMetadata<'a> { - disk_consistent_lsn: &'a Lsn, - prev_record_lsn: &'a Option, - ancestor_timeline: &'a Option, - ancestor_lsn: &'a Lsn, - latest_gc_cutoff_lsn: &'a Lsn, - initdb_lsn: &'a Lsn, - } - - impl<'a> From<&'a TimelineMetadata> for SeTimelineMetadata<'a> { - fn from(other: &'a TimelineMetadata) -> Self { - Self { - disk_consistent_lsn: &other.disk_consistent_lsn, - prev_record_lsn: &other.prev_record_lsn, - ancestor_timeline: &other.ancestor_timeline, - ancestor_lsn: &other.ancestor_lsn, - latest_gc_cutoff_lsn: &other.latest_gc_cutoff_lsn, - initdb_lsn: &other.initdb_lsn, - } - } - } - - #[derive(Deserialize)] - pub(super) struct DeTimelineMetadata { - disk_consistent_lsn: Lsn, - prev_record_lsn: Option, - ancestor_timeline: Option, - ancestor_lsn: Lsn, - latest_gc_cutoff_lsn: Lsn, - initdb_lsn: Lsn, - } - - impl From for TimelineMetadata { - fn from(other: DeTimelineMetadata) -> Self { - Self { - disk_consistent_lsn: other.disk_consistent_lsn, - prev_record_lsn: other.prev_record_lsn, - ancestor_timeline: other.ancestor_timeline, - ancestor_lsn: other.ancestor_lsn, - latest_gc_cutoff_lsn: other.latest_gc_cutoff_lsn, - initdb_lsn: other.initdb_lsn, - } - } - } -} - -#[cfg(test)] -mod tests { - use crate::repository::repo_harness::TIMELINE_ID; - - use super::*; - - #[test] - fn metadata_serializes_correctly() { - let original_metadata = TimelineMetadata { - disk_consistent_lsn: Lsn(0x200), - prev_record_lsn: Some(Lsn(0x100)), - ancestor_timeline: Some(TIMELINE_ID), - ancestor_lsn: Lsn(0), - latest_gc_cutoff_lsn: Lsn(0), - initdb_lsn: Lsn(0), - }; - - let metadata_bytes = original_metadata - .to_bytes() - .expect("Should serialize correct metadata to bytes"); - - let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes) - .expect("Should deserialize its own bytes"); - - assert_eq!( - deserialized_metadata, original_metadata, - "Metadata that was serialized to bytes and deserialized back should not change" - ); - } -} diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs deleted file mode 100644 index 8976491fc0..0000000000 --- a/pageserver/src/layered_repository/storage_layer.rs +++ /dev/null @@ -1,187 +0,0 @@ -//! -//! Common traits and structs for layers -//! - -use crate::relish::RelishTag; -use crate::repository::{BlockNumber, ZenithWalRecord}; -use crate::{ZTenantId, ZTimelineId}; -use anyhow::Result; -use bytes::Bytes; -use serde::{Deserialize, Serialize}; -use std::fmt; -use std::path::PathBuf; - -use zenith_utils::lsn::Lsn; - -// Size of one segment in pages (10 MB) -pub const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192; - -/// -/// Each relish stored in the repository is divided into fixed-sized "segments", -/// with 10 MB of key-space, or 1280 8k pages each. -/// -#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)] -pub struct SegmentTag { - pub rel: RelishTag, - pub segno: u32, -} - -/// SegmentBlk represents a block number within a segment, or the size of segment. -/// -/// This is separate from BlockNumber, which is used for block number within the -/// whole relish. Since this is just a type alias, the compiler will let you mix -/// them freely, but we use the type alias as documentation to make it clear -/// which one we're dealing with. -/// -/// (We could turn this into "struct SegmentBlk(u32)" to forbid accidentally -/// assigning a BlockNumber to SegmentBlk or vice versa, but that makes -/// operations more verbose). -pub type SegmentBlk = u32; - -impl fmt::Display for SegmentTag { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}.{}", self.rel, self.segno) - } -} - -impl SegmentTag { - /// Given a relish and block number, calculate the corresponding segment and - /// block number within the segment. - pub const fn from_blknum(rel: RelishTag, blknum: BlockNumber) -> (SegmentTag, SegmentBlk) { - ( - SegmentTag { - rel, - segno: blknum / RELISH_SEG_SIZE, - }, - blknum % RELISH_SEG_SIZE, - ) - } -} - -/// -/// Represents a version of a page at a specific LSN. The LSN is the key of the -/// entry in the 'page_versions' hash, it is not duplicated here. -/// -/// A page version can be stored as a full page image, or as WAL record that needs -/// to be applied over the previous page version to reconstruct this version. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub enum PageVersion { - Page(Bytes), - Wal(ZenithWalRecord), -} - -/// -/// Struct used to communicate across calls to 'get_page_reconstruct_data'. -/// -/// Before first call to get_page_reconstruct_data, you can fill in 'page_img' -/// if you have an older cached version of the page available. That can save -/// work in 'get_page_reconstruct_data', as it can stop searching for page -/// versions when all the WAL records going back to the cached image have been -/// collected. -/// -/// When get_page_reconstruct_data returns Complete, 'page_img' is set to an -/// image of the page, or the oldest WAL record in 'records' is a will_init-type -/// record that initializes the page without requiring a previous image. -/// -/// If 'get_page_reconstruct_data' returns Continue, some 'records' may have -/// been collected, but there are more records outside the current layer. Pass -/// the same PageReconstructData struct in the next 'get_page_reconstruct_data' -/// call, to collect more records. -/// -pub struct PageReconstructData { - pub records: Vec<(Lsn, ZenithWalRecord)>, - pub page_img: Option<(Lsn, Bytes)>, -} - -/// Return value from Layer::get_page_reconstruct_data -pub enum PageReconstructResult { - /// Got all the data needed to reconstruct the requested page - Complete, - /// This layer didn't contain all the required data, the caller should look up - /// the predecessor layer at the returned LSN and collect more data from there. - Continue(Lsn), - /// This layer didn't contain data needed to reconstruct the page version at - /// the returned LSN. This is usually considered an error, but might be OK - /// in some circumstances. - Missing(Lsn), -} - -/// -/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs. -/// There are two kinds of layers, in-memory and on-disk layers. In-memory -/// layers are used to ingest incoming WAL, and provide fast access -/// to the recent page versions. On-disk layers are stored as files on disk, and -/// are immutable. This trait presents the common functionality of -/// in-memory and on-disk layers. -/// -pub trait Layer: Send + Sync { - fn get_tenant_id(&self) -> ZTenantId; - - /// Identify the timeline this relish belongs to - fn get_timeline_id(&self) -> ZTimelineId; - - /// Identify the relish segment - fn get_seg_tag(&self) -> SegmentTag; - - /// Inclusive start bound of the LSN range that this layer holds - fn get_start_lsn(&self) -> Lsn; - - /// Exclusive end bound of the LSN range that this layer holds. - /// - /// - For an open in-memory layer, this is MAX_LSN. - /// - For a frozen in-memory layer or a delta layer, this is a valid end bound. - /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1 - fn get_end_lsn(&self) -> Lsn; - - /// Is the segment represented by this layer dropped by PostgreSQL? - fn is_dropped(&self) -> bool; - - /// Filename used to store this layer on disk. (Even in-memory layers - /// implement this, to print a handy unique identifier for the layer for - /// log messages, even though they're never not on disk.) - fn filename(&self) -> PathBuf; - - /// - /// Return data needed to reconstruct given page at LSN. - /// - /// It is up to the caller to collect more data from previous layer and - /// perform WAL redo, if necessary. - /// - /// See PageReconstructResult for possible return values. The collected data - /// is appended to reconstruct_data; the caller should pass an empty struct - /// on first call, or a struct with a cached older image of the page if one - /// is available. If this returns PageReconstructResult::Continue, look up - /// the predecessor layer and call again with the same 'reconstruct_data' to - /// collect more data. - fn get_page_reconstruct_data( - &self, - blknum: SegmentBlk, - lsn: Lsn, - reconstruct_data: &mut PageReconstructData, - ) -> Result; - - /// Return size of the segment at given LSN. (Only for blocky relations.) - fn get_seg_size(&self, lsn: Lsn) -> Result; - - /// Does the segment exist at given LSN? Or was it dropped before it. - fn get_seg_exists(&self, lsn: Lsn) -> Result; - - /// Does this layer only contain some data for the segment (incremental), - /// or does it contain a version of every page? This is important to know - /// for garbage collecting old layers: an incremental layer depends on - /// the previous non-incremental layer. - fn is_incremental(&self) -> bool; - - /// Returns true for layers that are represented in memory. - fn is_in_memory(&self) -> bool; - - /// Release memory used by this layer. There is no corresponding 'load' - /// function, that's done implicitly when you call one of the get-functions. - fn unload(&self) -> Result<()>; - - /// Permanently remove this layer from disk. - fn delete(&self) -> Result<()>; - - /// Dump summary of the contents of the layer to stdout - fn dump(&self) -> Result<()>; -} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 3a68f56187..b62143f650 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -1,45 +1,156 @@ pub mod basebackup; -pub mod branches; pub mod config; pub mod http; pub mod import_datadir; -pub mod layered_repository; +pub mod keyspace; +pub mod metrics; pub mod page_cache; pub mod page_service; -pub mod relish; -pub mod remote_storage; +pub mod pgdatadir_mapping; +pub mod profiling; pub mod repository; +pub mod storage_sync; +pub mod task_mgr; +pub mod tenant; +pub mod tenant_config; pub mod tenant_mgr; -pub mod tenant_threads; -pub mod thread_mgr; +pub mod tenant_tasks; +pub mod trace; pub mod virtual_file; pub mod walingest; pub mod walreceiver; pub mod walrecord; pub mod walredo; -use lazy_static::lazy_static; -use zenith_metrics::{register_int_gauge_vec, IntGaugeVec}; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use std::collections::HashMap; -lazy_static! { - static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!( - "pageserver_live_connections_count", - "Number of live network connections", - &["pageserver_connection_kind"] - ) - .expect("failed to define a metric"); -} +use tracing::info; +use utils::id::{TenantId, TimelineId}; -pub const LOG_FILE_NAME: &str = "pageserver.log"; +use crate::task_mgr::TaskKind; + +/// Current storage format version +/// +/// This is embedded in the header of all the layer files. +/// If you make any backwards-incompatible changes to the storage +/// format, bump this! +/// Note that TimelineMetadata uses its own version number to track +/// backwards-compatible changes to the metadata format. +pub const STORAGE_FORMAT_VERSION: u16 = 3; + +pub const DEFAULT_PG_VERSION: u32 = 14; + +// Magic constants used to identify different kinds of files +pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; +pub const DELTA_FILE_MAGIC: u16 = 0x5A61; + +static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); /// Config for the Repository checkpointer #[derive(Debug, Clone, Copy)] pub enum CheckpointConfig { - // Flush in-memory data that is older than this - Distance(u64), // Flush all in-memory data Flush, // Flush all in-memory data and reconstruct all page images Forced, } + +pub async fn shutdown_pageserver(exit_code: i32) { + // Shut down the libpq endpoint task. This prevents new connections from + // being accepted. + task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await; + + // Shut down any page service tasks. + task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await; + + // Shut down all the tenants. This flushes everything to disk and kills + // the checkpoint and GC tasks. + tenant_mgr::shutdown_all_tenants().await; + + // Stop syncing with remote storage. + // + // FIXME: Does this wait for the sync tasks to finish syncing what's queued up? + // Should it? + task_mgr::shutdown_tasks(Some(TaskKind::StorageSync), None, None).await; + + // Shut down the HTTP endpoint last, so that you can still check the server's + // status while it's shutting down. + // FIXME: We should probably stop accepting commands like attach/detach earlier. + task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await; + + // There should be nothing left, but let's be sure + task_mgr::shutdown_tasks(None, None, None).await; + info!("Shut down successfully completed"); + std::process::exit(exit_code); +} + +const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1; +const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0; + +async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) { + let backoff_duration_seconds = + exponential_backoff_duration_seconds(n, base_increment, max_seconds); + if backoff_duration_seconds > 0.0 { + info!( + "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task", + ); + tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await; + } +} + +fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { + if n == 0 { + 0.0 + } else { + (1.0 + base_increment).powf(f64::from(n)).min(max_seconds) + } +} + +/// A newtype to store arbitrary data grouped by tenant and timeline ids. +/// One could use [`utils::id::TenantTimelineId`] for grouping, but that would +/// not include the cases where a certain tenant has zero timelines. +/// This is sometimes important: a tenant could be registered during initial load from FS, +/// even if he has no timelines on disk. +#[derive(Debug)] +pub struct TenantTimelineValues(HashMap>); + +impl TenantTimelineValues { + fn new() -> Self { + Self(HashMap::new()) + } +} + +/// A suffix to be used during file sync from the remote storage, +/// to ensure that we do not leave corrupted files that pretend to be layers. +const TEMP_FILE_SUFFIX: &str = "___temp"; + +#[cfg(test)] +mod backoff_defaults_tests { + use super::*; + + #[test] + fn backoff_defaults_produce_growing_backoff_sequence() { + let mut current_backoff_value = None; + + for i in 0..10_000 { + let new_backoff_value = exponential_backoff_duration_seconds( + i, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ); + + if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) { + assert!( + old_backoff_value <= new_backoff_value, + "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}" + ) + } + } + + assert_eq!( + current_backoff_value.expect("Should have produced backoff values to compare"), + DEFAULT_MAX_BACKOFF_SECONDS, + "Given big enough of retries, backoff should reach its allowed max value" + ); + } +} diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs new file mode 100644 index 0000000000..43586b926d --- /dev/null +++ b/pageserver/src/metrics.rs @@ -0,0 +1,488 @@ +use metrics::core::{AtomicU64, GenericCounter}; +use metrics::{ + register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, + register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, + GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, + UIntGaugeVec, +}; +use once_cell::sync::Lazy; +use utils::id::{TenantId, TimelineId}; + +/// Prometheus histogram buckets (in seconds) that capture the majority of +/// latencies in the microsecond range but also extend far enough up to distinguish +/// "bad" from "really bad". +fn get_buckets_for_critical_operations() -> Vec { + let buckets_per_digit = 5; + let min_exponent = -6; + let max_exponent = 2; + + let mut buckets = vec![]; + // Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp + // because it's more numerically stable and doesn't result in numbers like 9.999999 + for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) { + buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64)) + } + buckets +} + +// Metrics collected on operations on the storage repository. +const STORAGE_TIME_OPERATIONS: &[&str] = &[ + "layer flush", + "compact", + "create images", + "init logical size", + "logical size", + "load layer map", + "gc", +]; + +pub static STORAGE_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_storage_operations_seconds", + "Time spent on storage operations", + &["operation", "tenant_id", "timeline_id"], + get_buckets_for_critical_operations(), + ) + .expect("failed to define a metric") +}); + +// Metrics collected on operations on the storage repository. +static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_getpage_reconstruct_seconds", + "Time spent in reconstruct_value", + &["tenant_id", "timeline_id"], + get_buckets_for_critical_operations(), + ) + .expect("failed to define a metric") +}); + +static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_materialized_cache_hits_total", + "Number of cache hits from materialized page cache", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static WAIT_LSN_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_wait_lsn_seconds", + "Time spent waiting for WAL to arrive", + &["tenant_id", "timeline_id"], + get_buckets_for_critical_operations(), + ) + .expect("failed to define a metric") +}); + +static LAST_RECORD_LSN: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_last_record_lsn", + "Last record LSN grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +// Metrics for determining timeline's physical size. +// A layered timeline's physical is defined as the total size of +// (delta/image) layer files on disk. +static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_current_physical_size", + "Current physical size grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_current_logical_size", + "Current logical size grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define current logical size metric") +}); + +// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, +// or in testing they estimate how much we would upload if we did. +static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_created_persistent_files_total", + "Number of files created that are meant to be uploaded to cloud storage", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_written_persistent_bytes_total", + "Total bytes written that are meant to be uploaded to cloud storage", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +// Metrics collected on disk IO operations +const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ + 0.000001, // 1 usec + 0.00001, // 10 usec + 0.0001, // 100 usec + 0.001, // 1 msec + 0.01, // 10 msec + 0.1, // 100 msec + 1.0, // 1 sec +]; + +const STORAGE_IO_TIME_OPERATIONS: &[&str] = + &["open", "close", "read", "write", "seek", "fsync", "gc"]; + +const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"]; + +pub static STORAGE_IO_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_io_operations_seconds", + "Time spent in IO operations", + &["operation", "tenant_id", "timeline_id"], + STORAGE_IO_TIME_BUCKETS.into() + ) + .expect("failed to define a metric") +}); + +pub static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_io_operations_bytes_total", + "Total amount of bytes read/written in IO operations", + &["operation", "tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[ + "get_rel_exists", + "get_rel_size", + "get_page_at_lsn", + "get_db_size", +]; + +const SMGR_QUERY_TIME_BUCKETS: &[f64] = &[ + 0.00001, // 1/100000 s + 0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s + 0.001, 0.0025, 0.005, 0.0075, // 1/1000 s + 0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s + 0.1, // 1/10 s +]; + +pub static SMGR_QUERY_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_smgr_query_seconds", + "Time spent on smgr query handling", + &["smgr_query_type", "tenant_id", "timeline_id"], + SMGR_QUERY_TIME_BUCKETS.into() + ) + .expect("failed to define a metric") +}); + +pub static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_live_connections", + "Number of live network connections", + &["pageserver_connection_kind"] + ) + .expect("failed to define a metric") +}); + +pub static NUM_ONDISK_LAYERS: Lazy = Lazy::new(|| { + register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk") + .expect("failed to define a metric") +}); + +pub static REMAINING_SYNC_ITEMS: Lazy = Lazy::new(|| { + register_int_gauge!( + "pageserver_remote_storage_remaining_sync_items", + "Number of storage sync items left in the queue" + ) + .expect("failed to register pageserver remote storage remaining sync items int gauge") +}); + +pub static IMAGE_SYNC_TIME: Lazy = Lazy::new(|| { + register_gauge_vec!( + "pageserver_remote_storage_image_sync_duration", + "Time spent to synchronize (up/download) a whole pageserver image", + &["tenant_id", "timeline_id"], + ) + .expect("failed to register per-timeline pageserver image sync time vec") +}); + +pub static IMAGE_SYNC_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"]; +pub static IMAGE_SYNC_STATUS: &[&str] = &["success", "failure", "abort"]; + +pub static IMAGE_SYNC_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_storage_image_sync_count", + "Number of synchronization operations executed for pageserver images. \ + Grouped by tenant, timeline, operation_kind and status", + &["tenant_id", "timeline_id", "operation_kind", "status"] + ) + .expect("failed to register pageserver image sync count vec") +}); + +pub static IMAGE_SYNC_TIME_HISTOGRAM: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_remote_storage_image_sync_seconds", + "Time took to synchronize (download or upload) a whole pageserver image. \ + Grouped by operation_kind and status", + &["operation_kind", "status"], + vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0] + ) + .expect("failed to register pageserver image sync time histogram vec") +}); + +pub static REMOTE_INDEX_UPLOAD: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_storage_remote_index_uploads_total", + "Number of remote index uploads", + &["tenant_id", "timeline_id"], + ) + .expect("failed to register pageserver remote index upload vec") +}); + +pub static NO_LAYERS_UPLOAD: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_remote_storage_no_layers_uploads_total", + "Number of skipped uploads due to no layers", + &["tenant_id", "timeline_id"], + ) + .expect("failed to register pageserver no layers upload vec") +}); + +pub static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_task_events", + "Number of task start/stop/fail events.", + &["event"], + ) + .expect("Failed to register tenant_task_events metric") +}); + +// Metrics collected on WAL redo operations +// +// We collect the time spent in actual WAL redo ('redo'), and time waiting +// for access to the postgres process ('wait') since there is only one for +// each tenant. + +/// Time buckets are small because we want to be able to measure the +/// smallest redo processing times. These buckets allow us to measure down +/// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec. +/// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec. +/// +/// Values up to 1s are recorded because metrics show that we have redo +/// durations and lock times larger than 0.250s. +macro_rules! redo_histogram_time_buckets { + () => { + vec![ + 0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000, + 0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, 0.500_000, + 1.000_000, + ] + }; +} + +/// While we're at it, also measure the amount of records replayed in each +/// operation. We have a global 'total replayed' counter, but that's not +/// as useful as 'what is the skew for how many records we replay in one +/// operation'. +macro_rules! redo_histogram_count_buckets { + () => { + vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0] + }; +} + +macro_rules! redo_bytes_histogram_count_buckets { + () => { + // powers of (2^.5), from 2^4.5 to 2^15 (22 buckets) + // rounded up to the next multiple of 8 to capture any MAXALIGNed record of that size, too. + vec![ + 24.0, 32.0, 48.0, 64.0, 96.0, 128.0, 184.0, 256.0, 368.0, 512.0, 728.0, 1024.0, 1456.0, + 2048.0, 2904.0, 4096.0, 5800.0, 8192.0, 11592.0, 16384.0, 23176.0, 32768.0, + ] + }; +} + +pub static WAL_REDO_TIME: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_wal_redo_seconds", + "Time spent on WAL redo", + redo_histogram_time_buckets!() + ) + .expect("failed to define a metric") +}); + +pub static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_wal_redo_wait_seconds", + "Time spent waiting for access to the WAL redo process", + redo_histogram_time_buckets!(), + ) + .expect("failed to define a metric") +}); + +pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_wal_redo_records_histogram", + "Histogram of number of records replayed per redo", + redo_histogram_count_buckets!(), + ) + .expect("failed to define a metric") +}); + +pub static WAL_REDO_BYTES_HISTOGRAM: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_wal_redo_bytes_histogram", + "Histogram of number of records replayed per redo", + redo_bytes_histogram_count_buckets!(), + ) + .expect("failed to define a metric") +}); + +pub static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_replayed_wal_records_total", + "Number of WAL records replayed in WAL redo process" + ) + .unwrap() +}); + +#[derive(Debug)] +pub struct TimelineMetrics { + tenant_id: String, + timeline_id: String, + pub reconstruct_time_histo: Histogram, + pub materialized_page_cache_hit_counter: GenericCounter, + pub flush_time_histo: Histogram, + pub compact_time_histo: Histogram, + pub create_images_time_histo: Histogram, + pub init_logical_size_histo: Histogram, + pub logical_size_histo: Histogram, + pub load_layer_map_histo: Histogram, + pub last_record_gauge: IntGauge, + pub wait_lsn_time_histo: Histogram, + pub current_physical_size_gauge: UIntGauge, + /// copy of LayeredTimeline.current_logical_size + pub current_logical_size_gauge: UIntGauge, + pub num_persistent_files_created: IntCounter, + pub persistent_bytes_written: IntCounter, +} + +impl TimelineMetrics { + pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { + let tenant_id = tenant_id.to_string(); + let timeline_id = timeline_id.to_string(); + let reconstruct_time_histo = RECONSTRUCT_TIME + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let flush_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id]) + .unwrap(); + let compact_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["compact", &tenant_id, &timeline_id]) + .unwrap(); + let create_images_time_histo = STORAGE_TIME + .get_metric_with_label_values(&["create images", &tenant_id, &timeline_id]) + .unwrap(); + let init_logical_size_histo = STORAGE_TIME + .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id]) + .unwrap(); + let logical_size_histo = STORAGE_TIME + .get_metric_with_label_values(&["logical size", &tenant_id, &timeline_id]) + .unwrap(); + let load_layer_map_histo = STORAGE_TIME + .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id]) + .unwrap(); + let last_record_gauge = LAST_RECORD_LSN + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let wait_lsn_time_histo = WAIT_LSN_TIME + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let current_logical_size_gauge = CURRENT_LOGICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + + TimelineMetrics { + tenant_id, + timeline_id, + reconstruct_time_histo, + materialized_page_cache_hit_counter, + flush_time_histo, + compact_time_histo, + create_images_time_histo, + init_logical_size_histo, + logical_size_histo, + load_layer_map_histo, + last_record_gauge, + wait_lsn_time_histo, + current_physical_size_gauge, + current_logical_size_gauge, + num_persistent_files_created, + persistent_bytes_written, + } + } +} + +impl Drop for TimelineMetrics { + fn drop(&mut self) { + let tenant_id = &self.tenant_id; + let timeline_id = &self.timeline_id; + let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]); + let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]); + let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); + let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]); + let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); + let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); + + for op in STORAGE_TIME_OPERATIONS { + let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]); + } + for op in STORAGE_IO_TIME_OPERATIONS { + let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]); + } + + for op in STORAGE_IO_SIZE_OPERATIONS { + let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]); + } + + for op in SMGR_QUERY_TIME_OPERATIONS { + let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]); + } + + for op in IMAGE_SYNC_OPERATION_KINDS { + for status in IMAGE_SYNC_STATUS { + let _ = IMAGE_SYNC_COUNT.remove_label_values(&[tenant_id, timeline_id, op, status]); + } + } + + let _ = IMAGE_SYNC_TIME.remove_label_values(&[tenant_id, timeline_id]); + } +} + +pub fn remove_tenant_metrics(tenant_id: &TenantId) { + let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]); +} diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index b0c8d3a5d7..d2fe06697e 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -20,7 +20,7 @@ //! assign a buffer for a page, you must hold the mapping lock and the lock on //! the slot at the same time. //! -//! Whenever you need to hold both locks simultenously, the slot lock must be +//! Whenever you need to hold both locks simultaneously, the slot lock must be //! acquired first. This consistent ordering avoids deadlocks. To look up a page //! in the cache, you would first look up the mapping, while holding the mapping //! lock, and then lock the slot. You must release the mapping lock in between, @@ -41,31 +41,29 @@ use std::{ convert::TryInto, sync::{ atomic::{AtomicU8, AtomicUsize, Ordering}, - RwLock, RwLockReadGuard, RwLockWriteGuard, + RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError, }, }; +use anyhow::Context; use once_cell::sync::OnceCell; use tracing::error; -use zenith_utils::{ +use utils::{ + id::{TenantId, TimelineId}, lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, }; -use crate::layered_repository::writeback_ephemeral_file; -use crate::{config::PageServerConf, relish::RelTag}; +use crate::repository::Key; +use crate::tenant::writeback_ephemeral_file; static PAGE_CACHE: OnceCell = OnceCell::new(); -const TEST_PAGE_CACHE_SIZE: usize = 10; +const TEST_PAGE_CACHE_SIZE: usize = 50; /// /// Initialize the page cache. This must be called once at page server startup. /// -pub fn init(conf: &'static PageServerConf) { - if PAGE_CACHE - .set(PageCache::new(conf.page_cache_size)) - .is_err() - { +pub fn init(size: usize) { + if PAGE_CACHE.set(PageCache::new(size)).is_err() { panic!("page cache already initialized"); } } @@ -86,13 +84,14 @@ pub fn get() -> &'static PageCache { } } -pub const PAGE_SZ: usize = postgres_ffi::pg_constants::BLCKSZ as usize; +pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize; const MAX_USAGE_COUNT: u8 = 5; /// /// CacheKey uniquely identifies a "thing" to cache in the page cache. /// #[derive(Debug, PartialEq, Eq, Clone)] +#[allow(clippy::enum_variant_names)] enum CacheKey { MaterializedPage { hash_key: MaterializedPageHashKey, @@ -102,14 +101,17 @@ enum CacheKey { file_id: u64, blkno: u32, }, + ImmutableFilePage { + file_id: u64, + blkno: u32, + }, } #[derive(Debug, PartialEq, Eq, Hash, Clone)] struct MaterializedPageHashKey { - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - rel_tag: RelTag, - blknum: u32, + tenant_id: TenantId, + timeline_id: TimelineId, + key: Key, } #[derive(Clone)] @@ -177,6 +179,8 @@ pub struct PageCache { ephemeral_page_map: RwLock>, + immutable_page_map: RwLock>, + /// The actual buffers with their metadata. slots: Box<[Slot]>, @@ -199,6 +203,12 @@ impl std::ops::Deref for PageReadGuard<'_> { } } +impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> { + fn as_ref(&self) -> &[u8; PAGE_SZ] { + self.0.buf + } +} + /// /// PageWriteGuard is a lease on a buffer for modifying it. The page is kept locked /// until the guard is dropped. @@ -230,6 +240,12 @@ impl std::ops::Deref for PageWriteGuard<'_> { } } +impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> { + fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] { + self.inner.buf + } +} + impl PageWriteGuard<'_> { /// Mark that the buffer contents are now valid. pub fn mark_valid(&mut self) { @@ -292,18 +308,16 @@ impl PageCache { /// returned page. pub fn lookup_materialized_page( &self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - rel_tag: RelTag, - blknum: u32, + tenant_id: TenantId, + timeline_id: TimelineId, + key: &Key, lsn: Lsn, ) -> Option<(Lsn, PageReadGuard)> { let mut cache_key = CacheKey::MaterializedPage { hash_key: MaterializedPageHashKey { tenant_id, timeline_id, - rel_tag, - blknum, + key: *key, }, lsn, }; @@ -324,24 +338,22 @@ impl PageCache { /// pub fn memorize_materialized_page( &self, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - rel_tag: RelTag, - blknum: u32, + tenant_id: TenantId, + timeline_id: TimelineId, + key: Key, lsn: Lsn, img: &[u8], - ) { + ) -> anyhow::Result<()> { let cache_key = CacheKey::MaterializedPage { hash_key: MaterializedPageHashKey { tenant_id, timeline_id, - rel_tag, - blknum, + key, }, lsn, }; - match self.lock_for_write(&cache_key) { + match self.lock_for_write(&cache_key)? { WriteBufResult::Found(write_guard) => { // We already had it in cache. Another thread must've put it there // concurrently. Check that it had the same contents that we @@ -353,17 +365,19 @@ impl PageCache { write_guard.mark_valid(); } } + + Ok(()) } // Section 1.2: Public interface functions for working with Ephemeral pages. - pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult { + pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result { let mut cache_key = CacheKey::EphemeralPage { file_id, blkno }; self.lock_for_read(&mut cache_key) } - pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> WriteBufResult { + pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result { let cache_key = CacheKey::EphemeralPage { file_id, blkno }; self.lock_for_write(&cache_key) @@ -389,6 +403,36 @@ impl PageCache { } } + // Section 1.3: Public interface functions for working with immutable file pages. + + pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result { + let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno }; + + self.lock_for_read(&mut cache_key) + } + + /// Immediately drop all buffers belonging to given file, without writeback + pub fn drop_buffers_for_immutable(&self, drop_file_id: u64) { + for slot_idx in 0..self.slots.len() { + let slot = &self.slots[slot_idx]; + + let mut inner = slot.inner.write().unwrap(); + if let Some(key) = &inner.key { + match key { + CacheKey::ImmutableFilePage { file_id, blkno: _ } + if *file_id == drop_file_id => + { + // remove mapping for old buffer + self.remove_mapping(key); + inner.key = None; + inner.dirty = false; + } + _ => {} + } + } + } + } + // // Section 2: Internal interface functions for lookup/update. // @@ -454,15 +498,16 @@ impl PageCache { /// } /// ``` /// - fn lock_for_read(&self, cache_key: &mut CacheKey) -> ReadBufResult { + fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result { loop { // First check if the key already exists in the cache. if let Some(read_guard) = self.try_lock_for_read(cache_key) { - return ReadBufResult::Found(read_guard); + return Ok(ReadBufResult::Found(read_guard)); } // Not found. Find a victim buffer - let (slot_idx, mut inner) = self.find_victim(); + let (slot_idx, mut inner) = + self.find_victim().context("Failed to find evict victim")?; // Insert mapping for this. At this point, we may find that another // thread did the same thing concurrently. In that case, we evicted @@ -485,10 +530,10 @@ impl PageCache { inner.dirty = false; slot.usage_count.store(1, Ordering::Relaxed); - return ReadBufResult::NotFound(PageWriteGuard { + return Ok(ReadBufResult::NotFound(PageWriteGuard { inner, valid: false, - }); + })); } } @@ -515,15 +560,16 @@ impl PageCache { /// /// Similar to lock_for_read(), but the returned buffer is write-locked and /// may be modified by the caller even if it's already found in the cache. - fn lock_for_write(&self, cache_key: &CacheKey) -> WriteBufResult { + fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result { loop { // First check if the key already exists in the cache. if let Some(write_guard) = self.try_lock_for_write(cache_key) { - return WriteBufResult::Found(write_guard); + return Ok(WriteBufResult::Found(write_guard)); } // Not found. Find a victim buffer - let (slot_idx, mut inner) = self.find_victim(); + let (slot_idx, mut inner) = + self.find_victim().context("Failed to find evict victim")?; // Insert mapping for this. At this point, we may find that another // thread did the same thing concurrently. In that case, we evicted @@ -546,10 +592,10 @@ impl PageCache { inner.dirty = false; slot.usage_count.store(1, Ordering::Relaxed); - return WriteBufResult::NotFound(PageWriteGuard { + return Ok(WriteBufResult::NotFound(PageWriteGuard { inner, valid: false, - }); + })); } } @@ -586,6 +632,10 @@ impl PageCache { let map = self.ephemeral_page_map.read().unwrap(); Some(*map.get(&(*file_id, *blkno))?) } + CacheKey::ImmutableFilePage { file_id, blkno } => { + let map = self.immutable_page_map.read().unwrap(); + Some(*map.get(&(*file_id, *blkno))?) + } } } @@ -609,6 +659,10 @@ impl PageCache { let map = self.ephemeral_page_map.read().unwrap(); Some(*map.get(&(*file_id, *blkno))?) } + CacheKey::ImmutableFilePage { file_id, blkno } => { + let map = self.immutable_page_map.read().unwrap(); + Some(*map.get(&(*file_id, *blkno))?) + } } } @@ -640,6 +694,11 @@ impl PageCache { map.remove(&(*file_id, *blkno)) .expect("could not find old key in mapping"); } + CacheKey::ImmutableFilePage { file_id, blkno } => { + let mut map = self.immutable_page_map.write().unwrap(); + map.remove(&(*file_id, *blkno)) + .expect("could not find old key in mapping"); + } } } @@ -680,6 +739,16 @@ impl PageCache { } } } + CacheKey::ImmutableFilePage { file_id, blkno } => { + let mut map = self.immutable_page_map.write().unwrap(); + match map.entry((*file_id, *blkno)) { + Entry::Occupied(entry) => Some(*entry.get()), + Entry::Vacant(entry) => { + entry.insert(slot_idx); + None + } + } + } } } @@ -690,17 +759,34 @@ impl PageCache { /// Find a slot to evict. /// /// On return, the slot is empty and write-locked. - fn find_victim(&self) -> (usize, RwLockWriteGuard) { - let iter_limit = self.slots.len() * 2; + fn find_victim(&self) -> anyhow::Result<(usize, RwLockWriteGuard)> { + let iter_limit = self.slots.len() * 10; let mut iters = 0; loop { + iters += 1; let slot_idx = self.next_evict_slot.fetch_add(1, Ordering::Relaxed) % self.slots.len(); let slot = &self.slots[slot_idx]; - if slot.dec_usage_count() == 0 || iters >= iter_limit { - let mut inner = slot.inner.write().unwrap(); - + if slot.dec_usage_count() == 0 { + let mut inner = match slot.inner.try_write() { + Ok(inner) => inner, + Err(TryLockError::Poisoned(err)) => { + anyhow::bail!("buffer lock was poisoned: {err:?}") + } + Err(TryLockError::WouldBlock) => { + // If we have looped through the whole buffer pool 10 times + // and still haven't found a victim buffer, something's wrong. + // Maybe all the buffers were in locked. That could happen in + // theory, if you have more threads holding buffers locked than + // there are buffers in the pool. In practice, with a reasonably + // large buffer pool it really shouldn't happen. + if iters > iter_limit { + anyhow::bail!("exceeded evict iter limit"); + } + continue; + } + }; if let Some(old_key) = &inner.key { if inner.dirty { if let Err(err) = Self::writeback(old_key, inner.buf) { @@ -723,10 +809,8 @@ impl PageCache { inner.dirty = false; inner.key = None; } - return (slot_idx, inner); + return Ok((slot_idx, inner)); } - - iters += 1; } } @@ -735,12 +819,20 @@ impl PageCache { CacheKey::MaterializedPage { hash_key: _, lsn: _, - } => { - panic!("unexpected dirty materialized page"); - } + } => Err(std::io::Error::new( + std::io::ErrorKind::Other, + "unexpected dirty materialized page", + )), CacheKey::EphemeralPage { file_id, blkno } => { writeback_ephemeral_file(*file_id, *blkno, buf) } + CacheKey::ImmutableFilePage { + file_id: _, + blkno: _, + } => Err(std::io::Error::new( + std::io::ErrorKind::Other, + "unexpected dirty immutable page", + )), } } @@ -771,6 +863,7 @@ impl PageCache { Self { materialized_page_map: Default::default(), ephemeral_page_map: Default::default(), + immutable_page_map: Default::default(), slots, next_evict_slot: AtomicUsize::new(0), } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7dc3c8c752..0919c5191a 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -7,232 +7,147 @@ // *status* -- show actual info about this pageserver, // *pagestream* -- enter mode where smgr and pageserver talk with their // custom protocol. -// *callmemaybe $url* -- ask pageserver to start walreceiver on $url // use anyhow::{bail, ensure, Context, Result}; -use bytes::{Buf, BufMut, Bytes, BytesMut}; -use lazy_static::lazy_static; -use regex::Regex; +use bytes::Buf; +use bytes::Bytes; +use futures::{Stream, StreamExt}; +use pageserver_api::models::{ + PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, + PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, + PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, + PagestreamNblocksRequest, PagestreamNblocksResponse, +}; +use pq_proto::{BeMessage, FeMessage, RowDescriptor}; use std::io; use std::net::TcpListener; use std::str; use std::str::FromStr; -use std::sync::{Arc, RwLockReadGuard}; +use std::sync::Arc; +use tokio::pin; +use tokio_util::io::StreamReader; +use tokio_util::io::SyncIoBridge; use tracing::*; -use zenith_metrics::{register_histogram_vec, HistogramVec}; -use zenith_utils::auth::{self, JwtAuth}; -use zenith_utils::auth::{Claims, Scope}; -use zenith_utils::lsn::Lsn; -use zenith_utils::postgres_backend::is_socket_read_timed_out; -use zenith_utils::postgres_backend::PostgresBackend; -use zenith_utils::postgres_backend::{self, AuthType}; -use zenith_utils::pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC}; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use utils::id::ConnectionId; +use utils::{ + auth::{self, Claims, JwtAuth, Scope}, + id::{TenantId, TimelineId}, + lsn::Lsn, + postgres_backend::AuthType, + postgres_backend_async::{self, PostgresBackend}, + simple_rcu::RcuReadGuard, +}; use crate::basebackup; -use crate::config::PageServerConf; -use crate::relish::*; -use crate::repository::Timeline; +use crate::config::{PageServerConf, ProfilingConfig}; +use crate::import_datadir::import_wal_from_tar; +use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; +use crate::profiling::profpoint_start; +use crate::task_mgr; +use crate::task_mgr::TaskKind; +use crate::tenant::Timeline; use crate::tenant_mgr; -use crate::thread_mgr; -use crate::thread_mgr::ThreadKind; -use crate::walreceiver; +use crate::trace::Tracer; use crate::CheckpointConfig; -// Wrapped in libpq CopyData -enum PagestreamFeMessage { - Exists(PagestreamExistsRequest), - Nblocks(PagestreamNblocksRequest), - GetPage(PagestreamGetPageRequest), -} +use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; +use postgres_ffi::BLCKSZ; -// Wrapped in libpq CopyData -enum PagestreamBeMessage { - Exists(PagestreamExistsResponse), - Nblocks(PagestreamNblocksResponse), - GetPage(PagestreamGetPageResponse), - Error(PagestreamErrorResponse), -} +fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream> + '_ { + async_stream::try_stream! { + loop { + let msg = tokio::select! { + biased; -#[derive(Debug)] -struct PagestreamExistsRequest { - latest: bool, - lsn: Lsn, - rel: RelTag, -} + _ = task_mgr::shutdown_watcher() => { + // We were requested to shut down. + let msg = format!("pageserver is shutting down"); + let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg)); + Err(anyhow::anyhow!(msg)) + } -#[derive(Debug)] -struct PagestreamNblocksRequest { - latest: bool, - lsn: Lsn, - rel: RelTag, -} + msg = pgb.read_message() => { msg } + }; -#[derive(Debug)] -struct PagestreamGetPageRequest { - latest: bool, - lsn: Lsn, - rel: RelTag, - blkno: u32, -} + match msg { + Ok(Some(message)) => { + let copy_data_bytes = match message { + FeMessage::CopyData(bytes) => bytes, + FeMessage::CopyDone => { break }, + FeMessage::Sync => continue, + m => { + let msg = format!("unexpected message {:?}", m); + pgb.write_message(&BeMessage::ErrorResponse(&msg))?; + Err(io::Error::new(io::ErrorKind::Other, msg))?; + break; + } + }; -#[derive(Debug)] -struct PagestreamExistsResponse { - exists: bool, -} - -#[derive(Debug)] -struct PagestreamNblocksResponse { - n_blocks: u32, -} - -#[derive(Debug)] -struct PagestreamGetPageResponse { - page: Bytes, -} - -#[derive(Debug)] -struct PagestreamErrorResponse { - message: String, -} - -impl PagestreamFeMessage { - fn parse(mut body: Bytes) -> anyhow::Result { - // TODO these gets can fail - - // these correspond to the ZenithMessageTag enum in pagestore_client.h - // - // TODO: consider using protobuf or serde bincode for less error prone - // serialization. - let msg_tag = body.get_u8(); - match msg_tag { - 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { - latest: body.get_u8() != 0, - lsn: Lsn::from(body.get_u64()), - rel: RelTag { - spcnode: body.get_u32(), - dbnode: body.get_u32(), - relnode: body.get_u32(), - forknum: body.get_u8(), - }, - })), - 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - latest: body.get_u8() != 0, - lsn: Lsn::from(body.get_u64()), - rel: RelTag { - spcnode: body.get_u32(), - dbnode: body.get_u32(), - relnode: body.get_u32(), - forknum: body.get_u8(), - }, - })), - 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - latest: body.get_u8() != 0, - lsn: Lsn::from(body.get_u64()), - rel: RelTag { - spcnode: body.get_u32(), - dbnode: body.get_u32(), - relnode: body.get_u32(), - forknum: body.get_u8(), - }, - blkno: body.get_u32(), - })), - _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body), + yield copy_data_bytes; + } + Ok(None) => { + let msg = "client closed connection"; + pgb.write_message(&BeMessage::ErrorResponse(msg))?; + pgb.flush().await?; + Err(io::Error::new(io::ErrorKind::Other, msg))?; + } + Err(e) => { + Err(io::Error::new(io::ErrorKind::Other, e))?; + } + }; } } } -impl PagestreamBeMessage { - fn serialize(&self) -> Bytes { - let mut bytes = BytesMut::new(); - - match self { - Self::Exists(resp) => { - bytes.put_u8(100); /* tag from pagestore_client.h */ - bytes.put_u8(resp.exists as u8); - } - - Self::Nblocks(resp) => { - bytes.put_u8(101); /* tag from pagestore_client.h */ - bytes.put_u32(resp.n_blocks); - } - - Self::GetPage(resp) => { - bytes.put_u8(102); /* tag from pagestore_client.h */ - bytes.put(&resp.page[..]); - } - - Self::Error(resp) => { - bytes.put_u8(103); /* tag from pagestore_client.h */ - bytes.put(resp.message.as_bytes()); - bytes.put_u8(0); // null terminator - } - } - - bytes.into() - } -} - /////////////////////////////////////////////////////////////////////////////// /// /// Main loop of the page service. /// -/// Listens for connections, and launches a new handler thread for each. +/// Listens for connections, and launches a new handler task for each. /// -pub fn thread_main( +pub async fn libpq_listener_main( conf: &'static PageServerConf, auth: Option>, listener: TcpListener, auth_type: AuthType, ) -> anyhow::Result<()> { listener.set_nonblocking(true)?; - let basic_rt = tokio::runtime::Builder::new_current_thread() - .enable_io() - .build()?; - - let tokio_listener = { - let _guard = basic_rt.enter(); - tokio::net::TcpListener::from_std(listener) - }?; + let tokio_listener = tokio::net::TcpListener::from_std(listener)?; // Wait for a new connection to arrive, or for server shutdown. - while let Some(res) = basic_rt.block_on(async { - let shutdown_watcher = thread_mgr::shutdown_watcher(); - tokio::select! { - biased; + while let Some(res) = tokio::select! { + biased; - _ = shutdown_watcher => { - // We were requested to shut down. - None - } - - res = tokio_listener.accept() => { - Some(res) - } + _ = task_mgr::shutdown_watcher() => { + // We were requested to shut down. + None } - }) { + + res = tokio_listener.accept() => { + Some(res) + } + } { match res { Ok((socket, peer_addr)) => { - // Connection established. Spawn a new thread to handle it. + // Connection established. Spawn a new task to handle it. debug!("accepted connection from {}", peer_addr); let local_auth = auth.clone(); - // PageRequestHandler threads are not associated with any particular - // timeline in the thread manager. In practice most connections will + // PageRequestHandler tasks are not associated with any particular + // timeline in the task manager. In practice most connections will // only deal with a particular timeline, but we don't know which one // yet. - if let Err(err) = thread_mgr::spawn( - ThreadKind::PageRequestHandler, + task_mgr::spawn( + &tokio::runtime::Handle::current(), + TaskKind::PageRequestHandler, None, None, - "serving Page Service thread", - move || page_service_conn_main(conf, local_auth, socket, auth_type), - ) { - // Thread creation failed. Log the error and continue. - error!("could not spawn page service thread: {:?}", err); - } + "serving compute connection task", + false, + page_service_conn_main(conf, local_auth, socket, auth_type), + ); } Err(err) => { // accept() failed. Log the error, and loop back to retry on next connection. @@ -246,37 +161,87 @@ pub fn thread_main( Ok(()) } -fn page_service_conn_main( +async fn page_service_conn_main( conf: &'static PageServerConf, auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, ) -> anyhow::Result<()> { - // Immediately increment the gauge, then create a job to decrement it on thread exit. + // Immediately increment the gauge, then create a job to decrement it on task exit. // One of the pros of `defer!` is that this will *most probably* // get called, even in presence of panics. - let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]); + let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]); gauge.inc(); scopeguard::defer! { gauge.dec(); } - // We use Tokio to accept the connection, but the rest of the code works with a - // regular socket. Convert. - let socket = socket - .into_std() - .context("could not convert tokio::net:TcpStream to std::net::TcpStream")?; - socket - .set_nonblocking(false) - .context("could not put socket to blocking mode")?; - socket .set_nodelay(true) .context("could not set TCP_NODELAY")?; let mut conn_handler = PageServerHandler::new(conf, auth); - let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?; - pgbackend.run(&mut conn_handler) + let pgbackend = PostgresBackend::new(socket, auth_type, None)?; + + let result = pgbackend + .run(&mut conn_handler, task_mgr::shutdown_watcher) + .await; + match result { + Ok(()) => { + // we've been requested to shut down + Ok(()) + } + Err(err) => { + let root_cause_io_err_kind = err + .root_cause() + .downcast_ref::() + .map(|e| e.kind()); + + // `ConnectionReset` error happens when the Postgres client closes the connection. + // As this disconnection happens quite often and is expected, + // we decided to downgrade the logging level to `INFO`. + // See: https://github.com/neondatabase/neon/issues/1683. + if root_cause_io_err_kind == Some(io::ErrorKind::ConnectionReset) { + info!("Postgres client disconnected"); + Ok(()) + } else { + Err(err) + } + } + } +} + +struct PageRequestMetrics { + get_rel_exists: metrics::Histogram, + get_rel_size: metrics::Histogram, + get_page_at_lsn: metrics::Histogram, + get_db_size: metrics::Histogram, +} + +impl PageRequestMetrics { + fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { + let tenant_id = tenant_id.to_string(); + let timeline_id = timeline_id.to_string(); + + let get_rel_exists = + SMGR_QUERY_TIME.with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]); + + let get_rel_size = + SMGR_QUERY_TIME.with_label_values(&["get_rel_size", &tenant_id, &timeline_id]); + + let get_page_at_lsn = + SMGR_QUERY_TIME.with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]); + + let get_db_size = + SMGR_QUERY_TIME.with_label_values(&["get_db_size", &tenant_id, &timeline_id]); + + Self { + get_rel_exists, + get_rel_size, + get_page_at_lsn, + get_db_size, + } + } } #[derive(Debug)] @@ -286,24 +251,6 @@ struct PageServerHandler { claims: Option, } -const TIME_BUCKETS: &[f64] = &[ - 0.00001, // 1/100000 s - 0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s - 0.001, 0.0025, 0.005, 0.0075, // 1/1000 s - 0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s - 0.1, // 1/10 s -]; - -lazy_static! { - static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!( - "pageserver_smgr_query_time", - "Time spent on smgr query handling", - &["smgr_query_type"], - TIME_BUCKETS.into() - ) - .expect("failed to define a metric"); -} - impl PageServerHandler { pub fn new(conf: &'static PageServerConf, auth: Option>) -> Self { PageServerHandler { @@ -313,76 +260,209 @@ impl PageServerHandler { } } - fn handle_pagerequests( + #[instrument(skip(self, pgb))] + async fn handle_pagerequests( &self, pgb: &mut PostgresBackend, - timelineid: ZTimelineId, - tenantid: ZTenantId, + tenant_id: TenantId, + timeline_id: TimelineId, ) -> anyhow::Result<()> { - let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered(); + // NOTE: pagerequests handler exits when connection is closed, + // so there is no need to reset the association + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); + + // Make request tracer if needed + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; + let mut tracer = if tenant.get_trace_read_requests() { + let connection_id = ConnectionId::generate(); + let path = tenant + .conf + .trace_path(&tenant_id, &timeline_id, &connection_id); + Some(Tracer::new(path)) + } else { + None + }; // Check that the timeline exists - let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid) - .context("Cannot handle pagerequests for a remote timeline")?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; - /* switch client to COPYBOTH */ + // switch client to COPYBOTH pgb.write_message(&BeMessage::CopyBothResponse)?; + pgb.flush().await?; - while !thread_mgr::is_shutdown_requested() { - match pgb.read_message() { - Ok(message) => { - if let Some(message) = message { - trace!("query: {:?}", message); + let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id); - let copy_data_bytes = match message { - FeMessage::CopyData(bytes) => bytes, - _ => continue, - }; + loop { + let msg = tokio::select! { + biased; - let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; - - let response = match zenith_fe_msg { - PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_exists"]) - .observe_closure_duration(|| { - self.handle_get_rel_exists_request(timeline.as_ref(), &req) - }), - PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_size"]) - .observe_closure_duration(|| { - self.handle_get_nblocks_request(timeline.as_ref(), &req) - }), - PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME - .with_label_values(&["get_page_at_lsn"]) - .observe_closure_duration(|| { - self.handle_get_page_at_lsn_request(timeline.as_ref(), &req) - }), - }; - - let response = response.unwrap_or_else(|e| { - // print the all details to the log with {:#}, but for the client the - // error message is enough - error!("error reading relation or page version: {:?}", e); - PagestreamBeMessage::Error(PagestreamErrorResponse { - message: e.to_string(), - }) - }); - - pgb.write_message(&BeMessage::CopyData(&response.serialize()))?; - } else { - break; - } + _ = task_mgr::shutdown_watcher() => { + // We were requested to shut down. + info!("shutdown request received in page handler"); + break; } - Err(e) => { - if !is_socket_read_timed_out(&e) { - return Err(e); - } + + msg = pgb.read_message() => { msg } + }; + + let copy_data_bytes = match msg? { + Some(FeMessage::CopyData(bytes)) => bytes, + Some(m) => { + bail!("unexpected message: {m:?} during COPY"); } + None => break, // client disconnected + }; + + trace!("query: {copy_data_bytes:?}"); + + // Trace request if needed + if let Some(t) = tracer.as_mut() { + t.trace(©_data_bytes) } + + let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; + + let response = match neon_fe_msg { + PagestreamFeMessage::Exists(req) => { + let _timer = metrics.get_rel_exists.start_timer(); + self.handle_get_rel_exists_request(&timeline, &req).await + } + PagestreamFeMessage::Nblocks(req) => { + let _timer = metrics.get_rel_size.start_timer(); + self.handle_get_nblocks_request(&timeline, &req).await + } + PagestreamFeMessage::GetPage(req) => { + let _timer = metrics.get_page_at_lsn.start_timer(); + self.handle_get_page_at_lsn_request(&timeline, &req).await + } + PagestreamFeMessage::DbSize(req) => { + let _timer = metrics.get_db_size.start_timer(); + self.handle_db_size_request(&timeline, &req).await + } + }; + + let response = response.unwrap_or_else(|e| { + // print the all details to the log with {:#}, but for the client the + // error message is enough + error!("error reading relation or page version: {:?}", e); + PagestreamBeMessage::Error(PagestreamErrorResponse { + message: e.to_string(), + }) + }); + + pgb.write_message(&BeMessage::CopyData(&response.serialize()))?; + pgb.flush().await?; } Ok(()) } + #[instrument(skip(self, pgb))] + async fn handle_import_basebackup( + &self, + pgb: &mut PostgresBackend, + tenant_id: TenantId, + timeline_id: TimelineId, + base_lsn: Lsn, + _end_lsn: Lsn, + pg_version: u32, + ) -> anyhow::Result<()> { + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); + // Create empty timeline + info!("creating new timeline"); + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; + let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?; + + // TODO mark timeline as not ready until it reaches end_lsn. + // We might have some wal to import as well, and we should prevent compute + // from connecting before that and writing conflicting wal. + // + // This is not relevant for pageserver->pageserver migrations, since there's + // no wal to import. But should be fixed if we want to import from postgres. + + // TODO leave clean state on error. For now you can use detach to clean + // up broken state from a failed import. + + // Import basebackup provided via CopyData + info!("importing basebackup"); + pgb.write_message(&BeMessage::CopyInResponse)?; + pgb.flush().await?; + + let copyin_stream = copyin_stream(pgb); + pin!(copyin_stream); + + timeline + .import_basebackup_from_tar(&mut copyin_stream, base_lsn) + .await?; + + // Drain the rest of the Copy data + let mut bytes_after_tar = 0; + while let Some(bytes) = copyin_stream.next().await { + bytes_after_tar += bytes?.len(); + } + if bytes_after_tar > 0 { + warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive"); + } + + // TODO check checksum + // Meanwhile you can verify client-side by taking fullbackup + // and checking that it matches in size with what was imported. + // It wouldn't work if base came from vanilla postgres though, + // since we discard some log files. + + info!("done"); + Ok(()) + } + + #[instrument(skip(self, pgb))] + async fn handle_import_wal( + &self, + pgb: &mut PostgresBackend, + tenant_id: TenantId, + timeline_id: TimelineId, + start_lsn: Lsn, + end_lsn: Lsn, + ) -> anyhow::Result<()> { + task_mgr::associate_with(Some(tenant_id), Some(timeline_id)); + + let timeline = get_local_timeline(tenant_id, timeline_id)?; + ensure!(timeline.get_last_record_lsn() == start_lsn); + + // TODO leave clean state on error. For now you can use detach to clean + // up broken state from a failed import. + + // Import wal provided via CopyData + info!("importing wal"); + pgb.write_message(&BeMessage::CopyInResponse)?; + pgb.flush().await?; + let mut copyin_stream = Box::pin(copyin_stream(pgb)); + let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); + tokio::task::block_in_place(|| { + import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn) + })?; + info!("wal import complete"); + + // Drain the rest of the Copy data + let mut bytes_after_tar = 0; + while let Some(bytes) = copyin_stream.next().await { + bytes_after_tar += bytes?.len(); + } + if bytes_after_tar > 0 { + warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive"); + } + + // TODO Does it make sense to overshoot? + ensure!(timeline.get_last_record_lsn() >= end_lsn); + + // Flush data to disk, then upload to s3. No need for a forced checkpoint. + // We only want to persist the data, and it doesn't matter if it's in the + // shape of deltas or images. + info!("flushing layers"); + timeline.checkpoint(CheckpointConfig::Flush).await?; + + info!("done"); + Ok(()) + } + /// Helper function to handle the LSN from client request. /// /// Each GetPage (and Exists and Nblocks) request includes information about @@ -395,11 +475,11 @@ impl PageServerHandler { /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. - fn wait_or_get_last_lsn( - timeline: &dyn Timeline, + async fn wait_or_get_last_lsn( + timeline: &Timeline, mut lsn: Lsn, latest: bool, - latest_gc_cutoff_lsn: &RwLockReadGuard, + latest_gc_cutoff_lsn: &RcuReadGuard, ) -> Result { if latest { // Latest page version was requested. If LSN is given, it is a hint @@ -423,7 +503,7 @@ impl PageServerHandler { if lsn <= last_record_lsn { lsn = last_record_lsn; } else { - timeline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn).await?; // Since we waited for 'lsn' to arrive, that is now the last // record LSN. (Or close enough for our purposes; the // last-record LSN can advance immediately after we return @@ -433,7 +513,7 @@ impl PageServerHandler { if lsn == Lsn(0) { bail!("invalid LSN(0) in request"); } - timeline.wait_lsn(lsn)?; + timeline.wait_lsn(lsn).await?; } ensure!( lsn >= **latest_gc_cutoff_lsn, @@ -443,85 +523,106 @@ impl PageServerHandler { Ok(lsn) } - fn handle_get_rel_exists_request( + #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + async fn handle_get_rel_exists_request( &self, - timeline: &dyn Timeline, + timeline: &Timeline, req: &PagestreamExistsRequest, ) -> Result { - let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered(); - - let tag = RelishTag::Relation(req.rel); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; - let exists = timeline.get_rel_exists(tag, lsn)?; + let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { exists, })) } - fn handle_get_nblocks_request( + #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))] + async fn handle_get_nblocks_request( &self, - timeline: &dyn Timeline, + timeline: &Timeline, req: &PagestreamNblocksRequest, ) -> Result { - let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered(); - let tag = RelishTag::Relation(req.rel); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; - let n_blocks = timeline.get_relish_size(tag, lsn)?; - - // Return 0 if relation is not found. - // This is what postgres smgr expects. - let n_blocks = n_blocks.unwrap_or(0); + let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { n_blocks, })) } - fn handle_get_page_at_lsn_request( + #[instrument(skip(self, timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))] + async fn handle_db_size_request( &self, - timeline: &dyn Timeline, + timeline: &Timeline, + req: &PagestreamDbSizeRequest, + ) -> Result { + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; + + let total_blocks = + timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?; + + let db_size = total_blocks as i64 * BLCKSZ as i64; + + Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse { + db_size, + })) + } + + #[instrument(skip(self, timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))] + async fn handle_get_page_at_lsn_request( + &self, + timeline: &Timeline, req: &PagestreamGetPageRequest, ) -> Result { - let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn) - .entered(); - let tag = RelishTag::Relation(req.rel); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; + let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) + .await?; /* - // Add a 1s delay to some requests. The delayed causes the requests to + // Add a 1s delay to some requests. The delay helps the requests to // hit the race condition from github issue #1047 more easily. use rand::Rng; if rand::thread_rng().gen::() < 5 { std::thread::sleep(std::time::Duration::from_millis(1000)); } */ - let page = timeline.get_page_at_lsn(tag, req.blkno, lsn)?; + + // FIXME: this profiling now happens at different place than it used to. The + // current profiling is based on a thread-local variable, so it doesn't work + // across awaits + let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); + let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page, })) } - fn handle_basebackup_request( + #[instrument(skip(self, pgb))] + async fn handle_basebackup_request( &self, pgb: &mut PostgresBackend, - timelineid: ZTimelineId, + tenant_id: TenantId, + timeline_id: TimelineId, lsn: Option, - tenantid: ZTenantId, + prev_lsn: Option, + full_backup: bool, ) -> anyhow::Result<()> { - let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty); - let _enter = span.enter(); - // check that the timeline exists - let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid) - .context("Cannot handle basebackup request for a remote timeline")?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { + // Backup was requested at a particular LSN. Wait for it to arrive. + info!("waiting for {}", lsn); + timeline.wait_lsn(lsn).await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .context("invalid basebackup lsn")?; @@ -529,50 +630,56 @@ impl PageServerHandler { // switch client to COPYOUT pgb.write_message(&BeMessage::CopyOutResponse)?; + pgb.flush().await?; /* Send a tarball of the latest layer on the timeline */ - { - let mut writer = CopyDataSink { pgb }; - let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?; - span.record("lsn", &basebackup.lsn.to_string().as_str()); - basebackup.send_tarball()?; - } + let mut writer = CopyDataSink { + pgb, + rt: tokio::runtime::Handle::current(), + }; + tokio::task::block_in_place(|| { + let basebackup = + basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?; + tracing::Span::current().record("lsn", &basebackup.lsn.to_string().as_str()); + basebackup.send_tarball() + })?; pgb.write_message(&BeMessage::CopyDone)?; - debug!("CopyDone sent!"); + pgb.flush().await?; + info!("basebackup complete"); Ok(()) } // when accessing management api supply None as an argument // when using to authorize tenant pass corresponding tenant id - fn check_permission(&self, tenantid: Option) -> Result<()> { + fn check_permission(&self, tenant_id: Option) -> Result<()> { if self.auth.is_none() { // auth is set to Trust, nothing to check so just return ok return Ok(()); } // auth is some, just checked above, when auth is some - // then claims are always present because of checks during connetion init + // then claims are always present because of checks during connection init // so this expect won't trigger let claims = self .claims .as_ref() .expect("claims presence already checked"); - auth::check_permission(claims, tenantid) + auth::check_permission(claims, tenant_id) } } -impl postgres_backend::Handler for PageServerHandler { +#[async_trait::async_trait] +impl postgres_backend_async::Handler for PageServerHandler { fn check_auth_jwt( &mut self, _pgb: &mut PostgresBackend, jwt_response: &[u8], ) -> anyhow::Result<()> { - // this unwrap is never triggered, because check_auth_jwt only called when auth_type is ZenithJWT + // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT // which requires auth to be present let data = self .auth .as_ref() - .as_ref() .unwrap() .decode(str::from_utf8(jwt_response)?)?; @@ -584,7 +691,7 @@ impl postgres_backend::Handler for PageServerHandler { } info!( - "jwt auth succeeded for scope: {:#?} by tenantid: {:?}", + "jwt auth succeeded for scope: {:#?} by tenant id: {:?}", data.claims.scope, data.claims.tenant_id, ); @@ -592,7 +699,7 @@ impl postgres_backend::Handler for PageServerHandler { Ok(()) } - fn process_query( + async fn process_query( &mut self, pgb: &mut PostgresBackend, query_string: &str, @@ -606,12 +713,13 @@ impl postgres_backend::Handler for PageServerHandler { params.len() == 2, "invalid param number for pagestream command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; - self.check_permission(Some(tenantid))?; + self.check_permission(Some(tenant_id))?; - self.handle_pagerequests(pgb, timelineid, tenantid)?; + self.handle_pagerequests(pgb, tenant_id, timeline_id) + .await?; } else if query_string.starts_with("basebackup ") { let (_, params_raw) = query_string.split_at("basebackup ".len()); let params = params_raw.split_whitespace().collect::>(); @@ -621,10 +729,10 @@ impl postgres_backend::Handler for PageServerHandler { "invalid param number for basebackup command" ); - let tenantid = ZTenantId::from_str(params[0])?; - let timelineid = ZTimelineId::from_str(params[1])?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; - self.check_permission(Some(tenantid))?; + self.check_permission(Some(tenant_id))?; let lsn = if params.len() == 3 { Some(Lsn::from_str(params[2])?) @@ -633,171 +741,203 @@ impl postgres_backend::Handler for PageServerHandler { }; // Check that the timeline exists - self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("callmemaybe ") { - // callmemaybe - // TODO lazy static - let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) ([[:xdigit:]]+) (.*)$").unwrap(); - let caps = re - .captures(query_string) - .with_context(|| format!("invalid callmemaybe: '{}'", query_string))?; + self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false) + .await?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + } + // return pair of prev_lsn and last_lsn + else if query_string.starts_with("get_last_record_rlsn ") { + let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len()); + let params = params_raw.split_whitespace().collect::>(); - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let connstr = caps.get(3).unwrap().as_str().to_owned(); + ensure!( + params.len() == 2, + "invalid param number for get_last_record_rlsn command" + ); - self.check_permission(Some(tenantid))?; + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; - let _enter = - info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered(); + self.check_permission(Some(tenant_id))?; + let timeline = get_local_timeline(tenant_id, timeline_id)?; + + let end_of_timeline = timeline.get_last_record_rlsn(); + + pgb.write_message(&BeMessage::RowDescription(&[ + RowDescriptor::text_col(b"prev_lsn"), + RowDescriptor::text_col(b"last_lsn"), + ]))? + .write_message(&BeMessage::DataRow(&[ + Some(end_of_timeline.prev.to_string().as_bytes()), + Some(end_of_timeline.last.to_string().as_bytes()), + ]))? + .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + } + // same as basebackup, but result includes relational data as well + else if query_string.starts_with("fullbackup ") { + let (_, params_raw) = query_string.split_at("fullbackup ".len()); + let params = params_raw.split_whitespace().collect::>(); + + ensure!( + params.len() >= 2, + "invalid param number for fullbackup command" + ); + + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; + + // The caller is responsible for providing correct lsn and prev_lsn. + let lsn = if params.len() > 2 { + Some(Lsn::from_str(params[2])?) + } else { + None + }; + let prev_lsn = if params.len() > 3 { + Some(Lsn::from_str(params[3])?) + } else { + None + }; + + self.check_permission(Some(tenant_id))?; // Check that the timeline exists - tenant_mgr::get_timeline_for_tenant(tenantid, timelineid) - .context("Failed to fetch local timeline for callmemaybe requests")?; + self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true) + .await?; + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + } else if query_string.starts_with("import basebackup ") { + // Import the `base` section (everything but the wal) of a basebackup. + // Assumes the tenant already exists on this pageserver. + // + // Files are scheduled to be persisted to remote storage, and the + // caller should poll the http api to check when that is done. + // + // Example import command: + // 1. Get start/end LSN from backup_manifest file + // 2. Run: + // cat my_backup/base.tar | psql -h $PAGESERVER \ + // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION" + let (_, params_raw) = query_string.split_at("import basebackup ".len()); + let params = params_raw.split_whitespace().collect::>(); + ensure!(params.len() == 5); + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; + let base_lsn = Lsn::from_str(params[2])?; + let end_lsn = Lsn::from_str(params[3])?; + let pg_version = u32::from_str(params[4])?; - walreceiver::launch_wal_receiver(self.conf, tenantid, timelineid, &connstr)?; + self.check_permission(Some(tenant_id))?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + match self + .handle_import_basebackup( + pgb, + tenant_id, + timeline_id, + base_lsn, + end_lsn, + pg_version, + ) + .await + { + Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, + Err(e) => { + error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}"); + pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))? + } + }; + } else if query_string.starts_with("import wal ") { + // Import the `pg_wal` section of a basebackup. + // + // Files are scheduled to be persisted to remote storage, and the + // caller should poll the http api to check when that is done. + let (_, params_raw) = query_string.split_at("import wal ".len()); + let params = params_raw.split_whitespace().collect::>(); + ensure!(params.len() == 4); + let tenant_id = TenantId::from_str(params[0])?; + let timeline_id = TimelineId::from_str(params[1])?; + let start_lsn = Lsn::from_str(params[2])?; + let end_lsn = Lsn::from_str(params[3])?; + + self.check_permission(Some(tenant_id))?; + + match self + .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn) + .await + { + Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?, + Err(e) => { + error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}"); + pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))? + } + }; } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("failpoints ") { - let (_, failpoints) = query_string.split_at("failpoints ".len()); - for failpoint in failpoints.split(';') { - if let Some((name, actions)) = failpoint.split_once('=') { - info!("cfg failpoint: {} {}", name, actions); - fail::cfg(name, actions).unwrap(); - } else { - bail!("Invalid failpoints format"); - } - } - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("do_gc ") { - // Run GC immediately on given timeline. - // FIXME: This is just for tests. See test_runner/batch_others/test_gc.py. - // This probably should require special authentication or a global flag to - // enable, I don't think we want to or need to allow regular clients to invoke - // GC. + pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + } else if query_string.starts_with("show ") { + // show + let (_, params_raw) = query_string.split_at("show ".len()); + let params = params_raw.split(' ').collect::>(); + ensure!(params.len() == 1, "invalid param number for config command"); + let tenant_id = TenantId::from_str(params[0])?; - // do_gc - let re = Regex::new(r"^do_gc ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)([[:digit:]]+)?") - .unwrap(); + self.check_permission(Some(tenant_id))?; - let caps = re - .captures(query_string) - .with_context(|| format!("invalid do_gc: '{}'", query_string))?; - - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let gc_horizon: u64 = caps - .get(4) - .map(|h| h.as_str().parse()) - .unwrap_or(Ok(self.conf.gc_horizon))?; - - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?; - pgb.write_message_noflush(&BeMessage::RowDescription(&[ - RowDescriptor::int8_col(b"layer_relfiles_total"), - RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"), - RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"), - RowDescriptor::int8_col(b"layer_relfiles_not_updated"), - RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"), - RowDescriptor::int8_col(b"layer_relfiles_removed"), - RowDescriptor::int8_col(b"layer_relfiles_dropped"), - RowDescriptor::int8_col(b"layer_nonrelfiles_total"), - RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"), - RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"), - RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"), - RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"), - RowDescriptor::int8_col(b"layer_nonrelfiles_removed"), - RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"), - RowDescriptor::int8_col(b"elapsed"), + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; + pgb.write_message(&BeMessage::RowDescription(&[ + RowDescriptor::int8_col(b"checkpoint_distance"), + RowDescriptor::int8_col(b"checkpoint_timeout"), + RowDescriptor::int8_col(b"compaction_target_size"), + RowDescriptor::int8_col(b"compaction_period"), + RowDescriptor::int8_col(b"compaction_threshold"), + RowDescriptor::int8_col(b"gc_horizon"), + RowDescriptor::int8_col(b"gc_period"), + RowDescriptor::int8_col(b"image_creation_threshold"), + RowDescriptor::int8_col(b"pitr_interval"), ]))? - .write_message_noflush(&BeMessage::DataRow(&[ - Some(result.ondisk_relfiles_total.to_string().as_bytes()), + .write_message(&BeMessage::DataRow(&[ + Some(tenant.get_checkpoint_distance().to_string().as_bytes()), Some( - result - .ondisk_relfiles_needed_by_cutoff + tenant + .get_checkpoint_timeout() + .as_secs() .to_string() .as_bytes(), ), + Some(tenant.get_compaction_target_size().to_string().as_bytes()), Some( - result - .ondisk_relfiles_needed_by_branches + tenant + .get_compaction_period() + .as_secs() .to_string() .as_bytes(), ), - Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()), - Some( - result - .ondisk_relfiles_needed_as_tombstone - .to_string() - .as_bytes(), - ), - Some(result.ondisk_relfiles_removed.to_string().as_bytes()), - Some(result.ondisk_relfiles_dropped.to_string().as_bytes()), - Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()), - Some( - result - .ondisk_nonrelfiles_needed_by_cutoff - .to_string() - .as_bytes(), - ), - Some( - result - .ondisk_nonrelfiles_needed_by_branches - .to_string() - .as_bytes(), - ), - Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()), - Some( - result - .ondisk_nonrelfiles_needed_as_tombstone - .to_string() - .as_bytes(), - ), - Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()), - Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()), - Some(result.elapsed.as_millis().to_string().as_bytes()), + Some(tenant.get_compaction_threshold().to_string().as_bytes()), + Some(tenant.get_gc_horizon().to_string().as_bytes()), + Some(tenant.get_gc_period().as_secs().to_string().as_bytes()), + Some(tenant.get_image_creation_threshold().to_string().as_bytes()), + Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()), ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("checkpoint ") { - // Run checkpoint immediately on given timeline. - - // checkpoint - let re = Regex::new(r"^checkpoint ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)?").unwrap(); - - let caps = re - .captures(query_string) - .with_context(|| format!("invalid checkpoint command: '{}'", query_string))?; - - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - - let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid) - .context("Failed to fetch local timeline for checkpoint request")?; - - timeline.checkpoint(CheckpointConfig::Forced)?; - pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { bail!("unknown command"); } - pgb.flush()?; - Ok(()) } } +fn get_local_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> Result> { + tenant_mgr::get_tenant(tenant_id, true) + .and_then(|tenant| tenant.get_timeline(timeline_id, true)) +} + /// /// A std::io::Write implementation that wraps all data written to it in CopyData /// messages. /// struct CopyDataSink<'a> { pgb: &'a mut PostgresBackend, + rt: tokio::runtime::Handle, } impl<'a> io::Write for CopyDataSink<'a> { @@ -809,6 +949,7 @@ impl<'a> io::Write for CopyDataSink<'a> { // FIXME: flush isn't really required, but makes it easier // to view in wireshark self.pgb.write_message(&BeMessage::CopyData(data))?; + self.rt.block_on(self.pgb.flush())?; trace!("CopyData sent for {} bytes!", data.len()); Ok(data.len()) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs new file mode 100644 index 0000000000..0e334a63df --- /dev/null +++ b/pageserver/src/pgdatadir_mapping.rs @@ -0,0 +1,1553 @@ +//! +//! This provides an abstraction to store PostgreSQL relations and other files +//! in the key-value store that implements the Repository interface. +//! +//! (TODO: The line between PUT-functions here and walingest.rs is a bit blurry, as +//! walingest.rs handles a few things like implicit relation creation and extension. +//! Clarify that) +//! +use crate::keyspace::{KeySpace, KeySpaceAccum}; +use crate::repository::*; +use crate::tenant::Timeline; +use crate::walrecord::NeonWalRecord; +use anyhow::{bail, ensure, Result}; +use bytes::{Buf, Bytes}; +use pageserver_api::reltag::{RelTag, SlruKind}; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::BLCKSZ; +use postgres_ffi::{Oid, TimestampTz, TransactionId}; +use serde::{Deserialize, Serialize}; +use std::collections::{hash_map, HashMap, HashSet}; +use std::ops::Range; +use tracing::{debug, trace, warn}; +use utils::{bin_ser::BeSer, lsn::Lsn}; + +/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type. +pub type BlockNumber = u32; + +#[derive(Debug)] +pub enum LsnForTimestamp { + Present(Lsn), + Future(Lsn), + Past(Lsn), + NoData(Lsn), +} + +/// +/// This impl provides all the functionality to store PostgreSQL relations, SLRUs, +/// and other special kinds of files, in a versioned key-value store. The +/// Timeline struct provides the key-value store. +/// +/// This is a separate impl, so that we can easily include all these functions in a Timeline +/// implementation, and might be moved into a separate struct later. +impl Timeline { + /// Start ingesting a WAL record, or other atomic modification of + /// the timeline. + /// + /// This provides a transaction-like interface to perform a bunch + /// of modifications atomically. + /// + /// To ingest a WAL record, call begin_modification(lsn) to get a + /// DatadirModification object. Use the functions in the object to + /// modify the repository state, updating all the pages and metadata + /// that the WAL record affects. When you're done, call commit() to + /// commit the changes. + /// + /// Lsn stored in modification is advanced by `ingest_record` and + /// is used by `commit()` to update `last_record_lsn`. + /// + /// Calling commit() will flush all the changes and reset the state, + /// so the `DatadirModification` struct can be reused to perform the next modification. + /// + /// Note that any pending modifications you make through the + /// modification object won't be visible to calls to the 'get' and list + /// functions of the timeline until you finish! And if you update the + /// same page twice, the last update wins. + /// + pub fn begin_modification(&self, lsn: Lsn) -> DatadirModification + where + Self: Sized, + { + DatadirModification { + tline: self, + pending_updates: HashMap::new(), + pending_deletions: Vec::new(), + pending_nblocks: 0, + lsn, + } + } + + //------------------------------------------------------------------------------ + // Public GET functions + //------------------------------------------------------------------------------ + + /// Look up given page version. + pub fn get_rel_page_at_lsn( + &self, + tag: RelTag, + blknum: BlockNumber, + lsn: Lsn, + latest: bool, + ) -> Result { + ensure!(tag.relnode != 0, "invalid relnode"); + + let nblocks = self.get_rel_size(tag, lsn, latest)?; + if blknum >= nblocks { + debug!( + "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", + tag, blknum, lsn, nblocks + ); + return Ok(ZERO_PAGE.clone()); + } + + let key = rel_block_to_key(tag, blknum); + self.get(key, lsn) + } + + // Get size of a database in blocks + pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn, latest: bool) -> Result { + let mut total_blocks = 0; + + let rels = self.list_rels(spcnode, dbnode, lsn)?; + + for rel in rels { + let n_blocks = self.get_rel_size(rel, lsn, latest)?; + total_blocks += n_blocks as usize; + } + Ok(total_blocks) + } + + /// Get size of a relation file + pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> Result { + ensure!(tag.relnode != 0, "invalid relnode"); + + if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) { + return Ok(nblocks); + } + + if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) + && !self.get_rel_exists(tag, lsn, latest)? + { + // FIXME: Postgres sometimes calls smgrcreate() to create + // FSM, and smgrnblocks() on it immediately afterwards, + // without extending it. Tolerate that by claiming that + // any non-existent FSM fork has size 0. + return Ok(0); + } + + let key = rel_size_to_key(tag); + let mut buf = self.get(key, lsn)?; + let nblocks = buf.get_u32_le(); + + if latest { + // Update relation size cache only if "latest" flag is set. + // This flag is set by compute when it is working with most recent version of relation. + // Typically master compute node always set latest=true. + // Please notice, that even if compute node "by mistake" specifies old LSN but set + // latest=true, then it can not cause cache corruption, because with latest=true + // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be + // associated with most recent value of LSN. + self.update_cached_rel_size(tag, lsn, nblocks); + } + Ok(nblocks) + } + + /// Does relation exist? + pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> Result { + ensure!(tag.relnode != 0, "invalid relnode"); + + // first try to lookup relation in cache + if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) { + return Ok(true); + } + // fetch directory listing + let key = rel_dir_to_key(tag.spcnode, tag.dbnode); + let buf = self.get(key, lsn)?; + let dir = RelDirectory::des(&buf)?; + + let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); + + Ok(exists) + } + + /// Get a list of all existing relations in given tablespace and database. + pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { + // fetch directory listing + let key = rel_dir_to_key(spcnode, dbnode); + let buf = self.get(key, lsn)?; + let dir = RelDirectory::des(&buf)?; + + let rels: HashSet = + HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { + spcnode, + dbnode, + relnode: *relnode, + forknum: *forknum, + })); + + Ok(rels) + } + + /// Look up given SLRU page version. + pub fn get_slru_page_at_lsn( + &self, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + lsn: Lsn, + ) -> Result { + let key = slru_block_to_key(kind, segno, blknum); + self.get(key, lsn) + } + + /// Get size of an SLRU segment + pub fn get_slru_segment_size( + &self, + kind: SlruKind, + segno: u32, + lsn: Lsn, + ) -> Result { + let key = slru_segment_size_to_key(kind, segno); + let mut buf = self.get(key, lsn)?; + Ok(buf.get_u32_le()) + } + + /// Get size of an SLRU segment + pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { + // fetch directory listing + let key = slru_dir_to_key(kind); + let buf = self.get(key, lsn)?; + let dir = SlruSegmentDirectory::des(&buf)?; + + let exists = dir.segments.get(&segno).is_some(); + Ok(exists) + } + + /// Locate LSN, such that all transactions that committed before + /// 'search_timestamp' are visible, but nothing newer is. + /// + /// This is not exact. Commit timestamps are not guaranteed to be ordered, + /// so it's not well defined which LSN you get if there were multiple commits + /// "in flight" at that point in time. + /// + pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result { + let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); + let min_lsn = *gc_cutoff_lsn_guard; + let max_lsn = self.get_last_record_lsn(); + + // LSNs are always 8-byte aligned. low/mid/high represent the + // LSN divided by 8. + let mut low = min_lsn.0 / 8; + let mut high = max_lsn.0 / 8 + 1; + + let mut found_smaller = false; + let mut found_larger = false; + while low < high { + // cannot overflow, high and low are both smaller than u64::MAX / 2 + let mid = (high + low) / 2; + + let cmp = self.is_latest_commit_timestamp_ge_than( + search_timestamp, + Lsn(mid * 8), + &mut found_smaller, + &mut found_larger, + )?; + + if cmp { + high = mid; + } else { + low = mid + 1; + } + } + match (found_smaller, found_larger) { + (false, false) => { + // This can happen if no commit records have been processed yet, e.g. + // just after importing a cluster. + Ok(LsnForTimestamp::NoData(max_lsn)) + } + (true, false) => { + // Didn't find any commit timestamps larger than the request + Ok(LsnForTimestamp::Future(max_lsn)) + } + (false, true) => { + // Didn't find any commit timestamps smaller than the request + Ok(LsnForTimestamp::Past(max_lsn)) + } + (true, true) => { + // low is the LSN of the first commit record *after* the search_timestamp, + // Back off by one to get to the point just before the commit. + // + // FIXME: it would be better to get the LSN of the previous commit. + // Otherwise, if you restore to the returned LSN, the database will + // include physical changes from later commits that will be marked + // as aborted, and will need to be vacuumed away. + Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8))) + } + } + } + + /// + /// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any + /// commits that committed after 'search_timestamp', at LSN 'probe_lsn'. + /// + /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits + /// with a smaller/larger timestamp. + /// + pub fn is_latest_commit_timestamp_ge_than( + &self, + search_timestamp: TimestampTz, + probe_lsn: Lsn, + found_smaller: &mut bool, + found_larger: &mut bool, + ) -> Result { + for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? { + let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?; + for blknum in (0..nblocks).rev() { + let clog_page = + self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?; + + if clog_page.len() == BLCKSZ as usize + 8 { + let mut timestamp_bytes = [0u8; 8]; + timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]); + let timestamp = TimestampTz::from_be_bytes(timestamp_bytes); + + if timestamp >= search_timestamp { + *found_larger = true; + return Ok(true); + } else { + *found_smaller = true; + } + } + } + } + Ok(false) + } + + /// Get a list of SLRU segments + pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { + // fetch directory entry + let key = slru_dir_to_key(kind); + + let buf = self.get(key, lsn)?; + let dir = SlruSegmentDirectory::des(&buf)?; + + Ok(dir.segments) + } + + pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + let key = relmap_file_key(spcnode, dbnode); + + let buf = self.get(key, lsn)?; + Ok(buf) + } + + pub fn list_dbdirs(&self, lsn: Lsn) -> Result> { + // fetch directory entry + let buf = self.get(DBDIR_KEY, lsn)?; + let dir = DbDirectory::des(&buf)?; + + Ok(dir.dbdirs) + } + + pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { + let key = twophase_file_key(xid); + let buf = self.get(key, lsn)?; + Ok(buf) + } + + pub fn list_twophase_files(&self, lsn: Lsn) -> Result> { + // fetch directory entry + let buf = self.get(TWOPHASEDIR_KEY, lsn)?; + let dir = TwoPhaseDirectory::des(&buf)?; + + Ok(dir.xids) + } + + pub fn get_control_file(&self, lsn: Lsn) -> Result { + self.get(CONTROLFILE_KEY, lsn) + } + + pub fn get_checkpoint(&self, lsn: Lsn) -> Result { + self.get(CHECKPOINT_KEY, lsn) + } + + /// Does the same as get_current_logical_size but counted on demand. + /// Used to initialize the logical size tracking on startup. + /// + /// Only relation blocks are counted currently. That excludes metadata, + /// SLRUs, twophase files etc. + pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { + // Fetch list of database dirs and iterate them + let buf = self.get(DBDIR_KEY, lsn)?; + let dbdir = DbDirectory::des(&buf)?; + + let mut total_size: u64 = 0; + for (spcnode, dbnode) in dbdir.dbdirs.keys() { + for rel in self.list_rels(*spcnode, *dbnode, lsn)? { + let relsize_key = rel_size_to_key(rel); + let mut buf = self.get(relsize_key, lsn)?; + let relsize = buf.get_u32_le(); + + total_size += relsize as u64; + } + } + Ok(total_size * BLCKSZ as u64) + } + + /// + /// Get a KeySpace that covers all the Keys that are in use at the given LSN. + /// Anything that's not listed maybe removed from the underlying storage (from + /// that LSN forwards). + pub fn collect_keyspace(&self, lsn: Lsn) -> Result { + // Iterate through key ranges, greedily packing them into partitions + let mut result = KeySpaceAccum::new(); + + // The dbdir metadata always exists + result.add_key(DBDIR_KEY); + + // Fetch list of database dirs and iterate them + let buf = self.get(DBDIR_KEY, lsn)?; + let dbdir = DbDirectory::des(&buf)?; + + let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect(); + dbs.sort_unstable(); + for (spcnode, dbnode) in dbs { + result.add_key(relmap_file_key(spcnode, dbnode)); + result.add_key(rel_dir_to_key(spcnode, dbnode)); + + let mut rels: Vec = self + .list_rels(spcnode, dbnode, lsn)? + .iter() + .cloned() + .collect(); + rels.sort_unstable(); + for rel in rels { + let relsize_key = rel_size_to_key(rel); + let mut buf = self.get(relsize_key, lsn)?; + let relsize = buf.get_u32_le(); + + result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize)); + result.add_key(relsize_key); + } + } + + // Iterate SLRUs next + for kind in [ + SlruKind::Clog, + SlruKind::MultiXactMembers, + SlruKind::MultiXactOffsets, + ] { + let slrudir_key = slru_dir_to_key(kind); + result.add_key(slrudir_key); + let buf = self.get(slrudir_key, lsn)?; + let dir = SlruSegmentDirectory::des(&buf)?; + let mut segments: Vec = dir.segments.iter().cloned().collect(); + segments.sort_unstable(); + for segno in segments { + let segsize_key = slru_segment_size_to_key(kind, segno); + let mut buf = self.get(segsize_key, lsn)?; + let segsize = buf.get_u32_le(); + + result.add_range( + slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize), + ); + result.add_key(segsize_key); + } + } + + // Then pg_twophase + result.add_key(TWOPHASEDIR_KEY); + let buf = self.get(TWOPHASEDIR_KEY, lsn)?; + let twophase_dir = TwoPhaseDirectory::des(&buf)?; + let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); + xids.sort_unstable(); + for xid in xids { + result.add_key(twophase_file_key(xid)); + } + + result.add_key(CONTROLFILE_KEY); + result.add_key(CHECKPOINT_KEY); + + Ok(result.to_keyspace()) + } + + /// Get cached size of relation if it not updated after specified LSN + pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option { + let rel_size_cache = self.rel_size_cache.read().unwrap(); + if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) { + if lsn >= *cached_lsn { + return Some(*nblocks); + } + } + None + } + + /// Update cached relation size if there is no more recent update + pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { + let mut rel_size_cache = self.rel_size_cache.write().unwrap(); + match rel_size_cache.entry(tag) { + hash_map::Entry::Occupied(mut entry) => { + let cached_lsn = entry.get_mut(); + if lsn >= cached_lsn.0 { + *cached_lsn = (lsn, nblocks); + } + } + hash_map::Entry::Vacant(entry) => { + entry.insert((lsn, nblocks)); + } + } + } + + /// Store cached relation size + pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { + let mut rel_size_cache = self.rel_size_cache.write().unwrap(); + rel_size_cache.insert(tag, (lsn, nblocks)); + } + + /// Remove cached relation size + pub fn remove_cached_rel_size(&self, tag: &RelTag) { + let mut rel_size_cache = self.rel_size_cache.write().unwrap(); + rel_size_cache.remove(tag); + } +} + +/// DatadirModification represents an operation to ingest an atomic set of +/// updates to the repository. It is created by the 'begin_record' +/// function. It is called for each WAL record, so that all the modifications +/// by a one WAL record appear atomic. +pub struct DatadirModification<'a> { + /// The timeline this modification applies to. You can access this to + /// read the state, but note that any pending updates are *not* reflected + /// in the state in 'tline' yet. + pub tline: &'a Timeline, + + /// Lsn assigned by begin_modification + pub lsn: Lsn, + + // The modifications are not applied directly to the underlying key-value store. + // The put-functions add the modifications here, and they are flushed to the + // underlying key-value store by the 'finish' function. + pending_updates: HashMap, + pending_deletions: Vec>, + pending_nblocks: i64, +} + +impl<'a> DatadirModification<'a> { + /// Initialize a completely new repository. + /// + /// This inserts the directory metadata entries that are assumed to + /// always exist. + pub fn init_empty(&mut self) -> Result<()> { + let buf = DbDirectory::ser(&DbDirectory { + dbdirs: HashMap::new(), + })?; + self.put(DBDIR_KEY, Value::Image(buf.into())); + + let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { + xids: HashSet::new(), + })?; + self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); + + let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into(); + let empty_dir = Value::Image(buf); + self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); + self.put( + slru_dir_to_key(SlruKind::MultiXactMembers), + empty_dir.clone(), + ); + self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); + + Ok(()) + } + + /// Put a new page version that can be constructed from a WAL record + /// + /// NOTE: this will *not* implicitly extend the relation, if the page is beyond the + /// current end-of-file. It's up to the caller to check that the relation size + /// matches the blocks inserted! + pub fn put_rel_wal_record( + &mut self, + rel: RelTag, + blknum: BlockNumber, + rec: NeonWalRecord, + ) -> Result<()> { + ensure!(rel.relnode != 0, "invalid relnode"); + self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec)); + Ok(()) + } + + // Same, but for an SLRU. + pub fn put_slru_wal_record( + &mut self, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + rec: NeonWalRecord, + ) -> Result<()> { + self.put( + slru_block_to_key(kind, segno, blknum), + Value::WalRecord(rec), + ); + Ok(()) + } + + /// Like put_wal_record, but with ready-made image of the page. + pub fn put_rel_page_image( + &mut self, + rel: RelTag, + blknum: BlockNumber, + img: Bytes, + ) -> Result<()> { + ensure!(rel.relnode != 0, "invalid relnode"); + self.put(rel_block_to_key(rel, blknum), Value::Image(img)); + Ok(()) + } + + pub fn put_slru_page_image( + &mut self, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + img: Bytes, + ) -> Result<()> { + self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img)); + Ok(()) + } + + /// Store a relmapper file (pg_filenode.map) in the repository + pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> Result<()> { + // Add it to the directory (if it doesn't exist already) + let buf = self.get(DBDIR_KEY)?; + let mut dbdir = DbDirectory::des(&buf)?; + + let r = dbdir.dbdirs.insert((spcnode, dbnode), true); + if r == None || r == Some(false) { + // The dbdir entry didn't exist, or it contained a + // 'false'. The 'insert' call already updated it with + // 'true', now write the updated 'dbdirs' map back. + let buf = DbDirectory::ser(&dbdir)?; + self.put(DBDIR_KEY, Value::Image(buf.into())); + } + if r == None { + // Create RelDirectory + let buf = RelDirectory::ser(&RelDirectory { + rels: HashSet::new(), + })?; + self.put( + rel_dir_to_key(spcnode, dbnode), + Value::Image(Bytes::from(buf)), + ); + } + + self.put(relmap_file_key(spcnode, dbnode), Value::Image(img)); + Ok(()) + } + + pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> Result<()> { + // Add it to the directory entry + let buf = self.get(TWOPHASEDIR_KEY)?; + let mut dir = TwoPhaseDirectory::des(&buf)?; + if !dir.xids.insert(xid) { + bail!("twophase file for xid {} already exists", xid); + } + self.put( + TWOPHASEDIR_KEY, + Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), + ); + + self.put(twophase_file_key(xid), Value::Image(img)); + Ok(()) + } + + pub fn put_control_file(&mut self, img: Bytes) -> Result<()> { + self.put(CONTROLFILE_KEY, Value::Image(img)); + Ok(()) + } + + pub fn put_checkpoint(&mut self, img: Bytes) -> Result<()> { + self.put(CHECKPOINT_KEY, Value::Image(img)); + Ok(()) + } + + pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> { + let req_lsn = self.tline.get_last_record_lsn(); + + let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?; + + // Remove entry from dbdir + let buf = self.get(DBDIR_KEY)?; + let mut dir = DbDirectory::des(&buf)?; + if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { + let buf = DbDirectory::ser(&dir)?; + self.put(DBDIR_KEY, Value::Image(buf.into())); + } else { + warn!( + "dropped dbdir for spcnode {} dbnode {} did not exist in db directory", + spcnode, dbnode + ); + } + + // Update logical database size. + self.pending_nblocks -= total_blocks as i64; + + // Delete all relations and metadata files for the spcnode/dnode + self.delete(dbdir_key_range(spcnode, dbnode)); + Ok(()) + } + + /// Create a relation fork. + /// + /// 'nblocks' is the initial size. + pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { + ensure!(rel.relnode != 0, "invalid relnode"); + // It's possible that this is the first rel for this db in this + // tablespace. Create the reldir entry for it if so. + let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY)?)?; + let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); + let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() { + // Didn't exist. Update dbdir + dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false); + let buf = DbDirectory::ser(&dbdir)?; + self.put(DBDIR_KEY, Value::Image(buf.into())); + + // and create the RelDirectory + RelDirectory::default() + } else { + // reldir already exists, fetch it + RelDirectory::des(&self.get(rel_dir_key)?)? + }; + + // Add the new relation to the rel directory entry, and write it back + if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { + bail!("rel {} already exists", rel); + } + self.put( + rel_dir_key, + Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)), + ); + + // Put size + let size_key = rel_size_to_key(rel); + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + + self.pending_nblocks += nblocks as i64; + + // Update relation size cache + self.tline.set_cached_rel_size(rel, self.lsn, nblocks); + + // Even if nblocks > 0, we don't insert any actual blocks here. That's up to the + // caller. + Ok(()) + } + + /// Truncate relation + pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { + ensure!(rel.relnode != 0, "invalid relnode"); + let last_lsn = self.tline.get_last_record_lsn(); + if self.tline.get_rel_exists(rel, last_lsn, true)? { + let size_key = rel_size_to_key(rel); + // Fetch the old size first + let old_size = self.get(size_key)?.get_u32_le(); + + // Update the entry with the new size. + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + + // Update relation size cache + self.tline.set_cached_rel_size(rel, self.lsn, nblocks); + + // Update relation size cache + self.tline.set_cached_rel_size(rel, self.lsn, nblocks); + + // Update logical database size. + self.pending_nblocks -= old_size as i64 - nblocks as i64; + } + Ok(()) + } + + /// Extend relation + /// If new size is smaller, do nothing. + pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { + ensure!(rel.relnode != 0, "invalid relnode"); + + // Put size + let size_key = rel_size_to_key(rel); + let old_size = self.get(size_key)?.get_u32_le(); + + // only extend relation here. never decrease the size + if nblocks > old_size { + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + + // Update relation size cache + self.tline.set_cached_rel_size(rel, self.lsn, nblocks); + + self.pending_nblocks += nblocks as i64 - old_size as i64; + } + Ok(()) + } + + /// Drop a relation. + pub fn put_rel_drop(&mut self, rel: RelTag) -> Result<()> { + ensure!(rel.relnode != 0, "invalid relnode"); + + // Remove it from the directory entry + let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); + let buf = self.get(dir_key)?; + let mut dir = RelDirectory::des(&buf)?; + + if dir.rels.remove(&(rel.relnode, rel.forknum)) { + self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?))); + } else { + warn!("dropped rel {} did not exist in rel directory", rel); + } + + // update logical size + let size_key = rel_size_to_key(rel); + let old_size = self.get(size_key)?.get_u32_le(); + self.pending_nblocks -= old_size as i64; + + // Remove enty from relation size cache + self.tline.remove_cached_rel_size(&rel); + + // Delete size entry, as well as all blocks + self.delete(rel_key_range(rel)); + + Ok(()) + } + + pub fn put_slru_segment_creation( + &mut self, + kind: SlruKind, + segno: u32, + nblocks: BlockNumber, + ) -> Result<()> { + // Add it to the directory entry + let dir_key = slru_dir_to_key(kind); + let buf = self.get(dir_key)?; + let mut dir = SlruSegmentDirectory::des(&buf)?; + + if !dir.segments.insert(segno) { + bail!("slru segment {:?}/{} already exists", kind, segno); + } + self.put( + dir_key, + Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), + ); + + // Put size + let size_key = slru_segment_size_to_key(kind, segno); + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + + // even if nblocks > 0, we don't insert any actual blocks here + + Ok(()) + } + + /// Extend SLRU segment + pub fn put_slru_extend( + &mut self, + kind: SlruKind, + segno: u32, + nblocks: BlockNumber, + ) -> Result<()> { + // Put size + let size_key = slru_segment_size_to_key(kind, segno); + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + Ok(()) + } + + /// This method is used for marking truncated SLRU files + pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> Result<()> { + // Remove it from the directory entry + let dir_key = slru_dir_to_key(kind); + let buf = self.get(dir_key)?; + let mut dir = SlruSegmentDirectory::des(&buf)?; + + if !dir.segments.remove(&segno) { + warn!("slru segment {:?}/{} does not exist", kind, segno); + } + self.put( + dir_key, + Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), + ); + + // Delete size entry, as well as all blocks + self.delete(slru_segment_key_range(kind, segno)); + + Ok(()) + } + + /// Drop a relmapper file (pg_filenode.map) + pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<()> { + // TODO + Ok(()) + } + + /// This method is used for marking truncated SLRU files + pub fn drop_twophase_file(&mut self, xid: TransactionId) -> Result<()> { + // Remove it from the directory entry + let buf = self.get(TWOPHASEDIR_KEY)?; + let mut dir = TwoPhaseDirectory::des(&buf)?; + + if !dir.xids.remove(&xid) { + warn!("twophase file for xid {} does not exist", xid); + } + self.put( + TWOPHASEDIR_KEY, + Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), + ); + + // Delete it + self.delete(twophase_key_range(xid)); + + Ok(()) + } + + /// + /// Flush changes accumulated so far to the underlying repository. + /// + /// Usually, changes made in DatadirModification are atomic, but this allows + /// you to flush them to the underlying repository before the final `commit`. + /// That allows to free up the memory used to hold the pending changes. + /// + /// Currently only used during bulk import of a data directory. In that + /// context, breaking the atomicity is OK. If the import is interrupted, the + /// whole import fails and the timeline will be deleted anyway. + /// (Or to be precise, it will be left behind for debugging purposes and + /// ignored, see https://github.com/neondatabase/neon/pull/1809) + /// + /// Note: A consequence of flushing the pending operations is that they + /// won't be visible to subsequent operations until `commit`. The function + /// retains all the metadata, but data pages are flushed. That's again OK + /// for bulk import, where you are just loading data pages and won't try to + /// modify the same pages twice. + pub fn flush(&mut self) -> Result<()> { + // Unless we have accumulated a decent amount of changes, it's not worth it + // to scan through the pending_updates list. + let pending_nblocks = self.pending_nblocks; + if pending_nblocks < 10000 { + return Ok(()); + } + + let writer = self.tline.writer(); + + // Flush relation and SLRU data blocks, keep metadata. + let mut result: Result<()> = Ok(()); + self.pending_updates.retain(|&key, value| { + if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) { + result = writer.put(key, self.lsn, value); + false + } else { + true + } + }); + result?; + + if pending_nblocks != 0 { + writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); + self.pending_nblocks = 0; + } + + Ok(()) + } + + /// + /// Finish this atomic update, writing all the updated keys to the + /// underlying timeline. + /// All the modifications in this atomic update are stamped by the specified LSN. + /// + pub fn commit(&mut self) -> anyhow::Result<()> { + let writer = self.tline.writer(); + let lsn = self.lsn; + let pending_nblocks = self.pending_nblocks; + self.pending_nblocks = 0; + + for (key, value) in self.pending_updates.drain() { + writer.put(key, lsn, &value)?; + } + for key_range in self.pending_deletions.drain(..) { + writer.delete(key_range, lsn)?; + } + + writer.finish_write(lsn); + + if pending_nblocks != 0 { + writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); + } + + Ok(()) + } + + // Internal helper functions to batch the modifications + + fn get(&self, key: Key) -> Result { + // Have we already updated the same key? Read the pending updated + // version in that case. + // + // Note: we don't check pending_deletions. It is an error to request a + // value that has been removed, deletion only avoids leaking storage. + if let Some(value) = self.pending_updates.get(&key) { + if let Value::Image(img) = value { + Ok(img.clone()) + } else { + // Currently, we never need to read back a WAL record that we + // inserted in the same "transaction". All the metadata updates + // work directly with Images, and we never need to read actual + // data pages. We could handle this if we had to, by calling + // the walredo manager, but let's keep it simple for now. + bail!("unexpected pending WAL record"); + } + } else { + let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); + self.tline.get(key, lsn) + } + } + + fn put(&mut self, key: Key, val: Value) { + self.pending_updates.insert(key, val); + } + + fn delete(&mut self, key_range: Range) { + trace!("DELETE {}-{}", key_range.start, key_range.end); + self.pending_deletions.push(key_range); + } +} + +//--- Metadata structs stored in key-value pairs in the repository. + +#[derive(Debug, Serialize, Deserialize)] +struct DbDirectory { + // (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist) + dbdirs: HashMap<(Oid, Oid), bool>, +} + +#[derive(Debug, Serialize, Deserialize)] +struct TwoPhaseDirectory { + xids: HashSet, +} + +#[derive(Debug, Serialize, Deserialize, Default)] +struct RelDirectory { + // Set of relations that exist. (relfilenode, forknum) + // + // TODO: Store it as a btree or radix tree or something else that spans multiple + // key-value pairs, if you have a lot of relations + rels: HashSet<(Oid, u8)>, +} + +#[derive(Debug, Serialize, Deserialize)] +struct RelSizeEntry { + nblocks: u32, +} + +#[derive(Debug, Serialize, Deserialize, Default)] +struct SlruSegmentDirectory { + // Set of SLRU segments that exist. + segments: HashSet, +} + +static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); + +// Layout of the Key address space +// +// The Key struct, used to address the underlying key-value store, consists of +// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map +// all the data and metadata keys into those 18 bytes. +// +// Principles for the mapping: +// +// - Things that are often accessed or modified together, should be close to +// each other in the key space. For example, if a relation is extended by one +// block, we create a new key-value pair for the block data, and update the +// relation size entry. Because of that, the RelSize key comes after all the +// RelBlocks of a relation: the RelSize and the last RelBlock are always next +// to each other. +// +// The key space is divided into four major sections, identified by the first +// byte, and the form a hierarchy: +// +// 00 Relation data and metadata +// +// DbDir () -> (dbnode, spcnode) +// Filenodemap +// RelDir -> relnode forknum +// RelBlocks +// RelSize +// +// 01 SLRUs +// +// SlruDir kind +// SlruSegBlocks segno +// SlruSegSize +// +// 02 pg_twophase +// +// 03 misc +// controlfile +// checkpoint +// pg_version +// +// Below is a full list of the keyspace allocation: +// +// DbDir: +// 00 00000000 00000000 00000000 00 00000000 +// +// Filenodemap: +// 00 SPCNODE DBNODE 00000000 00 00000000 +// +// RelDir: +// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0) +// +// RelBlock: +// 00 SPCNODE DBNODE RELNODE FORK BLKNUM +// +// RelSize: +// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF +// +// SlruDir: +// 01 kind 00000000 00000000 00 00000000 +// +// SlruSegBlock: +// 01 kind 00000001 SEGNO 00 BLKNUM +// +// SlruSegSize: +// 01 kind 00000001 SEGNO 00 FFFFFFFF +// +// TwoPhaseDir: +// 02 00000000 00000000 00000000 00 00000000 +// +// TwoPhaseFile: +// 02 00000000 00000000 00000000 00 XID +// +// ControlFile: +// 03 00000000 00000000 00000000 00 00000000 +// +// Checkpoint: +// 03 00000000 00000000 00000000 00 00000001 +//-- Section 01: relation data and metadata + +const DBDIR_KEY: Key = Key { + field1: 0x00, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0xffffffff, + field5: 0xff, + field6: 0xffffffff, + } +} + +fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + } +} + +fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 1, + } +} + +fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: blknum, + } +} + +fn rel_size_to_key(rel: RelTag) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: 0xffffffff, + } +} + +fn rel_key_range(rel: RelTag) -> Range { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: 0, + }..Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum + 1, + field6: 0, + } +} + +//-- Section 02: SLRUs + +fn slru_dir_to_key(kind: SlruKind) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } +} + +fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 1, + field4: segno, + field5: 0, + field6: blknum, + } +} + +fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 1, + field4: segno, + field5: 0, + field6: 0xffffffff, + } +} + +fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range { + let field2 = match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }; + + Key { + field1: 0x01, + field2, + field3: segno, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: 0x01, + field2, + field3: segno, + field4: 0, + field5: 1, + field6: 0, + } +} + +//-- Section 03: pg_twophase + +const TWOPHASEDIR_KEY: Key = Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +fn twophase_file_key(xid: TransactionId) -> Key { + Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: xid, + } +} + +fn twophase_key_range(xid: TransactionId) -> Range { + let (next_xid, overflowed) = xid.overflowing_add(1); + + Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: xid, + }..Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: if overflowed { 1 } else { 0 }, + field6: next_xid, + } +} + +//-- Section 03: Control file +const CONTROLFILE_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +const CHECKPOINT_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 1, +}; + +// Reverse mappings for a few Keys. +// These are needed by WAL redo manager. + +pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> { + Ok(match key.field1 { + 0x00 => ( + RelTag { + spcnode: key.field2, + dbnode: key.field3, + relnode: key.field4, + forknum: key.field5, + }, + key.field6, + ), + _ => bail!("unexpected value kind 0x{:02x}", key.field1), + }) +} + +fn is_rel_block_key(key: Key) -> bool { + key.field1 == 0x00 && key.field4 != 0 +} + +pub fn is_rel_fsm_block_key(key: Key) -> bool { + key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff +} + +pub fn is_rel_vm_block_key(key: Key) -> bool { + key.field1 == 0x00 + && key.field4 != 0 + && key.field5 == VISIBILITYMAP_FORKNUM + && key.field6 != 0xffffffff +} + +pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> { + Ok(match key.field1 { + 0x01 => { + let kind = match key.field2 { + 0x00 => SlruKind::Clog, + 0x01 => SlruKind::MultiXactMembers, + 0x02 => SlruKind::MultiXactOffsets, + _ => bail!("unrecognized slru kind 0x{:02x}", key.field2), + }; + let segno = key.field4; + let blknum = key.field6; + + (kind, segno, blknum) + } + _ => bail!("unexpected value kind 0x{:02x}", key.field1), + }) +} + +fn is_slru_block_key(key: Key) -> bool { + key.field1 == 0x01 // SLRU-related + && key.field3 == 0x00000001 // but not SlruDir + && key.field6 != 0xffffffff // and not SlruSegSize +} + +#[cfg(test)] +pub fn create_test_timeline( + tenant: &crate::tenant::Tenant, + timeline_id: utils::id::TimelineId, + pg_version: u32, +) -> Result> { + let tline = tenant + .create_empty_timeline(timeline_id, Lsn(8), pg_version)? + .initialize()?; + let mut m = tline.begin_modification(Lsn(8)); + m.init_empty()?; + m.commit()?; + Ok(tline) +} + +#[allow(clippy::bool_assert_comparison)] +#[cfg(test)] +mod tests { + //use super::repo_harness::*; + //use super::*; + + /* + fn assert_current_logical_size(timeline: &DatadirTimeline, lsn: Lsn) { + let incremental = timeline.get_current_logical_size(); + let non_incremental = timeline + .get_current_logical_size_non_incremental(lsn) + .unwrap(); + assert_eq!(incremental, non_incremental); + } + */ + + /* + /// + /// Test list_rels() function, with branches and dropped relations + /// + #[test] + fn test_list_rels_drop() -> Result<()> { + let repo = RepoHarness::create("test_list_rels_drop")?.load(); + let tline = create_empty_timeline(repo, TIMELINE_ID)?; + const TESTDB: u32 = 111; + + // Import initial dummy checkpoint record, otherwise the get_timeline() call + // after branching fails below + let mut writer = tline.begin_record(Lsn(0x10)); + writer.put_checkpoint(ZERO_CHECKPOINT.clone())?; + writer.finish()?; + + // Create a relation on the timeline + let mut writer = tline.begin_record(Lsn(0x20)); + writer.put_rel_page_image(TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + writer.finish()?; + + let writer = tline.begin_record(Lsn(0x00)); + writer.finish()?; + + // Check that list_rels() lists it after LSN 2, but no before it + assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A)); + assert!(tline.list_rels(0, TESTDB, Lsn(0x20))?.contains(&TESTREL_A)); + assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A)); + + // Create a branch, check that the relation is visible there + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; + let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { + Some(timeline) => timeline, + None => panic!("Should have a local timeline"), + }; + let newtline = DatadirTimelineImpl::new(newtline); + assert!(newtline + .list_rels(0, TESTDB, Lsn(0x30))? + .contains(&TESTREL_A)); + + // Drop it on the branch + let mut new_writer = newtline.begin_record(Lsn(0x40)); + new_writer.drop_relation(TESTREL_A)?; + new_writer.finish()?; + + // Check that it's no longer listed on the branch after the point where it was dropped + assert!(newtline + .list_rels(0, TESTDB, Lsn(0x30))? + .contains(&TESTREL_A)); + assert!(!newtline + .list_rels(0, TESTDB, Lsn(0x40))? + .contains(&TESTREL_A)); + + // Run checkpoint and garbage collection and check that it's still not visible + newtline.checkpoint(CheckpointConfig::Forced)?; + repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?; + + assert!(!newtline + .list_rels(0, TESTDB, Lsn(0x40))? + .contains(&TESTREL_A)); + + Ok(()) + } + */ + + /* + #[test] + fn test_read_beyond_eof() -> Result<()> { + let repo = RepoHarness::create("test_read_beyond_eof")?.load(); + let tline = create_test_timeline(repo, TIMELINE_ID)?; + + make_some_layers(&tline, Lsn(0x20))?; + let mut writer = tline.begin_record(Lsn(0x60)); + walingest.put_rel_page_image( + &mut writer, + TESTREL_A, + 0, + TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x60))), + )?; + writer.finish()?; + + // Test read before rel creation. Should error out. + assert!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x10), false).is_err()); + + // Read block beyond end of relation at different points in time. + // These reads should fall into different delta, image, and in-memory layers. + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x20), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x25), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x30), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x35), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x45), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x55), false)?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?, ZERO_PAGE); + + // Test on an in-memory layer with no preceding layer + let mut writer = tline.begin_record(Lsn(0x70)); + walingest.put_rel_page_image( + &mut writer, + TESTREL_B, + 0, + TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x70))), + )?; + writer.finish()?; + + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_B, 1, Lsn(0x70), false)?6, ZERO_PAGE); + + Ok(()) + } + */ +} diff --git a/pageserver/src/profiling.rs b/pageserver/src/profiling.rs new file mode 100644 index 0000000000..ad896cfa30 --- /dev/null +++ b/pageserver/src/profiling.rs @@ -0,0 +1,107 @@ +//! +//! Support for profiling +//! +//! This relies on a modified version of the 'pprof-rs' crate. That's not very +//! nice, so to avoid a hard dependency on that, this is an optional feature. +//! +use crate::config::{PageServerConf, ProfilingConfig}; + +/// The actual implementation is in the `profiling_impl` submodule. If the profiling +/// feature is not enabled, it's just a dummy implementation that panics if you +/// try to enabled profiling in the configuration. +pub use profiling_impl::*; + +#[cfg(feature = "profiling")] +mod profiling_impl { + use super::*; + use pprof; + use std::marker::PhantomData; + + /// Start profiling the current thread. Returns a guard object; + /// the profiling continues until the guard is dropped. + /// + /// Note: profiling is not re-entrant. If you call 'profpoint_start' while + /// profiling is already started, nothing happens, and the profiling will be + /// stopped when either guard object is dropped. + #[inline] + pub fn profpoint_start( + conf: &crate::config::PageServerConf, + point: ProfilingConfig, + ) -> Option { + if conf.profiling == point { + pprof::start_profiling(); + Some(ProfilingGuard(PhantomData)) + } else { + None + } + } + + /// A hack to remove Send and Sync from the ProfilingGuard. Because the + /// profiling is attached to current thread. + //// + /// See comments in https://github.com/rust-lang/rust/issues/68318 + type PhantomUnsend = std::marker::PhantomData<*mut u8>; + + pub struct ProfilingGuard(PhantomUnsend); + + impl Drop for ProfilingGuard { + fn drop(&mut self) { + pprof::stop_profiling(); + } + } + + /// Initialize the profiler. This must be called before any 'profpoint_start' calls. + pub fn init_profiler(conf: &PageServerConf) -> Option { + if conf.profiling != ProfilingConfig::Disabled { + Some(pprof::ProfilerGuardBuilder::default().build().unwrap()) + } else { + None + } + } + + /// Exit the profiler. Writes the flamegraph to current workdir. + pub fn exit_profiler(_conf: &PageServerConf, profiler_guard: &Option) { + // Write out the flamegraph + if let Some(profiler_guard) = profiler_guard { + if let Ok(report) = profiler_guard.report().build() { + // this gets written under the workdir + let file = std::fs::File::create("flamegraph.svg").unwrap(); + let mut options = pprof::flamegraph::Options::default(); + options.image_width = Some(2500); + report.flamegraph_with_options(file, &mut options).unwrap(); + } + } + } +} + +/// Dummy implementation when compiling without profiling feature or for non-linux OSes. +#[cfg(not(feature = "profiling"))] +mod profiling_impl { + use super::*; + + pub struct DummyProfilerGuard; + + impl Drop for DummyProfilerGuard { + fn drop(&mut self) { + // do nothing, this exists to calm Clippy down + } + } + + pub fn profpoint_start( + _conf: &PageServerConf, + _point: ProfilingConfig, + ) -> Option { + None + } + + pub fn init_profiler(conf: &PageServerConf) -> Option { + if conf.profiling != ProfilingConfig::Disabled { + // shouldn't happen, we don't allow profiling in the config if the support + // for it is disabled. + panic!("profiling enabled but the binary was compiled without profiling support"); + } + None + } + + pub fn exit_profiler(_conf: &PageServerConf, _guard: &Option) {} +} diff --git a/pageserver/src/relish.rs b/pageserver/src/relish.rs deleted file mode 100644 index 9228829aef..0000000000 --- a/pageserver/src/relish.rs +++ /dev/null @@ -1,226 +0,0 @@ -//! -//! Zenith stores PostgreSQL relations, and some other files, in the -//! repository. The relations (i.e. tables and indexes) take up most -//! of the space in a typical installation, while the other files are -//! small. We call each relation and other file that is stored in the -//! repository a "relish". It comes from "rel"-ish, as in "kind of a -//! rel", because it covers relations as well as other things that are -//! not relations, but are treated similarly for the purposes of the -//! storage layer. -//! -//! This source file contains the definition of the RelishTag struct, -//! which uniquely identifies a relish. -//! -//! Relishes come in two flavors: blocky and non-blocky. Relations and -//! SLRUs are blocky, that is, they are divided into 8k blocks, and -//! the repository tracks their size. Other relishes are non-blocky: -//! the content of the whole relish is stored as one blob. Block -//! number must be passed as 0 for all operations on a non-blocky -//! relish. The one "block" that you store in a non-blocky relish can -//! have arbitrary size, but they are expected to be small, or you -//! will have performance issues. -//! -//! All relishes are versioned by LSN in the repository. -//! - -use serde::{Deserialize, Serialize}; -use std::fmt; - -use postgres_ffi::relfile_utils::forknumber_to_name; -use postgres_ffi::{Oid, TransactionId}; - -/// -/// RelishTag identifies one relish. -/// -#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub enum RelishTag { - // Relations correspond to PostgreSQL relation forks. Each - // PostgreSQL relation fork is considered a separate relish. - Relation(RelTag), - - // SLRUs include pg_clog, pg_multixact/members, and - // pg_multixact/offsets. There are other SLRUs in PostgreSQL, but - // they don't need to be stored permanently (e.g. pg_subtrans), - // or we do not support them in zenith yet (pg_commit_ts). - // - // These are currently never requested directly by the compute - // nodes, although in principle that would be possible. However, - // when a new compute node is created, these are included in the - // tarball that we send to the compute node to initialize the - // PostgreSQL data directory. - // - // Each SLRU segment in PostgreSQL is considered a separate - // relish. For example, pg_clog/0000, pg_clog/0001, and so forth. - // - // SLRU segments are divided into blocks, like relations. - Slru { slru: SlruKind, segno: u32 }, - - // Miscellaneous other files that need to be included in the - // tarball at compute node creation. These are non-blocky, and are - // expected to be small. - - // - // FileNodeMap represents PostgreSQL's 'pg_filenode.map' - // files. They are needed to map catalog table OIDs to filenode - // numbers. Usually the mapping is done by looking up a relation's - // 'relfilenode' field in the 'pg_class' system table, but that - // doesn't work for 'pg_class' itself and a few other such system - // relations. See PostgreSQL relmapper.c for details. - // - // Each database has a map file for its local mapped catalogs, - // and there is a separate map file for shared catalogs. - // - // These files are always 512 bytes long (although we don't check - // or care about that in the page server). - // - FileNodeMap { spcnode: Oid, dbnode: Oid }, - - // - // State files for prepared transactions (e.g pg_twophase/1234) - // - TwoPhase { xid: TransactionId }, - - // The control file, stored in global/pg_control - ControlFile, - - // Special entry that represents PostgreSQL checkpoint. It doesn't - // correspond to to any physical file in PostgreSQL, but we use it - // to track fields needed to restore the checkpoint data in the - // control file, when a compute node is created. - Checkpoint, -} - -impl RelishTag { - pub const fn is_blocky(&self) -> bool { - match self { - // These relishes work with blocks - RelishTag::Relation(_) | RelishTag::Slru { slru: _, segno: _ } => true, - - // and these don't - RelishTag::FileNodeMap { - spcnode: _, - dbnode: _, - } - | RelishTag::TwoPhase { xid: _ } - | RelishTag::ControlFile - | RelishTag::Checkpoint => false, - } - } - - // Physical relishes represent files and use - // RelationSizeEntry to track existing and dropped files. - // They can be both blocky and non-blocky. - pub const fn is_physical(&self) -> bool { - match self { - // These relishes represent physical files - RelishTag::Relation(_) - | RelishTag::Slru { .. } - | RelishTag::FileNodeMap { .. } - | RelishTag::TwoPhase { .. } => true, - - // and these don't - RelishTag::ControlFile | RelishTag::Checkpoint => false, - } - } - - // convenience function to check if this relish is a normal relation. - pub const fn is_relation(&self) -> bool { - matches!(self, RelishTag::Relation(_)) - } -} - -/// -/// Relation data file segment id throughout the Postgres cluster. -/// -/// Every data file in Postgres is uniquely identified by 4 numbers: -/// - relation id / node (`relnode`) -/// - database id (`dbnode`) -/// - tablespace id (`spcnode`), in short this is a unique id of a separate -/// directory to store data files. -/// - forknumber (`forknum`) is used to split different kinds of data of the same relation -/// between some set of files (`relnode`, `relnode_fsm`, `relnode_vm`). -/// -/// In native Postgres code `RelFileNode` structure and individual `ForkNumber` value -/// are used for the same purpose. -/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57). -/// -#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)] -pub struct RelTag { - pub forknum: u8, - pub spcnode: Oid, - pub dbnode: Oid, - pub relnode: Oid, -} - -/// Display RelTag in the same format that's used in most PostgreSQL debug messages: -/// -/// //[_fsm|_vm|_init] -/// -impl fmt::Display for RelTag { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if let Some(forkname) = forknumber_to_name(self.forknum) { - write!( - f, - "{}/{}/{}_{}", - self.spcnode, self.dbnode, self.relnode, forkname - ) - } else { - write!(f, "{}/{}/{}", self.spcnode, self.dbnode, self.relnode) - } - } -} - -/// Display RelTag in the same format that's used in most PostgreSQL debug messages: -/// -/// //[_fsm|_vm|_init] -/// -impl fmt::Display for RelishTag { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - RelishTag::Relation(rel) => rel.fmt(f), - RelishTag::Slru { slru, segno } => { - // e.g. pg_clog/0001 - write!(f, "{}/{:04X}", slru.to_str(), segno) - } - RelishTag::FileNodeMap { spcnode, dbnode } => { - write!(f, "relmapper file for spc {} db {}", spcnode, dbnode) - } - RelishTag::TwoPhase { xid } => { - write!(f, "pg_twophase/{:08X}", xid) - } - RelishTag::ControlFile => { - write!(f, "control file") - } - RelishTag::Checkpoint => { - write!(f, "checkpoint") - } - } - } -} - -/// -/// Non-relation transaction status files (clog (a.k.a. pg_xact) and -/// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer, -/// hence the name. -/// -/// These files are global for a postgres instance. -/// -/// These files are divided into segments, which are divided into -/// pages of the same BLCKSZ as used for relation files. -/// -#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub enum SlruKind { - Clog, - MultiXactMembers, - MultiXactOffsets, -} - -impl SlruKind { - pub fn to_str(&self) -> &'static str { - match self { - Self::Clog => "pg_xact", - Self::MultiXactMembers => "pg_multixact/members", - Self::MultiXactOffsets => "pg_multixact/offsets", - } - } -} diff --git a/pageserver/src/remote_storage.rs b/pageserver/src/remote_storage.rs deleted file mode 100644 index 4af1f8ed56..0000000000 --- a/pageserver/src/remote_storage.rs +++ /dev/null @@ -1,358 +0,0 @@ -//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage. -//! This particular module serves as a public API border between pageserver and the internal storage machinery. -//! No other modules from this tree are supposed to be used directly by the external code. -//! -//! There are a few components the storage machinery consists of: -//! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations: -//! * [`local_fs`] allows to use local file system as an external storage -//! * [`rust_s3`] uses AWS S3 bucket as an external storage -//! -//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync. -//! Synchronization internals are split into submodules -//! * [`storage_sync::compression`] for a custom remote storage format used to store timeline files in archives -//! * [`storage_sync::index`] to keep track of remote tenant files, the metadata and their mappings to local files -//! * [`storage_sync::upload`] and [`storage_sync::download`] to manage archive creation and upload; download and extraction, respectively -//! -//! * public API via to interact with the external world: -//! * [`start_local_timeline_sync`] to launch a background async loop to handle the synchronization -//! * [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] to enqueue a new upload and download tasks, -//! to be processed by the async loop -//! -//! Here's a schematic overview of all interactions backup and the rest of the pageserver perform: -//! -//! +------------------------+ +--------->-------+ -//! | | - - - (init async loop) - - - -> | | -//! | | | | -//! | | -------------------------------> | async | -//! | pageserver | (enqueue timeline sync task) | upload/download | -//! | | | loop | -//! | | <------------------------------- | | -//! | | (apply new timeline sync states) | | -//! +------------------------+ +---------<-------+ -//! | -//! | -//! CRUD layer file operations | -//! (upload/download/delete/list, etc.) | -//! V -//! +------------------------+ -//! | | -//! | [`RemoteStorage`] impl | -//! | | -//! | pageserver assumes it | -//! | owns exclusive write | -//! | access to this storage | -//! +------------------------+ -//! -//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so. -//! The loop inits the storage connection and checks the remote files stored. -//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server). -//! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can -//! query their downloads later if they are accessed. -//! -//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata. -//! If the storage sync loop was successfully started before, pageserver schedules the new checkpoint file uploads after every checkpoint. -//! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either). -//! See [`crate::layered_repository`] for the upload calls and the adjacent logic. -//! -//! Synchronization logic is able to communicate back with updated timeline sync states, [`TimelineSyncState`], -//! submitted via [`crate::tenant_mgr::set_timeline_states`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state. -//! Such submissions happen in two cases: -//! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future -//! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory -//! -//! When the pageserver terminates, the upload loop finishes a current sync task (if any) and exits. -//! -//! The storage logic considers `image` as a set of local files, fully representing a certain timeline at given moment (identified with `disk_consistent_lsn`). -//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed -//! by the storage upload, if enabled. -//! Yet timeline cannot alter already existing files, and normally cannot remote those too: only a GC process is capable of removing unused files. -//! This way, remote storage synchronization relies on the fact that every checkpoint is incremental and local files are "immutable": -//! * when a certain checkpoint gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same state -//! * no files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten -//! when the newer image is downloaded -//! -//! To optimize S3 storage (and access), the sync loop compresses the checkpoint files before placing them to S3, and uncompresses them back, keeping track of timeline files and metadata. -//! Also, the remote file list is queried once only, at startup, to avoid possible extra costs and latency issues. -//! -//! NOTES: -//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage -//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API. -//! -//! * the sync tasks may not processed immediately after the submission: if they error and get re-enqueued, their execution might be backed off to ensure error cap is not exceeded too fast. -//! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time. - -mod local_fs; -mod rust_s3; -mod storage_sync; - -use std::{ - collections::HashMap, - ffi, fs, - path::{Path, PathBuf}, -}; - -use anyhow::{bail, Context}; -use tokio::io; -use tracing::{error, info}; -use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; - -pub use self::storage_sync::{schedule_timeline_checkpoint_upload, schedule_timeline_download}; -use self::{local_fs::LocalFs, rust_s3::S3}; -use crate::{ - config::{PageServerConf, RemoteStorageKind}, - layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}, - repository::TimelineSyncState, -}; - -pub use storage_sync::compression; - -/// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization. -/// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still, -/// to simplify the received code. -pub struct SyncStartupData { - /// A sync state, derived from initial comparison of local timeline files and the remote archives, - /// before any sync tasks are executed. - /// To reuse the local file scan logic, the timeline states are returned even if no sync loop get started during init: - /// in this case, no remote files exist and all local timelines with correct metadata files are considered ready. - pub initial_timeline_states: HashMap>, -} - -/// Based on the config, initiates the remote storage connection and starts a separate thread -/// that ensures that pageserver and the remote storage are in sync with each other. -/// If no external configuration connection given, no thread or storage initialization is done. -/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states. -pub fn start_local_timeline_sync( - config: &'static PageServerConf, -) -> anyhow::Result { - let local_timeline_files = local_tenant_timeline_files(config) - .context("Failed to collect local tenant timeline files")?; - - match &config.remote_storage_config { - Some(storage_config) => match &storage_config.storage { - RemoteStorageKind::LocalFs(root) => { - info!("Using fs root '{}' as a remote storage", root.display()); - storage_sync::spawn_storage_sync_thread( - config, - local_timeline_files, - LocalFs::new(root.clone(), &config.workdir)?, - storage_config.max_concurrent_sync, - storage_config.max_sync_errors, - ) - }, - RemoteStorageKind::AwsS3(s3_config) => { - info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'", - s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); - storage_sync::spawn_storage_sync_thread( - config, - local_timeline_files, - S3::new(s3_config, &config.workdir)?, - storage_config.max_concurrent_sync, - storage_config.max_sync_errors, - ) - }, - } - .context("Failed to spawn the storage sync thread"), - None => { - info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); - let mut initial_timeline_states: HashMap< - ZTenantId, - HashMap, - > = HashMap::new(); - for (ZTenantTimelineId{tenant_id, timeline_id}, (timeline_metadata, _)) in - local_timeline_files - { - initial_timeline_states - .entry(tenant_id) - .or_default() - .insert( - timeline_id, - TimelineSyncState::Ready(timeline_metadata.disk_consistent_lsn()), - ); - } - Ok(SyncStartupData { - initial_timeline_states, - }) - } - } -} - -fn local_tenant_timeline_files( - config: &'static PageServerConf, -) -> anyhow::Result)>> { - let mut local_tenant_timeline_files = HashMap::new(); - let tenants_dir = config.tenants_path(); - for tenants_dir_entry in fs::read_dir(&tenants_dir) - .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? - { - match &tenants_dir_entry { - Ok(tenants_dir_entry) => { - match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) { - Ok(collected_files) => { - local_tenant_timeline_files.extend(collected_files.into_iter()) - } - Err(e) => error!( - "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}", - tenants_dir.display(), - tenants_dir_entry, - e - ), - } - } - Err(e) => error!( - "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}", - tenants_dir_entry, - tenants_dir.display(), - e - ), - } - } - - Ok(local_tenant_timeline_files) -} - -fn collect_timelines_for_tenant( - config: &'static PageServerConf, - tenant_path: &Path, -) -> anyhow::Result)>> { - let mut timelines: HashMap)> = - HashMap::new(); - let tenant_id = tenant_path - .file_name() - .and_then(ffi::OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse tenant id out of the tenant dir name")?; - let timelines_dir = config.timelines_path(&tenant_id); - - for timelines_dir_entry in fs::read_dir(&timelines_dir).with_context(|| { - format!( - "Failed to list timelines dir entry for tenant {}", - tenant_id - ) - })? { - match timelines_dir_entry { - Ok(timelines_dir_entry) => { - let timeline_path = timelines_dir_entry.path(); - match collect_timeline_files(&timeline_path) { - Ok((timeline_id, metadata, timeline_files)) => { - timelines.insert( - ZTenantTimelineId { - tenant_id, - timeline_id, - }, - (metadata, timeline_files), - ); - } - Err(e) => error!( - "Failed to process timeline dir contents at '{}', reason: {:?}", - timeline_path.display(), - e - ), - } - } - Err(e) => error!( - "Failed to list timelines for entry tenant {}, reason: {:?}", - tenant_id, e - ), - } - } - - Ok(timelines) -} - -fn collect_timeline_files( - timeline_dir: &Path, -) -> anyhow::Result<(ZTimelineId, TimelineMetadata, Vec)> { - let mut timeline_files = Vec::new(); - let mut timeline_metadata_path = None; - - let timeline_id = timeline_dir - .file_name() - .and_then(ffi::OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse timeline id out of the timeline dir name")?; - let timeline_dir_entries = - fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; - for entry in timeline_dir_entries { - let entry_path = entry.context("Failed to list timeline dir entry")?.path(); - if entry_path.is_file() { - if entry_path.file_name().and_then(ffi::OsStr::to_str) == Some(METADATA_FILE_NAME) { - timeline_metadata_path = Some(entry_path); - } else { - timeline_files.push(entry_path); - } - } - } - - let timeline_metadata_path = match timeline_metadata_path { - Some(path) => path, - None => bail!("No metadata file found in the timeline directory"), - }; - let metadata = TimelineMetadata::from_bytes( - &fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, - ) - .context("Failed to parse timeline metadata file bytes")?; - - Ok((timeline_id, metadata, timeline_files)) -} - -/// Storage (potentially remote) API to manage its state. -/// This storage tries to be unaware of any layered repository context, -/// providing basic CRUD operations for storage files. -#[async_trait::async_trait] -trait RemoteStorage: Send + Sync { - /// A way to uniquely reference a file in the remote storage. - type StoragePath; - - /// Attempts to derive the storage path out of the local path, if the latter is correct. - fn storage_path(&self, local_path: &Path) -> anyhow::Result; - - /// Gets the download path of the given storage file. - fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result; - - /// Lists all items the storage has right now. - async fn list(&self) -> anyhow::Result>; - - /// Streams the local file contents into remote into the remote storage entry. - async fn upload( - &self, - from: impl io::AsyncRead + Unpin + Send + Sync + 'static, - to: &Self::StoragePath, - ) -> anyhow::Result<()>; - - /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer. - async fn download( - &self, - from: &Self::StoragePath, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result<()>; - - /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer. - async fn download_range( - &self, - from: &Self::StoragePath, - start_inclusive: u64, - end_exclusive: Option, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result<()>; - - async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()>; -} - -fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> { - if prefix == path { - anyhow::bail!( - "Prefix and the path are equal, cannot strip: '{}'", - prefix.display() - ) - } else { - path.strip_prefix(prefix).with_context(|| { - format!( - "Path '{}' is not prefixed with '{}'", - path.display(), - prefix.display(), - ) - }) - } -} diff --git a/pageserver/src/remote_storage/README.md b/pageserver/src/remote_storage/README.md deleted file mode 100644 index 1c718acf06..0000000000 --- a/pageserver/src/remote_storage/README.md +++ /dev/null @@ -1,72 +0,0 @@ -# Non-implementation details - -This document describes the current state of the backup system in pageserver, existing limitations and concerns, why some things are done the way they are the future development plans. -Detailed description on how the synchronization works and how it fits into the rest of the pageserver can be found in the [storage module](./../remote_storage.rs) and its submodules. -Ideally, this document should disappear after current implementation concerns are mitigated, with the remaining useful knowledge bits moved into rustdocs. - -## Approach - -Backup functionality is a new component, appeared way after the core DB functionality was implemented. -Pageserver layer functionality is also quite volatile at the moment, there's a risk its local file management changes over time. - -To avoid adding more chaos into that, backup functionality is currently designed as a relatively standalone component, with the majority of its logic placed in a standalone async loop. -This way, the backups are managed in background, not affecting directly other pageserver parts: this way the backup and restoration process may lag behind, but eventually keep up with the reality. To track that, a set of prometheus metrics is exposed from pageserver. - -## What's done - -Current implementation -* provides remote storage wrappers for AWS S3 and local FS -* synchronizes the differences with local timelines and remote states as fast as possible -* uploads new relishes, frozen by pageserver checkpoint thread -* downloads and registers timelines, found on the remote storage, but missing locally, if those are requested somehow via pageserver (e.g. http api, gc) -* uses compression when deals with files, for better S3 usage -* maintains an index of what's stored remotely -* evicts failing tasks and stops the corresponding timelines - -The tasks are delayed with every retry and the retries are capped, to avoid poisonous tasks. -After any task eviction, or any error at startup checks (e.g. obviously different and wrong local and remote states fot the same timeline), -the timeline has to be stopped from submitting further checkpoint upload tasks, which is done along the corresponding timeline status change. - -No good optimisations or performance testing is done, the feature is disabled by default and gets polished over time. -It's planned to deal with all questions that are currently on and prepare the feature to be enabled by default in cloud environments. - -### Peculiarities - -As mentioned, the backup component is rather new and under development currently, so not all things are done properly from the start. -Here's the list of known compromises with comments: - -* Remote storage file model is currently a custom archive format, that's not possible to deserialize without a particular Rust code of ours (including `serde`). -We also don't optimize the archivation and pack every timeline checkpoint separately, so the resulting blob's size that gets on S3 could be arbitrary. -But, it's a single blob, which is way better than storing ~780 small files separately. - -* Archive index restoration requires reading every blob's head. -This could be avoided by a background thread/future storing the serialized index in the remote storage. - -* no proper file comparison - -No file checksum assertion is done currently, but should be (AWS S3 returns file checksums during the `list` operation) - -* sad rust-s3 api - -rust-s3 is not very pleasant to use: -1. it returns `anyhow::Result` and it's hard to distinguish "missing file" cases from "no connection" one, for instance -2. at least one function it its API that we need (`get_object_stream`) has `async` keyword and blocks (!), see details [here](https://github.com/zenithdb/zenith/pull/752#discussion_r728373091) -3. it's a prerelease library with unclear maintenance status -4. noisy on debug level - -But it's already used in the project, so for now it's reused to avoid bloating the dependency tree. -Based on previous evaluation, even `rusoto-s3` could be a better choice over this library, but needs further benchmarking. - - -* gc is ignored - -So far, we don't adjust the remote storage based on GC thread loop results, only checkpointer loop affects the remote storage. -Index module could be used as a base to implement a deferred GC mechanism, a "defragmentation" that repacks archives into new ones after GC is done removing the files from the archives. - -* bracnhes implementaion could be improved - -Currently, there's a code to sync the branches along with the timeline files: on upload, every local branch files that are missing remotely are uploaded, -on the timeline download, missing remote branch files are downlaoded. - -A branch is a per-tenant entity, yet a current implementaion requires synchronizing a timeline first to get the branch files locally. -Currently, there's no other way to know about the remote branch files, neither the file contents is verified and updated. diff --git a/pageserver/src/remote_storage/local_fs.rs b/pageserver/src/remote_storage/local_fs.rs deleted file mode 100644 index 01f6028d17..0000000000 --- a/pageserver/src/remote_storage/local_fs.rs +++ /dev/null @@ -1,689 +0,0 @@ -//! Local filesystem acting as a remote storage. -//! Multiple pageservers can use the same "storage" of this kind by using different storage roots. -//! -//! This storage used in pageserver tests, but can also be used in cases when a certain persistent -//! volume is mounted to the local FS. - -use std::{ - future::Future, - path::{Path, PathBuf}, - pin::Pin, -}; - -use anyhow::{bail, ensure, Context}; -use tokio::{ - fs, - io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, -}; -use tracing::*; - -use super::{strip_path_prefix, RemoteStorage}; - -pub struct LocalFs { - pageserver_workdir: &'static Path, - root: PathBuf, -} - -impl LocalFs { - /// Attempts to create local FS storage, along with its root directory. - pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result { - if !root.exists() { - std::fs::create_dir_all(&root).with_context(|| { - format!( - "Failed to create all directories in the given root path '{}'", - root.display(), - ) - })?; - } - Ok(Self { - pageserver_workdir, - root, - }) - } - - fn resolve_in_storage(&self, path: &Path) -> anyhow::Result { - if path.is_relative() { - Ok(self.root.join(path)) - } else if path.starts_with(&self.root) { - Ok(path.to_path_buf()) - } else { - bail!( - "Path '{}' does not belong to the current storage", - path.display() - ) - } - } -} - -#[async_trait::async_trait] -impl RemoteStorage for LocalFs { - type StoragePath = PathBuf; - - fn storage_path(&self, local_path: &Path) -> anyhow::Result { - Ok(self.root.join( - strip_path_prefix(self.pageserver_workdir, local_path) - .context("local path does not belong to this storage")?, - )) - } - - fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result { - let relative_path = strip_path_prefix(&self.root, storage_path) - .context("local path does not belong to this storage")?; - Ok(self.pageserver_workdir.join(relative_path)) - } - - async fn list(&self) -> anyhow::Result> { - get_all_files(&self.root).await - } - - async fn upload( - &self, - mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static, - to: &Self::StoragePath, - ) -> anyhow::Result<()> { - let target_file_path = self.resolve_in_storage(to)?; - create_target_directory(&target_file_path).await?; - let mut destination = io::BufWriter::new( - fs::OpenOptions::new() - .write(true) - .create(true) - .open(&target_file_path) - .await - .with_context(|| { - format!( - "Failed to open target fs destination at '{}'", - target_file_path.display() - ) - })?, - ); - - io::copy(&mut from, &mut destination) - .await - .with_context(|| { - format!( - "Failed to upload file to the local storage at '{}'", - target_file_path.display() - ) - })?; - destination.flush().await.with_context(|| { - format!( - "Failed to upload file to the local storage at '{}'", - target_file_path.display() - ) - })?; - Ok(()) - } - - async fn download( - &self, - from: &Self::StoragePath, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result<()> { - let file_path = self.resolve_in_storage(from)?; - - if file_path.exists() && file_path.is_file() { - let mut source = io::BufReader::new( - fs::OpenOptions::new() - .read(true) - .open(&file_path) - .await - .with_context(|| { - format!( - "Failed to open source file '{}' to use in the download", - file_path.display() - ) - })?, - ); - io::copy(&mut source, to).await.with_context(|| { - format!( - "Failed to download file '{}' from the local storage", - file_path.display() - ) - })?; - source.flush().await?; - Ok(()) - } else { - bail!( - "File '{}' either does not exist or is not a file", - file_path.display() - ) - } - } - - async fn download_range( - &self, - from: &Self::StoragePath, - start_inclusive: u64, - end_exclusive: Option, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result<()> { - if let Some(end_exclusive) = end_exclusive { - ensure!( - end_exclusive > start_inclusive, - "Invalid range, start ({}) is bigger then end ({:?})", - start_inclusive, - end_exclusive - ); - if start_inclusive == end_exclusive.saturating_sub(1) { - return Ok(()); - } - } - let file_path = self.resolve_in_storage(from)?; - - if file_path.exists() && file_path.is_file() { - let mut source = io::BufReader::new( - fs::OpenOptions::new() - .read(true) - .open(&file_path) - .await - .with_context(|| { - format!( - "Failed to open source file '{}' to use in the download", - file_path.display() - ) - })?, - ); - source - .seek(io::SeekFrom::Start(start_inclusive)) - .await - .context("Failed to seek to the range start in a local storage file")?; - match end_exclusive { - Some(end_exclusive) => { - io::copy(&mut source.take(end_exclusive - start_inclusive), to).await - } - None => io::copy(&mut source, to).await, - } - .with_context(|| { - format!( - "Failed to download file '{}' range from the local storage", - file_path.display() - ) - })?; - Ok(()) - } else { - bail!( - "File '{}' either does not exist or is not a file", - file_path.display() - ) - } - } - - async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> { - let file_path = self.resolve_in_storage(path)?; - if file_path.exists() && file_path.is_file() { - Ok(fs::remove_file(file_path).await?) - } else { - bail!( - "File '{}' either does not exist or is not a file", - file_path.display() - ) - } - } -} - -fn get_all_files<'a, P>( - directory_path: P, -) -> Pin>> + Send + Sync + 'a>> -where - P: AsRef + Send + Sync + 'a, -{ - Box::pin(async move { - let directory_path = directory_path.as_ref(); - if directory_path.exists() { - if directory_path.is_dir() { - let mut paths = Vec::new(); - let mut dir_contents = fs::read_dir(directory_path).await?; - while let Some(dir_entry) = dir_contents.next_entry().await? { - let file_type = dir_entry.file_type().await?; - let entry_path = dir_entry.path(); - if file_type.is_symlink() { - debug!("{:?} us a symlink, skipping", entry_path) - } else if file_type.is_dir() { - paths.extend(get_all_files(entry_path).await?.into_iter()) - } else { - paths.push(dir_entry.path()); - } - } - Ok(paths) - } else { - bail!("Path '{}' is not a directory", directory_path.display()) - } - } else { - Ok(Vec::new()) - } - }) -} - -async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> { - let target_dir = match target_file_path.parent() { - Some(parent_dir) => parent_dir, - None => bail!( - "File path '{}' has no parent directory", - target_file_path.display() - ), - }; - if !target_dir.exists() { - fs::create_dir_all(target_dir).await?; - } - Ok(()) -} - -#[cfg(test)] -mod pure_tests { - use crate::{ - layered_repository::metadata::METADATA_FILE_NAME, - repository::repo_harness::{RepoHarness, TIMELINE_ID}, - }; - - use super::*; - - #[test] - fn storage_path_positive() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("storage_path_positive")?; - let storage_root = PathBuf::from("somewhere").join("else"); - let storage = LocalFs { - pageserver_workdir: &repo_harness.conf.workdir, - root: storage_root.clone(), - }; - - let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("file_name"); - let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?); - - assert_eq!( - expected_path, - storage.storage_path(&local_path).expect("Matching path should map to storage path normally"), - "File paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir" - ); - - Ok(()) - } - - #[test] - fn storage_path_negatives() -> anyhow::Result<()> { - #[track_caller] - fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String { - match storage.storage_path(mismatching_path) { - Ok(wrong_path) => panic!( - "Expected path '{}' to error, but got storage path: {:?}", - mismatching_path.display(), - wrong_path, - ), - Err(e) => format!("{:?}", e), - } - } - - let repo_harness = RepoHarness::create("storage_path_negatives")?; - let storage_root = PathBuf::from("somewhere").join("else"); - let storage = LocalFs { - pageserver_workdir: &repo_harness.conf.workdir, - root: storage_root, - }; - - let error_string = storage_path_error(&storage, &repo_harness.conf.workdir); - assert!(error_string.contains("does not belong to this storage")); - assert!(error_string.contains(repo_harness.conf.workdir.to_str().unwrap())); - - let mismatching_path_str = "/something/else"; - let error_message = storage_path_error(&storage, Path::new(mismatching_path_str)); - assert!( - error_message.contains(mismatching_path_str), - "Error should mention wrong path" - ); - assert!( - error_message.contains(repo_harness.conf.workdir.to_str().unwrap()), - "Error should mention server workdir" - ); - assert!(error_message.contains("does not belong to this storage")); - - Ok(()) - } - - #[test] - fn local_path_positive() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("local_path_positive")?; - let storage_root = PathBuf::from("somewhere").join("else"); - let storage = LocalFs { - pageserver_workdir: &repo_harness.conf.workdir, - root: storage_root.clone(), - }; - - let name = "not a metadata"; - let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name); - assert_eq!( - local_path, - storage - .local_path( - &storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?) - ) - .expect("For a valid input, valid local path should be parsed"), - "Should be able to parse metadata out of the correctly named remote delta file" - ); - - let local_metadata_path = repo_harness - .timeline_path(&TIMELINE_ID) - .join(METADATA_FILE_NAME); - let remote_metadata_path = storage.storage_path(&local_metadata_path)?; - assert_eq!( - local_metadata_path, - storage - .local_path(&remote_metadata_path) - .expect("For a valid input, valid local path should be parsed"), - "Should be able to parse metadata out of the correctly named remote metadata file" - ); - - Ok(()) - } - - #[test] - fn local_path_negatives() -> anyhow::Result<()> { - #[track_caller] - #[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.local_path` parameter requirements - fn local_path_error(storage: &LocalFs, storage_path: &PathBuf) -> String { - match storage.local_path(storage_path) { - Ok(wrong_path) => panic!( - "Expected local path input {:?} to cause an error, but got file path: {:?}", - storage_path, wrong_path, - ), - Err(e) => format!("{:?}", e), - } - } - - let repo_harness = RepoHarness::create("local_path_negatives")?; - let storage_root = PathBuf::from("somewhere").join("else"); - let storage = LocalFs { - pageserver_workdir: &repo_harness.conf.workdir, - root: storage_root, - }; - - let totally_wrong_path = "wrong_wrong_wrong"; - let error_message = local_path_error(&storage, &PathBuf::from(totally_wrong_path)); - assert!(error_message.contains(totally_wrong_path)); - - Ok(()) - } - - #[test] - fn download_destination_matches_original_path() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_destination_matches_original_path")?; - let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name"); - - let storage_root = PathBuf::from("somewhere").join("else"); - let dummy_storage = LocalFs { - pageserver_workdir: &repo_harness.conf.workdir, - root: storage_root, - }; - - let storage_path = dummy_storage.storage_path(&original_path)?; - let download_destination = dummy_storage.local_path(&storage_path)?; - - assert_eq!( - original_path, download_destination, - "'original path -> storage path -> matching fs path' transformation should produce the same path as the input one for the correct path" - ); - - Ok(()) - } -} - -#[cfg(test)] -mod fs_tests { - use super::*; - use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID}; - - use std::io::Write; - use tempfile::tempdir; - - #[tokio::test] - async fn upload_file() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("upload_file")?; - let storage = create_storage()?; - - let source = create_file_for_upload( - &storage.pageserver_workdir.join("whatever"), - "whatever_contents", - ) - .await?; - let target_path = PathBuf::from("/").join("somewhere").join("else"); - match storage.upload(source, &target_path).await { - Ok(()) => panic!("Should not allow storing files with wrong target path"), - Err(e) => { - let message = format!("{:?}", e); - assert!(message.contains(&target_path.display().to_string())); - assert!(message.contains("does not belong to the current storage")); - } - } - assert!(storage.list().await?.is_empty()); - - let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1").await?; - assert_eq!( - storage.list().await?, - vec![target_path_1.clone()], - "Should list a single file after first upload" - ); - - let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2").await?; - assert_eq!( - list_files_sorted(&storage).await?, - vec![target_path_1.clone(), target_path_2.clone()], - "Should list a two different files after second upload" - ); - - Ok(()) - } - - fn create_storage() -> anyhow::Result { - let pageserver_workdir = Box::leak(Box::new(tempdir()?.path().to_owned())); - let storage = LocalFs::new(tempdir()?.path().to_owned(), pageserver_workdir)?; - Ok(storage) - } - - #[tokio::test] - async fn download_file() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_file")?; - let storage = create_storage()?; - let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?; - - let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - storage.download(&upload_target, &mut content_bytes).await?; - content_bytes.flush().await?; - - let contents = String::from_utf8(content_bytes.into_inner().into_inner())?; - assert_eq!( - dummy_contents(upload_name), - contents, - "We should upload and download the same contents" - ); - - let non_existing_path = PathBuf::from("somewhere").join("else"); - match storage.download(&non_existing_path, &mut io::sink()).await { - Ok(_) => panic!("Should not allow downloading non-existing storage files"), - Err(e) => { - let error_string = e.to_string(); - assert!(error_string.contains("does not exist")); - assert!(error_string.contains(&non_existing_path.display().to_string())); - } - } - Ok(()) - } - - #[tokio::test] - async fn download_file_range_positive() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_file_range_positive")?; - let storage = create_storage()?; - let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?; - - let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - storage - .download_range(&upload_target, 0, None, &mut full_range_bytes) - .await?; - full_range_bytes.flush().await?; - assert_eq!( - dummy_contents(upload_name), - String::from_utf8(full_range_bytes.into_inner().into_inner())?, - "Download full range should return the whole upload" - ); - - let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - let same_byte = 1_000_000_000; - storage - .download_range( - &upload_target, - same_byte, - Some(same_byte + 1), // exclusive end - &mut zero_range_bytes, - ) - .await?; - zero_range_bytes.flush().await?; - assert!( - zero_range_bytes.into_inner().into_inner().is_empty(), - "Zero byte range should not download any part of the file" - ); - - let uploaded_bytes = dummy_contents(upload_name).into_bytes(); - let (first_part_local, second_part_local) = uploaded_bytes.split_at(3); - - let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - storage - .download_range( - &upload_target, - 0, - Some(first_part_local.len() as u64), - &mut first_part_remote, - ) - .await?; - first_part_remote.flush().await?; - let first_part_remote = first_part_remote.into_inner().into_inner(); - assert_eq!( - first_part_local, - first_part_remote.as_slice(), - "First part bytes should be returned when requested" - ); - - let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - storage - .download_range( - &upload_target, - first_part_local.len() as u64, - Some((first_part_local.len() + second_part_local.len()) as u64), - &mut second_part_remote, - ) - .await?; - second_part_remote.flush().await?; - let second_part_remote = second_part_remote.into_inner().into_inner(); - assert_eq!( - second_part_local, - second_part_remote.as_slice(), - "Second part bytes should be returned when requested" - ); - - Ok(()) - } - - #[tokio::test] - async fn download_file_range_negative() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_file_range_negative")?; - let storage = create_storage()?; - let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?; - - let start = 10000; - let end = 234; - assert!(start > end, "Should test an incorrect range"); - match storage - .download_range(&upload_target, start, Some(end), &mut io::sink()) - .await - { - Ok(_) => panic!("Should not allow downloading wrong ranges"), - Err(e) => { - let error_string = e.to_string(); - assert!(error_string.contains("Invalid range")); - assert!(error_string.contains(&start.to_string())); - assert!(error_string.contains(&end.to_string())); - } - } - - let non_existing_path = PathBuf::from("somewhere").join("else"); - match storage - .download_range(&non_existing_path, 1, Some(3), &mut io::sink()) - .await - { - Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"), - Err(e) => { - let error_string = e.to_string(); - assert!(error_string.contains("does not exist")); - assert!(error_string.contains(&non_existing_path.display().to_string())); - } - } - Ok(()) - } - - #[tokio::test] - async fn delete_file() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("delete_file")?; - let storage = create_storage()?; - let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?; - - storage.delete(&upload_target).await?; - assert!(storage.list().await?.is_empty()); - - match storage.delete(&upload_target).await { - Ok(()) => panic!("Should not allow deleting non-existing storage files"), - Err(e) => { - let error_string = e.to_string(); - assert!(error_string.contains("does not exist")); - assert!(error_string.contains(&upload_target.display().to_string())); - } - } - Ok(()) - } - - async fn upload_dummy_file( - harness: &RepoHarness, - storage: &LocalFs, - name: &str, - ) -> anyhow::Result { - let timeline_path = harness.timeline_path(&TIMELINE_ID); - let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?; - let storage_path = storage.root.join(relative_timeline_path).join(name); - storage - .upload( - create_file_for_upload( - &storage.pageserver_workdir.join(name), - &dummy_contents(name), - ) - .await?, - &storage_path, - ) - .await?; - Ok(storage_path) - } - - async fn create_file_for_upload( - path: &Path, - contents: &str, - ) -> anyhow::Result> { - std::fs::create_dir_all(path.parent().unwrap())?; - let mut file_for_writing = std::fs::OpenOptions::new() - .write(true) - .create_new(true) - .open(path)?; - write!(file_for_writing, "{}", contents)?; - drop(file_for_writing); - Ok(io::BufReader::new( - fs::OpenOptions::new().read(true).open(&path).await?, - )) - } - - fn dummy_contents(name: &str) -> String { - format!("contents for {}", name) - } - - async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result> { - let mut files = storage.list().await?; - files.sort(); - Ok(files) - } -} diff --git a/pageserver/src/remote_storage/rust_s3.rs b/pageserver/src/remote_storage/rust_s3.rs deleted file mode 100644 index 527bdf48ff..0000000000 --- a/pageserver/src/remote_storage/rust_s3.rs +++ /dev/null @@ -1,438 +0,0 @@ -//! AWS S3 storage wrapper around `rust_s3` library. -//! -//! Respects `prefix_in_bucket` property from [`S3Config`], -//! allowing multiple pageservers to independently work with the same S3 bucket, if -//! their bucket prefixes are both specified and different. - -use std::path::{Path, PathBuf}; - -use anyhow::Context; -use s3::{bucket::Bucket, creds::Credentials, region::Region}; -use tokio::io::{self, AsyncWriteExt}; -use tracing::debug; - -use crate::{ - config::S3Config, - remote_storage::{strip_path_prefix, RemoteStorage}, -}; - -const S3_FILE_SEPARATOR: char = '/'; - -#[derive(Debug, Eq, PartialEq)] -pub struct S3ObjectKey(String); - -impl S3ObjectKey { - fn key(&self) -> &str { - &self.0 - } - - fn download_destination( - &self, - pageserver_workdir: &Path, - prefix_to_strip: Option<&str>, - ) -> PathBuf { - let path_without_prefix = match prefix_to_strip { - Some(prefix) => self.0.strip_prefix(prefix).unwrap_or_else(|| { - panic!( - "Could not strip prefix '{}' from S3 object key '{}'", - prefix, self.0 - ) - }), - None => &self.0, - }; - - pageserver_workdir.join( - path_without_prefix - .split(S3_FILE_SEPARATOR) - .collect::(), - ) - } -} - -/// AWS S3 storage. -pub struct S3 { - pageserver_workdir: &'static Path, - bucket: Bucket, - prefix_in_bucket: Option, -} - -impl S3 { - /// Creates the storage, errors if incorrect AWS S3 configuration provided. - pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result { - debug!( - "Creating s3 remote storage around bucket {}", - aws_config.bucket_name - ); - let region = match aws_config.endpoint.clone() { - Some(endpoint) => Region::Custom { - endpoint, - region: aws_config.bucket_region.clone(), - }, - None => aws_config - .bucket_region - .parse::() - .context("Failed to parse the s3 region from config")?, - }; - - let credentials = Credentials::new( - aws_config.access_key_id.as_deref(), - aws_config.secret_access_key.as_deref(), - None, - None, - None, - ) - .context("Failed to create the s3 credentials")?; - - let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| { - let mut prefix = prefix; - while prefix.starts_with(S3_FILE_SEPARATOR) { - prefix = &prefix[1..] - } - - let mut prefix = prefix.to_string(); - while prefix.ends_with(S3_FILE_SEPARATOR) { - prefix.pop(); - } - prefix - }); - - Ok(Self { - bucket: Bucket::new_with_path_style( - aws_config.bucket_name.as_str(), - region, - credentials, - ) - .context("Failed to create the s3 bucket")?, - pageserver_workdir, - prefix_in_bucket, - }) - } -} - -#[async_trait::async_trait] -impl RemoteStorage for S3 { - type StoragePath = S3ObjectKey; - - fn storage_path(&self, local_path: &Path) -> anyhow::Result { - let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?; - let mut key = self.prefix_in_bucket.clone().unwrap_or_default(); - for segment in relative_path { - key.push(S3_FILE_SEPARATOR); - key.push_str(&segment.to_string_lossy()); - } - Ok(S3ObjectKey(key)) - } - - fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result { - Ok(storage_path - .download_destination(self.pageserver_workdir, self.prefix_in_bucket.as_deref())) - } - - async fn list(&self) -> anyhow::Result> { - let list_response = self - .bucket - .list(self.prefix_in_bucket.clone().unwrap_or_default(), None) - .await - .context("Failed to list s3 objects")?; - - Ok(list_response - .into_iter() - .flat_map(|response| response.contents) - .map(|s3_object| S3ObjectKey(s3_object.key)) - .collect()) - } - - async fn upload( - &self, - mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static, - to: &Self::StoragePath, - ) -> anyhow::Result<()> { - let mut upload_contents = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - io::copy(&mut from, &mut upload_contents) - .await - .context("Failed to read the upload contents")?; - upload_contents - .flush() - .await - .context("Failed to read the upload contents")?; - let upload_contents = upload_contents.into_inner().into_inner(); - - let (_, code) = self - .bucket - .put_object(to.key(), &upload_contents) - .await - .with_context(|| format!("Failed to create s3 object with key {}", to.key()))?; - if code != 200 { - Err(anyhow::format_err!( - "Received non-200 exit code during creating object with key '{}', code: {}", - to.key(), - code - )) - } else { - Ok(()) - } - } - - async fn download( - &self, - from: &Self::StoragePath, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result<()> { - let (data, code) = self - .bucket - .get_object(from.key()) - .await - .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?; - if code != 200 { - Err(anyhow::format_err!( - "Received non-200 exit code during downloading object, code: {}", - code - )) - } else { - // we don't have to write vector into the destination this way, `to_write_all` would be enough. - // but we want to prepare for migration on `rusoto`, that has a streaming HTTP body instead here, with - // which it makes more sense to use `io::copy`. - io::copy(&mut data.as_slice(), to) - .await - .context("Failed to write downloaded data into the destination buffer")?; - Ok(()) - } - } - - async fn download_range( - &self, - from: &Self::StoragePath, - start_inclusive: u64, - end_exclusive: Option, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result<()> { - // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 - // and needs both ends to be exclusive - let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1)); - let (data, code) = self - .bucket - .get_object_range(from.key(), start_inclusive, end_inclusive) - .await - .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?; - if code != 206 { - Err(anyhow::format_err!( - "Received non-206 exit code during downloading object range, code: {}", - code - )) - } else { - // see `download` function above for the comment on why `Vec` buffer is copied this way - io::copy(&mut data.as_slice(), to) - .await - .context("Failed to write downloaded range into the destination buffer")?; - Ok(()) - } - } - - async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> { - let (_, code) = self - .bucket - .delete_object(path.key()) - .await - .with_context(|| format!("Failed to delete s3 object with key {}", path.key()))?; - if code != 204 { - Err(anyhow::format_err!( - "Received non-204 exit code during deleting object with key '{}', code: {}", - path.key(), - code - )) - } else { - Ok(()) - } - } -} - -#[cfg(test)] -mod tests { - use crate::{ - layered_repository::metadata::METADATA_FILE_NAME, - repository::repo_harness::{RepoHarness, TIMELINE_ID}, - }; - - use super::*; - - #[test] - fn download_destination() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_destination")?; - - let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("test_name"); - let relative_path = local_path.strip_prefix(&repo_harness.conf.workdir)?; - - let key = S3ObjectKey(format!( - "{}{}", - S3_FILE_SEPARATOR, - relative_path - .iter() - .map(|segment| segment.to_str().unwrap()) - .collect::>() - .join(&S3_FILE_SEPARATOR.to_string()), - )); - - assert_eq!( - local_path, - key.download_destination(&repo_harness.conf.workdir, None), - "Download destination should consist of s3 path joined with the pageserver workdir prefix" - ); - - Ok(()) - } - - #[test] - fn storage_path_positive() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("storage_path_positive")?; - - let segment_1 = "matching"; - let segment_2 = "file"; - let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2); - - let storage = dummy_storage(&repo_harness.conf.workdir); - - let expected_key = S3ObjectKey(format!( - "{}{SEPARATOR}{}{SEPARATOR}{}", - storage.prefix_in_bucket.as_deref().unwrap_or_default(), - segment_1, - segment_2, - SEPARATOR = S3_FILE_SEPARATOR, - )); - - let actual_key = storage - .storage_path(local_path) - .expect("Matching path should map to S3 path normally"); - assert_eq!( - expected_key, - actual_key, - "S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator" - ); - - Ok(()) - } - - #[test] - fn storage_path_negatives() -> anyhow::Result<()> { - #[track_caller] - fn storage_path_error(storage: &S3, mismatching_path: &Path) -> String { - match storage.storage_path(mismatching_path) { - Ok(wrong_key) => panic!( - "Expected path '{}' to error, but got S3 key: {:?}", - mismatching_path.display(), - wrong_key, - ), - Err(e) => e.to_string(), - } - } - - let repo_harness = RepoHarness::create("storage_path_negatives")?; - let storage = dummy_storage(&repo_harness.conf.workdir); - - let error_message = storage_path_error(&storage, &repo_harness.conf.workdir); - assert!( - error_message.contains("Prefix and the path are equal"), - "Message '{}' does not contain the required string", - error_message - ); - - let mismatching_path = PathBuf::from("somewhere").join("else"); - let error_message = storage_path_error(&storage, &mismatching_path); - assert!( - error_message.contains(mismatching_path.to_str().unwrap()), - "Error should mention wrong path" - ); - assert!( - error_message.contains(repo_harness.conf.workdir.to_str().unwrap()), - "Error should mention server workdir" - ); - assert!( - error_message.contains("is not prefixed with"), - "Message '{}' does not contain a required string", - error_message - ); - - Ok(()) - } - - #[test] - fn local_path_positive() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("local_path_positive")?; - let storage = dummy_storage(&repo_harness.conf.workdir); - let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID); - let relative_timeline_path = timeline_dir.strip_prefix(&repo_harness.conf.workdir)?; - - let s3_key = create_s3_key( - &relative_timeline_path.join("not a metadata"), - storage.prefix_in_bucket.as_deref(), - ); - assert_eq!( - s3_key.download_destination( - &repo_harness.conf.workdir, - storage.prefix_in_bucket.as_deref() - ), - storage - .local_path(&s3_key) - .expect("For a valid input, valid S3 info should be parsed"), - "Should be able to parse metadata out of the correctly named remote delta file" - ); - - let s3_key = create_s3_key( - &relative_timeline_path.join(METADATA_FILE_NAME), - storage.prefix_in_bucket.as_deref(), - ); - assert_eq!( - s3_key.download_destination( - &repo_harness.conf.workdir, - storage.prefix_in_bucket.as_deref() - ), - storage - .local_path(&s3_key) - .expect("For a valid input, valid S3 info should be parsed"), - "Should be able to parse metadata out of the correctly named remote metadata file" - ); - - Ok(()) - } - - #[test] - fn download_destination_matches_original_path() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("download_destination_matches_original_path")?; - let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name"); - - let dummy_storage = dummy_storage(&repo_harness.conf.workdir); - - let key = dummy_storage.storage_path(&original_path)?; - let download_destination = dummy_storage.local_path(&key)?; - - assert_eq!( - original_path, download_destination, - "'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path" - ); - - Ok(()) - } - - fn dummy_storage(pageserver_workdir: &'static Path) -> S3 { - S3 { - pageserver_workdir, - bucket: Bucket::new( - "dummy-bucket", - "us-east-1".parse().unwrap(), - Credentials::anonymous().unwrap(), - ) - .unwrap(), - prefix_in_bucket: Some("dummy_prefix/".to_string()), - } - } - - fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> S3ObjectKey { - S3ObjectKey(relative_file_path.iter().fold( - prefix.unwrap_or_default().to_string(), - |mut path_string, segment| { - path_string.push(S3_FILE_SEPARATOR); - path_string.push_str(segment.to_str().unwrap()); - path_string - }, - )) - } -} diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs deleted file mode 100644 index 6b588c8e5f..0000000000 --- a/pageserver/src/remote_storage/storage_sync.rs +++ /dev/null @@ -1,1140 +0,0 @@ -//! A synchronization logic for the [`RemoteStorage`] and pageserver in-memory state to ensure correct synchronizations -//! between local tenant files and their counterparts from the remote storage. -//! -//! The synchronization does not aim to be immediate, yet eventually consistent. -//! Synchronization is done with the queue being emptied via separate thread asynchronously, -//! attempting to fully store pageserver's local data on the remote storage in a custom format, beneficial for storing. -//! -//! A queue is implemented in the [`sync_queue`] module as a pair of sender and receiver channels, to block on zero tasks instead of checking the queue. -//! The pair's shared buffer of a fixed size serves as an implicit queue, holding [`SyncTask`] for local files upload/download operations. -//! -//! The queue gets emptied by a single thread with the loop, that polls the tasks in batches of deduplicated tasks (size configurable). -//! Every task in a batch processed concurrently, which is possible due to incremental nature of the timelines: -//! it's not asserted, but assumed that timeline's checkpoints only add the files locally, not removing or amending the existing ones. -//! Only GC removes local timeline files, the GC support is not added to sync currently, -//! yet downloading extra files is not critically bad at this stage, GC can remove those again. -//! -//! Along the timeline files, branch files are uploaded and downloaded every time a corresponding sync task is processed. -//! For simplicity, branch files are also treated as immutable: only missing files are uploaded or downloaded, no removals, amendments or file contents checks are done. -//! Also, the branches are copied as separate files, with no extra compressions done. -//! Despite branches information currently belonging to tenants, a tenants' timeline sync is required to upload or download the branch files, also, there's no way to know -//! the branch sync state outside of the sync loop. -//! This implementation is currently considered as temporary and is a subjec to change later. -//! -//! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via listing the remote storage contents. -//! It's enough to poll the remote state once on startup only, due to agreement that the pageserver has -//! an exclusive write access to the remote storage: new files appear in the storage only after the same -//! pageserver writes them. -//! It's important to do so, since storages like S3 can get slower and more expensive as the number of files grows. -//! The index state is used to issue initial sync tasks, if needed: -//! * all timelines with local state behind the remote gets download tasks scheduled. -//! Such timelines are considered "remote" before the download succeeds, so a number of operations (gc, checkpoints) on that timeline are unavailable. -//! * all never local state gets scheduled for upload, such timelines are "local" and fully operational -//! * the rest of the remote timelines are reported to pageserver, but not downloaded before they are actually accessed in pageserver, -//! it may schedule the download on such occasions. -//! -//! The synchronization unit is an archive: a set of timeline files (or relishes) and a special metadata file, all compressed into a blob. -//! Currently, there's no way to process an archive partially, if the archive processing fails, it has to be started from zero next time again. -//! An archive contains set of files of a certain timeline, added during checkpoint(s) and the timeline metadata at that moment. -//! The archive contains that metadata's `disk_consistent_lsn` in its name, to be able to restore partial index information from just a remote storage file list. -//! The index is created at startup (possible due to exclusive ownership over the remote storage by the pageserver) and keeps track of which files were stored -//! in what remote archives. -//! Among other tasks, the index is used to prevent invalid uploads and non-existing downloads on demand. -//! Refer to [`compression`] and [`index`] for more details on the archives and index respectively. -//! -//! The list construction is currently the only place where the storage sync can return an [`Err`] to the user. -//! New sync tasks are accepted via [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] functions, -//! disregarding of the corresponding loop startup. -//! It's up to the caller to avoid synchronizations if the loop is disabled: otherwise, the sync tasks will be ignored. -//! After the initial state is loaded into memory and the loop starts, any further [`Err`] results do not stop the loop, but rather -//! reschedule the same task, with possibly less files to sync: -//! * download tasks currently never replace existing local file with metadata file as an exception -//! (but this is a subject to change when checksum checks are implemented: all files could get overwritten on a checksum mismatch) -//! * download tasks carry the information of skipped acrhives, so resubmissions are not downloading successfully processed archives again -//! -//! Not every upload of the same timeline gets processed: if the checkpoint with the same `disk_consistent_lsn` was already uploaded, no reuploads happen, as checkpoints -//! are considered to be immutable. The order of `lsn` during upload submissions is allowed to be arbitrary and not required to be ascending. -//! Refer to [`upload`] and [`download`] for more details. -//! -//! Current uploads are per-checkpoint and don't accumulate any data with optimal size for storing on S3. -//! The downloaded archives get processed sequentially, from smaller `disk_consistent_lsn` to larger, with metadata files being added as last. -//! The archive unpacking is designed to unpack metadata as the last file, so the risk of leaving the corrupt timeline due to uncompression error is small (while not eliminated entirely and that should be improved). -//! There's a reschedule threshold that evicts tasks that fail too much and stops the corresponding timeline so it does not diverge from the state on the remote storage. -//! Among other pageserver-specific changes to such evicted timelines, no uploads are expected to come from them to ensure the remote storage state does not get corrupted. -//! -//! Synchronization never removes any local from pageserver workdir or remote files from the remote storage, yet there could be overwrites of the same files (metadata file updates; future checksum mismatch fixes). -//! NOTE: No real contents or checksum check happens right now and is a subject to improve later. -//! -//! After the whole timeline is downloaded, [`crate::tenant_mgr::set_timeline_states`] function is used to update pageserver memory stage for the timeline processed. -//! No extra branch registration is done. -//! -//! When pageserver signals shutdown, current sync task gets finished and the loop exists. - -/// Expose the module for a binary CLI tool that deals with the corresponding blobs. -pub mod compression; -mod download; -pub mod index; -mod upload; - -use std::{ - collections::{BTreeSet, HashMap, HashSet, VecDeque}, - num::{NonZeroU32, NonZeroUsize}, - path::{Path, PathBuf}, - sync::Arc, -}; - -use anyhow::{bail, Context}; -use futures::stream::{FuturesUnordered, StreamExt}; -use lazy_static::lazy_static; -use tokio::{ - fs, - runtime::Runtime, - sync::{ - mpsc::{self, UnboundedReceiver}, - RwLock, - }, - time::{Duration, Instant}, -}; -use tracing::*; - -use self::{ - compression::ArchiveHeader, - download::{download_timeline, DownloadedTimeline}, - index::{ - ArchiveDescription, ArchiveId, RelativePath, RemoteTimeline, RemoteTimelineIndex, - TimelineIndexEntry, - }, - upload::upload_timeline_checkpoint, -}; -use super::{RemoteStorage, SyncStartupData, ZTenantTimelineId}; -use crate::{ - config::PageServerConf, layered_repository::metadata::TimelineMetadata, - remote_storage::storage_sync::compression::read_archive_header, repository::TimelineSyncState, - tenant_mgr::set_timeline_states, thread_mgr, thread_mgr::ThreadKind, -}; - -use zenith_metrics::{register_histogram_vec, register_int_gauge, HistogramVec, IntGauge}; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; - -lazy_static! { - static ref REMAINING_SYNC_ITEMS: IntGauge = register_int_gauge!( - "pageserver_remote_storage_remaining_sync_items", - "Number of storage sync items left in the queue" - ) - .expect("failed to register pageserver remote storage remaining sync items int gauge"); - static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!( - "pageserver_remote_storage_image_sync_time", - "Time took to synchronize (download or upload) a whole pageserver image. \ - Grouped by `operation_kind` (upload|download) and `status` (success|failure)", - &["operation_kind", "status"], - vec![ - 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0, 6.0, 7.0, - 8.0, 9.0, 10.0, 12.5, 15.0, 17.5, 20.0 - ] - ) - .expect("failed to register pageserver image sync time histogram vec"); -} - -/// Wraps mpsc channel bits around into a queue interface. -/// mpsc approach was picked to allow blocking the sync loop if no tasks are present, to avoid meaningless spinning. -mod sync_queue { - use std::{ - collections::{BTreeSet, HashMap}, - sync::atomic::{AtomicUsize, Ordering}, - }; - - use anyhow::anyhow; - use once_cell::sync::OnceCell; - use tokio::sync::mpsc::{error::TryRecvError, UnboundedReceiver, UnboundedSender}; - use tracing::{debug, warn}; - - use super::SyncTask; - - static SENDER: OnceCell> = OnceCell::new(); - static LENGTH: AtomicUsize = AtomicUsize::new(0); - - /// Initializes the queue with the given sender channel that is used to put the tasks into later. - /// Errors if called more than once. - pub fn init(sender: UnboundedSender) -> anyhow::Result<()> { - SENDER - .set(sender) - .map_err(|_sender| anyhow!("sync queue was already initialized"))?; - Ok(()) - } - - /// Adds a new task to the queue, if the queue was initialized, returning `true` on success. - /// On any error, or if the queue was not initialized, the task gets dropped (not scheduled) and `false` is returned. - pub fn push(new_task: SyncTask) -> bool { - if let Some(sender) = SENDER.get() { - match sender.send(new_task) { - Err(e) => { - warn!( - "Failed to enqueue a sync task: the receiver is dropped: {}", - e - ); - false - } - Ok(()) => { - LENGTH.fetch_add(1, Ordering::Relaxed); - true - } - } - } else { - warn!("Failed to enqueue a sync task: the sender is not initialized"); - false - } - } - - /// Polls a new task from the queue, using its receiver counterpart. - /// Does not block if the queue is empty, returning [`None`] instead. - /// Needed to correctly track the queue length. - pub async fn next_task(receiver: &mut UnboundedReceiver) -> Option { - let task = receiver.recv().await; - if task.is_some() { - LENGTH.fetch_sub(1, Ordering::Relaxed); - } - task - } - - /// Fetches a task batch, not bigger than the given limit. - /// Not blocking, can return fewer tasks if the queue does not contain enough. - /// Duplicate entries are eliminated and not considered in batch size calculations. - pub async fn next_task_batch( - receiver: &mut UnboundedReceiver, - mut max_batch_size: usize, - ) -> BTreeSet { - if max_batch_size == 0 { - return BTreeSet::new(); - } - let mut tasks = HashMap::with_capacity(max_batch_size); - - loop { - match receiver.try_recv() { - Ok(new_task) => { - LENGTH.fetch_sub(1, Ordering::Relaxed); - if tasks.insert(new_task.sync_id, new_task).is_none() { - max_batch_size -= 1; - if max_batch_size == 0 { - break; - } - } - } - Err(TryRecvError::Disconnected) => { - debug!("Sender disconnected, batch collection aborted"); - break; - } - Err(TryRecvError::Empty) => { - debug!("No more data in the sync queue, task batch is not full"); - break; - } - } - } - - tasks.into_values().collect() - } - - /// Length of the queue, assuming that all receiver counterparts were only called using the queue api. - pub fn len() -> usize { - LENGTH.load(Ordering::Relaxed) - } -} - -/// A task to run in the async download/upload loop. -/// Limited by the number of retries, after certain threshold the failing task gets evicted and the timeline disabled. -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] -pub struct SyncTask { - sync_id: ZTenantTimelineId, - retries: u32, - kind: SyncKind, -} - -impl SyncTask { - fn new(sync_id: ZTenantTimelineId, retries: u32, kind: SyncKind) -> Self { - Self { - sync_id, - retries, - kind, - } - } -} - -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] -enum SyncKind { - /// A certain amount of images (archive files) to download. - Download(TimelineDownload), - /// A checkpoint outcome with possible local file updates that need actualization in the remote storage. - /// Not necessary more fresh than the one already uploaded. - Upload(NewCheckpoint), -} - -impl SyncKind { - fn sync_name(&self) -> &'static str { - match self { - Self::Download(_) => "download", - Self::Upload(_) => "upload", - } - } -} - -/// Local timeline files for upload, appeared after the new checkpoint. -/// Current checkpoint design assumes new files are added only, no deletions or amendment happens. -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] -pub struct NewCheckpoint { - /// Relish file paths in the pageserver workdir, that were added for the corresponding checkpoint. - layers: Vec, - metadata: TimelineMetadata, -} - -/// Info about the remote image files. -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] -struct TimelineDownload { - files_to_skip: Arc>, - archives_to_skip: BTreeSet, -} - -/// Adds the new checkpoint files as an upload sync task to the queue. -/// On task failure, it gets retried again from the start a number of times. -/// -/// Ensure that the loop is started otherwise the task is never processed. -pub fn schedule_timeline_checkpoint_upload( - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - layers: Vec, - metadata: TimelineMetadata, -) { - if layers.is_empty() { - debug!("Skipping empty layers upload task"); - return; - } - - if !sync_queue::push(SyncTask::new( - ZTenantTimelineId { - tenant_id, - timeline_id, - }, - 0, - SyncKind::Upload(NewCheckpoint { layers, metadata }), - )) { - warn!( - "Could not send an upload task for tenant {}, timeline {}", - tenant_id, timeline_id - ) - } else { - warn!( - "Could not send an upload task for tenant {}, timeline {}: the sync queue is not initialized", - tenant_id, timeline_id - ) - } -} - -/// Requests the download of the entire timeline for a given tenant. -/// No existing local files are currently owerwritten, except the metadata file. -/// The timeline downloads checkpoint archives, from the earliest `disc_consistent_lsn` to the latest, -/// replacing the metadata file as the lasat file in every archive uncompression result. -/// -/// On any failure, the task gets retried, omitting already downloaded archives and files -/// (yet requiring to download the entire archive even if it got partially extracted before the failure). -/// -/// Ensure that the loop is started otherwise the task is never processed. -pub fn schedule_timeline_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) { - debug!( - "Scheduling timeline download for tenant {}, timeline {}", - tenant_id, timeline_id - ); - sync_queue::push(SyncTask::new( - ZTenantTimelineId { - tenant_id, - timeline_id, - }, - 0, - SyncKind::Download(TimelineDownload { - files_to_skip: Arc::new(BTreeSet::new()), - archives_to_skip: BTreeSet::new(), - }), - )); -} - -/// Uses a remote storage given to start the storage sync loop. -/// See module docs for loop step description. -pub(super) fn spawn_storage_sync_thread< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - conf: &'static PageServerConf, - local_timeline_files: HashMap)>, - storage: S, - max_concurrent_sync: NonZeroUsize, - max_sync_errors: NonZeroU32, -) -> anyhow::Result { - let (sender, receiver) = mpsc::unbounded_channel(); - sync_queue::init(sender)?; - - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .context("Failed to create storage sync runtime")?; - - let download_paths = runtime - // TODO could take long time, consider [de]serializing [`RemoteTimelineIndex`] instead - .block_on(storage.list()) - .context("Failed to list remote storage files")? - .into_iter() - .filter_map(|remote_path| match storage.local_path(&remote_path) { - Ok(local_path) => Some(local_path), - Err(e) => { - error!( - "Failed to find local path for remote path {:?}: {:?}", - remote_path, e - ); - None - } - }); - let remote_index = RemoteTimelineIndex::try_parse_descriptions_from_paths(conf, download_paths); - - let initial_timeline_states = schedule_first_sync_tasks(&remote_index, local_timeline_files); - - thread_mgr::spawn( - ThreadKind::StorageSync, - None, - None, - "Remote storage sync thread", - move || { - storage_sync_loop( - runtime, - conf, - receiver, - remote_index, - storage, - max_concurrent_sync, - max_sync_errors, - ) - }, - ) - .context("Failed to spawn remote storage sync thread")?; - Ok(SyncStartupData { - initial_timeline_states, - }) -} - -enum LoopStep { - NewStates(HashMap>), - Shutdown, -} - -#[allow(clippy::too_many_arguments)] -fn storage_sync_loop< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - runtime: Runtime, - conf: &'static PageServerConf, - mut receiver: UnboundedReceiver, - index: RemoteTimelineIndex, - storage: S, - max_concurrent_sync: NonZeroUsize, - max_sync_errors: NonZeroU32, -) -> anyhow::Result<()> { - let remote_assets = Arc::new((storage, RwLock::new(index))); - loop { - let loop_step = runtime.block_on(async { - tokio::select! { - new_timeline_states = loop_step( - conf, - &mut receiver, - Arc::clone(&remote_assets), - max_concurrent_sync, - max_sync_errors, - ) - .instrument(debug_span!("storage_sync_loop_step")) => LoopStep::NewStates(new_timeline_states), - _ = thread_mgr::shutdown_watcher() => LoopStep::Shutdown, - } - }); - - match loop_step { - LoopStep::NewStates(new_timeline_states) => { - // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - set_timeline_states(conf, new_timeline_states); - debug!("Sync loop step completed"); - } - LoopStep::Shutdown => { - debug!("Shutdown requested, stopping"); - break; - } - } - } - - Ok(()) -} - -async fn loop_step< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - conf: &'static PageServerConf, - receiver: &mut UnboundedReceiver, - remote_assets: Arc<(S, RwLock)>, - max_concurrent_sync: NonZeroUsize, - max_sync_errors: NonZeroU32, -) -> HashMap> { - let max_concurrent_sync = max_concurrent_sync.get(); - let mut next_tasks = BTreeSet::new(); - - // request the first task in blocking fashion to do less meaningless work - if let Some(first_task) = sync_queue::next_task(receiver).await { - next_tasks.insert(first_task); - } else { - debug!("Shutdown requested, stopping"); - return HashMap::new(); - }; - next_tasks.extend( - sync_queue::next_task_batch(receiver, max_concurrent_sync - 1) - .await - .into_iter(), - ); - - let remaining_queue_length = sync_queue::len(); - debug!( - "Processing {} tasks in batch, more tasks left to process: {}", - next_tasks.len(), - remaining_queue_length - ); - REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64); - - let mut task_batch = next_tasks - .into_iter() - .map(|task| async { - let sync_id = task.sync_id; - let attempt = task.retries; - let sync_name = task.kind.sync_name(); - - let extra_step = match tokio::spawn( - process_task(conf, Arc::clone(&remote_assets), task, max_sync_errors) - .instrument(debug_span!("", sync_id = %sync_id, attempt, sync_name)), - ) - .await - { - Ok(extra_step) => extra_step, - Err(e) => { - error!( - "Failed to process storage sync task for tenant {}, timeline {}: {:?}", - sync_id.tenant_id, sync_id.timeline_id, e - ); - None - } - }; - (sync_id, extra_step) - }) - .collect::>(); - - let mut new_timeline_states: HashMap> = - HashMap::with_capacity(max_concurrent_sync); - while let Some((sync_id, state_update)) = task_batch.next().await { - debug!("Finished storage sync task for sync id {}", sync_id); - if let Some(state_update) = state_update { - let ZTenantTimelineId { - tenant_id, - timeline_id, - } = sync_id; - new_timeline_states - .entry(tenant_id) - .or_default() - .insert(timeline_id, state_update); - } - } - - new_timeline_states -} - -async fn process_task< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - conf: &'static PageServerConf, - remote_assets: Arc<(S, RwLock)>, - task: SyncTask, - max_sync_errors: NonZeroU32, -) -> Option { - if task.retries > max_sync_errors.get() { - error!( - "Evicting task {:?} that failed {} times, exceeding the error threshold", - task.kind, task.retries - ); - return Some(TimelineSyncState::Evicted( - remote_assets - .as_ref() - .1 - .read() - .await - .timeline_entry(&task.sync_id) - .and_then(TimelineIndexEntry::disk_consistent_lsn), - )); - } - - if task.retries > 0 { - let seconds_to_wait = 2.0_f64.powf(task.retries as f64 - 1.0).min(30.0); - debug!( - "Waiting {} seconds before starting the task", - seconds_to_wait - ); - tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; - } - - let sync_start = Instant::now(); - let sync_name = task.kind.sync_name(); - match task.kind { - SyncKind::Download(download_data) => { - let download_result = download_timeline( - conf, - remote_assets, - task.sync_id, - download_data, - task.retries + 1, - ) - .await; - - match download_result { - DownloadedTimeline::Abort => { - register_sync_status(sync_start, sync_name, None); - None - } - DownloadedTimeline::FailedAndRescheduled { - disk_consistent_lsn, - } => { - register_sync_status(sync_start, sync_name, Some(false)); - Some(TimelineSyncState::AwaitsDownload(disk_consistent_lsn)) - } - DownloadedTimeline::Successful { - disk_consistent_lsn, - } => { - register_sync_status(sync_start, sync_name, Some(true)); - Some(TimelineSyncState::Ready(disk_consistent_lsn)) - } - } - } - SyncKind::Upload(layer_upload) => { - let sync_status = upload_timeline_checkpoint( - conf, - remote_assets, - task.sync_id, - layer_upload, - task.retries + 1, - ) - .await; - register_sync_status(sync_start, sync_name, sync_status); - None - } - } -} - -fn schedule_first_sync_tasks( - index: &RemoteTimelineIndex, - local_timeline_files: HashMap)>, -) -> HashMap> { - let mut initial_timeline_statuses: HashMap> = - HashMap::new(); - - let mut new_sync_tasks = - VecDeque::with_capacity(local_timeline_files.len().max(local_timeline_files.len())); - - for (sync_id, (local_metadata, local_files)) in local_timeline_files { - let local_disk_consistent_lsn = local_metadata.disk_consistent_lsn(); - - let ZTenantTimelineId { - tenant_id, - timeline_id, - } = sync_id; - match index.timeline_entry(&sync_id) { - Some(index_entry) => { - let timeline_status = compare_local_and_remote_timeline( - &mut new_sync_tasks, - sync_id, - local_metadata, - local_files, - index_entry, - ); - match timeline_status { - Some(timeline_status) => { - initial_timeline_statuses - .entry(tenant_id) - .or_default() - .insert(timeline_id, timeline_status); - } - None => error!( - "Failed to compare local and remote timeline for task {}", - sync_id - ), - } - } - None => { - new_sync_tasks.push_back(SyncTask::new( - sync_id, - 0, - SyncKind::Upload(NewCheckpoint { - layers: local_files, - metadata: local_metadata, - }), - )); - initial_timeline_statuses - .entry(tenant_id) - .or_default() - .insert( - timeline_id, - TimelineSyncState::Ready(local_disk_consistent_lsn), - ); - } - } - } - - let unprocessed_remote_ids = |remote_id: &ZTenantTimelineId| { - initial_timeline_statuses - .get(&remote_id.tenant_id) - .and_then(|timelines| timelines.get(&remote_id.timeline_id)) - .is_none() - }; - for unprocessed_remote_id in index - .all_sync_ids() - .filter(unprocessed_remote_ids) - .collect::>() - { - let ZTenantTimelineId { - tenant_id: cloud_only_tenant_id, - timeline_id: cloud_only_timeline_id, - } = unprocessed_remote_id; - match index - .timeline_entry(&unprocessed_remote_id) - .and_then(TimelineIndexEntry::disk_consistent_lsn) - { - Some(remote_disk_consistent_lsn) => { - initial_timeline_statuses - .entry(cloud_only_tenant_id) - .or_default() - .insert( - cloud_only_timeline_id, - TimelineSyncState::CloudOnly(remote_disk_consistent_lsn), - ); - } - None => error!( - "Failed to find disk consistent LSN for remote timeline {}", - unprocessed_remote_id - ), - } - } - - new_sync_tasks.into_iter().for_each(|task| { - sync_queue::push(task); - }); - initial_timeline_statuses -} - -fn compare_local_and_remote_timeline( - new_sync_tasks: &mut VecDeque, - sync_id: ZTenantTimelineId, - local_metadata: TimelineMetadata, - local_files: Vec, - remote_entry: &TimelineIndexEntry, -) -> Option { - let local_lsn = local_metadata.disk_consistent_lsn(); - let uploads = remote_entry.uploaded_checkpoints(); - - if !uploads.contains(&local_lsn) { - new_sync_tasks.push_back(SyncTask::new( - sync_id, - 0, - SyncKind::Upload(NewCheckpoint { - layers: local_files.clone(), - metadata: local_metadata, - }), - )); - } - - let uploads_count = uploads.len(); - let archives_to_skip: BTreeSet = uploads - .into_iter() - .filter(|upload_lsn| upload_lsn <= &local_lsn) - .map(ArchiveId) - .collect(); - Some(if archives_to_skip.len() != uploads_count { - new_sync_tasks.push_back(SyncTask::new( - sync_id, - 0, - SyncKind::Download(TimelineDownload { - files_to_skip: Arc::new(local_files.into_iter().collect()), - archives_to_skip, - }), - )); - TimelineSyncState::AwaitsDownload(remote_entry.disk_consistent_lsn()?) - } else { - TimelineSyncState::Ready(remote_entry.disk_consistent_lsn().unwrap_or(local_lsn)) - }) -} - -fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Option) { - let secs_elapsed = sync_start.elapsed().as_secs_f64(); - debug!("Processed a sync task in {} seconds", secs_elapsed); - match sync_status { - Some(true) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "success"]), - Some(false) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "failure"]), - None => return, - } - .observe(secs_elapsed) -} - -async fn update_index_description< - P: Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - (storage, index): &(S, RwLock), - timeline_dir: &Path, - id: ZTenantTimelineId, -) -> anyhow::Result { - let mut index_write = index.write().await; - let full_index = match index_write.timeline_entry(&id) { - None => bail!("Timeline not found for sync id {}", id), - Some(TimelineIndexEntry::Full(_)) => bail!("Index is already populated for sync id {}", id), - Some(TimelineIndexEntry::Description(description)) => { - let mut archive_header_downloads = FuturesUnordered::new(); - for (&archive_id, description) in description { - archive_header_downloads.push(async move { - let header = download_archive_header(storage, timeline_dir, description) - .await - .map_err(|e| (e, archive_id))?; - Ok((archive_id, description.header_size, header)) - }); - } - - let mut full_index = RemoteTimeline::empty(); - while let Some(header_data) = archive_header_downloads.next().await { - match header_data { - Ok((archive_id, header_size, header)) => full_index.update_archive_contents(archive_id.0, header, header_size), - Err((e, archive_id)) => bail!( - "Failed to download archive header for tenant {}, timeline {}, archive for Lsn {}: {}", - id.tenant_id, id.timeline_id, archive_id.0, - e - ), - } - } - full_index - } - }; - index_write.add_timeline_entry(id, TimelineIndexEntry::Full(full_index.clone())); - Ok(full_index) -} - -async fn download_archive_header< - P: Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - storage: &S, - timeline_dir: &Path, - description: &ArchiveDescription, -) -> anyhow::Result { - let mut header_buf = std::io::Cursor::new(Vec::new()); - let remote_path = storage.storage_path(&timeline_dir.join(&description.archive_name))?; - storage - .download_range( - &remote_path, - 0, - Some(description.header_size), - &mut header_buf, - ) - .await?; - let header_buf = header_buf.into_inner(); - let header = read_archive_header(&description.archive_name, &mut header_buf.as_slice()).await?; - Ok(header) -} - -async fn tenant_branch_files( - conf: &'static PageServerConf, - tenant_id: ZTenantId, -) -> anyhow::Result> { - let branches_dir = conf.branches_path(&tenant_id); - if !branches_dir.exists() { - return Ok(HashSet::new()); - } - - let mut branch_entries = fs::read_dir(&branches_dir) - .await - .context("Failed to list tenant branches dir contents")?; - - let mut branch_files = HashSet::new(); - while let Some(branch_entry) = branch_entries.next_entry().await? { - if branch_entry.file_type().await?.is_file() { - branch_files.insert(RelativePath::new(&branches_dir, branch_entry.path())?); - } - } - Ok(branch_files) -} - -#[cfg(test)] -mod test_utils { - use std::{ - collections::{BTreeMap, BTreeSet}, - fs, - }; - - use super::*; - use crate::{ - layered_repository::metadata::metadata_path, remote_storage::local_fs::LocalFs, - repository::repo_harness::RepoHarness, - }; - use zenith_utils::lsn::Lsn; - - #[track_caller] - pub async fn ensure_correct_timeline_upload( - harness: &RepoHarness, - remote_assets: Arc<(LocalFs, RwLock)>, - timeline_id: ZTimelineId, - new_upload: NewCheckpoint, - ) { - let sync_id = ZTenantTimelineId::new(harness.tenant_id, timeline_id); - upload_timeline_checkpoint( - harness.conf, - Arc::clone(&remote_assets), - sync_id, - new_upload.clone(), - 0, - ) - .await; - - let (storage, index) = remote_assets.as_ref(); - assert_index_descriptions( - index, - RemoteTimelineIndex::try_parse_descriptions_from_paths( - harness.conf, - remote_assets - .0 - .list() - .await - .unwrap() - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), - ), - ) - .await; - - let new_remote_timeline = expect_timeline(index, sync_id).await; - let new_remote_lsn = new_remote_timeline - .checkpoints() - .max() - .expect("Remote timeline should have an lsn after reupload"); - let upload_lsn = new_upload.metadata.disk_consistent_lsn(); - assert!( - new_remote_lsn >= upload_lsn, - "Remote timeline after upload should have the biggest Lsn out of all uploads" - ); - assert!( - new_remote_timeline.contains_checkpoint_at(upload_lsn), - "Should contain upload lsn among the remote ones" - ); - - let remote_files_after_upload = new_remote_timeline - .stored_files(&harness.conf.timeline_path(&timeline_id, &harness.tenant_id)); - for new_uploaded_layer in &new_upload.layers { - assert!( - remote_files_after_upload.contains(new_uploaded_layer), - "Remote files do not contain layer that should be uploaded: '{}'", - new_uploaded_layer.display() - ); - } - - assert_timeline_files_match(harness, timeline_id, new_remote_timeline); - } - - pub async fn expect_timeline( - index: &RwLock, - sync_id: ZTenantTimelineId, - ) -> RemoteTimeline { - if let Some(TimelineIndexEntry::Full(remote_timeline)) = - index.read().await.timeline_entry(&sync_id) - { - remote_timeline.clone() - } else { - panic!( - "Expect to have a full remote timeline in the index for sync id {}", - sync_id - ) - } - } - - #[track_caller] - pub async fn assert_index_descriptions( - index: &RwLock, - expected_index_with_descriptions: RemoteTimelineIndex, - ) { - let index_read = index.read().await; - let actual_sync_ids = index_read.all_sync_ids().collect::>(); - let expected_sync_ids = expected_index_with_descriptions - .all_sync_ids() - .collect::>(); - assert_eq!( - actual_sync_ids, expected_sync_ids, - "Index contains unexpected sync ids" - ); - - let mut actual_branches = BTreeMap::new(); - let mut expected_branches = BTreeMap::new(); - let mut actual_timeline_entries = BTreeMap::new(); - let mut expected_timeline_entries = BTreeMap::new(); - for sync_id in actual_sync_ids { - actual_branches.insert( - sync_id.tenant_id, - index_read - .branch_files(sync_id.tenant_id) - .into_iter() - .flat_map(|branch_paths| branch_paths.iter()) - .cloned() - .collect::>(), - ); - expected_branches.insert( - sync_id.tenant_id, - expected_index_with_descriptions - .branch_files(sync_id.tenant_id) - .into_iter() - .flat_map(|branch_paths| branch_paths.iter()) - .cloned() - .collect::>(), - ); - - actual_timeline_entries.insert( - sync_id, - index_read.timeline_entry(&sync_id).unwrap().clone(), - ); - expected_timeline_entries.insert( - sync_id, - expected_index_with_descriptions - .timeline_entry(&sync_id) - .unwrap() - .clone(), - ); - } - drop(index_read); - - assert_eq!( - actual_branches, expected_branches, - "Index contains unexpected branches" - ); - - for (sync_id, actual_timeline_entry) in actual_timeline_entries { - let expected_timeline_description = expected_timeline_entries - .remove(&sync_id) - .unwrap_or_else(|| { - panic!( - "Failed to find an expected timeline with id {} in the index", - sync_id - ) - }); - let expected_timeline_description = match expected_timeline_description { - TimelineIndexEntry::Description(description) => description, - TimelineIndexEntry::Full(_) => panic!("Expected index entry for sync id {} is a full entry, while a description was expected", sync_id), - }; - - match actual_timeline_entry { - TimelineIndexEntry::Description(actual_descriptions) => { - assert_eq!( - actual_descriptions, expected_timeline_description, - "Index contains unexpected descriptions entry for sync id {}", - sync_id - ) - } - TimelineIndexEntry::Full(actual_full_entry) => { - let expected_lsns = expected_timeline_description - .values() - .map(|description| description.disk_consistent_lsn) - .collect::>(); - assert_eq!( - actual_full_entry.checkpoints().collect::>(), - expected_lsns, - "Timeline {} should have the same checkpoints uploaded", - sync_id, - ) - } - } - } - } - - pub fn assert_timeline_files_match( - harness: &RepoHarness, - remote_timeline_id: ZTimelineId, - remote_timeline: RemoteTimeline, - ) { - let local_timeline_dir = harness.timeline_path(&remote_timeline_id); - let local_paths = fs::read_dir(&local_timeline_dir) - .unwrap() - .map(|dir| dir.unwrap().path()) - .collect::>(); - let mut reported_remote_files = remote_timeline.stored_files(&local_timeline_dir); - let local_metadata_path = - metadata_path(harness.conf, remote_timeline_id, harness.tenant_id); - let local_metadata = TimelineMetadata::from_bytes( - &fs::read(&local_metadata_path) - .expect("Failed to read metadata file when comparing remote and local image files"), - ) - .expect( - "Failed to parse metadata file contents when comparing remote and local image files", - ); - assert!( - remote_timeline.contains_checkpoint_at(local_metadata.disk_consistent_lsn()), - "Should contain local lsn among the remote ones after the upload" - ); - reported_remote_files.insert(local_metadata_path); - - assert_eq!( - local_paths, reported_remote_files, - "Remote image files and local image files are different, missing locally: {:?}, missing remotely: {:?}", - reported_remote_files.difference(&local_paths).collect::>(), - local_paths.difference(&reported_remote_files).collect::>(), - ); - - if let Some(remote_file) = reported_remote_files.iter().next() { - let actual_remote_paths = fs::read_dir( - remote_file - .parent() - .expect("Remote files are expected to have their timeline dir as parent"), - ) - .unwrap() - .map(|dir| dir.unwrap().path()) - .collect::>(); - - let unreported_remote_files = actual_remote_paths - .difference(&reported_remote_files) - .collect::>(); - assert!( - unreported_remote_files.is_empty(), - "Unexpected extra remote files that were not listed: {:?}", - unreported_remote_files - ) - } - } - - pub fn create_local_timeline( - harness: &RepoHarness, - timeline_id: ZTimelineId, - filenames: &[&str], - metadata: TimelineMetadata, - ) -> anyhow::Result { - let timeline_path = harness.timeline_path(&timeline_id); - fs::create_dir_all(&timeline_path)?; - - let mut layers = Vec::with_capacity(filenames.len()); - for &file in filenames { - let file_path = timeline_path.join(file); - fs::write(&file_path, dummy_contents(file).into_bytes())?; - layers.push(file_path); - } - - fs::write( - metadata_path(harness.conf, timeline_id, harness.tenant_id), - metadata.to_bytes()?, - )?; - - Ok(NewCheckpoint { layers, metadata }) - } - - fn dummy_contents(name: &str) -> String { - format!("contents for {}", name) - } - - pub fn dummy_metadata(disk_consistent_lsn: Lsn) -> TimelineMetadata { - TimelineMetadata::new(disk_consistent_lsn, None, None, Lsn(0), Lsn(0), Lsn(0)) - } -} diff --git a/pageserver/src/remote_storage/storage_sync/compression.rs b/pageserver/src/remote_storage/storage_sync/compression.rs deleted file mode 100644 index ca245359bf..0000000000 --- a/pageserver/src/remote_storage/storage_sync/compression.rs +++ /dev/null @@ -1,613 +0,0 @@ -//! A set of structs to represent a compressed part of the timeline, and methods to asynchronously compress and uncompress a stream of data, -//! without holding the entire data in memory. -//! For the latter, both compress and uncompress functions operate buffered streams (currently hardcoded size of [`ARCHIVE_STREAM_BUFFER_SIZE_BYTES`]), -//! not attempting to hold the entire archive in memory. -//! -//! The compression is done with zstd streaming algorithm via the `async-compression` crate. -//! The crate does not contain any knobs to tweak the compression, but otherwise is one of the only ones that's both async and has an API to manage the part of an archive. -//! Zstd was picked as the best algorithm among the ones available in the crate, after testing the initial timeline file compression. -//! -//! Archiving is almost agnostic to timeline file types, with an exception of the metadata file, that's currently distinguished in the [un]compression code. -//! The metadata file is treated separately when [de]compression is involved, to reduce the risk of corrupting the metadata file. -//! When compressed, the metadata file is always required and stored as the last file in the archive stream. -//! When uncompressed, the metadata file gets naturally uncompressed last, to ensure that all other relishes are decompressed successfully first. -//! -//! Archive structure: -//! +----------------------------------------+ -//! | header | file_1, ..., file_k, metadata | -//! +----------------------------------------+ -//! -//! The archive consists of two separate zstd archives: -//! * header archive, that contains all files names and their sizes and relative paths in the timeline directory -//! Header is a Rust structure, serialized into bytes and compressed with zstd. -//! * files archive, that has metadata file as the last one, all compressed with zstd into a single binary blob -//! -//! Header offset is stored in the file name, along with the `disk_consistent_lsn` from the metadata file. -//! See [`parse_archive_name`] and [`ARCHIVE_EXTENSION`] for the name details, example: `00000000016B9150-.zst_9732`. -//! This way, the header could be retrieved without reading an entire archive file. - -use std::{ - collections::BTreeSet, - future::Future, - io::Cursor, - path::{Path, PathBuf}, - sync::Arc, -}; - -use anyhow::{bail, ensure, Context}; -use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder}; -use serde::{Deserialize, Serialize}; -use tokio::{ - fs, - io::{self, AsyncReadExt, AsyncWriteExt}, -}; -use tracing::*; -use zenith_utils::{bin_ser::BeSer, lsn::Lsn}; - -use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME}; - -use super::index::RelativePath; - -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct ArchiveHeader { - /// All regular timeline files, excluding the metadata file. - pub files: Vec, - // Metadata file name is known to the system, as its location relative to the timeline dir, - // so no need to store anything but its size in bytes. - pub metadata_file_size: u64, -} - -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)] -pub struct FileEntry { - /// Uncompressed file size, bytes. - pub size: u64, - /// A path, relative to the directory root, used when compressing the directory contents. - pub subpath: RelativePath, -} - -const ARCHIVE_EXTENSION: &str = "-.zst_"; -const ARCHIVE_STREAM_BUFFER_SIZE_BYTES: usize = 4 * 1024 * 1024; - -/// Streams an archive of files given into a stream target, defined by the closure. -/// -/// The closure approach is picked for cases like S3, where we would need a name of the file before we can get a stream to write the bytes into. -/// Current idea is to place the header size in the name of the file, to enable the fast partial remote file index restoration without actually reading remote storage file contents. -/// -/// Performs the compression in multiple steps: -/// * prepares an archive header, stripping the `source_dir` prefix from the `files` -/// * generates the name of the archive -/// * prepares archive producer future, knowing the header and the file list -/// An `impl AsyncRead` and `impl AsyncWrite` pair of connected streams is created to implement the partial contents streaming. -/// The writer end gets into the archive producer future, to put the header and a stream of compressed files. -/// * prepares archive consumer future, by executing the provided closure -/// The closure gets the reader end stream and the name of the file to create a future that would stream the file contents elsewhere. -/// * runs and waits for both futures to complete -/// * on a successful completion of both futures, header, its size and the user-defined consumer future return data is returned -/// Due to the design above, the archive name and related data is visible inside the consumer future only, so it's possible to return the data, -/// needed for future processing. -pub async fn archive_files_as_stream( - source_dir: &Path, - files: impl Iterator, - metadata: &TimelineMetadata, - create_archive_consumer: Cons, -) -> anyhow::Result<(ArchiveHeader, u64, ConsRet)> -where - Cons: FnOnce(Box, String) -> Fut - + Send - + 'static, - Fut: Future> + Send + 'static, - ConsRet: Send + Sync + 'static, -{ - let metadata_bytes = metadata - .to_bytes() - .context("Failed to create metadata bytes")?; - let (archive_header, compressed_header_bytes) = - prepare_header(source_dir, files, &metadata_bytes) - .await - .context("Failed to prepare file for archivation")?; - - let header_size = compressed_header_bytes.len() as u64; - let (write, read) = io::duplex(ARCHIVE_STREAM_BUFFER_SIZE_BYTES); - let archive_filler = write_archive_contents( - source_dir.to_path_buf(), - archive_header.clone(), - metadata_bytes, - write, - ); - let archive_name = archive_name(metadata.disk_consistent_lsn(), header_size); - let archive_stream = - Cursor::new(compressed_header_bytes).chain(ZstdEncoder::new(io::BufReader::new(read))); - - let (archive_creation_result, archive_upload_result) = tokio::join!( - tokio::spawn(archive_filler), - tokio::spawn(async move { - create_archive_consumer(Box::new(archive_stream), archive_name).await - }) - ); - archive_creation_result - .context("Failed to spawn archive creation future")? - .context("Failed to create an archive")?; - let upload_return_value = archive_upload_result - .context("Failed to spawn archive upload future")? - .context("Failed to upload the archive")?; - - Ok((archive_header, header_size, upload_return_value)) -} - -/// Similar to [`archive_files_as_stream`], creates a pair of streams to uncompress the 2nd part of the archive, -/// that contains files and is located after the header. -/// S3 allows downloading partial file contents for a given file key (i.e. name), to accommodate this retrieval, -/// a closure is used. -/// Same concepts with two concurrent futures, user-defined closure, future and return value apply here, but the -/// consumer and the receiver ends are swapped, since the uncompression happens. -pub async fn uncompress_file_stream_with_index( - destination_dir: PathBuf, - files_to_skip: Arc>, - disk_consistent_lsn: Lsn, - header: ArchiveHeader, - header_size: u64, - create_archive_file_part: Prod, -) -> anyhow::Result -where - Prod: FnOnce(Box, String) -> Fut - + Send - + 'static, - Fut: Future> + Send + 'static, - ProdRet: Send + Sync + 'static, -{ - let (write, mut read) = io::duplex(ARCHIVE_STREAM_BUFFER_SIZE_BYTES); - let archive_name = archive_name(disk_consistent_lsn, header_size); - - let (archive_download_result, archive_uncompress_result) = tokio::join!( - tokio::spawn(async move { create_archive_file_part(Box::new(write), archive_name).await }), - tokio::spawn(async move { - uncompress_with_header(&files_to_skip, &destination_dir, header, &mut read).await - }) - ); - - let download_value = archive_download_result - .context("Failed to spawn archive download future")? - .context("Failed to download an archive")?; - archive_uncompress_result - .context("Failed to spawn archive uncompress future")? - .context("Failed to uncompress the archive")?; - - Ok(download_value) -} - -/// Reads archive header from the stream given: -/// * parses the file name to get the header size -/// * reads the exact amount of bytes -/// * uncompresses and deserializes those -pub async fn read_archive_header( - archive_name: &str, - from: &mut A, -) -> anyhow::Result { - let (_, header_size) = parse_archive_name(Path::new(archive_name))?; - - let mut compressed_header_bytes = vec![0; header_size as usize]; - from.read_exact(&mut compressed_header_bytes) - .await - .with_context(|| { - format!( - "Failed to read header header from the archive {}", - archive_name - ) - })?; - - let mut header_bytes = Vec::new(); - ZstdDecoder::new(io::BufReader::new(compressed_header_bytes.as_slice())) - .read_to_end(&mut header_bytes) - .await - .context("Failed to decompress a header from the archive")?; - - Ok(ArchiveHeader::des(&header_bytes) - .context("Failed to deserialize a header from the archive")?) -} - -/// Reads the archive metadata out of the archive name: -/// * `disk_consistent_lsn` of the checkpoint that was archived -/// * size of the archive header -pub fn parse_archive_name(archive_path: &Path) -> anyhow::Result<(Lsn, u64)> { - let archive_name = archive_path - .file_name() - .with_context(|| format!("Archive '{}' has no file name", archive_path.display()))? - .to_string_lossy(); - let (lsn_str, header_size_str) = - archive_name - .rsplit_once(ARCHIVE_EXTENSION) - .with_context(|| { - format!( - "Archive '{}' has incorrect extension, expected to contain '{}'", - archive_path.display(), - ARCHIVE_EXTENSION - ) - })?; - let disk_consistent_lsn = Lsn::from_hex(lsn_str).with_context(|| { - format!( - "Archive '{}' has an invalid disk consistent lsn in its extension", - archive_path.display(), - ) - })?; - let header_size = header_size_str.parse::().with_context(|| { - format!( - "Archive '{}' has an invalid a header offset number in its extension", - archive_path.display(), - ) - })?; - Ok((disk_consistent_lsn, header_size)) -} - -fn archive_name(disk_consistent_lsn: Lsn, header_size: u64) -> String { - let archive_name = format!( - "{:016X}{ARCHIVE_EXTENSION}{}", - u64::from(disk_consistent_lsn), - header_size, - ARCHIVE_EXTENSION = ARCHIVE_EXTENSION, - ); - archive_name -} - -pub async fn uncompress_with_header( - files_to_skip: &BTreeSet, - destination_dir: &Path, - header: ArchiveHeader, - archive_after_header: impl io::AsyncRead + Send + Sync + Unpin, -) -> anyhow::Result<()> { - debug!("Uncompressing archive into {}", destination_dir.display()); - let mut archive = ZstdDecoder::new(io::BufReader::new(archive_after_header)); - - if !destination_dir.exists() { - fs::create_dir_all(&destination_dir) - .await - .with_context(|| { - format!( - "Failed to create target directory at {}", - destination_dir.display() - ) - })?; - } else if !destination_dir.is_dir() { - bail!( - "Destination path '{}' is not a valid directory", - destination_dir.display() - ); - } - debug!("Will extract {} files from the archive", header.files.len()); - for entry in header.files { - uncompress_entry( - &mut archive, - &entry.subpath.as_path(destination_dir), - entry.size, - files_to_skip, - ) - .await - .with_context(|| format!("Failed to uncompress archive entry {:?}", entry))?; - } - uncompress_entry( - &mut archive, - &destination_dir.join(METADATA_FILE_NAME), - header.metadata_file_size, - files_to_skip, - ) - .await - .context("Failed to uncompress the metadata entry")?; - Ok(()) -} - -async fn uncompress_entry( - archive: &mut ZstdDecoder>, - destination_path: &Path, - entry_size: u64, - files_to_skip: &BTreeSet, -) -> anyhow::Result<()> { - if let Some(parent) = destination_path.parent() { - fs::create_dir_all(parent).await.with_context(|| { - format!( - "Failed to create parent directory for {}", - destination_path.display() - ) - })?; - }; - - if files_to_skip.contains(destination_path) { - debug!("Skipping {}", destination_path.display()); - copy_n_bytes(entry_size, archive, &mut io::sink()) - .await - .context("Failed to skip bytes in the archive")?; - return Ok(()); - } - - let mut destination = - io::BufWriter::new(fs::File::create(&destination_path).await.with_context(|| { - format!( - "Failed to open file {} for extraction", - destination_path.display() - ) - })?); - copy_n_bytes(entry_size, archive, &mut destination) - .await - .with_context(|| { - format!( - "Failed to write extracted archive contents into file {}", - destination_path.display() - ) - })?; - destination - .flush() - .await - .context("Failed to flush the streaming archive bytes")?; - Ok(()) -} - -async fn write_archive_contents( - source_dir: PathBuf, - header: ArchiveHeader, - metadata_bytes: Vec, - mut archive_input: io::DuplexStream, -) -> anyhow::Result<()> { - debug!("Starting writing files into archive"); - for file_entry in header.files { - let path = file_entry.subpath.as_path(&source_dir); - let mut source_file = - io::BufReader::new(fs::File::open(&path).await.with_context(|| { - format!( - "Failed to open file for archiving to path {}", - path.display() - ) - })?); - let bytes_written = io::copy(&mut source_file, &mut archive_input) - .await - .with_context(|| { - format!( - "Failed to open add a file into archive, file path {}", - path.display() - ) - })?; - ensure!( - file_entry.size == bytes_written, - "File {} was written to the archive incompletely", - path.display() - ); - trace!( - "Added file '{}' ({} bytes) into the archive", - path.display(), - bytes_written - ); - } - let metadata_bytes_written = io::copy(&mut metadata_bytes.as_slice(), &mut archive_input) - .await - .context("Failed to add metadata into the archive")?; - ensure!( - header.metadata_file_size == metadata_bytes_written, - "Metadata file was written to the archive incompletely", - ); - - archive_input - .shutdown() - .await - .context("Failed to finalize the archive")?; - debug!("Successfully streamed all files into the archive"); - Ok(()) -} - -async fn prepare_header( - source_dir: &Path, - files: impl Iterator, - metadata_bytes: &[u8], -) -> anyhow::Result<(ArchiveHeader, Vec)> { - let mut archive_files = Vec::new(); - for file_path in files { - let file_metadata = fs::metadata(file_path).await.with_context(|| { - format!( - "Failed to read metadata during archive indexing for {}", - file_path.display() - ) - })?; - ensure!( - file_metadata.is_file(), - "Archive indexed path {} is not a file", - file_path.display() - ); - - if file_path.file_name().and_then(|name| name.to_str()) != Some(METADATA_FILE_NAME) { - let entry = FileEntry { - subpath: RelativePath::new(source_dir, file_path).with_context(|| { - format!( - "File '{}' does not belong to pageserver workspace", - file_path.display() - ) - })?, - size: file_metadata.len(), - }; - archive_files.push(entry); - } - } - - let header = ArchiveHeader { - files: archive_files, - metadata_file_size: metadata_bytes.len() as u64, - }; - - debug!("Appending a header for {} files", header.files.len()); - let header_bytes = header.ser().context("Failed to serialize a header")?; - debug!("Header bytes len {}", header_bytes.len()); - let mut compressed_header_bytes = Vec::new(); - ZstdEncoder::new(io::BufReader::new(header_bytes.as_slice())) - .read_to_end(&mut compressed_header_bytes) - .await - .context("Failed to compress header bytes")?; - debug!( - "Compressed header bytes len {}", - compressed_header_bytes.len() - ); - Ok((header, compressed_header_bytes)) -} - -async fn copy_n_bytes( - n: u64, - from: &mut (impl io::AsyncRead + Send + Sync + Unpin), - into: &mut (impl io::AsyncWrite + Send + Sync + Unpin), -) -> anyhow::Result<()> { - let bytes_written = io::copy(&mut from.take(n), into).await?; - ensure!( - bytes_written == n, - "Failed to read exactly {} bytes from the input, bytes written: {}", - n, - bytes_written, - ); - Ok(()) -} - -#[cfg(test)] -mod tests { - use tokio::{fs, io::AsyncSeekExt}; - - use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID}; - - use super::*; - - #[tokio::test] - async fn compress_and_uncompress() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("compress_and_uncompress")?; - let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID); - init_directory( - &timeline_dir, - vec![ - ("first", "first_contents"), - ("second", "second_contents"), - (METADATA_FILE_NAME, "wrong_metadata"), - ], - ) - .await?; - let timeline_files = list_file_paths_with_contents(&timeline_dir).await?; - assert_eq!( - timeline_files, - vec![ - ( - timeline_dir.join("first"), - FileContents::Text("first_contents".to_string()) - ), - ( - timeline_dir.join(METADATA_FILE_NAME), - FileContents::Text("wrong_metadata".to_string()) - ), - ( - timeline_dir.join("second"), - FileContents::Text("second_contents".to_string()) - ), - ], - "Initial timeline contents should contain two normal files and a wrong metadata file" - ); - - let metadata = TimelineMetadata::new(Lsn(0x30), None, None, Lsn(0), Lsn(0), Lsn(0)); - let paths_to_archive = timeline_files - .into_iter() - .map(|(path, _)| path) - .collect::>(); - - let tempdir = tempfile::tempdir()?; - let base_path = tempdir.path().to_path_buf(); - let (header, header_size, archive_target) = archive_files_as_stream( - &timeline_dir, - paths_to_archive.iter(), - &metadata, - move |mut archive_streamer, archive_name| async move { - let archive_target = base_path.join(&archive_name); - let mut archive_file = fs::File::create(&archive_target).await?; - io::copy(&mut archive_streamer, &mut archive_file).await?; - Ok(archive_target) - }, - ) - .await?; - - let mut file = fs::File::open(&archive_target).await?; - file.seek(io::SeekFrom::Start(header_size)).await?; - let target_dir = tempdir.path().join("extracted"); - uncompress_with_header(&BTreeSet::new(), &target_dir, header, file).await?; - - let extracted_files = list_file_paths_with_contents(&target_dir).await?; - - assert_eq!( - extracted_files, - vec![ - ( - target_dir.join("first"), - FileContents::Text("first_contents".to_string()) - ), - ( - target_dir.join(METADATA_FILE_NAME), - FileContents::Binary(metadata.to_bytes()?) - ), - ( - target_dir.join("second"), - FileContents::Text("second_contents".to_string()) - ), - ], - "Extracted files should contain all local timeline files besides its metadata, which should be taken from the arguments" - ); - - Ok(()) - } - - async fn init_directory( - root: &Path, - files_with_contents: Vec<(&str, &str)>, - ) -> anyhow::Result<()> { - fs::create_dir_all(root).await?; - for (file_name, contents) in files_with_contents { - fs::File::create(root.join(file_name)) - .await? - .write_all(contents.as_bytes()) - .await?; - } - Ok(()) - } - - #[derive(PartialEq, Eq, PartialOrd, Ord)] - enum FileContents { - Text(String), - Binary(Vec), - } - - impl std::fmt::Debug for FileContents { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Text(text) => f.debug_tuple("Text").field(text).finish(), - Self::Binary(bytes) => f - .debug_tuple("Binary") - .field(&format!("{} bytes", bytes.len())) - .finish(), - } - } - } - - async fn list_file_paths_with_contents( - root: &Path, - ) -> anyhow::Result> { - let mut file_paths = Vec::new(); - - let mut dir_listings = vec![fs::read_dir(root).await?]; - while let Some(mut dir_listing) = dir_listings.pop() { - while let Some(entry) = dir_listing.next_entry().await? { - let entry_path = entry.path(); - if entry_path.is_file() { - let contents = match String::from_utf8(fs::read(&entry_path).await?) { - Ok(text) => FileContents::Text(text), - Err(e) => FileContents::Binary(e.into_bytes()), - }; - file_paths.push((entry_path, contents)); - } else if entry_path.is_dir() { - dir_listings.push(fs::read_dir(entry_path).await?); - } else { - info!( - "Skipping path '{}' as it's not a file or a directory", - entry_path.display() - ); - } - } - } - - file_paths.sort(); - Ok(file_paths) - } -} diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/remote_storage/storage_sync/download.rs deleted file mode 100644 index f268fc442a..0000000000 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ /dev/null @@ -1,428 +0,0 @@ -//! Timeline synchrnonization logic to put files from archives on remote storage into pageserver's local directory. -//! Currently, tenant branch files are also downloaded, but this does not appear final. - -use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc}; - -use anyhow::{ensure, Context}; -use futures::{stream::FuturesUnordered, StreamExt}; -use tokio::{fs, sync::RwLock}; -use tracing::{debug, error, trace, warn}; -use zenith_utils::{lsn::Lsn, zid::ZTenantId}; - -use crate::{ - config::PageServerConf, - layered_repository::metadata::{metadata_path, TimelineMetadata}, - remote_storage::{ - storage_sync::{ - compression, index::TimelineIndexEntry, sync_queue, tenant_branch_files, - update_index_description, SyncKind, SyncTask, - }, - RemoteStorage, ZTenantTimelineId, - }, -}; - -use super::{ - index::{ArchiveId, RemoteTimeline, RemoteTimelineIndex}, - TimelineDownload, -}; - -/// Timeline download result, with extra data, needed for downloading. -pub(super) enum DownloadedTimeline { - /// Remote timeline data is either absent or corrupt, no download possible. - Abort, - /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known. - /// Initial download failed due to some error, the download task is rescheduled for another retry. - FailedAndRescheduled { disk_consistent_lsn: Lsn }, - /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known. - /// Initial download successful. - Successful { disk_consistent_lsn: Lsn }, -} - -/// Attempts to download and uncompress files from all remote archives for the timeline given. -/// Timeline files that already exist locally are skipped during the download, but the local metadata file is -/// updated in the end of every checkpoint archive extraction. -/// -/// Before any archives are considered, the branch files are checked locally and remotely, all remote-only files are downloaded. -/// -/// On an error, bumps the retries count and reschedules the download, with updated archive skip list -/// (for any new successful archive downloads and extractions). -pub(super) async fn download_timeline< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - conf: &'static PageServerConf, - remote_assets: Arc<(S, RwLock)>, - sync_id: ZTenantTimelineId, - mut download: TimelineDownload, - retries: u32, -) -> DownloadedTimeline { - debug!("Downloading layers for sync id {}", sync_id); - - let ZTenantTimelineId { - tenant_id, - timeline_id, - } = sync_id; - let index_read = remote_assets.1.read().await; - let remote_timeline = match index_read.timeline_entry(&sync_id) { - None => { - error!("Cannot download: no timeline is present in the index for given ids"); - return DownloadedTimeline::Abort; - } - Some(index_entry) => match index_entry { - TimelineIndexEntry::Full(remote_timeline) => Cow::Borrowed(remote_timeline), - TimelineIndexEntry::Description(_) => { - let remote_disk_consistent_lsn = index_entry.disk_consistent_lsn(); - drop(index_read); - debug!("Found timeline description for the given ids, downloading the full index"); - match update_index_description( - remote_assets.as_ref(), - &conf.timeline_path(&timeline_id, &tenant_id), - sync_id, - ) - .await - { - Ok(remote_timeline) => Cow::Owned(remote_timeline), - Err(e) => { - error!("Failed to download full timeline index: {:?}", e); - return match remote_disk_consistent_lsn { - Some(disk_consistent_lsn) => { - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Download(download), - )); - DownloadedTimeline::FailedAndRescheduled { - disk_consistent_lsn, - } - } - None => { - error!("Cannot download: no disk consistent Lsn is present for the index entry"); - DownloadedTimeline::Abort - } - }; - } - } - } - }, - }; - let disk_consistent_lsn = match remote_timeline.checkpoints().max() { - Some(lsn) => lsn, - None => { - debug!("Cannot download: no disk consistent Lsn is present for the remote timeline"); - return DownloadedTimeline::Abort; - } - }; - - if let Err(e) = download_missing_branches(conf, remote_assets.as_ref(), sync_id.tenant_id).await - { - error!( - "Failed to download missing branches for sync id {}: {:?}", - sync_id, e - ); - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Download(download), - )); - return DownloadedTimeline::FailedAndRescheduled { - disk_consistent_lsn, - }; - } - - debug!("Downloading timeline archives"); - let archives_to_download = remote_timeline - .checkpoints() - .map(ArchiveId) - .filter(|remote_archive| !download.archives_to_skip.contains(remote_archive)) - .collect::>(); - - let archives_total = archives_to_download.len(); - debug!("Downloading {} archives of a timeline", archives_total); - trace!("Archives to download: {:?}", archives_to_download); - - for (archives_downloaded, archive_id) in archives_to_download.into_iter().enumerate() { - match try_download_archive( - conf, - sync_id, - Arc::clone(&remote_assets), - remote_timeline.as_ref(), - archive_id, - Arc::clone(&download.files_to_skip), - ) - .await - { - Err(e) => { - let archives_left = archives_total - archives_downloaded; - error!( - "Failed to download archive {:?} (archives downloaded: {}; archives left: {}) for tenant {} timeline {}, requeueing the download: {:?}", - archive_id, archives_downloaded, archives_left, tenant_id, timeline_id, e - ); - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Download(download), - )); - return DownloadedTimeline::FailedAndRescheduled { - disk_consistent_lsn, - }; - } - Ok(()) => { - debug!("Successfully downloaded archive {:?}", archive_id); - download.archives_to_skip.insert(archive_id); - } - } - } - - debug!("Finished downloading all timeline's archives"); - DownloadedTimeline::Successful { - disk_consistent_lsn, - } -} - -async fn try_download_archive< - P: Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - conf: &'static PageServerConf, - ZTenantTimelineId { - tenant_id, - timeline_id, - }: ZTenantTimelineId, - remote_assets: Arc<(S, RwLock)>, - remote_timeline: &RemoteTimeline, - archive_id: ArchiveId, - files_to_skip: Arc>, -) -> anyhow::Result<()> { - debug!("Downloading archive {:?}", archive_id); - let archive_to_download = remote_timeline - .archive_data(archive_id) - .with_context(|| format!("Archive {:?} not found in remote storage", archive_id))?; - let (archive_header, header_size) = remote_timeline - .restore_header(archive_id) - .context("Failed to restore header when downloading an archive")?; - - match read_local_metadata(conf, timeline_id, tenant_id).await { - Ok(local_metadata) => ensure!( - // need to allow `<=` instead of `<` due to cases when a failed archive can be redownloaded - local_metadata.disk_consistent_lsn() <= archive_to_download.disk_consistent_lsn(), - "Cannot download archive with Lsn {} since it's earlier than local Lsn {}", - archive_to_download.disk_consistent_lsn(), - local_metadata.disk_consistent_lsn() - ), - Err(e) => warn!("Failed to read local metadata file, assuming it's safe to override its with the download. Read: {:#}", e), - } - compression::uncompress_file_stream_with_index( - conf.timeline_path(&timeline_id, &tenant_id), - files_to_skip, - archive_to_download.disk_consistent_lsn(), - archive_header, - header_size, - move |mut archive_target, archive_name| async move { - let archive_local_path = conf - .timeline_path(&timeline_id, &tenant_id) - .join(&archive_name); - let remote_storage = &remote_assets.0; - remote_storage - .download_range( - &remote_storage.storage_path(&archive_local_path)?, - header_size, - None, - &mut archive_target, - ) - .await - }, - ) - .await?; - - Ok(()) -} - -async fn read_local_metadata( - conf: &'static PageServerConf, - timeline_id: zenith_utils::zid::ZTimelineId, - tenant_id: ZTenantId, -) -> anyhow::Result { - let local_metadata_path = metadata_path(conf, timeline_id, tenant_id); - let local_metadata_bytes = fs::read(&local_metadata_path) - .await - .context("Failed to read local metadata file bytes")?; - Ok(TimelineMetadata::from_bytes(&local_metadata_bytes) - .context("Failed to read local metadata files bytes")?) -} - -async fn download_missing_branches< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - conf: &'static PageServerConf, - (storage, index): &(S, RwLock), - tenant_id: ZTenantId, -) -> anyhow::Result<()> { - let local_branches = tenant_branch_files(conf, tenant_id) - .await - .context("Failed to list local branch files for the tenant")?; - let local_branches_dir = conf.branches_path(&tenant_id); - if !local_branches_dir.exists() { - fs::create_dir_all(&local_branches_dir) - .await - .with_context(|| { - format!( - "Failed to create local branches directory at path '{}'", - local_branches_dir.display() - ) - })?; - } - - if let Some(remote_branches) = index.read().await.branch_files(tenant_id) { - let mut remote_only_branches_downloads = remote_branches - .difference(&local_branches) - .map(|remote_only_branch| async move { - let branches_dir = conf.branches_path(&tenant_id); - let remote_branch_path = remote_only_branch.as_path(&branches_dir); - let storage_path = - storage.storage_path(&remote_branch_path).with_context(|| { - format!( - "Failed to derive a storage path for branch with local path '{}'", - remote_branch_path.display() - ) - })?; - let mut target_file = fs::OpenOptions::new() - .write(true) - .create_new(true) - .open(&remote_branch_path) - .await - .with_context(|| { - format!( - "Failed to create local branch file at '{}'", - remote_branch_path.display() - ) - })?; - storage - .download(&storage_path, &mut target_file) - .await - .with_context(|| { - format!( - "Failed to download branch file from the remote path {:?}", - storage_path - ) - })?; - Ok::<_, anyhow::Error>(()) - }) - .collect::>(); - - let mut branch_downloads_failed = false; - while let Some(download_result) = remote_only_branches_downloads.next().await { - if let Err(e) = download_result { - branch_downloads_failed = true; - error!("Failed to download a branch file: {:?}", e); - } - } - ensure!( - !branch_downloads_failed, - "Failed to download all branch files" - ); - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use std::collections::BTreeSet; - - use tempfile::tempdir; - use tokio::fs; - use zenith_utils::lsn::Lsn; - - use crate::{ - remote_storage::{ - local_fs::LocalFs, - storage_sync::test_utils::{ - assert_index_descriptions, assert_timeline_files_match, create_local_timeline, - dummy_metadata, ensure_correct_timeline_upload, expect_timeline, - }, - }, - repository::repo_harness::{RepoHarness, TIMELINE_ID}, - }; - - use super::*; - - #[tokio::test] - async fn test_download_timeline() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("test_download_timeline")?; - let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?; - let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - storage - .list() - .await? - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), - )); - let remote_assets = Arc::new((storage, index)); - let storage = &remote_assets.0; - let index = &remote_assets.1; - - let regular_timeline_path = repo_harness.timeline_path(&TIMELINE_ID); - let regular_timeline = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &["a", "b"], - dummy_metadata(Lsn(0x30)), - )?; - ensure_correct_timeline_upload( - &repo_harness, - Arc::clone(&remote_assets), - TIMELINE_ID, - regular_timeline, - ) - .await; - // upload multiple checkpoints for the same timeline - let regular_timeline = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &["c", "d"], - dummy_metadata(Lsn(0x40)), - )?; - ensure_correct_timeline_upload( - &repo_harness, - Arc::clone(&remote_assets), - TIMELINE_ID, - regular_timeline, - ) - .await; - - fs::remove_dir_all(®ular_timeline_path).await?; - let remote_regular_timeline = expect_timeline(index, sync_id).await; - - download_timeline( - repo_harness.conf, - Arc::clone(&remote_assets), - sync_id, - TimelineDownload { - files_to_skip: Arc::new(BTreeSet::new()), - archives_to_skip: BTreeSet::new(), - }, - 0, - ) - .await; - assert_index_descriptions( - index, - RemoteTimelineIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - remote_assets - .0 - .list() - .await - .unwrap() - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), - ), - ) - .await; - assert_timeline_files_match(&repo_harness, TIMELINE_ID, remote_regular_timeline); - - Ok(()) - } -} diff --git a/pageserver/src/remote_storage/storage_sync/index.rs b/pageserver/src/remote_storage/storage_sync/index.rs deleted file mode 100644 index 3d2680948d..0000000000 --- a/pageserver/src/remote_storage/storage_sync/index.rs +++ /dev/null @@ -1,427 +0,0 @@ -//! In-memory index to track the tenant files on the remote strorage, mitigating the storage format differences between the local and remote files. -//! Able to restore itself from the storage archive data and reconstruct archive indices on demand. -//! -//! The index is intended to be portable, so deliberately does not store any local paths inside. -//! This way in the future, the index could be restored fast from its serialized stored form. - -use std::{ - collections::{BTreeMap, BTreeSet, HashMap, HashSet}, - path::{Path, PathBuf}, -}; - -use anyhow::{bail, ensure, Context}; -use serde::{Deserialize, Serialize}; -use tracing::debug; -use zenith_utils::{ - lsn::Lsn, - zid::{ZTenantId, ZTimelineId}, -}; - -use crate::{ - config::PageServerConf, - layered_repository::TIMELINES_SEGMENT_NAME, - remote_storage::{ - storage_sync::compression::{parse_archive_name, FileEntry}, - ZTenantTimelineId, - }, -}; - -use super::compression::ArchiveHeader; - -/// A part of the filesystem path, that needs a root to become a path again. -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] -pub struct RelativePath(String); - -impl RelativePath { - /// Attempts to strip off the base from path, producing a relative path or an error. - pub fn new>(base: &Path, path: P) -> anyhow::Result { - let relative = path - .as_ref() - .strip_prefix(base) - .context("path is not relative to base")?; - Ok(RelativePath(relative.to_string_lossy().to_string())) - } - - /// Joins the relative path with the base path. - pub fn as_path(&self, base: &Path) -> PathBuf { - base.join(&self.0) - } -} - -/// An index to track tenant files that exist on the remote storage. -/// Currently, timeline archives and branch files are tracked. -#[derive(Debug, Clone)] -pub struct RemoteTimelineIndex { - branch_files: HashMap>, - timeline_files: HashMap, -} - -impl RemoteTimelineIndex { - /// Attempts to parse file paths (not checking the file contents) and find files - /// that can be tracked wiht the index. - /// On parse falures, logs the error and continues, so empty index can be created from not suitable paths. - pub fn try_parse_descriptions_from_paths>( - conf: &'static PageServerConf, - paths: impl Iterator, - ) -> Self { - let mut index = Self { - branch_files: HashMap::new(), - timeline_files: HashMap::new(), - }; - for path in paths { - if let Err(e) = try_parse_index_entry(&mut index, conf, path.as_ref()) { - debug!( - "Failed to parse path '{}' as index entry: {:#}", - path.as_ref().display(), - e - ); - } - } - index - } - - pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&TimelineIndexEntry> { - self.timeline_files.get(id) - } - - pub fn timeline_entry_mut( - &mut self, - id: &ZTenantTimelineId, - ) -> Option<&mut TimelineIndexEntry> { - self.timeline_files.get_mut(id) - } - - pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: TimelineIndexEntry) { - self.timeline_files.insert(id, entry); - } - - pub fn all_sync_ids(&self) -> impl Iterator + '_ { - self.timeline_files.keys().copied() - } - - pub fn add_branch_file(&mut self, tenant_id: ZTenantId, path: RelativePath) { - self.branch_files - .entry(tenant_id) - .or_insert_with(HashSet::new) - .insert(path); - } - - pub fn branch_files(&self, tenant_id: ZTenantId) -> Option<&HashSet> { - self.branch_files.get(&tenant_id) - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum TimelineIndexEntry { - /// An archive found on the remote storage, but not yet downloaded, only a metadata from its storage path is available, without archive contents. - Description(BTreeMap), - /// Full archive metadata, including the file list, parsed from the archive header. - Full(RemoteTimeline), -} - -impl TimelineIndexEntry { - pub fn uploaded_checkpoints(&self) -> BTreeSet { - match self { - Self::Description(description) => { - description.keys().map(|archive_id| archive_id.0).collect() - } - Self::Full(remote_timeline) => remote_timeline - .checkpoint_archives - .keys() - .map(|archive_id| archive_id.0) - .collect(), - } - } - - /// Gets latest uploaded checkpoint's disk consisten Lsn for the corresponding timeline. - pub fn disk_consistent_lsn(&self) -> Option { - match self { - Self::Description(description) => { - description.keys().map(|archive_id| archive_id.0).max() - } - Self::Full(remote_timeline) => remote_timeline - .checkpoint_archives - .keys() - .map(|archive_id| archive_id.0) - .max(), - } - } -} - -/// Checkpoint archive's id, corresponding to the `disk_consistent_lsn` from the timeline's metadata file during checkpointing. -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] -pub struct ArchiveId(pub(super) Lsn); - -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] -struct FileId(ArchiveId, ArchiveEntryNumber); - -type ArchiveEntryNumber = usize; - -/// All archives and files in them, representing a certain timeline. -/// Uses file and archive IDs to reference those without ownership issues. -#[derive(Debug, PartialEq, Eq, Clone)] -pub struct RemoteTimeline { - timeline_files: BTreeMap, - checkpoint_archives: BTreeMap, -} - -/// Archive metadata, enough to restore a header with the timeline data. -#[derive(Debug, PartialEq, Eq, Clone)] -pub struct CheckpointArchive { - disk_consistent_lsn: Lsn, - metadata_file_size: u64, - files: BTreeSet, - archive_header_size: u64, -} - -impl CheckpointArchive { - pub fn disk_consistent_lsn(&self) -> Lsn { - self.disk_consistent_lsn - } -} - -impl RemoteTimeline { - pub fn empty() -> Self { - Self { - timeline_files: BTreeMap::new(), - checkpoint_archives: BTreeMap::new(), - } - } - - pub fn checkpoints(&self) -> impl Iterator + '_ { - self.checkpoint_archives - .values() - .map(CheckpointArchive::disk_consistent_lsn) - } - - /// Lists all relish files in the given remote timeline. Omits the metadata file. - pub fn stored_files(&self, timeline_dir: &Path) -> BTreeSet { - self.timeline_files - .values() - .map(|file_entry| file_entry.subpath.as_path(timeline_dir)) - .collect() - } - - pub fn contains_checkpoint_at(&self, disk_consistent_lsn: Lsn) -> bool { - self.checkpoint_archives - .contains_key(&ArchiveId(disk_consistent_lsn)) - } - - pub fn archive_data(&self, archive_id: ArchiveId) -> Option<&CheckpointArchive> { - self.checkpoint_archives.get(&archive_id) - } - - /// Restores a header of a certain remote archive from the memory data. - /// Returns the header and its compressed size in the archive, both can be used to uncompress that archive. - pub fn restore_header(&self, archive_id: ArchiveId) -> anyhow::Result<(ArchiveHeader, u64)> { - let archive = self - .checkpoint_archives - .get(&archive_id) - .with_context(|| format!("Archive {:?} not found", archive_id))?; - - let mut header_files = Vec::with_capacity(archive.files.len()); - for (expected_archive_position, archive_file) in archive.files.iter().enumerate() { - let &FileId(archive_id, archive_position) = archive_file; - ensure!( - expected_archive_position == archive_position, - "Archive header is corrupt, file # {} from archive {:?} header is missing", - expected_archive_position, - archive_id, - ); - - let timeline_file = self.timeline_files.get(archive_file).with_context(|| { - format!( - "File with id {:?} not found for archive {:?}", - archive_file, archive_id - ) - })?; - header_files.push(timeline_file.clone()); - } - - Ok(( - ArchiveHeader { - files: header_files, - metadata_file_size: archive.metadata_file_size, - }, - archive.archive_header_size, - )) - } - - /// Updates (creates, if necessary) the data about certain archive contents. - pub fn update_archive_contents( - &mut self, - disk_consistent_lsn: Lsn, - header: ArchiveHeader, - header_size: u64, - ) { - let archive_id = ArchiveId(disk_consistent_lsn); - let mut common_archive_files = BTreeSet::new(); - for (file_index, file_entry) in header.files.into_iter().enumerate() { - let file_id = FileId(archive_id, file_index); - self.timeline_files.insert(file_id, file_entry); - common_archive_files.insert(file_id); - } - - let metadata_file_size = header.metadata_file_size; - self.checkpoint_archives - .entry(archive_id) - .or_insert_with(|| CheckpointArchive { - metadata_file_size, - files: BTreeSet::new(), - archive_header_size: header_size, - disk_consistent_lsn, - }) - .files - .extend(common_archive_files.into_iter()); - } -} - -/// Metadata abput timeline checkpoint archive, parsed from its remote storage path. -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct ArchiveDescription { - pub header_size: u64, - pub disk_consistent_lsn: Lsn, - pub archive_name: String, -} - -fn try_parse_index_entry( - index: &mut RemoteTimelineIndex, - conf: &'static PageServerConf, - path: &Path, -) -> anyhow::Result<()> { - let tenants_dir = conf.tenants_path(); - let tenant_id = path - .strip_prefix(&tenants_dir) - .with_context(|| { - format!( - "Path '{}' does not belong to tenants directory '{}'", - path.display(), - tenants_dir.display(), - ) - })? - .iter() - .next() - .with_context(|| format!("Found no tenant id in path '{}'", path.display()))? - .to_string_lossy() - .parse::() - .with_context(|| format!("Failed to parse tenant id from path '{}'", path.display()))?; - - let branches_path = conf.branches_path(&tenant_id); - let timelines_path = conf.timelines_path(&tenant_id); - match ( - RelativePath::new(&branches_path, &path), - path.strip_prefix(&timelines_path), - ) { - (Ok(_), Ok(_)) => bail!( - "Path '{}' cannot start with both branches '{}' and the timelines '{}' prefixes", - path.display(), - branches_path.display(), - timelines_path.display() - ), - (Ok(branches_entry), Err(_)) => index.add_branch_file(tenant_id, branches_entry), - (Err(_), Ok(timelines_subpath)) => { - let mut segments = timelines_subpath.iter(); - let timeline_id = segments - .next() - .with_context(|| { - format!( - "{} directory of tenant {} (path '{}') is not an index entry", - TIMELINES_SEGMENT_NAME, - tenant_id, - path.display() - ) - })? - .to_string_lossy() - .parse::() - .with_context(|| { - format!("Failed to parse timeline id from path '{}'", path.display()) - })?; - - let (disk_consistent_lsn, header_size) = - parse_archive_name(path).with_context(|| { - format!( - "Failed to parse archive name out in path '{}'", - path.display() - ) - })?; - - let archive_name = path - .file_name() - .with_context(|| format!("Archive '{}' has no file name", path.display()))? - .to_string_lossy() - .to_string(); - - let sync_id = ZTenantTimelineId { - tenant_id, - timeline_id, - }; - let timeline_index_entry = index - .timeline_files - .entry(sync_id) - .or_insert_with(|| TimelineIndexEntry::Description(BTreeMap::new())); - match timeline_index_entry { - TimelineIndexEntry::Description(descriptions) => { - descriptions.insert( - ArchiveId(disk_consistent_lsn), - ArchiveDescription { - header_size, - disk_consistent_lsn, - archive_name, - }, - ); - } - TimelineIndexEntry::Full(_) => { - bail!("Cannot add parsed archive description to its full context in index with sync id {}", sync_id) - } - } - } - (Err(branches_error), Err(timelines_strip_error)) => { - bail!( - "Path '{}' is not an index entry: it's neither parsable as a branch entry '{:#}' nor as an archive entry '{}'", - path.display(), - branches_error, - timelines_strip_error, - ) - } - } - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn header_restoration_preserves_file_order() { - let header = ArchiveHeader { - files: vec![ - FileEntry { - size: 5, - subpath: RelativePath("one".to_string()), - }, - FileEntry { - size: 1, - subpath: RelativePath("two".to_string()), - }, - FileEntry { - size: 222, - subpath: RelativePath("zero".to_string()), - }, - ], - metadata_file_size: 5, - }; - - let lsn = Lsn(1); - let mut remote_timeline = RemoteTimeline::empty(); - remote_timeline.update_archive_contents(lsn, header.clone(), 15); - - let (restored_header, _) = remote_timeline - .restore_header(ArchiveId(lsn)) - .expect("Should be able to restore header from a valid remote timeline"); - - assert_eq!( - header, restored_header, - "Header restoration should preserve file order" - ); - } -} diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/remote_storage/storage_sync/upload.rs deleted file mode 100644 index 0f57d714dd..0000000000 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ /dev/null @@ -1,573 +0,0 @@ -//! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints. -//! Currently, tenant branch files are also uploaded, but this does not appear final. - -use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc}; - -use anyhow::{ensure, Context}; -use futures::{stream::FuturesUnordered, StreamExt}; -use tokio::{fs, sync::RwLock}; -use tracing::{debug, error, warn}; -use zenith_utils::zid::ZTenantId; - -use crate::{ - config::PageServerConf, - remote_storage::{ - storage_sync::{ - compression, - index::{RemoteTimeline, TimelineIndexEntry}, - sync_queue, tenant_branch_files, update_index_description, SyncKind, SyncTask, - }, - RemoteStorage, ZTenantTimelineId, - }, -}; - -use super::{compression::ArchiveHeader, index::RemoteTimelineIndex, NewCheckpoint}; - -/// Attempts to compress and upload given checkpoint files. -/// No extra checks for overlapping files is made: download takes care of that, ensuring no non-metadata local timeline files are overwritten. -/// -/// Before the checkpoint files are uploaded, branch files are uploaded, if any local ones are missing remotely. -/// -/// On an error, bumps the retries count and reschedules the entire task. -/// On success, populates index data with new downloads. -pub(super) async fn upload_timeline_checkpoint< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - config: &'static PageServerConf, - remote_assets: Arc<(S, RwLock)>, - sync_id: ZTenantTimelineId, - new_checkpoint: NewCheckpoint, - retries: u32, -) -> Option { - debug!("Uploading checkpoint for sync id {}", sync_id); - if let Err(e) = upload_missing_branches(config, remote_assets.as_ref(), sync_id.tenant_id).await - { - error!( - "Failed to upload missing branches for sync id {}: {:?}", - sync_id, e - ); - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Upload(new_checkpoint), - )); - return Some(false); - } - let new_upload_lsn = new_checkpoint.metadata.disk_consistent_lsn(); - - let index = &remote_assets.1; - - let ZTenantTimelineId { - tenant_id, - timeline_id, - } = sync_id; - let timeline_dir = config.timeline_path(&timeline_id, &tenant_id); - - let index_read = index.read().await; - let remote_timeline = match index_read.timeline_entry(&sync_id) { - None => None, - Some(TimelineIndexEntry::Full(remote_timeline)) => Some(Cow::Borrowed(remote_timeline)), - Some(TimelineIndexEntry::Description(_)) => { - debug!("Found timeline description for the given ids, downloading the full index"); - match update_index_description(remote_assets.as_ref(), &timeline_dir, sync_id).await { - Ok(remote_timeline) => Some(Cow::Owned(remote_timeline)), - Err(e) => { - error!("Failed to download full timeline index: {:?}", e); - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Upload(new_checkpoint), - )); - return Some(false); - } - } - } - }; - - let already_contains_upload_lsn = remote_timeline - .as_ref() - .map(|remote_timeline| remote_timeline.contains_checkpoint_at(new_upload_lsn)) - .unwrap_or(false); - if already_contains_upload_lsn { - warn!( - "Received a checkpoint with Lsn {} that's already been uploaded to remote storage, skipping the upload.", - new_upload_lsn - ); - return None; - } - - let already_uploaded_files = remote_timeline - .map(|timeline| timeline.stored_files(&timeline_dir)) - .unwrap_or_default(); - drop(index_read); - - match try_upload_checkpoint( - config, - Arc::clone(&remote_assets), - sync_id, - &new_checkpoint, - already_uploaded_files, - ) - .await - { - Ok((archive_header, header_size)) => { - let mut index_write = index.write().await; - match index_write.timeline_entry_mut(&sync_id) { - Some(TimelineIndexEntry::Full(remote_timeline)) => { - remote_timeline.update_archive_contents( - new_checkpoint.metadata.disk_consistent_lsn(), - archive_header, - header_size, - ); - } - None | Some(TimelineIndexEntry::Description(_)) => { - let mut new_timeline = RemoteTimeline::empty(); - new_timeline.update_archive_contents( - new_checkpoint.metadata.disk_consistent_lsn(), - archive_header, - header_size, - ); - index_write.add_timeline_entry(sync_id, TimelineIndexEntry::Full(new_timeline)); - } - } - debug!("Checkpoint uploaded successfully"); - Some(true) - } - Err(e) => { - error!( - "Failed to upload checkpoint: {:?}, requeueing the upload", - e - ); - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Upload(new_checkpoint), - )); - Some(false) - } - } -} - -async fn try_upload_checkpoint< - P: Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - config: &'static PageServerConf, - remote_assets: Arc<(S, RwLock)>, - sync_id: ZTenantTimelineId, - new_checkpoint: &NewCheckpoint, - files_to_skip: BTreeSet, -) -> anyhow::Result<(ArchiveHeader, u64)> { - let ZTenantTimelineId { - tenant_id, - timeline_id, - } = sync_id; - let timeline_dir = config.timeline_path(&timeline_id, &tenant_id); - - let files_to_upload = new_checkpoint - .layers - .iter() - .filter(|&path_to_upload| { - if files_to_skip.contains(path_to_upload) { - error!( - "Skipping file upload '{}', since it was already uploaded", - path_to_upload.display() - ); - false - } else { - true - } - }) - .collect::>(); - ensure!(!files_to_upload.is_empty(), "No files to upload"); - - compression::archive_files_as_stream( - &timeline_dir, - files_to_upload.into_iter(), - &new_checkpoint.metadata, - move |archive_streamer, archive_name| async move { - let timeline_dir = config.timeline_path(&timeline_id, &tenant_id); - let remote_storage = &remote_assets.0; - remote_storage - .upload( - archive_streamer, - &remote_storage.storage_path(&timeline_dir.join(&archive_name))?, - ) - .await - }, - ) - .await - .map(|(header, header_size, _)| (header, header_size)) -} - -async fn upload_missing_branches< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - config: &'static PageServerConf, - (storage, index): &(S, RwLock), - tenant_id: ZTenantId, -) -> anyhow::Result<()> { - let local_branches = tenant_branch_files(config, tenant_id) - .await - .context("Failed to list local branch files for the tenant")?; - let index_read = index.read().await; - let remote_branches = index_read - .branch_files(tenant_id) - .cloned() - .unwrap_or_default(); - drop(index_read); - - let mut branch_uploads = local_branches - .difference(&remote_branches) - .map(|local_only_branch| async move { - let local_branch_path = local_only_branch.as_path(&config.branches_path(&tenant_id)); - let storage_path = storage.storage_path(&local_branch_path).with_context(|| { - format!( - "Failed to derive a storage path for branch with local path '{}'", - local_branch_path.display() - ) - })?; - let local_branch_file = fs::OpenOptions::new() - .read(true) - .open(&local_branch_path) - .await - .with_context(|| { - format!( - "Failed to open local branch file {} for reading", - local_branch_path.display() - ) - })?; - storage - .upload(local_branch_file, &storage_path) - .await - .with_context(|| { - format!( - "Failed to upload branch file to the remote path {:?}", - storage_path - ) - })?; - Ok::<_, anyhow::Error>(local_only_branch) - }) - .collect::>(); - - let mut branch_uploads_failed = false; - while let Some(upload_result) = branch_uploads.next().await { - match upload_result { - Ok(local_only_branch) => index - .write() - .await - .add_branch_file(tenant_id, local_only_branch.clone()), - Err(e) => { - error!("Failed to upload branch file: {:?}", e); - branch_uploads_failed = true; - } - } - } - - ensure!(!branch_uploads_failed, "Failed to upload all branch files"); - - Ok(()) -} - -#[cfg(test)] -mod tests { - use tempfile::tempdir; - use zenith_utils::lsn::Lsn; - - use crate::{ - remote_storage::{ - local_fs::LocalFs, - storage_sync::{ - index::ArchiveId, - test_utils::{ - assert_index_descriptions, create_local_timeline, dummy_metadata, - ensure_correct_timeline_upload, expect_timeline, - }, - }, - }, - repository::repo_harness::{RepoHarness, TIMELINE_ID}, - }; - - use super::*; - - #[tokio::test] - async fn reupload_timeline() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("reupload_timeline")?; - let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?; - let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - storage - .list() - .await? - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), - )); - let remote_assets = Arc::new((storage, index)); - let index = &remote_assets.1; - - let first_upload_metadata = dummy_metadata(Lsn(0x10)); - let first_checkpoint = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &["a", "b"], - first_upload_metadata.clone(), - )?; - let local_timeline_path = repo_harness.timeline_path(&TIMELINE_ID); - ensure_correct_timeline_upload( - &repo_harness, - Arc::clone(&remote_assets), - TIMELINE_ID, - first_checkpoint, - ) - .await; - - let uploaded_timeline = expect_timeline(index, sync_id).await; - let uploaded_archives = uploaded_timeline - .checkpoints() - .map(ArchiveId) - .collect::>(); - assert_eq!( - uploaded_archives.len(), - 1, - "Only one archive is expected after a first upload" - ); - let first_uploaded_archive = uploaded_archives.first().copied().unwrap(); - assert_eq!( - uploaded_timeline.checkpoints().last(), - Some(first_upload_metadata.disk_consistent_lsn()), - "Metadata that was uploaded, should have its Lsn stored" - ); - assert_eq!( - uploaded_timeline - .archive_data(uploaded_archives.first().copied().unwrap()) - .unwrap() - .disk_consistent_lsn(), - first_upload_metadata.disk_consistent_lsn(), - "Uploaded archive should have corresponding Lsn" - ); - assert_eq!( - uploaded_timeline.stored_files(&local_timeline_path), - vec![local_timeline_path.join("a"), local_timeline_path.join("b")] - .into_iter() - .collect(), - "Should have all files from the first checkpoint" - ); - - let second_upload_metadata = dummy_metadata(Lsn(0x40)); - let second_checkpoint = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &["b", "c"], - second_upload_metadata.clone(), - )?; - assert!( - first_upload_metadata.disk_consistent_lsn() - < second_upload_metadata.disk_consistent_lsn() - ); - ensure_correct_timeline_upload( - &repo_harness, - Arc::clone(&remote_assets), - TIMELINE_ID, - second_checkpoint, - ) - .await; - - let updated_timeline = expect_timeline(index, sync_id).await; - let mut updated_archives = updated_timeline - .checkpoints() - .map(ArchiveId) - .collect::>(); - assert_eq!( - updated_archives.len(), - 2, - "Two archives are expected after a successful update of the upload" - ); - updated_archives.retain(|archive_id| archive_id != &first_uploaded_archive); - assert_eq!( - updated_archives.len(), - 1, - "Only one new archive is expected among the uploaded" - ); - let second_uploaded_archive = updated_archives.last().copied().unwrap(); - assert_eq!( - updated_timeline.checkpoints().max(), - Some(second_upload_metadata.disk_consistent_lsn()), - "Metadata that was uploaded, should have its Lsn stored" - ); - assert_eq!( - updated_timeline - .archive_data(second_uploaded_archive) - .unwrap() - .disk_consistent_lsn(), - second_upload_metadata.disk_consistent_lsn(), - "Uploaded archive should have corresponding Lsn" - ); - assert_eq!( - updated_timeline.stored_files(&local_timeline_path), - vec![ - local_timeline_path.join("a"), - local_timeline_path.join("b"), - local_timeline_path.join("c"), - ] - .into_iter() - .collect(), - "Should have all files from both checkpoints without duplicates" - ); - - let third_upload_metadata = dummy_metadata(Lsn(0x20)); - let third_checkpoint = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &["d"], - third_upload_metadata.clone(), - )?; - assert_ne!( - third_upload_metadata.disk_consistent_lsn(), - first_upload_metadata.disk_consistent_lsn() - ); - assert!( - third_upload_metadata.disk_consistent_lsn() - < second_upload_metadata.disk_consistent_lsn() - ); - ensure_correct_timeline_upload( - &repo_harness, - Arc::clone(&remote_assets), - TIMELINE_ID, - third_checkpoint, - ) - .await; - - let updated_timeline = expect_timeline(index, sync_id).await; - let mut updated_archives = updated_timeline - .checkpoints() - .map(ArchiveId) - .collect::>(); - assert_eq!( - updated_archives.len(), - 3, - "Three archives are expected after two successful updates of the upload" - ); - updated_archives.retain(|archive_id| { - archive_id != &first_uploaded_archive && archive_id != &second_uploaded_archive - }); - assert_eq!( - updated_archives.len(), - 1, - "Only one new archive is expected among the uploaded" - ); - let third_uploaded_archive = updated_archives.last().copied().unwrap(); - assert!( - updated_timeline.checkpoints().max().unwrap() - > third_upload_metadata.disk_consistent_lsn(), - "Should not influence the last lsn by uploading an older checkpoint" - ); - assert_eq!( - updated_timeline - .archive_data(third_uploaded_archive) - .unwrap() - .disk_consistent_lsn(), - third_upload_metadata.disk_consistent_lsn(), - "Uploaded archive should have corresponding Lsn" - ); - assert_eq!( - updated_timeline.stored_files(&local_timeline_path), - vec![ - local_timeline_path.join("a"), - local_timeline_path.join("b"), - local_timeline_path.join("c"), - local_timeline_path.join("d"), - ] - .into_iter() - .collect(), - "Should have all files from three checkpoints without duplicates" - ); - - Ok(()) - } - - #[tokio::test] - async fn reupload_timeline_rejected() -> anyhow::Result<()> { - let repo_harness = RepoHarness::create("reupload_timeline_rejected")?; - let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?; - let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - storage - .list() - .await? - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), - )); - let remote_assets = Arc::new((storage, index)); - let storage = &remote_assets.0; - let index = &remote_assets.1; - - let first_upload_metadata = dummy_metadata(Lsn(0x10)); - let first_checkpoint = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &["a", "b"], - first_upload_metadata.clone(), - )?; - ensure_correct_timeline_upload( - &repo_harness, - Arc::clone(&remote_assets), - TIMELINE_ID, - first_checkpoint, - ) - .await; - let after_first_uploads = RemoteTimelineIndex::try_parse_descriptions_from_paths( - repo_harness.conf, - remote_assets - .0 - .list() - .await - .unwrap() - .into_iter() - .map(|storage_path| storage.local_path(&storage_path).unwrap()), - ); - - let normal_upload_metadata = dummy_metadata(Lsn(0x20)); - assert_ne!( - normal_upload_metadata.disk_consistent_lsn(), - first_upload_metadata.disk_consistent_lsn() - ); - - let checkpoint_with_no_files = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &[], - normal_upload_metadata.clone(), - )?; - upload_timeline_checkpoint( - repo_harness.conf, - Arc::clone(&remote_assets), - sync_id, - checkpoint_with_no_files, - 0, - ) - .await; - assert_index_descriptions(index, after_first_uploads.clone()).await; - - let checkpoint_with_uploaded_lsn = create_local_timeline( - &repo_harness, - TIMELINE_ID, - &["something", "new"], - first_upload_metadata.clone(), - )?; - upload_timeline_checkpoint( - repo_harness.conf, - Arc::clone(&remote_assets), - sync_id, - checkpoint_with_uploaded_lsn, - 0, - ) - .await; - assert_index_descriptions(index, after_first_uploads.clone()).await; - - Ok(()) - } -} diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 6142953a58..0c2fedd7d5 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,1133 +1,224 @@ -use crate::relish::*; -use crate::walrecord::MultiXactMember; -use crate::CheckpointConfig; -use anyhow::Result; +use crate::walrecord::NeonWalRecord; +use anyhow::{bail, Result}; +use byteorder::{ByteOrder, BE}; use bytes::Bytes; -use postgres_ffi::{MultiXactId, MultiXactOffset, TransactionId}; use serde::{Deserialize, Serialize}; -use std::collections::HashSet; -use std::ops::{AddAssign, Deref}; -use std::sync::{Arc, RwLockReadGuard}; +use std::fmt; +use std::ops::{AddAssign, Range}; use std::time::Duration; -use zenith_utils::lsn::{Lsn, RecordLsn}; -use zenith_utils::zid::ZTimelineId; - -/// Block number within a relish. This matches PostgreSQL's BlockNumber type. -pub type BlockNumber = u32; +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] +/// Key used in the Repository kv-store. /// -/// A repository corresponds to one .zenith directory. One repository holds multiple -/// timelines, forked off from the same initial call to 'initdb'. -pub trait Repository: Send + Sync { - fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>; - - /// Updates timeline based on the new sync state, received from the remote storage synchronization. - /// See [`crate::remote_storage`] for more details about the synchronization. - fn set_timeline_state( - &self, - timeline_id: ZTimelineId, - new_state: TimelineSyncState, - ) -> Result<()>; - - /// Gets current synchronization state of the timeline. - /// See [`crate::remote_storage`] for more details about the synchronization. - fn get_timeline_state(&self, timeline_id: ZTimelineId) -> Option; - - /// Get Timeline handle for given zenith timeline ID. - fn get_timeline(&self, timelineid: ZTimelineId) -> Result; - - /// Create a new, empty timeline. The caller is responsible for loading data into it - /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. - fn create_empty_timeline( - &self, - timelineid: ZTimelineId, - initdb_lsn: Lsn, - ) -> Result>; - - /// Branch a timeline - fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>; - - /// perform one garbage collection iteration, removing old data files from disk. - /// this function is periodically called by gc thread. - /// also it can be explicitly requested through page server api 'do_gc' command. - /// - /// 'timelineid' specifies the timeline to GC, or None for all. - /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval). - /// `checkpoint_before_gc` parameter is used to force compaction of storage before CG - /// to make tests more deterministic. - /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed? - fn gc_iteration( - &self, - timelineid: Option, - horizon: u64, - checkpoint_before_gc: bool, - ) -> Result; - - /// perform one checkpoint iteration, flushing in-memory data on disk. - /// this function is periodically called by checkponter thread. - fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()>; +/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs +/// for what we actually store in these fields. +pub struct Key { + pub field1: u8, + pub field2: u32, + pub field3: u32, + pub field4: u32, + pub field5: u8, + pub field6: u32, } -/// A timeline, that belongs to the current repository. -pub enum RepositoryTimeline { - /// Timeline, with its files present locally in pageserver's working directory. - /// Loaded into pageserver's memory and ready to be used. - Local(Arc), - /// Timeline, found on the pageserver's remote storage, but not yet downloaded locally. - Remote { - id: ZTimelineId, - /// metadata contents of the latest successfully uploaded checkpoint - disk_consistent_lsn: Lsn, - }, -} +pub const KEY_SIZE: usize = 18; -impl RepositoryTimeline { - pub fn local_timeline(&self) -> Option> { - if let Self::Local(local_timeline) = self { - Some(Arc::clone(local_timeline)) - } else { - None +impl Key { + /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. + /// As long as Neon does not support tablespace (because of lack of access to local file system), + /// we can assume that only some predefined namespace OIDs are used which can fit in u16 + pub fn to_i128(&self) -> i128 { + assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); + (((self.field1 & 0xf) as i128) << 120) + | (((self.field2 & 0xFFFF) as i128) << 104) + | ((self.field3 as i128) << 72) + | ((self.field4 as i128) << 40) + | ((self.field5 as i128) << 32) + | self.field6 as i128 + } + + pub fn next(&self) -> Key { + self.add(1) + } + + pub fn add(&self, x: u32) -> Key { + let mut key = *self; + + let r = key.field6.overflowing_add(x); + key.field6 = r.0; + if r.1 { + let r = key.field5.overflowing_add(1); + key.field5 = r.0; + if r.1 { + let r = key.field4.overflowing_add(1); + key.field4 = r.0; + if r.1 { + let r = key.field3.overflowing_add(1); + key.field3 = r.0; + if r.1 { + let r = key.field2.overflowing_add(1); + key.field2 = r.0; + if r.1 { + let r = key.field1.overflowing_add(1); + key.field1 = r.0; + assert!(!r.1); + } + } + } + } } + key + } + + pub fn from_slice(b: &[u8]) -> Self { + Key { + field1: b[0], + field2: u32::from_be_bytes(b[1..5].try_into().unwrap()), + field3: u32::from_be_bytes(b[5..9].try_into().unwrap()), + field4: u32::from_be_bytes(b[9..13].try_into().unwrap()), + field5: b[13], + field6: u32::from_be_bytes(b[14..18].try_into().unwrap()), + } + } + + pub fn write_to_byte_slice(&self, buf: &mut [u8]) { + buf[0] = self.field1; + BE::write_u32(&mut buf[1..5], self.field2); + BE::write_u32(&mut buf[5..9], self.field3); + BE::write_u32(&mut buf[9..13], self.field4); + buf[13] = self.field5; + BE::write_u32(&mut buf[14..18], self.field6); } } -/// A state of the timeline synchronization with the remote storage. -/// Contains `disk_consistent_lsn` of the corresponding remote timeline (latest checkpoint's disk_consistent_lsn). -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] -pub enum TimelineSyncState { - /// No further downloads from the remote storage are needed. - /// The timeline state is up-to-date or ahead of the remote storage one, - /// ready to be used in any pageserver operation. - Ready(Lsn), - /// Timeline is scheduled for downloading, but its current local state is not up to date with the remote storage. - /// The timeline is not ready to be used in any pageserver operations, otherwise it might diverge its local state from the remote version, - /// making it impossible to sync it further. - AwaitsDownload(Lsn), - /// Timeline was not in the pageserver's local working directory, but was found on the remote storage, ready to be downloaded. - /// Cannot be used in any pageserver operations due to complete absence locally. - CloudOnly(Lsn), - /// Timeline was evicted from the pageserver's local working directory due to conflicting remote and local states or too many errors during the synchronization. - /// Such timelines cannot have their state synchronized further and may not have the data about remote timeline's disk_consistent_lsn, since eviction may happen - /// due to errors before the remote timeline contents is known. - Evicted(Option), +pub fn key_range_size(key_range: &Range) -> u32 { + let start = key_range.start; + let end = key_range.end; + + if end.field1 != start.field1 + || end.field2 != start.field2 + || end.field3 != start.field3 + || end.field4 != start.field4 + { + return u32::MAX; + } + + let start = (start.field5 as u64) << 32 | start.field6 as u64; + let end = (end.field5 as u64) << 32 | end.field6 as u64; + + let diff = end - start; + if diff > u32::MAX as u64 { + u32::MAX + } else { + diff as u32 + } } -impl TimelineSyncState { - pub fn remote_disk_consistent_lsn(&self) -> Option { - Some(match self { - TimelineSyncState::Evicted(None) => return None, - TimelineSyncState::Ready(lsn) => lsn, - TimelineSyncState::AwaitsDownload(lsn) => lsn, - TimelineSyncState::CloudOnly(lsn) => lsn, - TimelineSyncState::Evicted(Some(lsn)) => lsn, +pub fn singleton_range(key: Key) -> Range { + key..key.next() +} + +impl fmt::Display for Key { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{:02X}{:08X}{:08X}{:08X}{:02X}{:08X}", + self.field1, self.field2, self.field3, self.field4, self.field5, self.field6 + ) + } +} + +impl Key { + pub const MIN: Key = Key { + field1: u8::MIN, + field2: u32::MIN, + field3: u32::MIN, + field4: u32::MIN, + field5: u8::MIN, + field6: u32::MIN, + }; + pub const MAX: Key = Key { + field1: u8::MAX, + field2: u32::MAX, + field3: u32::MAX, + field4: u32::MAX, + field5: u8::MAX, + field6: u32::MAX, + }; + + pub fn from_hex(s: &str) -> Result { + if s.len() != 36 { + bail!("parse error"); + } + Ok(Key { + field1: u8::from_str_radix(&s[0..2], 16)?, + field2: u32::from_str_radix(&s[2..10], 16)?, + field3: u32::from_str_radix(&s[10..18], 16)?, + field4: u32::from_str_radix(&s[18..26], 16)?, + field5: u8::from_str_radix(&s[26..28], 16)?, + field6: u32::from_str_radix(&s[28..36], 16)?, }) - .copied() + } +} + +/// A 'value' stored for a one Key. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum Value { + /// An Image value contains a full copy of the value + Image(Bytes), + /// A WalRecord value contains a WAL record that needs to be + /// replayed get the full value. Replaying the WAL record + /// might need a previous version of the value (if will_init() + /// returns false), or it may be replayed stand-alone (true). + WalRecord(NeonWalRecord), +} + +impl Value { + pub fn is_image(&self) -> bool { + matches!(self, Value::Image(_)) + } + + pub fn will_init(&self) -> bool { + match self { + Value::Image(_) => true, + Value::WalRecord(rec) => rec.will_init(), + } } } /// /// Result of performing GC /// -#[derive(Default)] +#[derive(Default, Serialize)] pub struct GcResult { - pub ondisk_relfiles_total: u64, - pub ondisk_relfiles_needed_by_cutoff: u64, - pub ondisk_relfiles_needed_by_branches: u64, - pub ondisk_relfiles_not_updated: u64, - pub ondisk_relfiles_needed_as_tombstone: u64, - pub ondisk_relfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. - pub ondisk_relfiles_dropped: u64, // # of layer files removed because the relation was dropped - - pub ondisk_nonrelfiles_total: u64, - pub ondisk_nonrelfiles_needed_by_cutoff: u64, - pub ondisk_nonrelfiles_needed_by_branches: u64, - pub ondisk_nonrelfiles_not_updated: u64, - pub ondisk_nonrelfiles_needed_as_tombstone: u64, - pub ondisk_nonrelfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. - pub ondisk_nonrelfiles_dropped: u64, // # of layer files removed because the relation was dropped + pub layers_total: u64, + pub layers_needed_by_cutoff: u64, + pub layers_needed_by_pitr: u64, + pub layers_needed_by_branches: u64, + pub layers_not_updated: u64, + pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. + #[serde(serialize_with = "serialize_duration_as_millis")] pub elapsed: Duration, } +// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds +fn serialize_duration_as_millis(d: &Duration, serializer: S) -> Result +where + S: serde::Serializer, +{ + d.as_millis().serialize(serializer) +} + impl AddAssign for GcResult { fn add_assign(&mut self, other: Self) { - self.ondisk_relfiles_total += other.ondisk_relfiles_total; - self.ondisk_relfiles_needed_by_cutoff += other.ondisk_relfiles_needed_by_cutoff; - self.ondisk_relfiles_needed_by_branches += other.ondisk_relfiles_needed_by_branches; - self.ondisk_relfiles_not_updated += other.ondisk_relfiles_not_updated; - self.ondisk_relfiles_needed_as_tombstone += other.ondisk_relfiles_needed_as_tombstone; - self.ondisk_relfiles_removed += other.ondisk_relfiles_removed; - self.ondisk_relfiles_dropped += other.ondisk_relfiles_dropped; - - self.ondisk_nonrelfiles_total += other.ondisk_nonrelfiles_total; - self.ondisk_nonrelfiles_needed_by_cutoff += other.ondisk_nonrelfiles_needed_by_cutoff; - self.ondisk_nonrelfiles_needed_by_branches += other.ondisk_nonrelfiles_needed_by_branches; - self.ondisk_nonrelfiles_not_updated += other.ondisk_nonrelfiles_not_updated; - self.ondisk_nonrelfiles_needed_as_tombstone += other.ondisk_nonrelfiles_needed_as_tombstone; - self.ondisk_nonrelfiles_removed += other.ondisk_nonrelfiles_removed; - self.ondisk_nonrelfiles_dropped += other.ondisk_nonrelfiles_dropped; + self.layers_total += other.layers_total; + self.layers_needed_by_pitr += other.layers_needed_by_pitr; + self.layers_needed_by_cutoff += other.layers_needed_by_cutoff; + self.layers_needed_by_branches += other.layers_needed_by_branches; + self.layers_not_updated += other.layers_not_updated; + self.layers_removed += other.layers_removed; self.elapsed += other.elapsed; } } - -pub trait Timeline: Send + Sync { - //------------------------------------------------------------------------------ - // Public GET functions - //------------------------------------------------------------------------------ - - /// - /// Wait until WAL has been received and processed up to this LSN. - /// - /// You should call this before any of the other get_* or list_* functions. Calling - /// those functions with an LSN that has been processed yet is an error. - /// - fn wait_lsn(&self, lsn: Lsn) -> Result<()>; - - /// Lock and get timeline's GC cuttof - fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard; - - /// Look up given page version. - fn get_page_at_lsn(&self, tag: RelishTag, blknum: BlockNumber, lsn: Lsn) -> Result; - - /// Get size of a relish - fn get_relish_size(&self, tag: RelishTag, lsn: Lsn) -> Result>; - - /// Does relation exist? - fn get_rel_exists(&self, tag: RelishTag, lsn: Lsn) -> Result; - - /// Get a list of all existing relations - /// Pass RelTag to get relation objects or None to get nonrels. - fn list_relishes(&self, tag: Option, lsn: Lsn) -> Result>; - - /// Get a list of all existing relations in given tablespace and database. - fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result>; - - /// Get a list of all existing non-relational objects - fn list_nonrels(&self, lsn: Lsn) -> Result>; - - /// Get the ancestor's timeline id - fn get_ancestor_timeline_id(&self) -> Option; - - /// Get the LSN where this branch was created - fn get_ancestor_lsn(&self) -> Lsn; - - //------------------------------------------------------------------------------ - // Public PUT functions, to update the repository with new page versions. - // - // These are called by the WAL receiver to digest WAL records. - //------------------------------------------------------------------------------ - - /// Atomically get both last and prev. - fn get_last_record_rlsn(&self) -> RecordLsn; - - /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. - fn get_last_record_lsn(&self) -> Lsn; - - fn get_prev_record_lsn(&self) -> Lsn; - - fn get_disk_consistent_lsn(&self) -> Lsn; - - /// Mutate the timeline with a [`TimelineWriter`]. - fn writer<'a>(&'a self) -> Box; - - /// - /// Flush to disk all data that was written with the put_* functions - /// - /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't - /// know anything about them here in the repository. - fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>; - - /// - /// Check that it is valid to request operations with that lsn. - fn check_lsn_is_in_scope( - &self, - lsn: Lsn, - latest_gc_cutoff_lsn: &RwLockReadGuard, - ) -> Result<()>; - - /// Retrieve current logical size of the timeline - /// - /// NOTE: counted incrementally, includes ancestors, - /// doesnt support TwoPhase relishes yet - fn get_current_logical_size(&self) -> usize; - - /// Does the same as get_current_logical_size but counted on demand. - /// Used in tests to ensure that incremental and non incremental variants match. - fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result; - - /// An escape hatch to allow "casting" a generic Timeline to LayeredTimeline. - fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline; -} - -/// Various functions to mutate the timeline. -// TODO Currently, Deref is used to allow easy access to read methods from this trait. -// This is probably considered a bad practice in Rust and should be fixed eventually, -// but will cause large code changes. -pub trait TimelineWriter: Deref { - /// Put a new page version that can be constructed from a WAL record - /// - /// This will implicitly extend the relation, if the page is beyond the - /// current end-of-file. - fn put_wal_record( - &self, - lsn: Lsn, - tag: RelishTag, - blknum: BlockNumber, - rec: ZenithWalRecord, - ) -> Result<()>; - - /// Like put_wal_record, but with ready-made image of the page. - fn put_page_image( - &self, - tag: RelishTag, - blknum: BlockNumber, - lsn: Lsn, - img: Bytes, - ) -> Result<()>; - - /// Truncate relation - fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: BlockNumber) -> Result<()>; - - /// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records - fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>; - - /// Track end of the latest digested WAL record. - /// - /// Advance requires aligned LSN as an argument and would wake wait_lsn() callers. - /// Previous last record LSN is stored alongside the latest and can be read. - fn advance_last_record_lsn(&self, lsn: Lsn); -} - -/// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper -/// around a PostgreSQL WAL record, or a custom zenith-specific "record". -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -pub enum ZenithWalRecord { - /// Native PostgreSQL WAL record - Postgres { will_init: bool, rec: Bytes }, - - /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear) - ClearVisibilityMapFlags { - new_heap_blkno: Option, - old_heap_blkno: Option, - flags: u8, - }, - /// Mark transaction IDs as committed on a CLOG page - ClogSetCommitted { xids: Vec }, - /// Mark transaction IDs as aborted on a CLOG page - ClogSetAborted { xids: Vec }, - /// Extend multixact offsets SLRU - MultixactOffsetCreate { - mid: MultiXactId, - moff: MultiXactOffset, - }, - /// Extend multixact members SLRU. - MultixactMembersCreate { - moff: MultiXactOffset, - members: Vec, - }, -} - -impl ZenithWalRecord { - /// Does replaying this WAL record initialize the page from scratch, or does - /// it need to be applied over the previous image of the page? - pub fn will_init(&self) -> bool { - match self { - ZenithWalRecord::Postgres { will_init, rec: _ } => *will_init, - - // None of the special zenith record types currently initialize the page - _ => false, - } - } -} - -#[cfg(test)] -pub mod repo_harness { - use bytes::BytesMut; - use std::{fs, path::PathBuf}; - - use crate::{ - config::PageServerConf, - layered_repository::{LayeredRepository, TIMELINES_SEGMENT_NAME}, - walredo::{WalRedoError, WalRedoManager}, - }; - - use super::*; - use hex_literal::hex; - use zenith_utils::zid::ZTenantId; - - pub const TIMELINE_ID: ZTimelineId = - ZTimelineId::from_array(hex!("11223344556677881122334455667788")); - pub const NEW_TIMELINE_ID: ZTimelineId = - ZTimelineId::from_array(hex!("AA223344556677881122334455667788")); - - /// Convenience function to create a page image with given string as the only content - #[allow(non_snake_case)] - pub fn TEST_IMG(s: &str) -> Bytes { - let mut buf = BytesMut::new(); - buf.extend_from_slice(s.as_bytes()); - buf.resize(8192, 0); - - buf.freeze() - } - - pub struct RepoHarness { - pub conf: &'static PageServerConf, - pub tenant_id: ZTenantId, - } - - impl RepoHarness { - pub fn create(test_name: &'static str) -> Result { - let repo_dir = PageServerConf::test_repo_dir(test_name); - let _ = fs::remove_dir_all(&repo_dir); - fs::create_dir_all(&repo_dir)?; - fs::create_dir_all(&repo_dir.join(TIMELINES_SEGMENT_NAME))?; - - let conf = PageServerConf::dummy_conf(repo_dir); - // Make a static copy of the config. This can never be free'd, but that's - // OK in a test. - let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - - let tenant_id = ZTenantId::generate(); - fs::create_dir_all(conf.tenant_path(&tenant_id))?; - fs::create_dir_all(conf.branches_path(&tenant_id))?; - - Ok(Self { conf, tenant_id }) - } - - pub fn load(&self) -> Box { - let walredo_mgr = Arc::new(TestRedoManager); - - Box::new(LayeredRepository::new( - self.conf, - walredo_mgr, - self.tenant_id, - false, - )) - } - - pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf { - self.conf.timeline_path(timeline_id, &self.tenant_id) - } - } - - // Mock WAL redo manager that doesn't do much - struct TestRedoManager; - - impl WalRedoManager for TestRedoManager { - fn request_redo( - &self, - rel: RelishTag, - blknum: BlockNumber, - lsn: Lsn, - base_img: Option, - records: Vec<(Lsn, ZenithWalRecord)>, - ) -> Result { - let s = format!( - "redo for {} blk {} to get to {}, with {} and {} records", - rel, - blknum, - lsn, - if base_img.is_some() { - "base image" - } else { - "no base image" - }, - records.len() - ); - println!("{}", s); - Ok(TEST_IMG(&s)) - } - } -} - -/// -/// Tests that should work the same with any Repository/Timeline implementation. -/// -#[allow(clippy::bool_assert_comparison)] -#[cfg(test)] -mod tests { - use super::repo_harness::*; - use super::*; - use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT}; - use std::fs; - - /// Arbitrary relation tag, for testing. - const TESTREL_A_REL_TAG: RelTag = RelTag { - spcnode: 0, - dbnode: 111, - relnode: 1000, - forknum: 0, - }; - const TESTREL_A: RelishTag = RelishTag::Relation(TESTREL_A_REL_TAG); - const TESTREL_B: RelishTag = RelishTag::Relation(RelTag { - spcnode: 0, - dbnode: 111, - relnode: 1001, - forknum: 0, - }); - - fn assert_current_logical_size(timeline: &Arc, lsn: Lsn) { - let incremental = timeline.get_current_logical_size(); - let non_incremental = timeline - .get_current_logical_size_non_incremental(lsn) - .unwrap(); - assert_eq!(incremental, non_incremental); - } - - static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); - static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); - - #[test] - fn test_relsize() -> Result<()> { - let repo = RepoHarness::create("test_relsize")?.load(); - // get_timeline() with non-existent timeline id should fail - //repo.get_timeline("11223344556677881122334455667788"); - - // Create timeline to work on - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - let writer = tline.writer(); - - writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?; - writer.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?; - writer.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?; - - writer.advance_last_record_lsn(Lsn(0x50)); - - assert_current_logical_size(&tline, Lsn(0x50)); - - // The relation was created at LSN 2, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); - assert!(tline.get_relish_size(TESTREL_A, Lsn(0x10))?.is_none()); - - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), 1); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x50))?.unwrap(), 3); - - // Check page contents at each LSN - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x20))?, - TEST_IMG("foo blk 0 at 2") - ); - - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x30))?, - TEST_IMG("foo blk 0 at 3") - ); - - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x40))?, - TEST_IMG("foo blk 0 at 3") - ); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, - TEST_IMG("foo blk 1 at 4") - ); - - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x50))?, - TEST_IMG("foo blk 0 at 3") - ); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, - TEST_IMG("foo blk 1 at 4") - ); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 2, Lsn(0x50))?, - TEST_IMG("foo blk 2 at 5") - ); - - // Truncate last block - writer.put_truncation(TESTREL_A, Lsn(0x60), 2)?; - writer.advance_last_record_lsn(Lsn(0x60)); - assert_current_logical_size(&tline, Lsn(0x60)); - - // Check reported size and contents after truncation - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 2); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x60))?, - TEST_IMG("foo blk 0 at 3") - ); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, - TEST_IMG("foo blk 1 at 4") - ); - - // should still see the truncated block with older LSN - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x50))?.unwrap(), 3); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 2, Lsn(0x50))?, - TEST_IMG("foo blk 2 at 5") - ); - - // Truncate to zero length - writer.put_truncation(TESTREL_A, Lsn(0x68), 0)?; - writer.advance_last_record_lsn(Lsn(0x68)); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x68))?.unwrap(), 0); - - // Extend from 0 to 2 blocks, leaving a gap - writer.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?; - writer.advance_last_record_lsn(Lsn(0x70)); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x70))?.unwrap(), 2); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, ZERO_PAGE); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x70))?, - TEST_IMG("foo blk 1") - ); - - // Extend a lot more, leaving a big gap that spans across segments - // FIXME: This is currently broken, see https://github.com/zenithdb/zenith/issues/500 - /* - tline.put_page_image(TESTREL_A, 1500, Lsn(0x80), TEST_IMG("foo blk 1500"))?; - tline.advance_last_record_lsn(Lsn(0x80)); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x80))?.unwrap(), 1501); - for blk in 2..1500 { - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, blk, Lsn(0x80))?, - ZERO_PAGE); - } - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 1500, Lsn(0x80))?, - TEST_IMG("foo blk 1500")); - */ - - Ok(()) - } - - // Test what happens if we dropped a relation - // and then created it again within the same layer. - #[test] - fn test_drop_extend() -> Result<()> { - let repo = RepoHarness::create("test_drop_extend")?.load(); - - // Create timeline to work on - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - let writer = tline.writer(); - - writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - writer.advance_last_record_lsn(Lsn(0x20)); - - // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), 1); - - // Drop relish - writer.drop_relish(TESTREL_A, Lsn(0x30))?; - writer.advance_last_record_lsn(Lsn(0x30)); - - // Check that rel is not visible anymore - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false); - assert!(tline.get_relish_size(TESTREL_A, Lsn(0x30))?.is_none()); - - // Extend it again - writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?; - writer.advance_last_record_lsn(Lsn(0x40)); - - // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x40))?.unwrap(), 1); - - Ok(()) - } - - // Test what happens if we truncated a relation - // so that one of its segments was dropped - // and then extended it again within the same layer. - #[test] - fn test_truncate_extend() -> Result<()> { - let repo = RepoHarness::create("test_truncate_extend")?.load(); - - // Create timeline to work on - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - let writer = tline.writer(); - - //from storage_layer.rs - const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192; - let relsize = RELISH_SEG_SIZE * 2; - - // Create relation with relsize blocks - for blkno in 0..relsize { - let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); - writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?; - } - - writer.advance_last_record_lsn(Lsn(0x20)); - - // The relation was created at LSN 2, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); - assert!(tline.get_relish_size(TESTREL_A, Lsn(0x10))?.is_none()); - - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), - relsize - ); - - // Check relation content - for blkno in 0..relsize { - let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, blkno, lsn)?, - TEST_IMG(&data) - ); - } - - // Truncate relation so that second segment was dropped - // - only leave one page - writer.put_truncation(TESTREL_A, Lsn(0x60), 1)?; - writer.advance_last_record_lsn(Lsn(0x60)); - - // Check reported size and contents after truncation - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 1); - - for blkno in 0..1 { - let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x60))?, - TEST_IMG(&data) - ); - } - - // should still see all blocks with older LSN - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(0x50))?.unwrap(), - relsize - ); - for blkno in 0..relsize { - let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x50))?, - TEST_IMG(&data) - ); - } - - // Extend relation again. - // Add enough blocks to create second segment - for blkno in 0..relsize { - let lsn = Lsn(0x80); - let data = format!("foo blk {} at {}", blkno, lsn); - writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?; - } - writer.advance_last_record_lsn(Lsn(0x80)); - - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true); - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(0x80))?.unwrap(), - relsize - ); - // Check relation content - for blkno in 0..relsize { - let lsn = Lsn(0x80); - let data = format!("foo blk {} at {}", blkno, lsn); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x80))?, - TEST_IMG(&data) - ); - } - - Ok(()) - } - - /// Test get_relsize() and truncation with a file larger than 1 GB, so that it's - /// split into multiple 1 GB segments in Postgres. - #[test] - fn test_large_rel() -> Result<()> { - let repo = RepoHarness::create("test_large_rel")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - let writer = tline.writer(); - - let mut lsn = 0x10; - for blknum in 0..pg_constants::RELSEG_SIZE + 1 { - lsn += 0x10; - let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); - writer.put_page_image(TESTREL_A, blknum as BlockNumber, Lsn(lsn), img)?; - } - writer.advance_last_record_lsn(Lsn(lsn)); - - assert_current_logical_size(&tline, Lsn(lsn)); - - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), - pg_constants::RELSEG_SIZE + 1 - ); - - // Truncate one block - lsn += 0x10; - writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?; - writer.advance_last_record_lsn(Lsn(lsn)); - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), - pg_constants::RELSEG_SIZE - ); - assert_current_logical_size(&tline, Lsn(lsn)); - - // Truncate another block - lsn += 0x10; - writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?; - writer.advance_last_record_lsn(Lsn(lsn)); - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), - pg_constants::RELSEG_SIZE - 1 - ); - assert_current_logical_size(&tline, Lsn(lsn)); - - // Truncate to 1500, and then truncate all the way down to 0, one block at a time - // This tests the behavior at segment boundaries - let mut size: i32 = 3000; - while size >= 0 { - lsn += 0x10; - writer.put_truncation(TESTREL_A, Lsn(lsn), size as BlockNumber)?; - writer.advance_last_record_lsn(Lsn(lsn)); - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), - size as BlockNumber - ); - - size -= 1; - } - assert_current_logical_size(&tline, Lsn(lsn)); - - Ok(()) - } - - /// - /// Test list_rels() function, with branches and dropped relations - /// - #[test] - fn test_list_rels_drop() -> Result<()> { - let repo = RepoHarness::create("test_list_rels_drop")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - let writer = tline.writer(); - const TESTDB: u32 = 111; - - // Import initial dummy checkpoint record, otherwise the get_timeline() call - // after branching fails below - writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?; - - // Create a relation on the timeline - writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - - writer.advance_last_record_lsn(Lsn(0x30)); - - // Check that list_rels() lists it after LSN 2, but no before it - assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A)); - assert!(tline.list_rels(0, TESTDB, Lsn(0x20))?.contains(&TESTREL_A)); - assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A)); - - // Create a branch, check that the relation is visible there - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; - let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { - Some(timeline) => timeline, - None => panic!("Should have a local timeline"), - }; - let new_writer = newtline.writer(); - - assert!(newtline - .list_rels(0, TESTDB, Lsn(0x30))? - .contains(&TESTREL_A)); - - // Drop it on the branch - new_writer.drop_relish(TESTREL_A, Lsn(0x40))?; - new_writer.advance_last_record_lsn(Lsn(0x40)); - - drop(new_writer); - - // Check that it's no longer listed on the branch after the point where it was dropped - assert!(newtline - .list_rels(0, TESTDB, Lsn(0x30))? - .contains(&TESTREL_A)); - assert!(!newtline - .list_rels(0, TESTDB, Lsn(0x40))? - .contains(&TESTREL_A)); - - // Run checkpoint and garbage collection and check that it's still not visible - newtline.checkpoint(CheckpointConfig::Forced)?; - repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?; - - assert!(!newtline - .list_rels(0, TESTDB, Lsn(0x40))? - .contains(&TESTREL_A)); - - Ok(()) - } - - /// - /// Test branch creation - /// - #[test] - fn test_branch() -> Result<()> { - let repo = RepoHarness::create("test_branch")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - let writer = tline.writer(); - - // Import initial dummy checkpoint record, otherwise the get_timeline() call - // after branching fails below - writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?; - - // Create a relation on the timeline - writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?; - writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?; - - // Create another relation - writer.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?; - - writer.advance_last_record_lsn(Lsn(0x40)); - assert_current_logical_size(&tline, Lsn(0x40)); - - // Branch the history, modify relation differently on the new timeline - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; - let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { - Some(timeline) => timeline, - None => panic!("Should have a local timeline"), - }; - let new_writer = newtline.writer(); - - new_writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?; - new_writer.advance_last_record_lsn(Lsn(0x40)); - - // Check page contents on both branches - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x40))?, - TEST_IMG("foo blk 0 at 4") - ); - - assert_eq!( - newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x40))?, - TEST_IMG("bar blk 0 at 4") - ); - - assert_eq!( - newtline.get_page_at_lsn(TESTREL_B, 0, Lsn(0x40))?, - TEST_IMG("foobar blk 0 at 2") - ); - - assert_eq!(newtline.get_relish_size(TESTREL_B, Lsn(0x40))?.unwrap(), 1); - - assert_current_logical_size(&tline, Lsn(0x40)); - - Ok(()) - } - - fn make_some_layers(tline: &Arc, start_lsn: Lsn) -> Result<()> { - let mut lsn = start_lsn; - { - let writer = tline.writer(); - // Create a relation on the timeline - writer.put_page_image( - TESTREL_A, - 0, - lsn, - TEST_IMG(&format!("foo blk 0 at {}", lsn)), - )?; - lsn += 0x10; - writer.put_page_image( - TESTREL_A, - 0, - lsn, - TEST_IMG(&format!("foo blk 0 at {}", lsn)), - )?; - writer.advance_last_record_lsn(lsn); - } - tline.checkpoint(CheckpointConfig::Forced)?; - { - let writer = tline.writer(); - lsn += 0x10; - writer.put_page_image( - TESTREL_A, - 0, - lsn, - TEST_IMG(&format!("foo blk 0 at {}", lsn)), - )?; - lsn += 0x10; - writer.put_page_image( - TESTREL_A, - 0, - lsn, - TEST_IMG(&format!("foo blk 0 at {}", lsn)), - )?; - writer.advance_last_record_lsn(lsn); - } - tline.checkpoint(CheckpointConfig::Forced) - } - - #[test] - fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> { - let repo = - RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load(); - - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(&tline, Lsn(0x20))?; - - // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; - - // try to branch at lsn 25, should fail because we already garbage collected the data - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) { - Ok(_) => panic!("branching should have failed"), - Err(err) => { - assert!(err.to_string().contains("invalid branch start lsn")); - assert!(err - .source() - .unwrap() - .to_string() - .contains("we might've already garbage collected needed data")) - } - } - - Ok(()) - } - - #[test] - fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> { - let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); - - repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; - // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) { - Ok(_) => panic!("branching should have failed"), - Err(err) => { - assert!(&err.to_string().contains("invalid branch start lsn")); - assert!(&err - .source() - .unwrap() - .to_string() - .contains("is earlier than latest GC horizon")); - } - } - - Ok(()) - } - - #[test] - fn test_prohibit_get_page_at_lsn_for_garbage_collected_pages() -> Result<()> { - let repo = - RepoHarness::create("test_prohibit_get_page_at_lsn_for_garbage_collected_pages")? - .load(); - - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(&tline, Lsn(0x20))?; - - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; - let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); - assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); - match tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)) { - Ok(_) => panic!("request for page should have failed"), - Err(err) => assert!(err.to_string().contains("not found at")), - } - Ok(()) - } - - #[test] - fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { - let repo = - RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - - make_some_layers(&tline, Lsn(0x20))?; - - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; - let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { - Some(timeline) => timeline, - None => panic!("Should have a local timeline"), - }; - - // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; - assert!(newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)).is_ok()); - - Ok(()) - } - - #[test] - fn test_parent_keeps_data_forever_after_branching() -> Result<()> { - let harness = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?; - let repo = harness.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - - make_some_layers(&tline, Lsn(0x20))?; - - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; - let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { - Some(timeline) => timeline, - None => panic!("Should have a local timeline"), - }; - - make_some_layers(&newtline, Lsn(0x60))?; - - // run gc on parent - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; - - // check that the layer in parent before the branching point is still there - let tline_dir = harness.conf.timeline_path(&TIMELINE_ID, &harness.tenant_id); - - let expected_image_layer_path = tline_dir.join(format!( - "rel_{}_{}_{}_{}_{}_{:016X}_{:016X}", - TESTREL_A_REL_TAG.spcnode, - TESTREL_A_REL_TAG.dbnode, - TESTREL_A_REL_TAG.relnode, - TESTREL_A_REL_TAG.forknum, - 0, // seg is 0 - 0x20, - 0x30, - )); - assert!(fs::metadata(&expected_image_layer_path).is_ok()); - - Ok(()) - } - - #[test] - fn test_read_beyond_eof() -> Result<()> { - let harness = RepoHarness::create("test_read_beyond_eof")?; - let repo = harness.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - - make_some_layers(&tline, Lsn(0x20))?; - { - let writer = tline.writer(); - writer.put_page_image( - TESTREL_A, - 0, - Lsn(0x60), - TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x50))), - )?; - writer.advance_last_record_lsn(Lsn(0x60)); - } - - // Test read before rel creation. Should error out. - assert!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x10)).is_err()); - - // Read block beyond end of relation at different points in time. - // These reads should fall into different delta, image, and in-memory layers. - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x20))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x25))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x30))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x35))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x45))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x55))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, ZERO_PAGE); - - // Test on an in-memory layer with no preceding layer - { - let writer = tline.writer(); - writer.put_page_image( - TESTREL_B, - 0, - Lsn(0x70), - TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x70))), - )?; - writer.advance_last_record_lsn(Lsn(0x70)); - } - assert_eq!(tline.get_page_at_lsn(TESTREL_B, 1, Lsn(0x70))?, ZERO_PAGE); - - Ok(()) - } -} diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs new file mode 100644 index 0000000000..037fe76d7f --- /dev/null +++ b/pageserver/src/storage_sync.rs @@ -0,0 +1,1907 @@ +//! There are a few components the storage machinery consists of: +//! +//! * [`RemoteStorage`] that is used to interact with an arbitrary external storage +//! +//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync. +//! Synchronization internals are split into submodules +//! * [`storage_sync::index`] to keep track of remote tenant files, the metadata and their mappings to local files +//! * [`storage_sync::upload`] and [`storage_sync::download`] to manage archive creation and upload; download and extraction, respectively +//! +//! * public API via to interact with the external world: +//! * [`start_local_timeline_sync`] to launch a background async loop to handle the synchronization +//! * [`schedule_layer_upload`], [`schedule_layer_download`], and[`schedule_layer_delete`] to enqueue a new task +//! to be processed by the async loop +//! +//! Here's a schematic overview of all interactions backup and the rest of the pageserver perform: +//! +//! +------------------------+ +--------->-------+ +//! | | - - - (init async loop) - - - -> | | +//! | | | | +//! | | -------------------------------> | async | +//! | pageserver | (enqueue timeline sync task) | upload/download | +//! | | | loop | +//! | | <------------------------------- | | +//! | | (apply new timeline sync states) | | +//! +------------------------+ +---------<-------+ +//! | +//! | +//! CRUD layer file operations | +//! (upload/download/delete/list, etc.) | +//! V +//! +------------------------+ +//! | | +//! | [`RemoteStorage`] impl | +//! | | +//! | pageserver assumes it | +//! | owns exclusive write | +//! | access to this storage | +//! +------------------------+ +//! +//! First, during startup, the pageserver inits the storage sync task with the async loop, or leaves the loop uninitialised, if configured so. +//! The loop inits the storage connection and checks the remote files stored. +//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server). +//! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can +//! query their downloads later if they are accessed. +//! +//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata. +//! If the storage sync loop was successfully started before, pageserver schedules the layer files and the updated metadata file for upload, every time a layer is flushed to disk. +//! The uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either). +//! See [`crate::tenant`] for the upload calls and the adjacent logic. +//! +//! Synchronization logic is able to communicate back with updated timeline sync states, submitted via [`crate::tenant_mgr::attach_local_tenants`] function. +//! Tenant manager applies corresponding timeline updates in pageserver's in-memory state. +//! Such submissions happen in two cases: +//! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future +//! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory +//! +//! When the pageserver terminates, the sync loop finishes current sync task (if any) and exits. +//! +//! The storage logic considers `image` as a set of local files (layers), fully representing a certain timeline at given moment (identified with `disk_consistent_lsn` from the corresponding `metadata` file). +//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed +//! by the storage upload, if enabled. +//! Yet timeline cannot alter already existing files, and cannot remove those too: only a GC process is capable of removing unused files. +//! This way, remote storage synchronization relies on the fact that every checkpoint is incremental and local files are "immutable": +//! * when a certain checkpoint gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same state +//! * no files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten +//! when the newer image is downloaded +//! +//! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure. +//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexPart`], containing the list of remote files. +//! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download. +//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`TenantId`] and [`TimelineId`], +//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrieve its shard contents, if needed, same as any layer files. +//! +//! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed. +//! Bulk index data download happens only initially, on pageserver startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only, +//! when a new timeline is scheduled for the download. +//! +//! NOTES: +//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage +//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API. +//! +//! * the sync tasks may not processed immediately after the submission: if they error and get re-enqueued, their execution might be backed off to ensure error cap is not exceeded too fast. +//! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time. +//! +//! A synchronization logic for the [`RemoteStorage`] and pageserver in-memory state to ensure correct synchronizations +//! between local tenant files and their counterparts from the remote storage. +//! +//! The synchronization does not aim to be immediate, yet eventually consistent. +//! Synchronization is done with the queue being emptied via separate thread asynchronously, +//! attempting to fully store pageserver's local data on the remote storage in a custom format, beneficial for storing. +//! +//! A queue is implemented in the [`sync_queue`] module as a VecDeque to hold the tasks, and a condition variable for blocking when the queue is empty. +//! +//! The queue gets emptied by a single thread with the loop, that polls the tasks in batches of deduplicated tasks. +//! A task from the batch corresponds to a single timeline, with its files to sync merged together: given that only one task sync loop step is active at a time, +//! timeline uploads and downloads can happen concurrently, in no particular order due to incremental nature of the timeline layers. +//! Deletion happens only after a successful upload only, otherwise the compaction output might make the timeline inconsistent until both tasks are fully processed without errors. +//! Upload and download update the remote data (inmemory index and S3 json index part file) only after every layer is successfully synchronized, while the deletion task +//! does otherwise: it requires to have the remote data updated first successfully: blob files will be invisible to pageserver this way. +//! +//! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via downloading and merging the index data for all timelines, +//! present locally. +//! It's enough to poll such timelines' remote state once on startup only, due to an agreement that only one pageserver at a time has an exclusive +//! write access to remote portion of timelines that are attached to the pagegserver. +//! The index state is used to issue initial sync tasks, if needed: +//! * all timelines with local state behind the remote gets download tasks scheduled. +//! Such timelines are considered "remote" before the download succeeds, so a number of operations (gc, checkpoints) on that timeline are unavailable +//! before up-to-date layers and metadata file are downloaded locally. +//! * all newer local state gets scheduled for upload, such timelines are "local" and fully operational +//! * remote timelines not present locally are unknown to pageserver, but can be downloaded on a separate request +//! +//! Then, the index is shared across pageserver under [`RemoteIndex`] guard to ensure proper synchronization. +//! The remote index gets updated after very remote storage change (after an upload), same as the index part files remotely. +//! +//! Remote timeline contains a set of layer files, created during checkpoint(s) and the serialized [`IndexPart`] file with timeline metadata and all remote layer paths inside. +//! Those paths are used instead of `S3 list` command to avoid its slowliness and expenciveness for big amount of files. +//! If the index part does not contain some file path but it's present remotely, such file is invisible to pageserver and ignored. +//! Among other tasks, the index is used to prevent invalid uploads and non-existing downloads on demand, refer to [`index`] for more details. +//! +//! Index construction is currently the only place where the storage sync can return an [`Err`] to the user. +//! New sync tasks are accepted via [`schedule_layer_upload`], [`schedule_layer_download`] and [`schedule_layer_delete`] functions, +//! disregarding of the corresponding loop startup. +//! It's up to the caller to avoid synchronizations if the loop is disabled: otherwise, the sync tasks will be ignored. +//! After the initial state is loaded into memory and the loop starts, any further [`Err`] results do not stop the loop, but rather +//! reschedule the same task, with possibly less files to sync: +//! * download tasks currently never replace existing local file with metadata file as an exception +//! (but this is a subject to change when checksum checks are implemented: all files could get overwritten on a checksum mismatch) +//! * download tasks carry the information of skipped acrhives, so resubmissions are not downloading successfully processed layers again +//! * downloads do not contain any actual files to download, so that "external", sync pageserver code is able to schedule the timeline download +//! without accessing any extra information about its files. +//! +//! Uploads and downloads sync layer files in arbitrary order, but only after all layer files are synched the local metadada (for download) and remote index part (for upload) are updated, +//! to avoid having a corrupt state without the relevant layer files. +//! Refer to [`upload`] and [`download`] for more details. +//! +//! Synchronization never removes any local files from pageserver workdir or remote files from the remote storage, yet there could be overwrites of the same files (index part and metadata file updates, future checksum mismatch fixes). +//! NOTE: No real contents or checksum check happens right now and is a subject to improve later. +//! +//! After the whole timeline is downloaded, [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function is used to update pageserver memory stage for the timeline processed. + +mod delete; +mod download; +pub mod index; +mod upload; + +use std::{ + collections::{hash_map, HashMap, HashSet, VecDeque}, + fmt::Debug, + num::{NonZeroU32, NonZeroUsize}, + ops::ControlFlow, + path::{Path, PathBuf}, + sync::{Condvar, Mutex}, +}; + +use anyhow::{anyhow, bail, Context}; +use futures::stream::{FuturesUnordered, StreamExt}; +use once_cell::sync::OnceCell; +use remote_storage::GenericRemoteStorage; +use tokio::{ + fs, + time::{Duration, Instant}, +}; +use tracing::*; + +use self::{ + delete::delete_timeline_layers, + download::{download_timeline_layers, DownloadedTimeline}, + index::{IndexPart, RemoteTimeline, RemoteTimelineIndex}, + upload::{upload_index_part, upload_timeline_layers, UploadedTimeline}, +}; +use crate::{ + config::PageServerConf, + exponential_backoff, + storage_sync::index::{LayerFileMetadata, RemoteIndex}, + task_mgr, + task_mgr::TaskKind, + task_mgr::BACKGROUND_RUNTIME, + tenant::metadata::TimelineMetadata, + tenant_mgr::{attach_local_tenants, TenantAttachData}, +}; +use crate::{ + metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD}, + TenantTimelineValues, +}; + +use crate::metrics::{IMAGE_SYNC_COUNT, IMAGE_SYNC_TIME_HISTOGRAM}; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; + +use self::download::download_index_parts; +pub use self::download::gather_tenant_timelines_index_parts; + +static SYNC_QUEUE: OnceCell = OnceCell::new(); + +/// A timeline status to share with pageserver's sync counterpart, +/// after comparing local and remote timeline state. +#[derive(Clone, PartialEq, Eq)] +pub enum LocalTimelineInitStatus { + /// The timeline has every remote layer present locally. + /// There could be some layers requiring uploading, + /// but this does not block the timeline from any user interaction. + LocallyComplete(TimelineMetadata), + /// A timeline has some files remotely, that are not present locally and need downloading. + /// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only, + /// so the data needs to be downloaded first before the timeline can be used. + NeedsSync, +} + +impl std::fmt::Debug for LocalTimelineInitStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::LocallyComplete(_) => write!(f, "LocallyComplete"), + Self::NeedsSync => write!(f, "NeedsSync"), + } + } +} + +/// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization. +/// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still, +/// to simplify the received code. +pub struct SyncStartupData { + pub remote_index: RemoteIndex, + pub local_timeline_init_statuses: TenantTimelineValues, +} + +/// Global queue of sync tasks. +/// +/// 'queue' is protected by a mutex, and 'condvar' is used to wait for tasks to arrive. +struct SyncQueue { + max_timelines_per_batch: NonZeroUsize, + + queue: Mutex>, + condvar: Condvar, +} + +impl SyncQueue { + fn new(max_timelines_per_batch: NonZeroUsize) -> Self { + Self { + max_timelines_per_batch, + queue: Mutex::new(VecDeque::new()), + condvar: Condvar::new(), + } + } + + /// Queue a new task + fn push(&self, sync_id: TenantTimelineId, new_task: SyncTask) { + let mut q = self.queue.lock().unwrap(); + + q.push_back((sync_id, new_task)); + if q.len() <= 1 { + self.condvar.notify_one(); + } + } + + /// Fetches a task batch, getting every existing entry from the queue, grouping by timelines and merging the tasks for every timeline. + /// A timeline has to care to not to delete certain layers from the remote storage before the corresponding uploads happen. + /// Other than that, due to "immutable" nature of the layers, the order of their deletion/uploading/downloading does not matter. + /// Hence, we merge the layers together into single task per timeline and run those concurrently (with the deletion happening only after successful uploading). + fn next_task_batch(&self) -> (HashMap, usize) { + // Wait for the first task in blocking fashion + let mut q = self.queue.lock().unwrap(); + while q.is_empty() { + q = self + .condvar + .wait_timeout(q, Duration::from_millis(1000)) + .unwrap() + .0; + + if task_mgr::is_shutdown_requested() { + return (HashMap::new(), q.len()); + } + } + let (first_sync_id, first_task) = q.pop_front().unwrap(); + + let mut timelines_left_to_batch = self.max_timelines_per_batch.get() - 1; + let tasks_to_process = q.len(); + + let mut batches = HashMap::with_capacity(tasks_to_process); + batches.insert(first_sync_id, SyncTaskBatch::new(first_task)); + + let mut tasks_to_reenqueue = Vec::with_capacity(tasks_to_process); + + // Greedily grab as many other tasks that we can. + // Yet do not put all timelines in the batch, but only the first ones that fit the timeline limit. + // Re-enqueue the tasks that don't fit in this batch. + while let Some((sync_id, new_task)) = q.pop_front() { + match batches.entry(sync_id) { + hash_map::Entry::Occupied(mut v) => v.get_mut().add(new_task), + hash_map::Entry::Vacant(v) => { + timelines_left_to_batch = timelines_left_to_batch.saturating_sub(1); + if timelines_left_to_batch == 0 { + tasks_to_reenqueue.push((sync_id, new_task)); + } else { + v.insert(SyncTaskBatch::new(new_task)); + } + } + } + } + + debug!( + "Batched {} timelines, reenqueuing {}", + batches.len(), + tasks_to_reenqueue.len() + ); + for (id, task) in tasks_to_reenqueue { + q.push_back((id, task)); + } + + (batches, q.len()) + } + + #[cfg(test)] + fn len(&self) -> usize { + self.queue.lock().unwrap().len() + } +} + +/// A task to run in the async download/upload loop. +/// Limited by the number of retries, after certain threshold the failing task gets evicted and the timeline disabled. +#[derive(Debug, Clone, PartialEq, Eq)] +enum SyncTask { + /// A checkpoint outcome with possible local file updates that need actualization in the remote storage. + /// Not necessary more fresh than the one already uploaded. + Download(SyncData), + /// A certain amount of image files to download. + Upload(SyncData), + /// Delete remote files. + Delete(SyncData), +} + +/// Stores the data to synd and its retries, to evict the tasks failing to frequently. +#[derive(Debug, Clone, PartialEq, Eq)] +struct SyncData { + retries: u32, + data: T, +} + +impl SyncData { + fn new(retries: u32, data: T) -> Self { + Self { retries, data } + } +} + +impl SyncTask { + fn download(download_task: LayersDownload) -> Self { + Self::Download(SyncData::new(0, download_task)) + } + + fn upload(upload_task: LayersUpload) -> Self { + Self::Upload(SyncData::new(0, upload_task)) + } + + fn delete(delete_task: LayersDeletion) -> Self { + Self::Delete(SyncData::new(0, delete_task)) + } +} + +#[derive(Debug, Default, PartialEq, Eq)] +struct SyncTaskBatch { + upload: Option>, + download: Option>, + delete: Option>, +} + +impl SyncTaskBatch { + fn new(task: SyncTask) -> Self { + let mut new_self = Self::default(); + new_self.add(task); + new_self + } + + fn add(&mut self, task: SyncTask) { + match task { + SyncTask::Download(new_download) => match &mut self.download { + Some(batch_download) => { + batch_download.retries = batch_download.retries.min(new_download.retries); + batch_download + .data + .layers_to_skip + .extend(new_download.data.layers_to_skip.into_iter()); + } + None => self.download = Some(new_download), + }, + SyncTask::Upload(new_upload) => match &mut self.upload { + Some(batch_upload) => { + batch_upload.retries = batch_upload.retries.min(new_upload.retries); + + let batch_data = &mut batch_upload.data; + let new_data = new_upload.data; + batch_data + .layers_to_upload + .extend(new_data.layers_to_upload.into_iter()); + batch_data + .uploaded_layers + .extend(new_data.uploaded_layers.into_iter()); + if batch_data + .metadata + .as_ref() + .map(|meta| meta.disk_consistent_lsn()) + <= new_data + .metadata + .as_ref() + .map(|meta| meta.disk_consistent_lsn()) + { + batch_data.metadata = new_data.metadata; + } + } + None => self.upload = Some(new_upload), + }, + SyncTask::Delete(new_delete) => match &mut self.delete { + Some(batch_delete) => { + batch_delete.retries = batch_delete.retries.min(new_delete.retries); + // Need to reregister deletions, but it's ok to register already deleted files once again, they will be skipped. + batch_delete.data.deletion_registered = batch_delete + .data + .deletion_registered + .min(new_delete.data.deletion_registered); + + // Do not download and upload the layers getting removed in the same batch + if let Some(batch_download) = &mut self.download { + batch_download + .data + .layers_to_skip + .extend(new_delete.data.layers_to_delete.iter().cloned()); + batch_download + .data + .layers_to_skip + .extend(new_delete.data.deleted_layers.iter().cloned()); + } + if let Some(batch_upload) = &mut self.upload { + let not_deleted = |layer: &PathBuf, _: &mut LayerFileMetadata| { + !new_delete.data.layers_to_delete.contains(layer) + && !new_delete.data.deleted_layers.contains(layer) + }; + batch_upload.data.layers_to_upload.retain(not_deleted); + batch_upload.data.uploaded_layers.retain(not_deleted); + } + + batch_delete + .data + .layers_to_delete + .extend(new_delete.data.layers_to_delete.into_iter()); + batch_delete + .data + .deleted_layers + .extend(new_delete.data.deleted_layers.into_iter()); + } + None => self.delete = Some(new_delete), + }, + } + } +} + +/// Local timeline files for upload, appeared after the new checkpoint. +/// Current checkpoint design assumes new files are added only, no deletions or amendment happens. +#[derive(Debug, Clone, PartialEq, Eq)] +struct LayersUpload { + /// Layer file path in the pageserver workdir, that were added for the corresponding checkpoint. + layers_to_upload: HashMap, + /// Already uploaded layers. Used to store the data about the uploads between task retries + /// and to record the data into the remote index after the task got completed or evicted. + uploaded_layers: HashMap, + metadata: Option, +} + +/// A timeline download task. +/// Does not contain the file list to download, to allow other +/// parts of the pageserer code to schedule the task +/// without using the remote index or any other ways to list the remote timeline files. +/// Skips the files that are already downloaded. +#[derive(Debug, Clone, PartialEq, Eq)] +struct LayersDownload { + layers_to_skip: HashSet, + + /// Paths which have been downloaded, and had their metadata verified or generated. + /// + /// Metadata generation happens when upgrading from past version of `IndexPart`. + gathered_metadata: HashMap, +} + +impl LayersDownload { + fn from_skipped_layers(layers_to_skip: HashSet) -> Self { + LayersDownload { + layers_to_skip, + gathered_metadata: HashMap::default(), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct LayersDeletion { + layers_to_delete: HashSet, + deleted_layers: HashSet, + /// Pageserver uses [`IndexPart`] as a source of truth for listing the files per timeline. + /// This object gets serialized and placed into the remote storage. + /// So if we manage to update pageserver's [`RemoteIndex`] and update the index part on the remote storage, + /// the corresponding files on S3 won't exist for pageserver albeit being physically present on that remote storage still. + /// Then all that's left is to remove the files from the remote storage, without concerns about consistency. + deletion_registered: bool, +} + +/// Adds the new checkpoint files as an upload sync task to the queue. +/// On task failure, it gets retried again from the start a number of times. +/// +/// Ensure that the loop is started otherwise the task is never processed. +pub fn schedule_layer_upload( + tenant_id: TenantId, + timeline_id: TimelineId, + layers_to_upload: HashMap, + metadata: Option, +) { + let sync_queue = match SYNC_QUEUE.get() { + Some(queue) => queue, + None => { + warn!("Could not send an upload task for tenant {tenant_id}, timeline {timeline_id}"); + return; + } + }; + sync_queue.push( + TenantTimelineId { + tenant_id, + timeline_id, + }, + SyncTask::upload(LayersUpload { + layers_to_upload, + uploaded_layers: HashMap::new(), + metadata, + }), + ); + debug!("Upload task for tenant {tenant_id}, timeline {timeline_id} sent") +} + +/// Adds the new files to delete as a deletion task to the queue. +/// On task failure, it gets retried again from the start a number of times. +/// +/// Ensure that the loop is started otherwise the task is never processed. +pub fn schedule_layer_delete( + tenant_id: TenantId, + timeline_id: TimelineId, + layers_to_delete: HashSet, +) { + let sync_queue = match SYNC_QUEUE.get() { + Some(queue) => queue, + None => { + warn!("Could not send deletion task for tenant {tenant_id}, timeline {timeline_id}"); + return; + } + }; + sync_queue.push( + TenantTimelineId { + tenant_id, + timeline_id, + }, + SyncTask::delete(LayersDeletion { + layers_to_delete, + deleted_layers: HashSet::new(), + deletion_registered: false, + }), + ); + debug!("Deletion task for tenant {tenant_id}, timeline {timeline_id} sent") +} + +/// Requests the download of the entire timeline for a given tenant. +/// No existing local files are currently overwritten, except the metadata file (if its disk_consistent_lsn is less than the downloaded one). +/// The metadata file is always updated last, to avoid inconsistencies. +/// +/// On any failure, the task gets retried, omitting already downloaded layers. +/// +/// Ensure that the loop is started otherwise the task is never processed. +pub fn schedule_layer_download(tenant_id: TenantId, timeline_id: TimelineId) { + debug!("Scheduling layer download for tenant {tenant_id}, timeline {timeline_id}"); + let sync_queue = match SYNC_QUEUE.get() { + Some(queue) => queue, + None => { + warn!("Could not send download task for tenant {tenant_id}, timeline {timeline_id}"); + return; + } + }; + sync_queue.push( + TenantTimelineId { + tenant_id, + timeline_id, + }, + SyncTask::download(LayersDownload::from_skipped_layers(HashSet::new())), + ); + debug!("Download task for tenant {tenant_id}, timeline {timeline_id} sent") +} + +/// Local existing timeline files +/// +/// Values of this type serve different meanings in different contexts. On startup, collected +/// timelines come with the full collected information and when signalling readyness to attach +/// after completed download. After the download the file information is no longer carried, because +/// it is already merged into [`RemoteTimeline`]. +#[derive(Debug)] +pub struct TimelineLocalFiles(TimelineMetadata, HashMap); + +impl TimelineLocalFiles { + pub fn metadata(&self) -> &TimelineMetadata { + &self.0 + } + + /// Called during startup, for all of the local files with full metadata. + pub(crate) fn collected( + metadata: TimelineMetadata, + timeline_files: HashMap, + ) -> TimelineLocalFiles { + TimelineLocalFiles(metadata, timeline_files) + } + + /// Called near the end of tenant initialization, to signal readyness to attach tenants. + pub(crate) fn ready(metadata: TimelineMetadata) -> Self { + TimelineLocalFiles(metadata, HashMap::new()) + } +} + +/// Launch a thread to perform remote storage sync tasks. +/// See module docs for loop step description. +pub fn spawn_storage_sync_task( + conf: &'static PageServerConf, + local_timeline_files: HashMap>, + storage: GenericRemoteStorage, + max_concurrent_timelines_sync: NonZeroUsize, + max_sync_errors: NonZeroU32, +) -> anyhow::Result { + let sync_queue = SyncQueue::new(max_concurrent_timelines_sync); + SYNC_QUEUE + .set(sync_queue) + .map_err(|_queue| anyhow!("Could not initialize sync queue"))?; + let sync_queue = match SYNC_QUEUE.get() { + Some(queue) => queue, + None => bail!("Could not get sync queue during the sync loop step, aborting"), + }; + + // TODO we are able to "attach" empty tenants, but not doing it now since it might require big wait time: + // * we need to list every timeline for tenant on S3, that might be a costly operation + // * we need to download every timeline for the tenant, to activate it in memory + // + // When on-demand download gets merged, we're able to do this fast by storing timeline metadata only. + let mut empty_tenants = TenantTimelineValues::::new(); + let mut keys_for_index_part_downloads = HashSet::new(); + let mut timelines_to_sync = HashMap::new(); + + for (tenant_id, timeline_data) in local_timeline_files { + if timeline_data.is_empty() { + info!("got empty tenant {}", tenant_id); + let _ = empty_tenants.0.entry(tenant_id).or_default(); + } else { + for (timeline_id, timeline_data) in timeline_data { + let id = TenantTimelineId::new(tenant_id, timeline_id); + keys_for_index_part_downloads.insert(id); + timelines_to_sync.insert(id, timeline_data); + } + } + } + + let applicable_index_parts = BACKGROUND_RUNTIME.block_on(download_index_parts( + conf, + &storage, + keys_for_index_part_downloads, + )); + + let remote_index = RemoteIndex::from_parts(conf, applicable_index_parts)?; + + let mut local_timeline_init_statuses = schedule_first_sync_tasks( + &mut BACKGROUND_RUNTIME.block_on(remote_index.write()), + sync_queue, + timelines_to_sync, + ); + local_timeline_init_statuses + .0 + .extend(empty_tenants.0.into_iter()); + + let remote_index_clone = remote_index.clone(); + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::StorageSync, + None, + None, + "Remote storage sync task", + false, + async move { + storage_sync_loop( + conf, + (storage, remote_index_clone, sync_queue), + max_sync_errors, + ) + .instrument(info_span!("storage_sync_loop")) + .await; + Ok(()) + }, + ); + Ok(SyncStartupData { + remote_index, + local_timeline_init_statuses, + }) +} + +async fn storage_sync_loop( + conf: &'static PageServerConf, + (storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue), + max_sync_errors: NonZeroU32, +) { + info!("Starting remote storage sync loop"); + loop { + let loop_storage = storage.clone(); + + let (batched_tasks, remaining_queue_length) = sync_queue.next_task_batch(); + + if task_mgr::is_shutdown_requested() { + info!("Shutdown requested, stopping"); + break; + } + + REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64); + if remaining_queue_length > 0 || !batched_tasks.is_empty() { + debug!("Processing tasks for {} timelines in batch, more tasks left to process: {remaining_queue_length}", batched_tasks.len()); + } else { + debug!("No tasks to process"); + continue; + } + + // Concurrently perform all the tasks in the batch + let loop_step = tokio::select! { + step = process_batches( + conf, + max_sync_errors, + loop_storage, + &index, + batched_tasks, + sync_queue, + ) + .instrument(info_span!("storage_sync_loop_step")) => ControlFlow::Continue(step) + , + _ = task_mgr::shutdown_watcher() => ControlFlow::Break(()), + }; + + match loop_step { + ControlFlow::Continue(updated_tenants) => { + if updated_tenants.is_empty() { + debug!("Sync loop step completed, no new tenant states"); + } else { + info!( + "Sync loop step completed, {} new tenant state update(s)", + updated_tenants.len() + ); + let mut timelines_to_attach = HashMap::new(); + let index_accessor = index.read().await; + for tenant_id in updated_tenants { + let tenant_entry = match index_accessor.tenant_entry(&tenant_id) { + Some(tenant_entry) => tenant_entry, + None => { + error!( + "cannot find tenant in remote index for timeline sync update" + ); + continue; + } + }; + + if tenant_entry.has_in_progress_downloads() { + info!("Tenant {tenant_id} has pending timeline downloads, skipping tenant registration"); + continue; + } else { + info!( + "Tenant {tenant_id} download completed. Picking to register in tenant" + ); + // Here we assume that if tenant has no in-progress downloads that + // means that it is the last completed timeline download that triggered + // sync status update. So we look at the index for available timelines + // and register them all at once in a tenant for download + // to be submitted in a single operation to tenant + // so it can apply them at once to internal timeline map. + timelines_to_attach.insert( + tenant_id, + TenantAttachData::Ready( + tenant_entry + .iter() + .map(|(&id, entry)| { + (id, TimelineLocalFiles::ready(entry.metadata.clone())) + }) + .collect(), + ), + ); + } + } + drop(index_accessor); + // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. + attach_local_tenants(conf, &index, timelines_to_attach); + } + } + ControlFlow::Break(()) => { + info!("Shutdown requested, stopping"); + break; + } + } + } +} + +#[derive(Debug)] +enum DownloadStatus { + Downloaded, + Nothing, +} + +#[derive(Debug)] +enum UploadStatus { + Uploaded, + Failed(anyhow::Error), + Nothing, +} + +async fn process_batches( + conf: &'static PageServerConf, + max_sync_errors: NonZeroU32, + storage: GenericRemoteStorage, + index: &RemoteIndex, + batched_tasks: HashMap, + sync_queue: &SyncQueue, +) -> HashSet { + let mut sync_results = batched_tasks + .into_iter() + .map(|(sync_id, batch)| { + let storage = storage.clone(); + let index = index.clone(); + async move { + let state_update = process_sync_task_batch( + conf, + (storage, index, sync_queue), + max_sync_errors, + sync_id, + batch, + ) + .instrument(info_span!("process_sync_task_batch", sync_id = %sync_id)) + .await; + (sync_id, state_update) + } + }) + .collect::>(); + + let mut downloaded_timelines = HashSet::new(); + + while let Some((sync_id, download_marker)) = sync_results.next().await { + debug!( + "Finished storage sync task for sync id {sync_id} download marker {:?}", + download_marker + ); + if matches!(download_marker, DownloadStatus::Downloaded) { + downloaded_timelines.insert(sync_id.tenant_id); + } + } + + downloaded_timelines +} + +async fn process_sync_task_batch( + conf: &'static PageServerConf, + (storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue), + max_sync_errors: NonZeroU32, + sync_id: TenantTimelineId, + batch: SyncTaskBatch, +) -> DownloadStatus { + let sync_start = Instant::now(); + let current_remote_timeline = { index.read().await.timeline_entry(&sync_id).cloned() }; + + let upload_data = batch.upload.clone(); + let download_data = batch.download.clone(); + // Run both upload and download tasks concurrently (not in parallel): + // download and upload tasks do not conflict and spoil the pageserver state even if they are executed in parallel. + // Under "spoiling" here means potentially inconsistent layer set that misses some of the layers, declared present + // in local (implicitly, via Lsn values and related memory state) or remote (explicitly via remote layer file paths) metadata. + // When operating in a system without tasks failing over the error threshold, + // current batching and task processing systems aim to update the layer set and metadata files (remote and local), + // without "losing" such layer files. + let (upload_status, download_status) = tokio::join!( + async { + if let Some(upload_data) = upload_data { + let upload_retries = upload_data.retries; + match validate_task_retries(upload_retries, max_sync_errors) + .instrument(info_span!("retries_validation")) + .await + { + ControlFlow::Continue(()) => { + upload_timeline_data( + conf, + (&storage, &index, sync_queue), + current_remote_timeline.as_ref(), + sync_id, + upload_data, + sync_start, + ) + .await + } + ControlFlow::Break(()) => match update_remote_data( + conf, + &storage, + &index, + sync_id, + RemoteDataUpdate::Upload { + uploaded_data: upload_data.data, + upload_failed: true, + }, + ) + .await + { + Ok(()) => UploadStatus::Failed(anyhow::anyhow!( + "Aborted after retries validation, current retries: {upload_retries}, max retries allowed: {max_sync_errors}" + )), + Err(e) => { + error!("Failed to update remote timeline {sync_id}: {e:?}"); + UploadStatus::Failed(e) + } + }, + } + } else { + UploadStatus::Nothing + } + } + .instrument(info_span!("upload_timeline_data")), + async { + if let Some(download_data) = download_data { + match validate_task_retries(download_data.retries, max_sync_errors) + .instrument(info_span!("retries_validation")) + .await + { + ControlFlow::Continue(()) => { + return download_timeline_data( + conf, + (&storage, &index, sync_queue), + current_remote_timeline.as_ref(), + sync_id, + download_data, + sync_start, + ) + .await; + } + ControlFlow::Break(()) => { + index + .write() + .await + .set_awaits_download(&sync_id, false) + .ok(); + } + } + } + DownloadStatus::Nothing + } + .instrument(info_span!("download_timeline_data")), + ); + + if let Some(delete_data) = batch.delete { + match upload_status { + UploadStatus::Uploaded | UploadStatus::Nothing => { + match validate_task_retries(delete_data.retries, max_sync_errors) + .instrument(info_span!("retries_validation")) + .await + { + ControlFlow::Continue(()) => { + delete_timeline_data( + conf, + (&storage, &index, sync_queue), + sync_id, + delete_data, + sync_start, + ) + .instrument(info_span!("delete_timeline_data")) + .await; + } + ControlFlow::Break(()) => { + if let Err(e) = update_remote_data( + conf, + &storage, + &index, + sync_id, + RemoteDataUpdate::Delete(&delete_data.data.deleted_layers), + ) + .await + { + error!("Failed to update remote timeline {sync_id}: {e:?}"); + } + } + } + } + UploadStatus::Failed(e) => { + warn!("Skipping delete task due to failed upload tasks, reenqueuing. Upload data: {:?}, delete data: {delete_data:?}. Upload failure: {e:#}", batch.upload); + sync_queue.push(sync_id, SyncTask::Delete(delete_data)); + } + } + } + + download_status +} + +async fn download_timeline_data( + conf: &'static PageServerConf, + (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), + current_remote_timeline: Option<&RemoteTimeline>, + sync_id: TenantTimelineId, + new_download_data: SyncData, + sync_start: Instant, +) -> DownloadStatus { + static TASK_NAME: &str = "download"; + + match download_timeline_layers( + conf, + storage, + sync_queue, + current_remote_timeline, + sync_id, + new_download_data, + ) + .await + { + DownloadedTimeline::Abort => { + register_sync_status(sync_id, sync_start, TASK_NAME, None); + if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) { + error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}"); + } + } + DownloadedTimeline::FailedAndRescheduled => { + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); + } + DownloadedTimeline::Successful(mut download_data) => { + match update_local_metadata(conf, sync_id, current_remote_timeline).await { + Ok(()) => { + let mut g = index.write().await; + + match g.set_awaits_download(&sync_id, false) { + Ok(()) => { + let timeline = g + .timeline_entry_mut(&sync_id) + .expect("set_awaits_download verified existence"); + + timeline.merge_metadata_from_downloaded( + &download_data.data.gathered_metadata, + ); + + register_sync_status(sync_id, sync_start, TASK_NAME, Some(true)); + return DownloadStatus::Downloaded; + } + Err(e) => { + error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}"); + } + }; + } + Err(e) => { + error!("Failed to update local timeline metadata: {e:?}"); + download_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Download(download_data)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); + } + } + } + } + + DownloadStatus::Nothing +} + +async fn update_local_metadata( + conf: &'static PageServerConf, + sync_id: TenantTimelineId, + remote_timeline: Option<&RemoteTimeline>, +) -> anyhow::Result<()> { + let remote_metadata = match remote_timeline { + Some(timeline) => &timeline.metadata, + None => { + debug!("No remote timeline to update local metadata from, skipping the update"); + return Ok(()); + } + }; + let remote_lsn = remote_metadata.disk_consistent_lsn(); + + let local_metadata_path = conf.metadata_path(sync_id.timeline_id, sync_id.tenant_id); + let local_lsn = if local_metadata_path.exists() { + let local_metadata = read_metadata_file(&local_metadata_path) + .await + .with_context(|| { + format!( + "Failed to load local metadata from path '{}'", + local_metadata_path.display() + ) + })?; + + Some(local_metadata.disk_consistent_lsn()) + } else { + None + }; + + if local_lsn < Some(remote_lsn) { + info!("Updating local timeline metadata from remote timeline: local disk_consistent_lsn={local_lsn:?}, remote disk_consistent_lsn={remote_lsn}"); + // clone because spawn_blocking requires static lifetime + let cloned_metadata = remote_metadata.to_owned(); + let TenantTimelineId { + tenant_id, + timeline_id, + } = sync_id; + tokio::task::spawn_blocking(move || { + crate::tenant::save_metadata(conf, timeline_id, tenant_id, &cloned_metadata, true) + }) + .await + .with_context(|| { + format!( + "failed to join save_metadata task for {}", + local_metadata_path.display() + ) + })? + .with_context(|| { + format!( + "Failed to write remote metadata bytes locally to path '{}'", + local_metadata_path.display() + ) + })?; + } else { + info!("Local metadata at path '{}' has later disk consistent Lsn ({local_lsn:?}) than the remote one ({remote_lsn}), skipping the update", local_metadata_path.display()); + } + + Ok(()) +} + +async fn delete_timeline_data( + conf: &'static PageServerConf, + (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), + sync_id: TenantTimelineId, + mut new_delete_data: SyncData, + sync_start: Instant, +) { + static TASK_NAME: &str = "delete"; + + let timeline_delete = &mut new_delete_data.data; + + if !timeline_delete.deletion_registered { + if let Err(e) = update_remote_data( + conf, + storage, + index, + sync_id, + RemoteDataUpdate::Delete(&timeline_delete.layers_to_delete), + ) + .await + { + error!("Failed to update remote timeline {sync_id}: {e:?}"); + new_delete_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Delete(new_delete_data)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); + return; + } + } + timeline_delete.deletion_registered = true; + + let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await; + register_sync_status(sync_id, sync_start, TASK_NAME, Some(sync_status)); +} + +async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result { + TimelineMetadata::from_bytes( + &fs::read(metadata_path) + .await + .context("Failed to read local metadata bytes from fs")?, + ) + .context("Failed to parse metadata bytes") +} + +async fn upload_timeline_data( + conf: &'static PageServerConf, + (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue), + current_remote_timeline: Option<&RemoteTimeline>, + sync_id: TenantTimelineId, + new_upload_data: SyncData, + sync_start: Instant, +) -> UploadStatus { + static TASK_NAME: &str = "upload"; + let mut uploaded_data = match upload_timeline_layers( + storage, + sync_queue, + current_remote_timeline, + sync_id, + new_upload_data, + ) + .await + { + UploadedTimeline::FailedAndRescheduled(e) => { + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); + return UploadStatus::Failed(e); + } + UploadedTimeline::Successful(upload_data) => upload_data, + }; + + match update_remote_data( + conf, + storage, + index, + sync_id, + RemoteDataUpdate::Upload { + uploaded_data: uploaded_data.data.clone(), + upload_failed: false, + }, + ) + .await + { + Ok(()) => { + register_sync_status(sync_id, sync_start, TASK_NAME, Some(true)); + UploadStatus::Uploaded + } + Err(e) => { + error!("Failed to update remote timeline {sync_id}: {e:?}"); + uploaded_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Upload(uploaded_data)); + register_sync_status(sync_id, sync_start, TASK_NAME, Some(false)); + UploadStatus::Failed(e) + } + } +} + +enum RemoteDataUpdate<'a> { + Upload { + uploaded_data: LayersUpload, + upload_failed: bool, + }, + Delete(&'a HashSet), +} + +async fn update_remote_data( + conf: &'static PageServerConf, + storage: &GenericRemoteStorage, + index: &RemoteIndex, + sync_id: TenantTimelineId, + update: RemoteDataUpdate<'_>, +) -> anyhow::Result<()> { + let updated_remote_timeline = { + let mut index_accessor = index.write().await; + + match index_accessor.timeline_entry_mut(&sync_id) { + Some(existing_entry) => { + match update { + RemoteDataUpdate::Upload { + uploaded_data, + upload_failed, + } => { + if let Some(new_metadata) = uploaded_data.metadata.as_ref() { + if existing_entry.metadata.disk_consistent_lsn() + < new_metadata.disk_consistent_lsn() + { + existing_entry.metadata = new_metadata.clone(); + } + } + if upload_failed { + existing_entry.add_upload_failures( + uploaded_data + .layers_to_upload + .iter() + .map(|(k, v)| (k.to_owned(), v.to_owned())), + ); + } else { + existing_entry.add_timeline_layers( + uploaded_data + .uploaded_layers + .iter() + .map(|(k, v)| (k.to_owned(), v.to_owned())), + ); + } + } + RemoteDataUpdate::Delete(layers_to_remove) => { + existing_entry.remove_layers(layers_to_remove) + } + } + existing_entry.clone() + } + None => match update { + RemoteDataUpdate::Upload { + uploaded_data, + upload_failed, + } => { + let new_metadata = match uploaded_data.metadata.as_ref() { + Some(new_metadata) => new_metadata, + None => bail!("For timeline {sync_id} upload, there's no upload metadata and no remote index entry, cannot create a new one"), + }; + let mut new_remote_timeline = RemoteTimeline::new(new_metadata.clone()); + if upload_failed { + new_remote_timeline.add_upload_failures( + uploaded_data + .layers_to_upload + .iter() + .map(|(k, v)| (k.to_owned(), v.to_owned())), + ); + } else { + new_remote_timeline.add_timeline_layers( + uploaded_data + .uploaded_layers + .iter() + .map(|(k, v)| (k.to_owned(), v.to_owned())), + ); + } + + index_accessor.add_timeline_entry(sync_id, new_remote_timeline.clone()); + new_remote_timeline + } + RemoteDataUpdate::Delete(_) => { + warn!("No remote index entry for timeline {sync_id}, skipping deletion"); + return Ok(()); + } + }, + } + }; + + let timeline_path = conf.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id); + let new_index_part = + IndexPart::from_remote_timeline(&timeline_path, updated_remote_timeline) + .context("Failed to create an index part from the updated remote timeline")?; + + debug!("Uploading remote index for the timeline"); + REMOTE_INDEX_UPLOAD + .with_label_values(&[ + &sync_id.tenant_id.to_string(), + &sync_id.timeline_id.to_string(), + ]) + .inc(); + + upload_index_part(conf, storage, sync_id, new_index_part) + .await + .context("Failed to upload new index part") +} + +async fn validate_task_retries( + current_attempt: u32, + max_sync_errors: NonZeroU32, +) -> ControlFlow<(), ()> { + let max_sync_errors = max_sync_errors.get(); + if current_attempt >= max_sync_errors { + return ControlFlow::Break(()); + } + + exponential_backoff(current_attempt, 1.0, 30.0).await; + ControlFlow::Continue(()) +} + +fn schedule_first_sync_tasks( + index: &mut RemoteTimelineIndex, + sync_queue: &SyncQueue, + local_timeline_files: HashMap, +) -> TenantTimelineValues { + let mut local_timeline_init_statuses = TenantTimelineValues::new(); + + let mut new_sync_tasks = VecDeque::with_capacity(local_timeline_files.len()); + + for (sync_id, local_timeline) in local_timeline_files { + let TimelineLocalFiles(local_metadata, local_files) = local_timeline; + match index.timeline_entry_mut(&sync_id) { + Some(remote_timeline) => { + let (timeline_status, awaits_download) = compare_local_and_remote_timeline( + &mut new_sync_tasks, + sync_id, + local_metadata, + local_files, + remote_timeline, + ); + match local_timeline_init_statuses + .0 + .entry(sync_id.tenant_id) + .or_default() + .entry(sync_id.timeline_id) + { + hash_map::Entry::Occupied(mut o) => { + { + // defensive check + warn!( + "Overwriting timeline init sync status. Status {timeline_status:?}, timeline {}", + sync_id.timeline_id + ); + } + o.insert(timeline_status); + } + hash_map::Entry::Vacant(v) => { + v.insert(timeline_status); + } + } + + remote_timeline.awaits_download = awaits_download; + } + None => { + // TODO (rodionov) does this mean that we've crashed during tenant creation? + // is it safe to upload this checkpoint? could it be half broken? + warn!( + "marking {} as locally complete, while it doesnt exist in remote index", + sync_id + ); + new_sync_tasks.push_back(( + sync_id, + SyncTask::upload(LayersUpload { + layers_to_upload: local_files, + uploaded_layers: HashMap::new(), + metadata: Some(local_metadata.clone()), + }), + )); + local_timeline_init_statuses + .0 + .entry(sync_id.tenant_id) + .or_default() + .insert( + sync_id.timeline_id, + LocalTimelineInitStatus::LocallyComplete(local_metadata), + ); + } + } + } + + new_sync_tasks.into_iter().for_each(|(sync_id, task)| { + sync_queue.push(sync_id, task); + }); + local_timeline_init_statuses +} + +/// bool in return value stands for awaits_download +fn compare_local_and_remote_timeline( + new_sync_tasks: &mut VecDeque<(TenantTimelineId, SyncTask)>, + sync_id: TenantTimelineId, + local_metadata: TimelineMetadata, + local_files: HashMap, + remote_entry: &RemoteTimeline, +) -> (LocalTimelineInitStatus, bool) { + let _entered = info_span!("compare_local_and_remote_timeline", sync_id = %sync_id).entered(); + + let needed_to_download_files = remote_entry + .stored_files() + .iter() + .filter_map(|(layer_file, remote_metadata)| { + if let Some(local_metadata) = local_files.get(layer_file) { + match (remote_metadata.file_size(), local_metadata.file_size()) { + (Some(x), Some(y)) if x == y => { None }, + (None, Some(_)) => { + // upgrading from an earlier IndexPart without metadata + None + }, + _ => { + // having to deal with other than (Some(x), Some(y)) where x != y here is a + // bummer, but see #2582 and #2610 for attempts and discussion. + warn!("Redownloading locally existing {layer_file:?} due to size mismatch, size on index: {:?}, on disk: {:?}", remote_metadata.file_size(), local_metadata.file_size()); + Some(layer_file) + }, + } + } else { + // doesn't exist locally + Some(layer_file) + } + }) + .collect::>(); + + let (initial_timeline_status, awaits_download) = if !needed_to_download_files.is_empty() { + new_sync_tasks.push_back(( + sync_id, + SyncTask::download(LayersDownload::from_skipped_layers( + local_files + .keys() + .filter(|path| !needed_to_download_files.contains(path)) + .cloned() + .collect(), + )), + )); + info!("NeedsSync"); + (LocalTimelineInitStatus::NeedsSync, true) + // we do not need to manipulate with remote consistent lsn here + // because it will be updated when sync will be completed + } else { + info!("LocallyComplete"); + ( + LocalTimelineInitStatus::LocallyComplete(local_metadata.clone()), + false, + ) + }; + + let layers_to_upload = local_files + .iter() + .filter_map(|(local_file, metadata)| { + if !remote_entry.stored_files().contains_key(local_file) { + Some((local_file.to_owned(), metadata.to_owned())) + } else { + None + } + }) + .collect::>(); + + if !layers_to_upload.is_empty() { + new_sync_tasks.push_back(( + sync_id, + SyncTask::upload(LayersUpload { + layers_to_upload, + uploaded_layers: HashMap::new(), + metadata: Some(local_metadata), + }), + )); + // Note that status here doesn't change. + } + + (initial_timeline_status, awaits_download) +} + +fn register_sync_status( + sync_id: TenantTimelineId, + sync_start: Instant, + sync_name: &str, + sync_status: Option, +) { + let secs_elapsed = sync_start.elapsed().as_secs_f64(); + debug!("Processed a sync task in {secs_elapsed:.2} seconds"); + + let tenant_id = sync_id.tenant_id.to_string(); + let timeline_id = sync_id.timeline_id.to_string(); + + let sync_status = match sync_status { + Some(true) => "success", + Some(false) => "failure", + None => "abort", + }; + + IMAGE_SYNC_TIME_HISTOGRAM + .with_label_values(&[sync_name, sync_status]) + .observe(secs_elapsed); + IMAGE_SYNC_TIME + .with_label_values(&[&tenant_id, &timeline_id]) + .add(secs_elapsed); + IMAGE_SYNC_COUNT + .with_label_values(&[&tenant_id, &timeline_id, sync_name, sync_status]) + .inc(); +} + +#[cfg(test)] +mod test_utils { + use utils::lsn::Lsn; + + use crate::tenant::harness::TenantHarness; + + use super::*; + + pub(super) async fn create_local_timeline( + harness: &TenantHarness<'_>, + timeline_id: TimelineId, + filenames: &[&str], + metadata: TimelineMetadata, + ) -> anyhow::Result { + let timeline_path = harness.timeline_path(&timeline_id); + fs::create_dir_all(&timeline_path).await?; + + let mut layers_to_upload = HashMap::with_capacity(filenames.len()); + for &file in filenames { + let file_path = timeline_path.join(file); + fs::write(&file_path, dummy_contents(file).into_bytes()).await?; + let metadata = LayerFileMetadata::new(file_path.metadata()?.len()); + layers_to_upload.insert(file_path, metadata); + } + + fs::write( + harness.conf.metadata_path(timeline_id, harness.tenant_id), + metadata.to_bytes()?, + ) + .await?; + + Ok(LayersUpload { + layers_to_upload, + uploaded_layers: HashMap::new(), + metadata: Some(metadata), + }) + } + + pub(super) fn dummy_contents(name: &str) -> String { + format!("contents for {name}") + } + + pub(super) fn dummy_metadata(disk_consistent_lsn: Lsn) -> TimelineMetadata { + TimelineMetadata::new( + disk_consistent_lsn, + None, + None, + Lsn(0), + Lsn(0), + Lsn(0), + // Any version will do + // but it should be consistent with the one in the tests + crate::DEFAULT_PG_VERSION, + ) + } +} + +#[cfg(test)] +mod tests { + use super::test_utils::dummy_metadata; + use crate::tenant::harness::TIMELINE_ID; + use hex_literal::hex; + use utils::lsn::Lsn; + + use super::*; + + const TEST_SYNC_ID: TenantTimelineId = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("11223344556677881122334455667788")), + timeline_id: TIMELINE_ID, + }; + + #[tokio::test] + async fn separate_task_ids_batch() { + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + assert_eq!(sync_queue.len(), 0); + + let sync_id_2 = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")), + timeline_id: TIMELINE_ID, + }; + let sync_id_3 = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("33223344556677881122334455667788")), + timeline_id: TIMELINE_ID, + }; + assert!(sync_id_2 != TEST_SYNC_ID); + assert!(sync_id_2 != sync_id_3); + assert!(sync_id_3 != TEST_SYNC_ID); + + let download_task = + SyncTask::download(LayersDownload::from_skipped_layers(HashSet::from([ + PathBuf::from("sk"), + ]))); + let upload_task = SyncTask::upload(LayersUpload { + layers_to_upload: HashMap::from([(PathBuf::from("up"), LayerFileMetadata::new(123))]), + uploaded_layers: HashMap::from([(PathBuf::from("upl"), LayerFileMetadata::new(123))]), + metadata: Some(dummy_metadata(Lsn(2))), + }); + let delete_task = SyncTask::delete(LayersDeletion { + layers_to_delete: HashSet::from([PathBuf::from("de")]), + deleted_layers: HashSet::from([PathBuf::from("del")]), + deletion_registered: false, + }); + + sync_queue.push(TEST_SYNC_ID, download_task.clone()); + sync_queue.push(sync_id_2, upload_task.clone()); + sync_queue.push(sync_id_3, delete_task.clone()); + + let submitted_tasks_count = sync_queue.len(); + assert_eq!(submitted_tasks_count, 3); + let (mut batch, _) = sync_queue.next_task_batch(); + assert_eq!( + batch.len(), + submitted_tasks_count, + "Batch should consist of all tasks submitted" + ); + + assert_eq!( + Some(SyncTaskBatch::new(download_task)), + batch.remove(&TEST_SYNC_ID) + ); + assert_eq!( + Some(SyncTaskBatch::new(upload_task)), + batch.remove(&sync_id_2) + ); + assert_eq!( + Some(SyncTaskBatch::new(delete_task)), + batch.remove(&sync_id_3) + ); + + assert!(batch.is_empty(), "Should check all batch tasks"); + assert_eq!(sync_queue.len(), 0); + } + + #[tokio::test] + async fn same_task_id_separate_tasks_batch() { + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + assert_eq!(sync_queue.len(), 0); + + let download = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk")])); + let upload = LayersUpload { + layers_to_upload: HashMap::from([(PathBuf::from("up"), LayerFileMetadata::new(123))]), + uploaded_layers: HashMap::from([(PathBuf::from("upl"), LayerFileMetadata::new(123))]), + metadata: Some(dummy_metadata(Lsn(2))), + }; + let delete = LayersDeletion { + layers_to_delete: HashSet::from([PathBuf::from("de")]), + deleted_layers: HashSet::from([PathBuf::from("del")]), + deletion_registered: false, + }; + + sync_queue.push(TEST_SYNC_ID, SyncTask::download(download.clone())); + sync_queue.push(TEST_SYNC_ID, SyncTask::upload(upload.clone())); + sync_queue.push(TEST_SYNC_ID, SyncTask::delete(delete.clone())); + + let submitted_tasks_count = sync_queue.len(); + assert_eq!(submitted_tasks_count, 3); + let (mut batch, _) = sync_queue.next_task_batch(); + assert_eq!( + batch.len(), + 1, + "Queue should have one batch merged from 3 sync tasks of the same user" + ); + + assert_eq!( + Some(SyncTaskBatch { + upload: Some(SyncData { + retries: 0, + data: upload + }), + download: Some(SyncData { + retries: 0, + data: download + }), + delete: Some(SyncData { + retries: 0, + data: delete + }), + }), + batch.remove(&TEST_SYNC_ID), + "Should have one batch containing all tasks unchanged" + ); + + assert!(batch.is_empty(), "Should check all batch tasks"); + assert_eq!(sync_queue.len(), 0); + } + + #[tokio::test] + async fn same_task_id_same_tasks_batch() { + let sync_queue = SyncQueue::new(NonZeroUsize::new(1).unwrap()); + let download_1 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk1")])); + let download_2 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk2")])); + let download_3 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk3")])); + let download_4 = LayersDownload::from_skipped_layers(HashSet::from([PathBuf::from("sk4")])); + + let sync_id_2 = TenantTimelineId { + tenant_id: TenantId::from_array(hex!("22223344556677881122334455667788")), + timeline_id: TIMELINE_ID, + }; + assert!(sync_id_2 != TEST_SYNC_ID); + + sync_queue.push(TEST_SYNC_ID, SyncTask::download(download_1.clone())); + sync_queue.push(TEST_SYNC_ID, SyncTask::download(download_2.clone())); + sync_queue.push(sync_id_2, SyncTask::download(download_3)); + sync_queue.push(TEST_SYNC_ID, SyncTask::download(download_4.clone())); + assert_eq!(sync_queue.len(), 4); + + let (mut smallest_batch, _) = sync_queue.next_task_batch(); + assert_eq!( + smallest_batch.len(), + 1, + "Queue should have one batch merged from the all sync tasks, but not the other user's task" + ); + assert_eq!( + Some(SyncTaskBatch { + download: Some(SyncData { + retries: 0, + data: LayersDownload::from_skipped_layers( + { + let mut set = HashSet::new(); + set.extend(download_1.layers_to_skip.into_iter()); + set.extend(download_2.layers_to_skip.into_iter()); + set.extend(download_4.layers_to_skip.into_iter()); + set + }, + ) + }), + upload: None, + delete: None, + }), + smallest_batch.remove(&TEST_SYNC_ID), + "Should have one batch containing all tasks merged for the tenant first appeared in the batch" + ); + + assert!(smallest_batch.is_empty(), "Should check all batch tasks"); + assert_eq!( + sync_queue.len(), + 1, + "Should have one task left out of the batch" + ); + } + + mod local_and_remote_comparisons { + use super::*; + + #[test] + fn ready() { + let mut new_sync_tasks = VecDeque::default(); + let sync_id = TenantTimelineId::generate(); + let local_metadata = dummy_metadata(0x02.into()); + let local_files = + HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]); + let mut remote_entry = RemoteTimeline::new(local_metadata.clone()); + remote_entry + .add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]); + + let (status, sync_needed) = compare_local_and_remote_timeline( + &mut new_sync_tasks, + sync_id, + local_metadata.clone(), + local_files, + &remote_entry, + ); + + assert_eq!( + status, + LocalTimelineInitStatus::LocallyComplete(local_metadata) + ); + assert!(!sync_needed); + + assert!(new_sync_tasks.is_empty(), "{:?}", new_sync_tasks); + } + + #[test] + fn needs_download() { + let mut new_sync_tasks = VecDeque::default(); + let sync_id = TenantTimelineId::generate(); + let local_metadata = dummy_metadata(0x02.into()); + let local_files = HashMap::default(); + let mut remote_entry = RemoteTimeline::new(local_metadata.clone()); + remote_entry + .add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]); + + let (status, sync_needed) = compare_local_and_remote_timeline( + &mut new_sync_tasks, + sync_id, + local_metadata, + local_files.clone(), + &remote_entry, + ); + + assert_eq!(status, LocalTimelineInitStatus::NeedsSync); + assert!(sync_needed); + + let new_sync_tasks = new_sync_tasks.into_iter().collect::>(); + + assert_eq!( + &new_sync_tasks, + &[( + sync_id, + SyncTask::download(LayersDownload::from_skipped_layers( + local_files.keys().cloned().collect() + )) + )] + ); + } + + #[test] + fn redownload_is_not_needed_on_upgrade() { + // originally the implementation missed the `(None, Some(_))` case in the match, and + // proceeded to always redownload if the remote metadata was not available. + + let mut new_sync_tasks = VecDeque::default(); + let sync_id = TenantTimelineId::generate(); + + let local_metadata = dummy_metadata(0x02.into()); + + // type system would in general allow that LayerFileMetadata would be created with + // file_size: None, however `LayerFileMetadata::default` is only allowed from tests, + // and so everywhere within the system valid LayerFileMetadata is being created, it is + // created through `::new`. + let local_files = + HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]); + + let mut remote_entry = RemoteTimeline::new(local_metadata.clone()); + + // RemoteTimeline is constructed out of an older version IndexPart, which didn't carry + // any metadata. + remote_entry + .add_timeline_layers([(PathBuf::from("first_file"), LayerFileMetadata::default())]); + + let (status, sync_needed) = compare_local_and_remote_timeline( + &mut new_sync_tasks, + sync_id, + local_metadata.clone(), + local_files, + &remote_entry, + ); + + assert_eq!( + status, + LocalTimelineInitStatus::LocallyComplete(local_metadata) + ); + assert!(!sync_needed); + } + + #[test] + fn needs_upload() { + let mut new_sync_tasks = VecDeque::default(); + let sync_id = TenantTimelineId::generate(); + let local_metadata = dummy_metadata(0x02.into()); + let local_files = + HashMap::from([(PathBuf::from("first_file"), LayerFileMetadata::new(123))]); + let mut remote_entry = RemoteTimeline::new(local_metadata.clone()); + remote_entry.add_timeline_layers([]); + + let (status, sync_needed) = compare_local_and_remote_timeline( + &mut new_sync_tasks, + sync_id, + local_metadata.clone(), + local_files.clone(), + &remote_entry, + ); + + assert_eq!( + status, + LocalTimelineInitStatus::LocallyComplete(local_metadata.clone()) + ); + assert!(!sync_needed); + + let new_sync_tasks = new_sync_tasks.into_iter().collect::>(); + + assert_eq!( + &new_sync_tasks, + &[( + sync_id, + SyncTask::upload(LayersUpload { + layers_to_upload: local_files, + uploaded_layers: HashMap::default(), + metadata: Some(local_metadata), + }) + )] + ); + } + } +} diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs new file mode 100644 index 0000000000..39846f0da3 --- /dev/null +++ b/pageserver/src/storage_sync/delete.rs @@ -0,0 +1,235 @@ +//! Timeline synchronization logic to delete a bulk of timeline's remote files from the remote storage. + +use std::path::Path; + +use anyhow::Context; +use futures::stream::{FuturesUnordered, StreamExt}; +use tracing::{debug, error, info}; + +use crate::storage_sync::{SyncQueue, SyncTask}; +use remote_storage::GenericRemoteStorage; +use utils::id::TenantTimelineId; + +use super::{LayersDeletion, SyncData}; + +/// Attempts to remove the timleline layers from the remote storage. +/// If the task had not adjusted the metadata before, the deletion will fail. +pub(super) async fn delete_timeline_layers( + storage: &GenericRemoteStorage, + sync_queue: &SyncQueue, + sync_id: TenantTimelineId, + mut delete_data: SyncData, +) -> bool { + if !delete_data.data.deletion_registered { + error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing"); + delete_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Delete(delete_data)); + return false; + } + + if delete_data.data.layers_to_delete.is_empty() { + info!("No layers to delete, skipping"); + return true; + } + + let layers_to_delete = delete_data + .data + .layers_to_delete + .drain() + .collect::>(); + debug!("Layers to delete: {layers_to_delete:?}"); + info!("Deleting {} timeline layers", layers_to_delete.len()); + + let mut delete_tasks = layers_to_delete + .into_iter() + .map(|local_layer_path| async { + match remove_storage_object(storage, &local_layer_path).await { + Ok(()) => Ok(local_layer_path), + Err(e) => Err((e, local_layer_path)), + } + }) + .collect::>(); + + let mut errored = false; + while let Some(deletion_result) = delete_tasks.next().await { + match deletion_result { + Ok(local_layer_path) => { + debug!( + "Successfully deleted layer {} for timeline {sync_id}", + local_layer_path.display() + ); + delete_data.data.deleted_layers.insert(local_layer_path); + } + Err((e, local_layer_path)) => { + errored = true; + error!( + "Failed to delete layer {} for timeline {sync_id}: {e:?}", + local_layer_path.display() + ); + delete_data.data.layers_to_delete.insert(local_layer_path); + } + } + } + + if errored { + debug!("Reenqueuing failed delete task for timeline {sync_id}"); + delete_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Delete(delete_data)); + } else { + info!("Successfully deleted all layers"); + } + errored +} + +async fn remove_storage_object( + storage: &GenericRemoteStorage, + local_layer_path: &Path, +) -> anyhow::Result<()> { + let storage_path = storage + .remote_object_id(local_layer_path) + .with_context(|| { + format!( + "Failed to get the layer storage path for local path '{}'", + local_layer_path.display() + ) + })?; + + storage.delete(&storage_path).await.with_context(|| { + format!( + "Failed to delete remote layer from storage at '{:?}'", + storage_path + ) + }) +} + +#[cfg(test)] +mod tests { + use std::{collections::HashSet, num::NonZeroUsize}; + + use itertools::Itertools; + use tempfile::tempdir; + use tokio::fs; + use utils::lsn::Lsn; + + use crate::{ + storage_sync::test_utils::{create_local_timeline, dummy_metadata}, + tenant::harness::{TenantHarness, TIMELINE_ID}, + }; + use remote_storage::{LocalFs, RemoteStorage}; + + use super::*; + + #[tokio::test] + async fn delete_timeline_negative() -> anyhow::Result<()> { + let harness = TenantHarness::create("delete_timeline_negative")?; + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let storage = GenericRemoteStorage::new(LocalFs::new( + tempdir()?.path().to_path_buf(), + harness.conf.workdir.clone(), + )?); + + let deleted = delete_timeline_layers( + &storage, + &sync_queue, + sync_id, + SyncData { + retries: 1, + data: LayersDeletion { + deleted_layers: HashSet::new(), + layers_to_delete: HashSet::new(), + deletion_registered: false, + }, + }, + ) + .await; + + assert!( + !deleted, + "Should not start the deletion for task with delete metadata unregistered" + ); + + Ok(()) + } + + #[tokio::test] + async fn delete_timeline() -> anyhow::Result<()> { + let harness = TenantHarness::create("delete_timeline")?; + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let layer_files = ["a", "b", "c", "d"]; + let storage = GenericRemoteStorage::new(LocalFs::new( + tempdir()?.path().to_path_buf(), + harness.conf.workdir.clone(), + )?); + + let local_storage = storage.as_local().unwrap(); + + let current_retries = 3; + let metadata = dummy_metadata(Lsn(0x30)); + let local_timeline_path = harness.timeline_path(&TIMELINE_ID); + let timeline_upload = + create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; + for (local_path, _metadata) in timeline_upload.layers_to_upload { + let remote_path = + local_storage.resolve_in_storage(&local_storage.remote_object_id(&local_path)?)?; + let remote_parent_dir = remote_path.parent().unwrap(); + if !remote_parent_dir.exists() { + fs::create_dir_all(&remote_parent_dir).await?; + } + fs::copy(&local_path, &remote_path).await?; + } + assert_eq!( + local_storage + .list() + .await? + .into_iter() + .map(|remote_path| local_storage.local_path(&remote_path).unwrap()) + .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) + .sorted() + .collect::>(), + layer_files + .iter() + .map(|layer_str| layer_str.to_string()) + .sorted() + .collect::>(), + "Expect to have all layer files remotely before deletion" + ); + + let deleted = delete_timeline_layers( + &storage, + &sync_queue, + sync_id, + SyncData { + retries: current_retries, + data: LayersDeletion { + deleted_layers: HashSet::new(), + layers_to_delete: HashSet::from([ + local_timeline_path.join("a"), + local_timeline_path.join("c"), + local_timeline_path.join("something_different"), + ]), + deletion_registered: true, + }, + }, + ) + .await; + assert!(deleted, "Should be able to delete timeline files"); + + assert_eq!( + local_storage + .list() + .await? + .into_iter() + .map(|remote_path| local_storage.local_path(&remote_path).unwrap()) + .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) + .sorted() + .collect::>(), + vec!["b".to_string(), "d".to_string()], + "Expect to have only non-deleted files remotely" + ); + + Ok(()) + } +} diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs new file mode 100644 index 0000000000..6f9b2e2071 --- /dev/null +++ b/pageserver/src/storage_sync/download.rs @@ -0,0 +1,695 @@ +//! Timeline synchronization logic to fetch the layer files from remote storage into pageserver's local directory. + +use std::{ + collections::{HashMap, HashSet}, + fmt::Debug, + mem, + path::Path, +}; + +use anyhow::Context; +use futures::stream::{FuturesUnordered, StreamExt}; +use remote_storage::{DownloadError, GenericRemoteStorage}; +use tokio::{ + fs, + io::{self, AsyncWriteExt}, +}; +use tracing::{debug, error, info, warn}; + +use crate::{ + config::PageServerConf, + storage_sync::{index::LayerFileMetadata, SyncTask}, + TEMP_FILE_SUFFIX, +}; +use utils::{ + crashsafe::path_with_suffix_extension, + id::{TenantId, TenantTimelineId, TimelineId}, +}; + +use super::{ + index::{IndexPart, RemoteTimeline}, + LayersDownload, SyncData, SyncQueue, +}; + +// We collect timelines remotely available for each tenant +// in case we failed to gather all index parts (due to an error) +// Poisoned variant is returned. +// When data is received succesfully without errors Present variant is used. +pub enum TenantIndexParts { + Poisoned { + present: HashMap, + missing: HashSet, + }, + Present(HashMap), +} + +impl TenantIndexParts { + fn add_poisoned(&mut self, timeline_id: TimelineId) { + match self { + TenantIndexParts::Poisoned { missing, .. } => { + missing.insert(timeline_id); + } + TenantIndexParts::Present(present) => { + *self = TenantIndexParts::Poisoned { + present: mem::take(present), + missing: HashSet::from([timeline_id]), + } + } + } + } +} + +impl Default for TenantIndexParts { + fn default() -> Self { + TenantIndexParts::Present(HashMap::default()) + } +} + +pub async fn download_index_parts( + conf: &'static PageServerConf, + storage: &GenericRemoteStorage, + keys: HashSet, +) -> HashMap { + let mut index_parts: HashMap = HashMap::new(); + + let mut part_downloads = keys + .into_iter() + .map(|id| async move { (id, download_index_part(conf, storage, id).await) }) + .collect::>(); + + while let Some((id, part_upload_result)) = part_downloads.next().await { + match part_upload_result { + Ok(index_part) => { + debug!("Successfully fetched index part for {id}"); + match index_parts.entry(id.tenant_id).or_default() { + TenantIndexParts::Poisoned { present, .. } => { + present.insert(id.timeline_id, index_part); + } + TenantIndexParts::Present(parts) => { + parts.insert(id.timeline_id, index_part); + } + } + } + Err(download_error) => { + match download_error { + DownloadError::NotFound => { + // thats ok because it means that we didnt upload something we have locally for example + } + e => { + let tenant_parts = index_parts.entry(id.tenant_id).or_default(); + tenant_parts.add_poisoned(id.timeline_id); + error!( + "Failed to fetch index part for {id}: {e} poisoning tenant index parts" + ); + } + } + } + } + } + + index_parts +} + +/// Note: The function is rather expensive from s3 access point of view, it will execute ceil(N/1000) + N requests. +/// At least one request to obtain a list of tenant timelines (more requests is there are more than 1000 timelines). +/// And then will attempt to download all index files that belong to these timelines. +pub async fn gather_tenant_timelines_index_parts( + conf: &'static PageServerConf, + storage: &GenericRemoteStorage, + tenant_id: TenantId, +) -> anyhow::Result> { + let tenant_path = conf.timelines_path(&tenant_id); + let timeline_sync_ids = get_timeline_sync_ids(storage, &tenant_path, tenant_id) + .await + .with_context(|| format!("Failed to list timeline sync ids for tenat {tenant_id}"))?; + + match download_index_parts(conf, storage, timeline_sync_ids) + .await + .remove(&tenant_id) + .ok_or_else(|| anyhow::anyhow!("Missing tenant index parts. This is a bug."))? + { + TenantIndexParts::Poisoned { missing, .. } => { + anyhow::bail!("Failed to download index parts for all timelines. Missing {missing:?}") + } + TenantIndexParts::Present(parts) => Ok(parts), + } +} + +/// Retrieves index data from the remote storage for a given timeline. +async fn download_index_part( + conf: &'static PageServerConf, + storage: &GenericRemoteStorage, + sync_id: TenantTimelineId, +) -> Result { + let index_part_path = conf + .metadata_path(sync_id.timeline_id, sync_id.tenant_id) + .with_file_name(IndexPart::FILE_NAME); + let mut index_part_download = storage + .download_storage_object(None, &index_part_path) + .await?; + + let mut index_part_bytes = Vec::new(); + io::copy( + &mut index_part_download.download_stream, + &mut index_part_bytes, + ) + .await + .with_context(|| { + format!( + "Failed to download an index part into file '{}'", + index_part_path.display() + ) + }) + .map_err(DownloadError::Other)?; + + let index_part: IndexPart = serde_json::from_slice(&index_part_bytes) + .with_context(|| { + format!( + "Failed to deserialize index part file into file '{}'", + index_part_path.display() + ) + }) + .map_err(DownloadError::Other)?; + + let missing_files = index_part.missing_files(); + if !missing_files.is_empty() { + warn!("Found missing layers in index part for timeline {sync_id}: {missing_files:?}"); + } + + Ok(index_part) +} + +/// Timeline download result, with extra data, needed for downloading. +#[derive(Debug)] +pub(super) enum DownloadedTimeline { + /// Remote timeline data is either absent or corrupt, no download possible. + Abort, + /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known. + /// Initial download failed due to some error, the download task is rescheduled for another retry. + FailedAndRescheduled, + /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known. + /// Initial download successful. + Successful(SyncData), +} + +/// Attempts to download all given timeline's layers. +/// Timeline files that already exist locally are skipped during the download, but the local metadata file is +/// updated in the end, if the remote one contains a newer disk_consistent_lsn. +/// +/// On an error, bumps the retries count and updates the files to skip with successful downloads, rescheduling the task. +pub(super) async fn download_timeline_layers<'a>( + conf: &'static PageServerConf, + storage: &'a GenericRemoteStorage, + sync_queue: &'a SyncQueue, + remote_timeline: Option<&'a RemoteTimeline>, + sync_id: TenantTimelineId, + mut download_data: SyncData, +) -> DownloadedTimeline { + let remote_timeline = match remote_timeline { + Some(remote_timeline) => { + if !remote_timeline.awaits_download { + error!("Timeline with sync id {sync_id} is not awaiting download"); + return DownloadedTimeline::Abort; + } + remote_timeline + } + None => { + error!("Timeline with sync id {sync_id} is not present in the remote index"); + return DownloadedTimeline::Abort; + } + }; + + let download = &mut download_data.data; + + let layers_to_download = remote_timeline + .stored_files() + .iter() + .filter_map(|(layer_path, metadata)| { + if !download.layers_to_skip.contains(layer_path) { + Some((layer_path.to_owned(), metadata.to_owned())) + } else { + None + } + }) + .collect::>(); + + debug!("Layers to download: {layers_to_download:?}"); + info!("Downloading {} timeline layers", layers_to_download.len()); + + if layers_to_download.is_empty() { + info!("No layers to download after filtering, skipping"); + return DownloadedTimeline::Successful(download_data); + } + + let mut download_tasks = layers_to_download + .into_iter() + .map(|(layer_destination_path, metadata)| async move { + + match layer_destination_path.metadata() { + Ok(m) if m.is_file() => { + // the file exists from earlier round when we failed after renaming it as + // layer_destination_path + let verified = if let Some(expected) = metadata.file_size() { + m.len() == expected + } else { + // behaviour before recording metadata was to accept any existing + true + }; + + if verified { + debug!( + "Layer already exists locally, skipping download: {}", + layer_destination_path.display() + ); + return Ok((layer_destination_path, LayerFileMetadata::new(m.len()))) + } else { + // no need to remove it, it will be overwritten by fs::rename + // after successful download + warn!("Downloaded layer exists already but layer file metadata mismatches: {}, metadata {:?}", layer_destination_path.display(), metadata); + } + } + Ok(m) => { + return Err(anyhow::anyhow!("Downloaded layer destination exists but is not a file: {m:?}, target needs to be removed/archived manually: {layer_destination_path:?}")); + } + Err(_) => { + // behave as the file didn't exist + } + } + + // Perform a rename inspired by durable_rename from file_utils.c. + // The sequence: + // write(tmp) + // fsync(tmp) + // rename(tmp, new) + // fsync(new) + // fsync(parent) + // For more context about durable_rename check this email from postgres mailing list: + // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com + // If pageserver crashes the temp file will be deleted on startup and re-downloaded. + let temp_file_path = + path_with_suffix_extension(&layer_destination_path, TEMP_FILE_SUFFIX); + + // TODO: this doesn't use the cached fd for some reason? + let mut destination_file = + fs::File::create(&temp_file_path).await.with_context(|| { + format!( + "Failed to create a destination file for layer '{}'", + temp_file_path.display() + ) + })?; + + let mut layer_download = storage.download_storage_object(None, &layer_destination_path) + .await + .with_context(|| { + format!( + "Failed to initiate the download the layer for {sync_id} into file '{}'", + temp_file_path.display() + ) + })?; + + let bytes_amount = io::copy(&mut layer_download.download_stream, &mut destination_file) + .await + .with_context(|| { + format!( + "Failed to download the layer for {sync_id} into file '{}'", + temp_file_path.display() + ) + })?; + + // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: + // A file will not be closed immediately when it goes out of scope if there are any IO operations + // that have not yet completed. To ensure that a file is closed immediately when it is dropped, + // you should call flush before dropping it. + // + // From the tokio code I see that it waits for pending operations to complete. There shouldn't be any because + // we assume that `destination_file` file is fully written. I.e there is no pending .write(...).await operations. + // But for additional safety let's check/wait for any pending operations. + destination_file.flush().await.with_context(|| { + format!( + "failed to flush source file at {}", + temp_file_path.display() + ) + })?; + + match metadata.file_size() { + Some(expected) if expected != bytes_amount => { + anyhow::bail!( + "According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'", + temp_file_path.display() + ); + }, + Some(_) | None => { + // matches, or upgrading from an earlier IndexPart version + } + } + + // not using sync_data because it can lose file size update + destination_file.sync_all().await.with_context(|| { + format!( + "failed to fsync source file at {}", + temp_file_path.display() + ) + })?; + drop(destination_file); + + fail::fail_point!("remote-storage-download-pre-rename", |_| { + anyhow::bail!("remote-storage-download-pre-rename failpoint triggered") + }); + + fs::rename(&temp_file_path, &layer_destination_path).await?; + + fsync_path(&layer_destination_path).await.with_context(|| { + format!( + "Cannot fsync layer destination path {}", + layer_destination_path.display(), + ) + })?; + + Ok::<_, anyhow::Error>((layer_destination_path, LayerFileMetadata::new(bytes_amount))) + }) + .collect::>(); + + let mut errors_happened = false; + // keep files we've downloaded to remove them from layers_to_skip if directory fsync fails + let mut undo = HashSet::new(); + while let Some(download_result) = download_tasks.next().await { + match download_result { + Ok((downloaded_path, metadata)) => { + undo.insert(downloaded_path.clone()); + download.layers_to_skip.insert(downloaded_path.clone()); + // what if the key existed already? ignore, because then we would had + // downloaded a partial file, and had to retry + download.gathered_metadata.insert(downloaded_path, metadata); + } + Err(e) => { + errors_happened = true; + error!("Failed to download a layer for timeline {sync_id}: {e:?}"); + } + } + } + + // fsync timeline directory which is a parent directory for downloaded files + let TenantTimelineId { + tenant_id, + timeline_id, + } = &sync_id; + let timeline_dir = conf.timeline_path(timeline_id, tenant_id); + if let Err(e) = fsync_path(&timeline_dir).await { + error!( + "Cannot fsync parent directory {} error {}", + timeline_dir.display(), + e + ); + for item in undo { + download.layers_to_skip.remove(&item); + // intentionally don't clear the gathered_metadata because it exists for fsync_path + // failure on parent directory + } + errors_happened = true; + } + + if errors_happened { + debug!("Reenqueuing failed download task for timeline {sync_id}"); + download_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Download(download_data)); + DownloadedTimeline::FailedAndRescheduled + } else { + info!("Successfully downloaded all layers"); + DownloadedTimeline::Successful(download_data) + } +} + +async fn get_timeline_sync_ids( + storage: &GenericRemoteStorage, + tenant_path: &Path, + tenant_id: TenantId, +) -> anyhow::Result> { + let tenant_storage_path = storage.remote_object_id(tenant_path).with_context(|| { + format!( + "Failed to get tenant storage path for local path '{}'", + tenant_path.display() + ) + })?; + + let timelines = storage + .list_prefixes(Some(&tenant_storage_path)) + .await + .with_context(|| { + format!( + "Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download" + ) + })?; + + if timelines.is_empty() { + anyhow::bail!("no timelines found on the remote storage") + } + + let mut sync_ids = HashSet::new(); + + for timeline_remote_storage_key in timelines { + let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { + anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") + })?; + + let timeline_id: TimelineId = object_name.parse().with_context(|| { + format!("failed to parse object name into timeline id '{object_name}'") + })?; + + sync_ids.insert(TenantTimelineId { + tenant_id, + timeline_id, + }); + } + + Ok(sync_ids) +} + +async fn fsync_path(path: impl AsRef) -> Result<(), io::Error> { + fs::File::open(path).await?.sync_all().await +} + +#[cfg(test)] +mod tests { + use std::{ + collections::{BTreeSet, HashSet}, + num::NonZeroUsize, + path::PathBuf, + }; + + use remote_storage::{LocalFs, RemoteStorage}; + use tempfile::tempdir; + use utils::lsn::Lsn; + + use crate::{ + storage_sync::{ + index::RelativePath, + test_utils::{create_local_timeline, dummy_metadata}, + }, + tenant::harness::{TenantHarness, TIMELINE_ID}, + }; + + use super::*; + + #[tokio::test] + async fn download_timeline() -> anyhow::Result<()> { + let harness = TenantHarness::create("download_timeline")?; + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"]; + let storage = GenericRemoteStorage::new(LocalFs::new( + tempdir()?.path().to_owned(), + harness.conf.workdir.clone(), + )?); + let local_storage = storage.as_local().unwrap(); + let current_retries = 3; + let metadata = dummy_metadata(Lsn(0x30)); + let local_timeline_path = harness.timeline_path(&TIMELINE_ID); + let timeline_upload = + create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; + + for local_path in timeline_upload.layers_to_upload.keys() { + let remote_path = + local_storage.resolve_in_storage(&storage.remote_object_id(local_path)?)?; + let remote_parent_dir = remote_path.parent().unwrap(); + if !remote_parent_dir.exists() { + fs::create_dir_all(&remote_parent_dir).await?; + } + fs::copy(&local_path, &remote_path).await?; + } + let mut read_dir = fs::read_dir(&local_timeline_path).await?; + while let Some(dir_entry) = read_dir.next_entry().await? { + if dir_entry.file_name().to_str() == Some("layer_to_keep_locally") { + continue; + } else { + fs::remove_file(dir_entry.path()).await?; + } + } + + let mut remote_timeline = RemoteTimeline::new(metadata.clone()); + remote_timeline.awaits_download = true; + remote_timeline.add_timeline_layers(layer_files.iter().map(|layer| { + let layer_path = local_timeline_path.join(layer); + + // this could had also been LayerFileMetadata::default(), but since in this test we + // don't do the merge operation done by storage_sync::download_timeline_data, it would + // not be merged back to timeline. + let metadata_from_upload = timeline_upload + .layers_to_upload + .get(&layer_path) + .expect("layer must exist in previously uploaded paths") + .to_owned(); + (layer_path, metadata_from_upload) + })); + + let download_data = match download_timeline_layers( + harness.conf, + &storage, + &sync_queue, + Some(&remote_timeline), + sync_id, + SyncData::new( + current_retries, + LayersDownload::from_skipped_layers(HashSet::from([ + local_timeline_path.join("layer_to_skip") + ])), + ), + ) + .await + { + DownloadedTimeline::Successful(data) => data, + wrong_result => { + panic!("Expected a successful download for timeline, but got: {wrong_result:?}") + } + }; + + assert_eq!( + current_retries, download_data.retries, + "On successful download, retries are not expected to change" + ); + assert_eq!( + download_data + .data + .layers_to_skip + .into_iter() + .collect::>(), + layer_files + .iter() + .map(|layer| local_timeline_path.join(layer)) + .collect(), + "On successful download, layers to skip should contain all downloaded files and present layers that were skipped" + ); + + let mut downloaded_files = BTreeSet::new(); + let mut read_dir = fs::read_dir(&local_timeline_path).await?; + while let Some(dir_entry) = read_dir.next_entry().await? { + downloaded_files.insert(dir_entry.path()); + } + + assert_eq!( + downloaded_files, + layer_files + .iter() + .filter(|layer| layer != &&"layer_to_skip") + .map(|layer| local_timeline_path.join(layer)) + .collect(), + "On successful download, all layers that were not skipped, should be downloaded" + ); + + Ok(()) + } + + #[tokio::test] + async fn download_timeline_negatives() -> anyhow::Result<()> { + let harness = TenantHarness::create("download_timeline_negatives")?; + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + let storage = GenericRemoteStorage::new(LocalFs::new( + tempdir()?.path().to_owned(), + harness.conf.workdir.clone(), + )?); + + let empty_remote_timeline_download = download_timeline_layers( + harness.conf, + &storage, + &sync_queue, + None, + sync_id, + SyncData::new(0, LayersDownload::from_skipped_layers(HashSet::new())), + ) + .await; + assert!( + matches!(empty_remote_timeline_download, DownloadedTimeline::Abort), + "Should not allow downloading for empty remote timeline" + ); + + let not_expecting_download_remote_timeline = RemoteTimeline::new(dummy_metadata(Lsn(5))); + assert!( + !not_expecting_download_remote_timeline.awaits_download, + "Should not expect download for the timeline" + ); + let already_downloading_remote_timeline_download = download_timeline_layers( + harness.conf, + &storage, + &sync_queue, + Some(¬_expecting_download_remote_timeline), + sync_id, + SyncData::new(0, LayersDownload::from_skipped_layers(HashSet::new())), + ) + .await; + assert!( + matches!( + already_downloading_remote_timeline_download, + DownloadedTimeline::Abort, + ), + "Should not allow downloading for remote timeline that does not expect it" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_download_index_part() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_download_index_part")?; + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + + let storage = GenericRemoteStorage::new(LocalFs::new( + tempdir()?.path().to_owned(), + harness.conf.workdir.clone(), + )?); + let local_storage = storage.as_local().unwrap(); + let metadata = dummy_metadata(Lsn(0x30)); + let local_timeline_path = harness.timeline_path(&TIMELINE_ID); + + let index_part = IndexPart::new( + HashSet::from([ + RelativePath::new(&local_timeline_path, local_timeline_path.join("one"))?, + RelativePath::new(&local_timeline_path, local_timeline_path.join("two"))?, + ]), + HashSet::from([RelativePath::new( + &local_timeline_path, + local_timeline_path.join("three"), + )?]), + metadata.disk_consistent_lsn(), + metadata.to_bytes()?, + ); + + let local_index_part_path = harness + .conf + .metadata_path(sync_id.timeline_id, sync_id.tenant_id) + .with_file_name(IndexPart::FILE_NAME); + let index_part_remote_id = local_storage.remote_object_id(&local_index_part_path)?; + let index_part_local_path = PathBuf::from(index_part_remote_id.to_string()); + fs::create_dir_all(index_part_local_path.parent().unwrap()).await?; + fs::write(&index_part_local_path, serde_json::to_vec(&index_part)?).await?; + + let downloaded_index_part = download_index_part(harness.conf, &storage, sync_id).await?; + + assert_eq!( + downloaded_index_part, index_part, + "Downloaded index part should be the same as the one in storage" + ); + + Ok(()) + } +} diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs new file mode 100644 index 0000000000..0779d95e8e --- /dev/null +++ b/pageserver/src/storage_sync/index.rs @@ -0,0 +1,712 @@ +//! In-memory index to track the tenant files on the remote storage. +//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about +//! remote timeline layers and its metadata. + +use std::ops::{Deref, DerefMut}; +use std::{ + collections::{HashMap, HashSet}, + path::{Path, PathBuf}, + sync::Arc, +}; + +use anyhow::{anyhow, Context, Ok}; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; +use tokio::sync::RwLock; +use tracing::log::warn; + +use crate::{config::PageServerConf, tenant::metadata::TimelineMetadata}; +use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, +}; + +use super::download::TenantIndexParts; + +/// A part of the filesystem path, that needs a root to become a path again. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +#[serde(transparent)] +pub struct RelativePath(String); + +impl RelativePath { + /// Attempts to strip off the base from path, producing a relative path or an error. + pub fn new>(base: &Path, path: P) -> anyhow::Result { + let path = path.as_ref(); + let relative = path.strip_prefix(base).with_context(|| { + format!( + "path '{}' is not relative to base '{}'", + path.display(), + base.display() + ) + })?; + Ok(RelativePath(relative.to_string_lossy().to_string())) + } + + /// Joins the relative path with the base path. + fn as_path(&self, base: &Path) -> PathBuf { + base.join(&self.0) + } +} + +#[derive(Debug, Clone, Default)] +pub struct TenantEntry(HashMap); + +impl TenantEntry { + pub fn has_in_progress_downloads(&self) -> bool { + self.values() + .any(|remote_timeline| remote_timeline.awaits_download) + } +} + +impl Deref for TenantEntry { + type Target = HashMap; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for TenantEntry { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl From> for TenantEntry { + fn from(inner: HashMap) -> Self { + Self(inner) + } +} + +/// An index to track tenant files that exist on the remote storage. +#[derive(Debug, Clone, Default)] +pub struct RemoteTimelineIndex { + entries: HashMap, +} + +/// A wrapper to synchronize the access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`]. +#[derive(Default)] +pub struct RemoteIndex(Arc>); + +impl RemoteIndex { + pub fn from_parts( + conf: &'static PageServerConf, + index_parts: HashMap, + ) -> anyhow::Result { + let mut entries: HashMap = HashMap::new(); + + for (tenant_id, index_parts) in index_parts { + match index_parts { + // TODO: should we schedule a retry so it can be recovered? otherwise we can revive it only through detach/attach or pageserver restart + TenantIndexParts::Poisoned { missing, ..} => warn!("skipping tenant_id set up for remote index because the index download has failed for timeline(s): {missing:?}"), + TenantIndexParts::Present(timelines) => { + for (timeline_id, index_part) in timelines { + let timeline_path = conf.timeline_path(&timeline_id, &tenant_id); + let remote_timeline = + RemoteTimeline::from_index_part(&timeline_path, index_part) + .context("Failed to restore remote timeline data from index part")?; + + entries + .entry(tenant_id) + .or_default() + .insert(timeline_id, remote_timeline); + } + }, + } + } + + Ok(Self(Arc::new(RwLock::new(RemoteTimelineIndex { entries })))) + } + + pub async fn read(&self) -> tokio::sync::RwLockReadGuard<'_, RemoteTimelineIndex> { + self.0.read().await + } + + pub async fn write(&self) -> tokio::sync::RwLockWriteGuard<'_, RemoteTimelineIndex> { + self.0.write().await + } +} + +impl Clone for RemoteIndex { + fn clone(&self) -> Self { + Self(Arc::clone(&self.0)) + } +} + +impl RemoteTimelineIndex { + pub fn timeline_entry( + &self, + TenantTimelineId { + tenant_id, + timeline_id, + }: &TenantTimelineId, + ) -> Option<&RemoteTimeline> { + self.entries.get(tenant_id)?.get(timeline_id) + } + + pub fn timeline_entry_mut( + &mut self, + TenantTimelineId { + tenant_id, + timeline_id, + }: &TenantTimelineId, + ) -> Option<&mut RemoteTimeline> { + self.entries.get_mut(tenant_id)?.get_mut(timeline_id) + } + + pub fn add_timeline_entry( + &mut self, + TenantTimelineId { + tenant_id, + timeline_id, + }: TenantTimelineId, + entry: RemoteTimeline, + ) { + self.entries + .entry(tenant_id) + .or_default() + .insert(timeline_id, entry); + } + + pub fn remove_timeline_entry( + &mut self, + TenantTimelineId { + tenant_id, + timeline_id, + }: TenantTimelineId, + ) -> Option { + self.entries + .entry(tenant_id) + .or_default() + .remove(&timeline_id) + } + + pub fn tenant_entry(&self, tenant_id: &TenantId) -> Option<&TenantEntry> { + self.entries.get(tenant_id) + } + + pub fn tenant_entry_mut(&mut self, tenant_id: &TenantId) -> Option<&mut TenantEntry> { + self.entries.get_mut(tenant_id) + } + + pub fn add_tenant_entry(&mut self, tenant_id: TenantId) -> &mut TenantEntry { + self.entries.entry(tenant_id).or_default() + } + + pub fn remove_tenant_entry(&mut self, tenant_id: &TenantId) -> Option { + self.entries.remove(tenant_id) + } + + pub fn set_awaits_download( + &mut self, + id: &TenantTimelineId, + awaits_download: bool, + ) -> anyhow::Result<()> { + self.timeline_entry_mut(id) + .ok_or_else(|| anyhow!("unknown timeline sync {id}"))? + .awaits_download = awaits_download; + Ok(()) + } +} + +/// Restored index part data about the timeline, stored in the remote index. +#[derive(Debug, Clone)] +pub struct RemoteTimeline { + timeline_layers: HashMap, + missing_layers: HashMap, + + pub metadata: TimelineMetadata, + pub awaits_download: bool, +} + +impl RemoteTimeline { + pub fn new(metadata: TimelineMetadata) -> Self { + Self { + timeline_layers: HashMap::default(), + missing_layers: HashMap::default(), + metadata, + awaits_download: false, + } + } + + pub fn add_timeline_layers( + &mut self, + new_layers: impl IntoIterator, + ) { + self.timeline_layers.extend(new_layers); + } + + pub fn add_upload_failures( + &mut self, + upload_failures: impl IntoIterator, + ) { + self.missing_layers.extend(upload_failures); + } + + pub fn remove_layers(&mut self, layers_to_remove: &HashSet) { + self.timeline_layers + .retain(|layer, _| !layers_to_remove.contains(layer)); + self.missing_layers + .retain(|layer, _| !layers_to_remove.contains(layer)); + } + + /// Lists all layer files in the given remote timeline. Omits the metadata file. + pub fn stored_files(&self) -> &HashMap { + &self.timeline_layers + } + + /// Combines metadata gathered or verified during downloading needed layer files to metadata on + /// the [`RemoteIndex`], so it can be uploaded later. + pub fn merge_metadata_from_downloaded( + &mut self, + downloaded: &HashMap, + ) { + downloaded.iter().for_each(|(path, metadata)| { + if let Some(upgraded) = self.timeline_layers.get_mut(path) { + upgraded.merge(metadata); + } + }); + } + + pub fn from_index_part(timeline_path: &Path, index_part: IndexPart) -> anyhow::Result { + let metadata = TimelineMetadata::from_bytes(&index_part.metadata_bytes)?; + let default_metadata = &IndexLayerMetadata::default(); + + let find_metadata = |key: &RelativePath| -> LayerFileMetadata { + index_part + .layer_metadata + .get(key) + .unwrap_or(default_metadata) + .into() + }; + + Ok(Self { + timeline_layers: index_part + .timeline_layers + .iter() + .map(|layer_path| (layer_path.as_path(timeline_path), find_metadata(layer_path))) + .collect(), + missing_layers: index_part + .missing_layers + .iter() + .map(|layer_path| (layer_path.as_path(timeline_path), find_metadata(layer_path))) + .collect(), + metadata, + awaits_download: false, + }) + } +} + +/// Metadata gathered for each of the layer files. +/// +/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which +/// might have less or more metadata depending if upgrading or rolling back an upgrade. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +#[cfg_attr(test, derive(Default))] +pub struct LayerFileMetadata { + file_size: Option, +} + +impl From<&'_ IndexLayerMetadata> for LayerFileMetadata { + fn from(other: &IndexLayerMetadata) -> Self { + LayerFileMetadata { + file_size: other.file_size, + } + } +} + +impl LayerFileMetadata { + pub fn new(file_size: u64) -> Self { + LayerFileMetadata { + file_size: Some(file_size), + } + } + + pub fn file_size(&self) -> Option { + self.file_size + } + + /// Metadata has holes due to version upgrades. This method is called to upgrade self with the + /// other value. + /// + /// This is called on the possibly outdated version. + pub fn merge(&mut self, other: &Self) { + self.file_size = other.file_size.or(self.file_size); + } +} + +/// Part of the remote index, corresponding to a certain timeline. +/// Contains the data about all files in the timeline, present remotely and its metadata. +/// +/// This type needs to be backwards and forwards compatible. When changing the fields, +/// remember to add a test case for the changed version. +#[serde_as] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +pub struct IndexPart { + /// Debugging aid describing the version of this type. + #[serde(default)] + version: usize, + + /// Each of the layers present on remote storage. + /// + /// Additional metadata can might exist in `layer_metadata`. + timeline_layers: HashSet, + + /// Currently is not really used in pageserver, + /// present to manually keep track of the layer files that pageserver might never retrieve. + /// + /// Such "holes" might appear if any upload task was evicted on an error threshold: + /// the this layer will only be rescheduled for upload on pageserver restart. + missing_layers: HashSet, + + /// Per layer file metadata, which can be present for a present or missing layer file. + /// + /// Older versions of `IndexPart` will not have this property or have only a part of metadata + /// that latest version stores. + #[serde(default)] + layer_metadata: HashMap, + + #[serde_as(as = "DisplayFromStr")] + disk_consistent_lsn: Lsn, + metadata_bytes: Vec, +} + +impl IndexPart { + /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be + /// used to understand later versions. + /// + /// Version is currently informative only. + const LATEST_VERSION: usize = 1; + pub const FILE_NAME: &'static str = "index_part.json"; + + #[cfg(test)] + pub fn new( + timeline_layers: HashSet, + missing_layers: HashSet, + disk_consistent_lsn: Lsn, + metadata_bytes: Vec, + ) -> Self { + Self { + version: Self::LATEST_VERSION, + timeline_layers, + missing_layers, + layer_metadata: HashMap::default(), + disk_consistent_lsn, + metadata_bytes, + } + } + + pub fn missing_files(&self) -> &HashSet { + &self.missing_layers + } + + pub fn from_remote_timeline( + timeline_path: &Path, + remote_timeline: RemoteTimeline, + ) -> anyhow::Result { + let metadata_bytes = remote_timeline.metadata.to_bytes()?; + + let mut layer_metadata = HashMap::new(); + + let mut missing_layers = HashSet::new(); + + separate_paths_and_metadata( + timeline_path, + &remote_timeline.missing_layers, + &mut missing_layers, + &mut layer_metadata, + ) + .context("Failed to convert missing layers' paths to relative ones")?; + + let mut timeline_layers = HashSet::new(); + + separate_paths_and_metadata( + timeline_path, + &remote_timeline.timeline_layers, + &mut timeline_layers, + &mut layer_metadata, + ) + .context("Failed to convert timeline layers' paths to relative ones")?; + + Ok(Self { + version: Self::LATEST_VERSION, + timeline_layers, + missing_layers, + layer_metadata, + disk_consistent_lsn: remote_timeline.metadata.disk_consistent_lsn(), + metadata_bytes, + }) + } +} + +/// Serialized form of [`LayerFileMetadata`]. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] +pub struct IndexLayerMetadata { + file_size: Option, +} + +impl From<&'_ LayerFileMetadata> for IndexLayerMetadata { + fn from(other: &'_ LayerFileMetadata) -> Self { + IndexLayerMetadata { + file_size: other.file_size, + } + } +} + +fn separate_paths_and_metadata( + timeline_path: &Path, + input: &HashMap, + output: &mut HashSet, + layer_metadata: &mut HashMap, +) -> anyhow::Result<()> { + for (path, metadata) in input { + let rel_path = RelativePath::new(timeline_path, path)?; + let metadata = IndexLayerMetadata::from(metadata); + + layer_metadata.insert(rel_path.clone(), metadata); + output.insert(rel_path); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::collections::BTreeSet; + + use super::*; + use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; + use crate::DEFAULT_PG_VERSION; + + #[test] + fn index_part_conversion() { + let harness = TenantHarness::create("index_part_conversion").unwrap(); + let timeline_path = harness.timeline_path(&TIMELINE_ID); + let metadata = TimelineMetadata::new( + Lsn(5).align(), + Some(Lsn(4)), + None, + Lsn(3), + Lsn(2), + Lsn(1), + DEFAULT_PG_VERSION, + ); + let remote_timeline = RemoteTimeline { + timeline_layers: HashMap::from([ + (timeline_path.join("layer_1"), LayerFileMetadata::new(1)), + (timeline_path.join("layer_2"), LayerFileMetadata::new(2)), + ]), + missing_layers: HashMap::from([ + (timeline_path.join("missing_1"), LayerFileMetadata::new(3)), + (timeline_path.join("missing_2"), LayerFileMetadata::new(4)), + ]), + metadata: metadata.clone(), + awaits_download: false, + }; + + let index_part = IndexPart::from_remote_timeline(&timeline_path, remote_timeline.clone()) + .expect("Correct remote timeline should be convertible to index part"); + + assert_eq!( + index_part.timeline_layers.iter().collect::>(), + BTreeSet::from([ + &RelativePath("layer_1".to_string()), + &RelativePath("layer_2".to_string()) + ]), + "Index part should have all remote timeline layers after the conversion" + ); + assert_eq!( + index_part.missing_layers.iter().collect::>(), + BTreeSet::from([ + &RelativePath("missing_1".to_string()), + &RelativePath("missing_2".to_string()) + ]), + "Index part should have all missing remote timeline layers after the conversion" + ); + assert_eq!( + index_part.disk_consistent_lsn, + metadata.disk_consistent_lsn(), + "Index part should have disk consistent lsn from the timeline" + ); + assert_eq!( + index_part.metadata_bytes, + metadata + .to_bytes() + .expect("Failed to serialize correct metadata into bytes"), + "Index part should have all missing remote timeline layers after the conversion" + ); + + let restored_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part) + .expect("Correct index part should be convertible to remote timeline"); + + let original_metadata = &remote_timeline.metadata; + let restored_metadata = &restored_timeline.metadata; + // we have to compare the metadata this way, since its header is different after creation and restoration, + // but that is now consireded ok. + assert_eq!( + original_metadata.disk_consistent_lsn(), + restored_metadata.disk_consistent_lsn(), + "remote timeline -> index part -> remote timeline conversion should not alter metadata" + ); + assert_eq!( + original_metadata.prev_record_lsn(), + restored_metadata.prev_record_lsn(), + "remote timeline -> index part -> remote timeline conversion should not alter metadata" + ); + assert_eq!( + original_metadata.ancestor_timeline(), + restored_metadata.ancestor_timeline(), + "remote timeline -> index part -> remote timeline conversion should not alter metadata" + ); + assert_eq!( + original_metadata.ancestor_lsn(), + restored_metadata.ancestor_lsn(), + "remote timeline -> index part -> remote timeline conversion should not alter metadata" + ); + assert_eq!( + original_metadata.latest_gc_cutoff_lsn(), + restored_metadata.latest_gc_cutoff_lsn(), + "remote timeline -> index part -> remote timeline conversion should not alter metadata" + ); + assert_eq!( + original_metadata.initdb_lsn(), + restored_metadata.initdb_lsn(), + "remote timeline -> index part -> remote timeline conversion should not alter metadata" + ); + + assert_eq!( + remote_timeline.awaits_download, restored_timeline.awaits_download, + "remote timeline -> index part -> remote timeline conversion should not loose download flag" + ); + + assert_eq!( + remote_timeline + .timeline_layers + .into_iter() + .collect::>(), + restored_timeline + .timeline_layers + .into_iter() + .collect::>(), + "remote timeline -> index part -> remote timeline conversion should not loose layer data" + ); + assert_eq!( + remote_timeline + .missing_layers + .into_iter() + .collect::>(), + restored_timeline + .missing_layers + .into_iter() + .collect::>(), + "remote timeline -> index part -> remote timeline conversion should not loose missing file data" + ); + } + + #[test] + fn index_part_conversion_negatives() { + let harness = TenantHarness::create("index_part_conversion_negatives").unwrap(); + let timeline_path = harness.timeline_path(&TIMELINE_ID); + let metadata = TimelineMetadata::new( + Lsn(5).align(), + Some(Lsn(4)), + None, + Lsn(3), + Lsn(2), + Lsn(1), + DEFAULT_PG_VERSION, + ); + + let conversion_result = IndexPart::from_remote_timeline( + &timeline_path, + RemoteTimeline { + timeline_layers: HashMap::from([ + (PathBuf::from("bad_path"), LayerFileMetadata::new(1)), + (timeline_path.join("layer_2"), LayerFileMetadata::new(2)), + ]), + missing_layers: HashMap::from([ + (timeline_path.join("missing_1"), LayerFileMetadata::new(3)), + (timeline_path.join("missing_2"), LayerFileMetadata::new(4)), + ]), + metadata: metadata.clone(), + awaits_download: false, + }, + ); + assert!(conversion_result.is_err(), "Should not be able to convert metadata with layer paths that are not in the timeline directory"); + + let conversion_result = IndexPart::from_remote_timeline( + &timeline_path, + RemoteTimeline { + timeline_layers: HashMap::from([ + (timeline_path.join("layer_1"), LayerFileMetadata::new(1)), + (timeline_path.join("layer_2"), LayerFileMetadata::new(2)), + ]), + missing_layers: HashMap::from([ + (PathBuf::from("bad_path"), LayerFileMetadata::new(3)), + (timeline_path.join("missing_2"), LayerFileMetadata::new(4)), + ]), + metadata, + awaits_download: false, + }, + ); + assert!(conversion_result.is_err(), "Should not be able to convert metadata with missing layer paths that are not in the timeline directory"); + } + + #[test] + fn v0_indexpart_is_parsed() { + let example = r#"{ + "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], + "missing_layers":["not_a_real_layer_but_adding_coverage"], + "disk_consistent_lsn":"0/16960E8", + "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] + }"#; + + let expected = IndexPart { + version: 0, + timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(), + missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(), + layer_metadata: HashMap::default(), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), + }; + + let part = serde_json::from_str::(example).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v1_indexpart_is_parsed() { + let example = r#"{ + "version":1, + "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], + "missing_layers":["not_a_real_layer_but_adding_coverage"], + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] + }"#; + + let expected = IndexPart { + // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? + version: 1, + timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(), + missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(), + layer_metadata: HashMap::from([ + (RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata { + file_size: Some(25600000), + }), + (RelativePath("not_a_real_layer_but_adding_coverage".to_owned()), IndexLayerMetadata { + // serde_json should always parse this but this might be a double with jq for + // example. + file_size: Some(9007199254741001), + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), + }; + + let part = serde_json::from_str::(example).unwrap(); + assert_eq!(part, expected); + } +} diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs new file mode 100644 index 0000000000..f91105052b --- /dev/null +++ b/pageserver/src/storage_sync/upload.rs @@ -0,0 +1,479 @@ +//! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints. + +use std::{fmt::Debug, path::PathBuf}; + +use anyhow::Context; +use futures::stream::{FuturesUnordered, StreamExt}; +use remote_storage::GenericRemoteStorage; +use tokio::fs; +use tracing::{debug, error, info, warn}; + +use utils::id::TenantTimelineId; + +use super::{ + index::{IndexPart, RemoteTimeline}, + LayersUpload, SyncData, SyncQueue, +}; +use crate::metrics::NO_LAYERS_UPLOAD; +use crate::{config::PageServerConf, storage_sync::SyncTask}; + +/// Serializes and uploads the given index part data to the remote storage. +pub(super) async fn upload_index_part( + conf: &'static PageServerConf, + storage: &GenericRemoteStorage, + sync_id: TenantTimelineId, + index_part: IndexPart, +) -> anyhow::Result<()> { + let index_part_bytes = serde_json::to_vec(&index_part) + .context("Failed to serialize index part file into bytes")?; + let index_part_size = index_part_bytes.len(); + let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes)); + + let index_part_path = conf + .metadata_path(sync_id.timeline_id, sync_id.tenant_id) + .with_file_name(IndexPart::FILE_NAME); + storage + .upload_storage_object( + Box::new(index_part_bytes), + index_part_size, + &index_part_path, + ) + .await + .with_context(|| format!("Failed to upload index part for '{sync_id}'")) +} + +/// Timeline upload result, with extra data, needed for uploading. +#[derive(Debug)] +pub(super) enum UploadedTimeline { + /// Upload failed due to some error, the upload task is rescheduled for another retry. + FailedAndRescheduled(anyhow::Error), + /// No issues happened during the upload, all task files were put into the remote storage. + Successful(SyncData), +} + +/// Attempts to upload given layer files. +/// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload. +/// +/// On an error, bumps the retries count and reschedules the entire task. +pub(super) async fn upload_timeline_layers<'a>( + storage: &'a GenericRemoteStorage, + sync_queue: &SyncQueue, + remote_timeline: Option<&'a RemoteTimeline>, + sync_id: TenantTimelineId, + mut upload_data: SyncData, +) -> UploadedTimeline { + let upload = &mut upload_data.data; + let new_upload_lsn = upload + .metadata + .as_ref() + .map(|meta| meta.disk_consistent_lsn()); + + let already_uploaded_layers = remote_timeline + .map(|timeline| { + timeline + .stored_files() + .keys() + .cloned() + .collect::>() + }) + .unwrap_or_default(); + + let layers_to_upload = upload + .layers_to_upload + .iter() + .filter_map(|(k, v)| { + if !already_uploaded_layers.contains(k) { + Some((k.to_owned(), v.to_owned())) + } else { + None + } + }) + .collect::>(); + + if layers_to_upload.is_empty() { + debug!("No layers to upload after filtering, aborting"); + NO_LAYERS_UPLOAD + .with_label_values(&[ + &sync_id.tenant_id.to_string(), + &sync_id.timeline_id.to_string(), + ]) + .inc(); + return UploadedTimeline::Successful(upload_data); + } + + debug!("Layers to upload: {layers_to_upload:?}"); + info!( + "Uploading {} timeline layers, new lsn: {new_upload_lsn:?}", + layers_to_upload.len(), + ); + + let mut upload_tasks = layers_to_upload + .into_iter() + .map(|(source_path, known_metadata)| async move { + let source_file = match fs::File::open(&source_path).await.with_context(|| { + format!( + "Failed to upen a source file for layer '{}'", + source_path.display() + ) + }) { + Ok(file) => file, + Err(e) => return Err(UploadError::MissingLocalFile(source_path, e)), + }; + + let fs_size = source_file + .metadata() + .await + .with_context(|| { + format!( + "Failed to get the source file metadata for layer '{}'", + source_path.display() + ) + }) + .map_err(UploadError::Other)? + .len(); + + // FIXME: this looks bad + if let Some(metadata_size) = known_metadata.file_size() { + if metadata_size != fs_size { + return Err(UploadError::Other(anyhow::anyhow!( + "File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}" + ))); + } + } else { + // this is a silly state we would like to avoid + } + + let fs_size = usize::try_from(fs_size).with_context(|| format!("File {source_path:?} size {fs_size} could not be converted to usize")) + .map_err(UploadError::Other)?; + + match storage + .upload_storage_object(Box::new(source_file), fs_size, &source_path) + .await + .with_context(|| format!("Failed to upload layer file for {sync_id}")) + { + Ok(()) => Ok(source_path), + Err(e) => Err(UploadError::MissingLocalFile(source_path, e)), + } + }) + .collect::>(); + + let mut errors = Vec::new(); + while let Some(upload_result) = upload_tasks.next().await { + match upload_result { + Ok(uploaded_path) => { + let metadata = upload + .layers_to_upload + .remove(&uploaded_path) + .expect("metadata should always exist, assuming no double uploads"); + upload.uploaded_layers.insert(uploaded_path, metadata); + } + Err(e) => match e { + UploadError::Other(e) => { + error!("Failed to upload a layer for timeline {sync_id}: {e:?}"); + errors.push(format!("{e:#}")); + } + UploadError::MissingLocalFile(source_path, e) => { + if source_path.exists() { + error!("Failed to upload a layer for timeline {sync_id}: {e:?}"); + errors.push(format!("{e:#}")); + } else { + // We have run the upload sync task, but the file we wanted to upload is gone. + // This is "fine" due the asynchronous nature of the sync loop: it only reacts to events and might need to + // retry the upload tasks, if S3 or network is down: but during this time, pageserver might still operate and + // run compaction/gc tasks, removing redundant files from disk. + // It's not good to pause GC/compaction because of those and we would rather skip such uploads. + // + // Yet absence of such files might also mean that the timeline metadata file was updated (GC moves the Lsn forward, for instance). + // We don't try to read a more recent version, since it could contain `disk_consistent_lsn` that does not have its upload finished yet. + // This will create "missing" layers and make data inconsistent. + // Instead, we only update the metadata when it was submitted in an upload task as a checkpoint result. + upload.layers_to_upload.remove(&source_path); + warn!( + "Missing locally a layer file {} scheduled for upload, skipping", + source_path.display() + ); + } + } + }, + } + } + + if errors.is_empty() { + info!("Successfully uploaded all layers"); + UploadedTimeline::Successful(upload_data) + } else { + debug!("Reenqueuing failed upload task for timeline {sync_id}"); + upload_data.retries += 1; + sync_queue.push(sync_id, SyncTask::Upload(upload_data)); + UploadedTimeline::FailedAndRescheduled(anyhow::anyhow!( + "Errors appeared during layer uploads: {:?}", + errors + )) + } +} + +enum UploadError { + MissingLocalFile(PathBuf, anyhow::Error), + Other(anyhow::Error), +} + +#[cfg(test)] +mod tests { + use std::{ + collections::{BTreeSet, HashSet}, + num::NonZeroUsize, + }; + + use remote_storage::{LocalFs, RemoteStorage}; + use tempfile::tempdir; + use utils::lsn::Lsn; + + use crate::{ + storage_sync::{ + index::RelativePath, + test_utils::{create_local_timeline, dummy_metadata}, + }, + tenant::harness::{TenantHarness, TIMELINE_ID}, + }; + + use super::{upload_index_part, *}; + + #[tokio::test] + async fn regular_layer_upload() -> anyhow::Result<()> { + let harness = TenantHarness::create("regular_layer_upload")?; + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + + let layer_files = ["a", "b"]; + let storage = GenericRemoteStorage::new(LocalFs::new( + tempdir()?.path().to_path_buf(), + harness.conf.workdir.clone(), + )?); + let local_storage = storage.as_local().unwrap(); + let current_retries = 3; + let metadata = dummy_metadata(Lsn(0x30)); + let local_timeline_path = harness.timeline_path(&TIMELINE_ID); + let mut timeline_upload = + create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; + timeline_upload.metadata = None; + + assert!( + local_storage.list().await?.is_empty(), + "Storage should be empty before any uploads are made" + ); + + let upload_result = upload_timeline_layers( + &storage, + &sync_queue, + None, + sync_id, + SyncData::new(current_retries, timeline_upload.clone()), + ) + .await; + + let upload_data = match upload_result { + UploadedTimeline::Successful(upload_data) => upload_data, + wrong_result => { + panic!("Expected a successful upload for timeline, but got: {wrong_result:?}") + } + }; + + assert_eq!( + current_retries, upload_data.retries, + "On successful upload, retries are not expected to change" + ); + let upload = &upload_data.data; + assert!( + upload.layers_to_upload.is_empty(), + "Successful upload should have no layers left to upload" + ); + assert_eq!( + upload + .uploaded_layers + .keys() + .cloned() + .collect::>(), + layer_files + .iter() + .map(|layer_file| local_timeline_path.join(layer_file)) + .collect(), + "Successful upload should have all layers uploaded" + ); + assert_eq!( + upload.metadata, None, + "Successful upload without metadata should not have it returned either" + ); + + let storage_files = local_storage.list().await?; + assert_eq!( + storage_files.len(), + layer_files.len(), + "All layers should be uploaded" + ); + assert_eq!( + storage_files + .into_iter() + .map(|storage_path| local_storage.local_path(&storage_path)) + .collect::>>()?, + layer_files + .into_iter() + .map(|file| local_timeline_path.join(file)) + .collect(), + "Uploaded files should match with the local ones" + ); + + Ok(()) + } + + // Currently, GC can run between upload retries, removing local layers scheduled for upload. Test this scenario. + #[tokio::test] + async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> { + let harness = TenantHarness::create("layer_upload_after_local_fs_update")?; + let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap()); + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + + let layer_files = ["a1", "b1"]; + let storage = GenericRemoteStorage::new(LocalFs::new( + tempdir()?.path().to_owned(), + harness.conf.workdir.clone(), + )?); + let local_storage = storage.as_local().unwrap(); + let current_retries = 5; + let metadata = dummy_metadata(Lsn(0x40)); + + let local_timeline_path = harness.timeline_path(&TIMELINE_ID); + let layers_to_upload = { + let mut layers = layer_files.to_vec(); + layers.push("layer_to_remove"); + layers + }; + let timeline_upload = + create_local_timeline(&harness, TIMELINE_ID, &layers_to_upload, metadata.clone()) + .await?; + assert!( + local_storage.list().await?.is_empty(), + "Storage should be empty before any uploads are made" + ); + + fs::remove_file(local_timeline_path.join("layer_to_remove")).await?; + + let upload_result = upload_timeline_layers( + &storage, + &sync_queue, + None, + sync_id, + SyncData::new(current_retries, timeline_upload.clone()), + ) + .await; + + let upload_data = match upload_result { + UploadedTimeline::Successful(upload_data) => upload_data, + wrong_result => panic!( + "Expected a successful after local fs upload for timeline, but got: {wrong_result:?}" + ), + }; + + assert_eq!( + current_retries, upload_data.retries, + "On successful upload, retries are not expected to change" + ); + let upload = &upload_data.data; + assert!( + upload.layers_to_upload.is_empty(), + "Successful upload should have no layers left to upload, even those that were removed from the local fs" + ); + assert_eq!( + upload + .uploaded_layers + .keys() + .cloned() + .collect::>(), + layer_files + .iter() + .map(|layer_file| local_timeline_path.join(layer_file)) + .collect(), + "Successful upload should have all layers uploaded" + ); + assert_eq!( + upload.metadata, + Some(metadata), + "Successful upload should not change its metadata" + ); + + let storage_files = local_storage.list().await?; + assert_eq!( + storage_files.len(), + layer_files.len(), + "All layers should be uploaded" + ); + assert_eq!( + storage_files + .into_iter() + .map(|storage_path| local_storage.local_path(&storage_path)) + .collect::>>()?, + layer_files + .into_iter() + .map(|file| local_timeline_path.join(file)) + .collect(), + "Uploaded files should match with the local ones" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_upload_index_part() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_upload_index_part")?; + let sync_id = TenantTimelineId::new(harness.tenant_id, TIMELINE_ID); + + let storage = GenericRemoteStorage::new(LocalFs::new( + tempdir()?.path().to_owned(), + harness.conf.workdir.clone(), + )?); + let local_storage = storage.as_local().unwrap(); + let metadata = dummy_metadata(Lsn(0x40)); + let local_timeline_path = harness.timeline_path(&TIMELINE_ID); + + let index_part = IndexPart::new( + HashSet::from([ + RelativePath::new(&local_timeline_path, local_timeline_path.join("one"))?, + RelativePath::new(&local_timeline_path, local_timeline_path.join("two"))?, + ]), + HashSet::from([RelativePath::new( + &local_timeline_path, + local_timeline_path.join("three"), + )?]), + metadata.disk_consistent_lsn(), + metadata.to_bytes()?, + ); + + assert!( + local_storage.list().await?.is_empty(), + "Storage should be empty before any uploads are made" + ); + upload_index_part(harness.conf, &storage, sync_id, index_part.clone()).await?; + + let storage_files = local_storage.list().await?; + assert_eq!( + storage_files.len(), + 1, + "Should have only the index part file uploaded" + ); + + let index_part_path = storage_files.first().unwrap(); + assert_eq!( + index_part_path.object_name(), + Some(IndexPart::FILE_NAME), + "Remote index part should have the correct name" + ); + let remote_index_part: IndexPart = serde_json::from_slice( + &fs::read(local_storage.resolve_in_storage(index_part_path)?).await?, + )?; + assert_eq!( + index_part, remote_index_part, + "Remote index part should match the local one" + ); + + Ok(()) + } +} diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs new file mode 100644 index 0000000000..dad6e0039d --- /dev/null +++ b/pageserver/src/task_mgr.rs @@ -0,0 +1,463 @@ +//! +//! This module provides centralized handling of tokio tasks in the Page Server. +//! +//! We provide a few basic facilities: +//! - A global registry of tasks that lists what kind of tasks they are, and +//! which tenant or timeline they are working on +//! +//! - The ability to request a task to shut down. +//! +//! +//! # How it works? +//! +//! There is a global hashmap of all the tasks (`TASKS`). Whenever a new +//! task is spawned, a PageServerTask entry is added there, and when a +//! task dies, it removes itself from the hashmap. If you want to kill a +//! task, you can scan the hashmap to find it. +//! +//! # Task shutdown +//! +//! To kill a task, we rely on co-operation from the victim. Each task is +//! expected to periodically call the `is_shutdown_requested()` function, and +//! if it returns true, exit gracefully. In addition to that, when waiting for +//! the network or other long-running operation, you can use +//! `shutdown_watcher()` function to get a Future that will become ready if +//! the current task has been requested to shut down. You can use that with +//! Tokio select!(). +//! +//! +//! TODO: This would be a good place to also handle panics in a somewhat sane way. +//! Depending on what task panics, we might want to kill the whole server, or +//! only a single tenant or timeline. +//! + +// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro. +// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224. +#![allow(clippy::declare_interior_mutable_const)] + +use std::collections::HashMap; +use std::future::Future; +use std::panic::AssertUnwindSafe; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, Mutex}; + +use futures::FutureExt; +use tokio::runtime::Runtime; +use tokio::sync::watch; +use tokio::task::JoinHandle; +use tokio::task_local; + +use tracing::{debug, error, info, warn}; + +use once_cell::sync::Lazy; + +use utils::id::{TenantId, TimelineId}; + +use crate::shutdown_pageserver; + +// +// There are four runtimes: +// +// Compute request runtime +// - used to handle connections from compute nodes. Any tasks related to satisfying +// GetPage requests, base backups, import, and other such compute node operations +// are handled by the Compute request runtime +// - page_service.rs +// - this includes layer downloads from remote storage, if a layer is needed to +// satisfy a GetPage request +// +// Management request runtime +// - used to handle HTTP API requests +// +// WAL receiver runtime: +// - used to handle WAL receiver connections. +// - and to receiver updates from etcd +// +// Background runtime +// - layer flushing +// - garbage collection +// - compaction +// - remote storage uploads +// - initial tenant loading +// +// Everything runs in a tokio task. If you spawn new tasks, spawn it using the correct +// runtime. +// +// There might be situations when one task needs to wait for a task running in another +// Runtime to finish. For example, if a background operation needs a layer from remote +// storage, it will start to download it. If a background operation needs a remote layer, +// and the download was already initiated by a GetPage request, the background task +// will wait for the download - running in the Page server runtime - to finish. +// Another example: the initial tenant loading tasks are launched in the background ops +// runtime. If a GetPage request comes in before the load of a tenant has finished, the +// GetPage request will wait for the tenant load to finish. +// +// The core Timeline code is synchronous, and uses a bunch of std Mutexes and RWLocks to +// protect data structures. Let's keep it that way. Synchronous code is easier to debug +// and analyze, and there's a lot of hairy, low-level, performance critical code there. +// +// It's nice to have different runtimes, so that you can quickly eyeball how much CPU +// time each class of operations is taking, with 'top -H' or similar. +// +// It's also good to avoid hogging all threads that would be needed to process +// other operations, if the upload tasks e.g. get blocked on locks. It shouldn't +// happen, but still. +// +pub static COMPUTE_REQUEST_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("compute request worker") + .enable_all() + .build() + .expect("Failed to create compute request runtime") +}); + +pub static MGMT_REQUEST_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("mgmt request worker") + .enable_all() + .build() + .expect("Failed to create mgmt request runtime") +}); + +pub static WALRECEIVER_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("walreceiver worker") + .enable_all() + .build() + .expect("Failed to create walreceiver runtime") +}); + +pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name("background op worker") + .enable_all() + .build() + .expect("Failed to create background op runtime") +}); + +pub struct PageserverTaskId(u64); + +/// Each task that we track is associated with a "task ID". It's just an +/// increasing number that we assign. Note that it is different from tokio::task::Id. +static NEXT_TASK_ID: Lazy = Lazy::new(|| AtomicU64::new(1)); + +/// Global registry of tasks +static TASKS: Lazy>>> = + Lazy::new(|| Mutex::new(HashMap::new())); + +task_local! { + // There is a Tokio watch channel for each task, which can be used to signal the + // task that it needs to shut down. This task local variable holds the receiving + // end of the channel. The sender is kept in the global registry, so that anyone + // can send the signal to request task shutdown. + static SHUTDOWN_RX: watch::Receiver; + + // Each task holds reference to its own PageServerTask here. + static CURRENT_TASK: Arc; +} + +/// +/// There are many kinds of tasks in the system. Some are associated with a particular +/// tenant or timeline, while others are global. +/// +/// Note that we don't try to limit how many task of a certain kind can be running +/// at the same time. +/// +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum TaskKind { + // libpq listener task. It just accepts connection and spawns a + // PageRequestHandler task for each connection. + LibpqEndpointListener, + + // HTTP endpoint listener. + HttpEndpointListener, + + // Task that handles a single connection. A PageRequestHandler task + // starts detached from any particular tenant or timeline, but it can be + // associated with one later, after receiving a command from the client. + PageRequestHandler, + + // Manages the WAL receiver connection for one timeline. It subscribes to + // events from etcd, decides which safekeeper to connect to. It spawns a + // separate WalReceiverConnection task to handle each connection. + WalReceiverManager, + + // Handles a connection to a safekeeper, to stream WAL to a timeline. + WalReceiverConnection, + + // Garbage collection worker. One per tenant + GarbageCollector, + + // Compaction. One per tenant. + Compaction, + + // Initial logical size calculation + InitialLogicalSizeCalculation, + + // Task that flushes frozen in-memory layers to disk + LayerFlushTask, + + // Task that manages the remote upload queue + StorageSync, + + // task that handles the initial downloading of all tenants + InitialLoad, + + // task that handles attaching a tenant + Attach, +} + +#[derive(Default)] +struct MutableTaskState { + /// Tenant and timeline that this task is associated with. + tenant_id: Option, + timeline_id: Option, + + /// Handle for waiting for the task to exit. It can be None, if the + /// the task has already exited. + join_handle: Option>, +} + +struct PageServerTask { + #[allow(dead_code)] // unused currently + task_id: PageserverTaskId, + + kind: TaskKind, + + name: String, + + // To request task shutdown, send 'true' to the channel to notify the task. + shutdown_tx: watch::Sender, + + mutable: Mutex, +} + +/// Launch a new task +/// Note: if shutdown_process_on_error is set to true failure +/// of the task will lead to shutdown of entire process +pub fn spawn( + runtime: &tokio::runtime::Handle, + kind: TaskKind, + tenant_id: Option, + timeline_id: Option, + name: &str, + shutdown_process_on_error: bool, + future: F, +) -> PageserverTaskId +where + F: Future> + Send + 'static, +{ + let (shutdown_tx, shutdown_rx) = watch::channel(false); + let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed); + let task = Arc::new(PageServerTask { + task_id: PageserverTaskId(task_id), + kind, + name: name.to_string(), + shutdown_tx, + mutable: Mutex::new(MutableTaskState { + tenant_id, + timeline_id, + join_handle: None, + }), + }); + + TASKS.lock().unwrap().insert(task_id, Arc::clone(&task)); + + let mut task_mut = task.mutable.lock().unwrap(); + + let task_name = name.to_string(); + let task_cloned = Arc::clone(&task); + let join_handle = runtime.spawn(task_wrapper( + task_name, + task_id, + task_cloned, + shutdown_rx, + shutdown_process_on_error, + future, + )); + task_mut.join_handle = Some(join_handle); + drop(task_mut); + + // The task is now running. Nothing more to do here + PageserverTaskId(task_id) +} + +/// This wrapper function runs in a newly-spawned task. It initializes the +/// task-local variables and calls the payload function. +async fn task_wrapper( + task_name: String, + task_id: u64, + task: Arc, + shutdown_rx: watch::Receiver, + shutdown_process_on_error: bool, + future: F, +) where + F: Future> + Send + 'static, +{ + debug!("Starting task '{}'", task_name); + + let result = SHUTDOWN_RX + .scope( + shutdown_rx, + CURRENT_TASK.scope(task, { + // We use AssertUnwindSafe here so that the payload function + // doesn't need to be UnwindSafe. We don't do anything after the + // unwinding that would expose us to unwind-unsafe behavior. + AssertUnwindSafe(future).catch_unwind() + }), + ) + .await; + task_finish(result, task_name, task_id, shutdown_process_on_error).await; +} + +async fn task_finish( + result: std::result::Result< + anyhow::Result<()>, + std::boxed::Box, + >, + task_name: String, + task_id: u64, + shutdown_process_on_error: bool, +) { + // Remove our entry from the global hashmap. + let task = TASKS + .lock() + .unwrap() + .remove(&task_id) + .expect("no task in registry"); + + let mut shutdown_process = false; + { + let task_mut = task.mutable.lock().unwrap(); + + match result { + Ok(Ok(())) => { + debug!("Task '{}' exited normally", task_name); + } + Ok(Err(err)) => { + if shutdown_process_on_error { + error!( + "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + shutdown_process = true; + } else { + error!( + "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + } + } + Err(err) => { + if shutdown_process_on_error { + error!( + "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + shutdown_process = true; + } else { + error!( + "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task_mut.tenant_id, task_mut.timeline_id, err + ); + } + } + } + } + + if shutdown_process { + shutdown_pageserver(1).await; + } +} + +// expected to be called from the task of the given id. +pub fn associate_with(tenant_id: Option, timeline_id: Option) { + CURRENT_TASK.with(|ct| { + let mut task_mut = ct.mutable.lock().unwrap(); + task_mut.tenant_id = tenant_id; + task_mut.timeline_id = timeline_id; + }); +} + +/// Is there a task running that matches the criteria + +/// Signal and wait for tasks to shut down. +/// +/// +/// The arguments are used to select the tasks to kill. Any None arguments are +/// ignored. For example, to shut down all WalReceiver tasks: +/// +/// shutdown_tasks(Some(TaskKind::WalReceiver), None, None) +/// +/// Or to shut down all tasks for given timeline: +/// +/// shutdown_tasks(None, Some(tenant_id), Some(timeline_id)) +/// +pub async fn shutdown_tasks( + kind: Option, + tenant_id: Option, + timeline_id: Option, +) { + let mut victim_tasks = Vec::new(); + + { + let tasks = TASKS.lock().unwrap(); + for task in tasks.values() { + let task_mut = task.mutable.lock().unwrap(); + if (kind.is_none() || Some(task.kind) == kind) + && (tenant_id.is_none() || task_mut.tenant_id == tenant_id) + && (timeline_id.is_none() || task_mut.timeline_id == timeline_id) + { + let _ = task.shutdown_tx.send_replace(true); + victim_tasks.push(Arc::clone(task)); + } + } + } + + for task in victim_tasks { + let join_handle = { + let mut task_mut = task.mutable.lock().unwrap(); + info!("waiting for {} to shut down", task.name); + let join_handle = task_mut.join_handle.take(); + drop(task_mut); + join_handle + }; + if let Some(join_handle) = join_handle { + let _ = join_handle.await; + } else { + // Possibly one of: + // * The task had not even fully started yet. + // * It was shut down concurrently and already exited + } + } +} + +pub fn current_task_kind() -> Option { + CURRENT_TASK.try_with(|ct| ct.kind).ok() +} + +/// A Future that can be used to check if the current task has been requested to +/// shut down. +pub async fn shutdown_watcher() { + let mut shutdown_rx = SHUTDOWN_RX + .try_with(|rx| rx.clone()) + .expect("shutdown_requested() called in an unexpected task or thread"); + + while !*shutdown_rx.borrow() { + if shutdown_rx.changed().await.is_err() { + break; + } + } +} + +/// Has the current task been requested to shut down? +pub fn is_shutdown_requested() -> bool { + if let Ok(shutdown_rx) = SHUTDOWN_RX.try_with(|rx| rx.clone()) { + *shutdown_rx.borrow() + } else { + if !cfg!(test) { + warn!("is_shutdown_requested() called in an unexpected task or thread"); + } + false + } +} diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs new file mode 100644 index 0000000000..11845ebb48 --- /dev/null +++ b/pageserver/src/tenant.rs @@ -0,0 +1,2530 @@ +//! +//! Timeline repository implementation that keeps old data in files on disk, and +//! the recent changes in memory. See tenant/*_layer.rs files. +//! The functions here are responsible for locating the correct layer for the +//! get/put call, walking back the timeline branching history as needed. +//! +//! The files are stored in the .neon/tenants//timelines/ +//! directory. See docs/pageserver-storage.md for how the files are managed. +//! In addition to the layer files, there is a metadata file in the same +//! directory that contains information about the timeline, in particular its +//! parent timeline, and the last LSN that has been written to disk. +//! + +use anyhow::{bail, Context}; +use bytes::Bytes; +use futures::Stream; +use pageserver_api::models::TimelineState; +use tokio::sync::watch; +use tokio_util::io::StreamReader; +use tokio_util::io::SyncIoBridge; +use tracing::*; +use utils::crashsafe::path_with_suffix_extension; + +use std::cmp::min; +use std::collections::hash_map::Entry; +use std::collections::BTreeSet; +use std::collections::HashMap; +use std::fs; +use std::fs::File; +use std::fs::OpenOptions; +use std::io; +use std::io::Write; +use std::ops::Bound::Included; +use std::path::Path; +use std::path::PathBuf; +use std::pin::Pin; +use std::process::Command; +use std::process::Stdio; +use std::sync::Arc; +use std::sync::MutexGuard; +use std::sync::{Mutex, RwLock}; +use std::time::{Duration, Instant}; + +use self::metadata::TimelineMetadata; +use crate::config::PageServerConf; +use crate::import_datadir; +use crate::metrics::{remove_tenant_metrics, STORAGE_TIME}; +use crate::repository::GcResult; +use crate::storage_sync::index::RemoteIndex; +use crate::task_mgr; +use crate::tenant_config::TenantConfOpt; +use crate::virtual_file::VirtualFile; +use crate::walredo::WalRedoManager; +use crate::{CheckpointConfig, TEMP_FILE_SUFFIX}; +pub use pageserver_api::models::TenantState; + +use toml_edit; +use utils::{ + crashsafe, + id::{TenantId, TimelineId}, + lsn::{Lsn, RecordLsn}, +}; + +mod blob_io; +pub mod block_io; +mod delta_layer; +mod disk_btree; +pub(crate) mod ephemeral_file; +pub mod filename; +mod image_layer; +mod inmemory_layer; +pub mod layer_map; + +pub mod metadata; +mod par_fsync; +pub mod storage_layer; + +mod timeline; + +pub mod size; + +use storage_layer::Layer; + +pub use timeline::Timeline; + +// re-export this function so that page_cache.rs can use it. +pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file; + +// re-export for use in storage_sync.rs +pub use crate::tenant::metadata::save_metadata; + +// re-export for use in walreceiver +pub use crate::tenant::timeline::WalReceiverInfo; + +/// Parts of the `.neon/tenants//timelines/` directory prefix. +pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; + +/// +/// Tenant consists of multiple timelines. Keep them in a hash table. +/// +pub struct Tenant { + // Global pageserver config parameters + pub conf: &'static PageServerConf, + + state: watch::Sender, + + // Overridden tenant-specific config parameters. + // We keep TenantConfOpt sturct here to preserve the information + // about parameters that are not set. + // This is necessary to allow global config updates. + tenant_conf: Arc>, + + tenant_id: TenantId, + timelines: Mutex>>, + // This mutex prevents creation of new timelines during GC. + // Adding yet another mutex (in addition to `timelines`) is needed because holding + // `timelines` mutex during all GC iteration (especially with enforced checkpoint) + // may block for a long time `get_timeline`, `get_timelines_state`,... and other operations + // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn + // timeout... + gc_cs: Mutex<()>, + walredo_mgr: Arc, + + // provides access to timeline data sitting in the remote storage + // supposed to be used for retrieval of remote consistent lsn in walreceiver + remote_index: RemoteIndex, + + /// Makes every timeline to backup their files to remote storage. + upload_layers: bool, + + /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`]. + cached_logical_sizes: tokio::sync::Mutex>, +} + +/// A timeline with some of its files on disk, being initialized. +/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or +/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory +/// to be removed on next restart. +/// +/// The caller is responsible for proper timeline data filling before the final init. +#[must_use] +pub struct UninitializedTimeline<'t> { + owning_tenant: &'t Tenant, + timeline_id: TimelineId, + raw_timeline: Option<(Arc, TimelineUninitMark)>, +} + +/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory, +/// or gets removed eventually. +/// +/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first. +#[must_use] +struct TimelineUninitMark { + uninit_mark_deleted: bool, + uninit_mark_path: PathBuf, + timeline_path: PathBuf, +} + +impl UninitializedTimeline<'_> { + /// Ensures timeline data is valid, loads it into pageserver's memory and removes uninit mark file on success. + pub fn initialize(self) -> anyhow::Result> { + let mut timelines = self.owning_tenant.timelines.lock().unwrap(); + self.initialize_with_lock(&mut timelines, true) + } + + fn initialize_with_lock( + mut self, + timelines: &mut HashMap>, + load_layer_map: bool, + ) -> anyhow::Result> { + let timeline_id = self.timeline_id; + let tenant_id = self.owning_tenant.tenant_id; + + let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| { + format!("No timeline for initalization found for {tenant_id}/{timeline_id}") + })?; + + let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn(); + // TODO it would be good to ensure that, but apparently a lot of our testing is dependend on that at least + // ensure!(new_disk_consistent_lsn.is_valid(), + // "Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn and cannot be initialized"); + + match timelines.entry(timeline_id) { + Entry::Occupied(_) => anyhow::bail!( + "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map" + ), + Entry::Vacant(v) => { + if load_layer_map { + new_timeline + .load_layer_map(new_disk_consistent_lsn) + .with_context(|| { + format!( + "Failed to load layermap for timeline {tenant_id}/{timeline_id}" + ) + })?; + } + uninit_mark.remove_uninit_mark().with_context(|| { + format!( + "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}" + ) + })?; + new_timeline.set_state(TimelineState::Active); + v.insert(Arc::clone(&new_timeline)); + + new_timeline.maybe_spawn_flush_loop(); + + new_timeline.launch_wal_receiver(); + } + } + + Ok(new_timeline) + } + + /// Prepares timeline data by loading it from the basebackup archive. + pub async fn import_basebackup_from_tar( + self, + mut copyin_stream: &mut Pin<&mut impl Stream>>, + base_lsn: Lsn, + ) -> anyhow::Result> { + let raw_timeline = self.raw_timeline()?; + + // import_basebackup_from_tar() is not async, mainly because the Tar crate + // it uses is not async. So we need to jump through some hoops: + // - convert the input from client connection to a synchronous Read + // - use block_in_place() + let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); + + tokio::task::block_in_place(|| { + import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn) + .context("Failed to import basebackup") + })?; + + // Flush loop needs to be spawned in order for checkpoint to be able to flush. + // We want to run proper checkpoint before we mark timeline as available to outside world + // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock + raw_timeline.maybe_spawn_flush_loop(); + + fail::fail_point!("before-checkpoint-new-timeline", |_| { + bail!("failpoint before-checkpoint-new-timeline"); + }); + + raw_timeline + .checkpoint(CheckpointConfig::Flush) + .await + .context("Failed to checkpoint after basebackup import")?; + + let timeline = self.initialize()?; + + Ok(timeline) + } + + fn raw_timeline(&self) -> anyhow::Result<&Arc> { + Ok(&self + .raw_timeline + .as_ref() + .with_context(|| { + format!( + "No raw timeline {}/{} found", + self.owning_tenant.tenant_id, self.timeline_id + ) + })? + .0) + } +} + +impl Drop for UninitializedTimeline<'_> { + fn drop(&mut self) { + if let Some((_, uninit_mark)) = self.raw_timeline.take() { + let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered(); + error!("Timeline got dropped without initializing, cleaning its files"); + cleanup_timeline_directory(uninit_mark); + } + } +} + +fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) { + let timeline_path = &uninit_mark.timeline_path; + match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) { + Ok(()) => { + info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark") + } + Err(e) => { + error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}") + } + } + drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists +} + +impl TimelineUninitMark { + /// Useful for initializing timelines, existing on disk after the restart. + pub fn dummy() -> Self { + Self { + uninit_mark_deleted: true, + uninit_mark_path: PathBuf::new(), + timeline_path: PathBuf::new(), + } + } + + fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self { + Self { + uninit_mark_deleted: false, + uninit_mark_path, + timeline_path, + } + } + + fn remove_uninit_mark(mut self) -> anyhow::Result<()> { + if !self.uninit_mark_deleted { + self.delete_mark_file_if_present()?; + } + + Ok(()) + } + + fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> { + let uninit_mark_file = &self.uninit_mark_path; + let uninit_mark_parent = uninit_mark_file + .parent() + .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?; + ignore_absent_files(|| fs::remove_file(&uninit_mark_file)).with_context(|| { + format!("Failed to remove uninit mark file at path {uninit_mark_file:?}") + })?; + crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?; + self.uninit_mark_deleted = true; + + Ok(()) + } +} + +impl Drop for TimelineUninitMark { + fn drop(&mut self) { + if !self.uninit_mark_deleted { + if self.timeline_path.exists() { + error!( + "Uninit mark {} is not removed, timeline {} stays uninitialized", + self.uninit_mark_path.display(), + self.timeline_path.display() + ) + } else { + // unblock later timeline creation attempts + warn!( + "Removing intermediate uninit mark file {}", + self.uninit_mark_path.display() + ); + if let Err(e) = self.delete_mark_file_if_present() { + error!("Failed to remove the uninit mark file: {e}") + } + } + } + } +} + +/// A repository corresponds to one .neon directory. One repository holds multiple +/// timelines, forked off from the same initial call to 'initdb'. +impl Tenant { + pub fn tenant_id(&self) -> TenantId { + self.tenant_id + } + + /// Get Timeline handle for given Neon timeline ID. + /// This function is idempotent. It doesn't change internal state in any way. + pub fn get_timeline( + &self, + timeline_id: TimelineId, + active_only: bool, + ) -> anyhow::Result> { + let timelines_accessor = self.timelines.lock().unwrap(); + let timeline = timelines_accessor.get(&timeline_id).with_context(|| { + format!("Timeline {}/{} was not found", self.tenant_id, timeline_id) + })?; + + if active_only && !timeline.is_active() { + anyhow::bail!( + "Timeline {}/{} is not active, state: {:?}", + self.tenant_id, + timeline_id, + timeline.current_state() + ) + } else { + Ok(Arc::clone(timeline)) + } + } + + /// Lists timelines the tenant contains. + /// Up to tenant's implementation to omit certain timelines that ar not considered ready for use. + pub fn list_timelines(&self) -> Vec> { + self.timelines + .lock() + .unwrap() + .values() + .map(Arc::clone) + .collect() + } + + /// This is used to create the initial 'main' timeline during bootstrapping, + /// or when importing a new base backup. The caller is expected to load an + /// initial image of the datadir to the new timeline after this. + pub fn create_empty_timeline( + &self, + new_timeline_id: TimelineId, + initdb_lsn: Lsn, + pg_version: u32, + ) -> anyhow::Result { + anyhow::ensure!( + self.is_active(), + "Cannot create empty timelines on inactive tenant" + ); + + let timelines = self.timelines.lock().unwrap(); + let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id, &timelines)?; + drop(timelines); + + let new_metadata = TimelineMetadata::new( + Lsn(0), + None, + None, + Lsn(0), + initdb_lsn, + initdb_lsn, + pg_version, + ); + self.prepare_timeline( + new_timeline_id, + new_metadata, + timeline_uninit_mark, + true, + None, + ) + } + + /// Create a new timeline. + /// + /// Returns the new timeline ID and reference to its Timeline object. + /// + /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with + /// the same timeline ID already exists, returns None. If `new_timeline_id` is not given, + /// a new unique ID is generated. + pub async fn create_timeline( + &self, + new_timeline_id: Option, + ancestor_timeline_id: Option, + mut ancestor_start_lsn: Option, + pg_version: u32, + ) -> anyhow::Result>> { + anyhow::ensure!( + self.is_active(), + "Cannot create timelines on inactive tenant" + ); + + let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate); + + if self.get_timeline(new_timeline_id, false).is_ok() { + debug!("timeline {new_timeline_id} already exists"); + return Ok(None); + } + + let loaded_timeline = match ancestor_timeline_id { + Some(ancestor_timeline_id) => { + let ancestor_timeline = self + .get_timeline(ancestor_timeline_id, false) + .context("Cannot branch off the timeline that's not present in pageserver")?; + + if let Some(lsn) = ancestor_start_lsn.as_mut() { + // Wait for the WAL to arrive and be processed on the parent branch up + // to the requested branch point. The repository code itself doesn't + // require it, but if we start to receive WAL on the new timeline, + // decoding the new WAL might need to look up previous pages, relation + // sizes etc. and that would get confused if the previous page versions + // are not in the repository yet. + *lsn = lsn.align(); + ancestor_timeline.wait_lsn(*lsn).await?; + + let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); + if ancestor_ancestor_lsn > *lsn { + // can we safely just branch from the ancestor instead? + bail!( + "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", + lsn, + ancestor_timeline_id, + ancestor_ancestor_lsn, + ); + } + } + + self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? + } + None => self.bootstrap_timeline(new_timeline_id, pg_version).await?, + }; + + // Have added new timeline into the tenant, now its background tasks are needed. + self.activate(true); + + Ok(Some(loaded_timeline)) + } + + /// perform one garbage collection iteration, removing old data files from disk. + /// this function is periodically called by gc task. + /// also it can be explicitly requested through page server api 'do_gc' command. + /// + /// 'target_timeline_id' specifies the timeline to GC, or None for all. + /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval). + /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC + /// to make tests more deterministic. + /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed? + pub async fn gc_iteration( + &self, + target_timeline_id: Option, + horizon: u64, + pitr: Duration, + checkpoint_before_gc: bool, + ) -> anyhow::Result { + anyhow::ensure!( + self.is_active(), + "Cannot run GC iteration on inactive tenant" + ); + + let timeline_str = target_timeline_id + .map(|x| x.to_string()) + .unwrap_or_else(|| "-".to_string()); + + { + let _timer = STORAGE_TIME + .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str]) + .start_timer(); + self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc) + .await + } + } + + /// Perform one compaction iteration. + /// This function is periodically called by compactor task. + /// Also it can be explicitly requested per timeline through page server + /// api's 'compact' command. + pub fn compaction_iteration(&self) -> anyhow::Result<()> { + anyhow::ensure!( + self.is_active(), + "Cannot run compaction iteration on inactive tenant" + ); + + // Scan through the hashmap and collect a list of all the timelines, + // while holding the lock. Then drop the lock and actually perform the + // compactions. We don't want to block everything else while the + // compaction runs. + let timelines = self.timelines.lock().unwrap(); + let timelines_to_compact = timelines + .iter() + .map(|(timeline_id, timeline)| (*timeline_id, timeline.clone())) + .collect::>(); + drop(timelines); + + for (timeline_id, timeline) in &timelines_to_compact { + let _entered = info_span!("compact_timeline", timeline = %timeline_id).entered(); + timeline.compact()?; + } + + Ok(()) + } + + /// Flush all in-memory data to disk. + /// + /// Used at graceful shutdown. + /// + pub async fn checkpoint(&self) -> anyhow::Result<()> { + // Scan through the hashmap and collect a list of all the timelines, + // while holding the lock. Then drop the lock and actually perform the + // checkpoints. We don't want to block everything else while the + // checkpoint runs. + let timelines_to_checkpoint = { + let timelines = self.timelines.lock().unwrap(); + timelines + .iter() + .map(|(id, timeline)| (*id, Arc::clone(timeline))) + .collect::>() + }; + + for (id, timeline) in &timelines_to_checkpoint { + timeline + .checkpoint(CheckpointConfig::Flush) + .instrument(info_span!("checkpoint", timeline = %id, tenant = %self.tenant_id)) + .await?; + } + + Ok(()) + } + + /// Removes timeline-related in-memory data + pub fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> { + // in order to be retriable detach needs to be idempotent + // (or at least to a point that each time the detach is called it can make progress) + let mut timelines = self.timelines.lock().unwrap(); + + // Ensure that there are no child timelines **attached to that pageserver**, + // because detach removes files, which will break child branches + let children_exist = timelines + .iter() + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); + + anyhow::ensure!( + !children_exist, + "Cannot delete timeline which has child timelines" + ); + let timeline_entry = match timelines.entry(timeline_id) { + Entry::Occupied(e) => e, + Entry::Vacant(_) => bail!("timeline not found"), + }; + + let timeline = timeline_entry.get(); + timeline.set_state(TimelineState::Paused); + + let layer_removal_guard = timeline.layer_removal_guard()?; + + let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id); + std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { + format!( + "Failed to remove local timeline directory '{}'", + local_timeline_directory.display() + ) + })?; + info!("detach removed files"); + + drop(layer_removal_guard); + timeline_entry.remove(); + + Ok(()) + } + + /// Allows to retrieve remote timeline index from the tenant. Used in walreceiver to grab remote consistent lsn. + pub fn get_remote_index(&self) -> &RemoteIndex { + &self.remote_index + } + + pub fn current_state(&self) -> TenantState { + *self.state.borrow() + } + + pub fn is_active(&self) -> bool { + matches!(self.current_state(), TenantState::Active { .. }) + } + + pub fn should_run_tasks(&self) -> bool { + matches!( + self.current_state(), + TenantState::Active { + background_jobs_running: true + } + ) + } + + /// Changes tenant status to active, if it was not broken before. + /// Otherwise, ignores the state change, logging an error. + pub fn activate(&self, enable_background_jobs: bool) { + self.set_state(TenantState::Active { + background_jobs_running: enable_background_jobs, + }); + } + + pub fn set_state(&self, new_state: TenantState) { + match (self.current_state(), new_state) { + (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { + debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}"); + } + (TenantState::Broken, _) => { + error!("Ignoring state update {new_state:?} for broken tenant"); + } + (_, new_state) => { + self.state.send_replace(new_state); + + let timelines_accessor = self.timelines.lock().unwrap(); + let not_broken_timelines = timelines_accessor + .values() + .filter(|timeline| timeline.current_state() != TimelineState::Broken); + match new_state { + TenantState::Active { + background_jobs_running, + } => { + if background_jobs_running { + // Spawn gc and compaction loops. The loops will shut themselves + // down when they notice that the tenant is inactive. + crate::tenant_tasks::start_background_loops(self.tenant_id); + } + + for timeline in not_broken_timelines { + timeline.set_state(TimelineState::Active); + } + } + TenantState::Paused | TenantState::Broken => { + for timeline in not_broken_timelines { + timeline.set_state(TimelineState::Suspended); + } + } + } + } + } + } + + pub fn subscribe_for_state_updates(&self) -> watch::Receiver { + self.state.subscribe() + } +} + +/// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id), +/// perform a topological sort, so that the parent of each timeline comes +/// before the children. +fn tree_sort_timelines( + timelines: HashMap, +) -> anyhow::Result> { + let mut result = Vec::with_capacity(timelines.len()); + + let mut now = Vec::with_capacity(timelines.len()); + // (ancestor, children) + let mut later: HashMap> = + HashMap::with_capacity(timelines.len()); + + for (timeline_id, metadata) in timelines { + if let Some(ancestor_id) = metadata.ancestor_timeline() { + let children = later.entry(ancestor_id).or_default(); + children.push((timeline_id, metadata)); + } else { + now.push((timeline_id, metadata)); + } + } + + while let Some((timeline_id, metadata)) = now.pop() { + result.push((timeline_id, metadata)); + // All children of this can be loaded now + if let Some(mut children) = later.remove(&timeline_id) { + now.append(&mut children); + } + } + + // All timelines should be visited now. Unless there were timelines with missing ancestors. + if !later.is_empty() { + for (missing_id, orphan_ids) in later { + for (orphan_id, _) in orphan_ids { + error!("could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded"); + } + } + bail!("could not load tenant because some timelines are missing ancestors"); + } + + Ok(result) +} + +/// Private functions +impl Tenant { + pub fn get_checkpoint_distance(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .checkpoint_distance + .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) + } + + pub fn get_checkpoint_timeout(&self) -> Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .checkpoint_timeout + .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) + } + + pub fn get_compaction_target_size(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_target_size + .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) + } + + pub fn get_compaction_period(&self) -> Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_period + .unwrap_or(self.conf.default_tenant_conf.compaction_period) + } + + pub fn get_compaction_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_threshold + .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) + } + + pub fn get_gc_horizon(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .gc_horizon + .unwrap_or(self.conf.default_tenant_conf.gc_horizon) + } + + pub fn get_gc_period(&self) -> Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .gc_period + .unwrap_or(self.conf.default_tenant_conf.gc_period) + } + + pub fn get_image_creation_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .image_creation_threshold + .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) + } + + pub fn get_pitr_interval(&self) -> Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .pitr_interval + .unwrap_or(self.conf.default_tenant_conf.pitr_interval) + } + + pub fn get_trace_read_requests(&self) -> bool { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .trace_read_requests + .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) + } + + pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) { + self.tenant_conf.write().unwrap().update(&new_tenant_conf); + } + + fn create_timeline_data( + &self, + new_timeline_id: TimelineId, + new_metadata: TimelineMetadata, + ancestor: Option>, + ) -> anyhow::Result { + if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() { + anyhow::ensure!( + ancestor.is_some(), + "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found" + ) + } + + let pg_version = new_metadata.pg_version(); + Ok(Timeline::new( + self.conf, + Arc::clone(&self.tenant_conf), + new_metadata, + ancestor, + new_timeline_id, + self.tenant_id, + Arc::clone(&self.walredo_mgr), + self.upload_layers, + pg_version, + )) + } + + pub(super) fn new( + conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, + walredo_mgr: Arc, + tenant_id: TenantId, + remote_index: RemoteIndex, + upload_layers: bool, + ) -> Tenant { + let (state, _) = watch::channel(TenantState::Paused); + Tenant { + tenant_id, + conf, + tenant_conf: Arc::new(RwLock::new(tenant_conf)), + timelines: Mutex::new(HashMap::new()), + gc_cs: Mutex::new(()), + walredo_mgr, + remote_index, + upload_layers, + state, + cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()), + } + } + + /// Locate and load config + pub(super) fn load_tenant_config( + conf: &'static PageServerConf, + tenant_id: TenantId, + ) -> anyhow::Result { + let target_config_path = conf.tenant_config_path(tenant_id); + let target_config_display = target_config_path.display(); + + info!("loading tenantconf from {target_config_display}"); + + // FIXME If the config file is not found, assume that we're attaching + // a detached tenant and config is passed via attach command. + // https://github.com/neondatabase/neon/issues/1555 + if !target_config_path.exists() { + info!("tenant config not found in {target_config_display}"); + return Ok(TenantConfOpt::default()); + } + + // load and parse file + let config = fs::read_to_string(&target_config_path).with_context(|| { + format!("Failed to load config from path '{target_config_display}'") + })?; + + let toml = config.parse::().with_context(|| { + format!("Failed to parse config from file '{target_config_display}' as toml file") + })?; + + let mut tenant_conf = TenantConfOpt::default(); + for (key, item) in toml.iter() { + match key { + "tenant_config" => { + tenant_conf = PageServerConf::parse_toml_tenant_conf(item).with_context(|| { + format!("Failed to parse config from file '{target_config_display}' as pageserver config") + })?; + } + _ => bail!("config file {target_config_display} has unrecognized pageserver option '{key}'"), + + } + } + + Ok(tenant_conf) + } + + pub(super) fn persist_tenant_config( + target_config_path: &Path, + tenant_conf: TenantConfOpt, + first_save: bool, + ) -> anyhow::Result<()> { + let _enter = info_span!("saving tenantconf").entered(); + info!("persisting tenantconf to {}", target_config_path.display()); + + // TODO this will prepend comments endlessly + let mut conf_content = r#"# This file contains a specific per-tenant's config. +# It is read in case of pageserver restart. + +[tenant_config] +"# + .to_string(); + + // Convert the config to a toml file. + conf_content += &toml_edit::easy::to_string(&tenant_conf)?; + + let mut target_config_file = VirtualFile::open_with_options( + target_config_path, + OpenOptions::new().write(true).create_new(first_save), + )?; + + target_config_file + .write(conf_content.as_bytes()) + .context("Failed to write toml bytes into file") + .and_then(|_| { + target_config_file + .sync_all() + .context("Faile to fsync config file") + }) + .with_context(|| { + format!( + "Failed to write config file into path '{}'", + target_config_path.display() + ) + })?; + + // fsync the parent directory to ensure the directory entry is durable + if first_save { + target_config_path + .parent() + .context("Config file does not have a parent") + .and_then(|target_config_parent| { + File::open(target_config_parent).context("Failed to open config parent") + }) + .and_then(|tenant_dir| { + tenant_dir + .sync_all() + .context("Failed to fsync config parent") + }) + .with_context(|| { + format!( + "Failed to fsync on first save for config {}", + target_config_path.display() + ) + })?; + } + + Ok(()) + } + + // + // How garbage collection works: + // + // +--bar-------------> + // / + // +----+-----foo----------------> + // / + // ----main--+--------------------------> + // \ + // +-----baz--------> + // + // + // 1. Grab 'gc_cs' mutex to prevent new timelines from being created while Timeline's + // `gc_infos` are being refreshed + // 2. Scan collected timelines, and on each timeline, make note of the + // all the points where other timelines have been branched off. + // We will refrain from removing page versions at those LSNs. + // 3. For each timeline, scan all layer files on the timeline. + // Remove all files for which a newer file exists and which + // don't cover any branch point LSNs. + // + // TODO: + // - if a relation has a non-incremental persistent layer on a child branch, then we + // don't need to keep that in the parent anymore. But currently + // we do. + async fn gc_iteration_internal( + &self, + target_timeline_id: Option, + horizon: u64, + pitr: Duration, + checkpoint_before_gc: bool, + ) -> anyhow::Result { + let mut totals: GcResult = Default::default(); + let now = Instant::now(); + + let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?; + + // Perform GC for each timeline. + // + // Note that we don't hold the GC lock here because we don't want + // to delay the branch creation task, which requires the GC lock. + // A timeline GC iteration can be slow because it may need to wait for + // compaction (both require `layer_removal_cs` lock), + // but the GC iteration can run concurrently with branch creation. + // + // See comments in [`Tenant::branch_timeline`] for more information + // about why branch creation task can run concurrently with timeline's GC iteration. + for timeline in gc_timelines { + if task_mgr::is_shutdown_requested() { + // We were requested to shut down. Stop and return with the progress we + // made. + break; + } + + // If requested, force flush all in-memory layers to disk first, + // so that they too can be garbage collected. That's + // used in tests, so we want as deterministic results as possible. + if checkpoint_before_gc { + timeline.checkpoint(CheckpointConfig::Forced).await?; + info!( + "timeline {} checkpoint_before_gc done", + timeline.timeline_id + ); + } + + let result = timeline.gc()?; + totals += result; + } + + totals.elapsed = now.elapsed(); + Ok(totals) + } + + /// Refreshes the Timeline::gc_info for all timelines, returning the + /// vector of timelines which have [`Timeline::get_last_record_lsn`] past + /// [`Tenant::get_gc_horizon`]. + /// + /// This is usually executed as part of periodic gc, but can now be triggered more often. + pub fn refresh_gc_info(&self) -> anyhow::Result>> { + // since this method can now be called at different rates than the configured gc loop, it + // might be that these configuration values get applied faster than what it was previously, + // since these were only read from the gc task. + let horizon = self.get_gc_horizon(); + let pitr = self.get_pitr_interval(); + + // refresh all timelines + let target_timeline_id = None; + + self.refresh_gc_info_internal(target_timeline_id, horizon, pitr) + } + + fn refresh_gc_info_internal( + &self, + target_timeline_id: Option, + horizon: u64, + pitr: Duration, + ) -> anyhow::Result>> { + // grab mutex to prevent new timelines from being created here. + let gc_cs = self.gc_cs.lock().unwrap(); + + let timelines = self.timelines.lock().unwrap(); + + // Scan all timelines. For each timeline, remember the timeline ID and + // the branch point where it was created. + let mut all_branchpoints: BTreeSet<(TimelineId, Lsn)> = BTreeSet::new(); + let timeline_ids = { + if let Some(target_timeline_id) = target_timeline_id.as_ref() { + if timelines.get(target_timeline_id).is_none() { + bail!("gc target timeline does not exist") + } + }; + + timelines + .iter() + .map(|(timeline_id, timeline_entry)| { + if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { + // If target_timeline is specified, we only need to know branchpoints of its children + if let Some(timeline_id) = target_timeline_id { + if ancestor_timeline_id == &timeline_id { + all_branchpoints.insert(( + *ancestor_timeline_id, + timeline_entry.get_ancestor_lsn(), + )); + } + } + // Collect branchpoints for all timelines + else { + all_branchpoints + .insert((*ancestor_timeline_id, timeline_entry.get_ancestor_lsn())); + } + } + + *timeline_id + }) + .collect::>() + }; + drop(timelines); + + // Ok, we now know all the branch points. + // Update the GC information for each timeline. + let mut gc_timelines = Vec::with_capacity(timeline_ids.len()); + for timeline_id in timeline_ids { + // Timeline is known to be local and loaded. + let timeline = self + .get_timeline(timeline_id, false) + .with_context(|| format!("Timeline {timeline_id} was not found"))?; + + // If target_timeline is specified, ignore all other timelines + if let Some(target_timeline_id) = target_timeline_id { + if timeline_id != target_timeline_id { + continue; + } + } + + if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) { + let branchpoints: Vec = all_branchpoints + .range(( + Included((timeline_id, Lsn(0))), + Included((timeline_id, Lsn(u64::MAX))), + )) + .map(|&x| x.1) + .collect(); + timeline.update_gc_info(branchpoints, cutoff, pitr)?; + + gc_timelines.push(timeline); + } + } + drop(gc_cs); + Ok(gc_timelines) + } + + /// Branch an existing timeline + fn branch_timeline( + &self, + src: TimelineId, + dst: TimelineId, + start_lsn: Option, + ) -> anyhow::Result> { + // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn + // about timelines, so otherwise a race condition is possible, where we create new timeline and GC + // concurrently removes data that is needed by the new timeline. + let _gc_cs = self.gc_cs.lock().unwrap(); + let timelines = self.timelines.lock().unwrap(); + let timeline_uninit_mark = self.create_timeline_uninit_mark(dst, &timelines)?; + drop(timelines); + + // In order for the branch creation task to not wait for GC/compaction, + // we need to make sure that the starting LSN of the child branch is not out of scope midway by + // + // 1. holding the GC lock to prevent overwritting timeline's GC data + // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline + // + // Step 2 is to avoid initializing the new branch using data removed by past GC iterations + // or in-queue GC iterations. + + let src_timeline = self.get_timeline(src, false).with_context(|| { + format!( + "No ancestor {} found for timeline {}/{}", + src, self.tenant_id, dst + ) + })?; + + let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); + + // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN + let start_lsn = start_lsn.unwrap_or_else(|| { + let lsn = src_timeline.get_last_record_lsn(); + info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}"); + lsn + }); + + // Check if the starting LSN is out of scope because it is less than + // 1. the latest GC cutoff LSN or + // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration. + src_timeline + .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) + .context(format!( + "invalid branch start lsn: less than latest GC cutoff {}", + *latest_gc_cutoff_lsn, + ))?; + { + let gc_info = src_timeline.gc_info.read().unwrap(); + let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); + if start_lsn < cutoff { + bail!(format!( + "invalid branch start lsn: less than planned GC cutoff {cutoff}" + )); + } + } + + // Determine prev-LSN for the new timeline. We can only determine it if + // the timeline was branched at the current end of the source timeline. + let RecordLsn { + last: src_last, + prev: src_prev, + } = src_timeline.get_last_record_rlsn(); + let dst_prev = if src_last == start_lsn { + Some(src_prev) + } else { + None + }; + + // Create the metadata file, noting the ancestor of the new timeline. + // There is initially no data in it, but all the read-calls know to look + // into the ancestor. + let metadata = TimelineMetadata::new( + start_lsn, + dst_prev, + Some(src), + start_lsn, + *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer? + src_timeline.initdb_lsn, + src_timeline.pg_version, + ); + let mut timelines = self.timelines.lock().unwrap(); + let new_timeline = self + .prepare_timeline( + dst, + metadata, + timeline_uninit_mark, + false, + Some(src_timeline), + )? + .initialize_with_lock(&mut timelines, true)?; + drop(timelines); + info!("branched timeline {dst} from {src} at {start_lsn}"); + + Ok(new_timeline) + } + + /// - run initdb to init temporary instance and get bootstrap data + /// - after initialization complete, remove the temp dir. + async fn bootstrap_timeline( + &self, + timeline_id: TimelineId, + pg_version: u32, + ) -> anyhow::Result> { + let timeline_uninit_mark = { + let timelines = self.timelines.lock().unwrap(); + self.create_timeline_uninit_mark(timeline_id, &timelines)? + }; + // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` + // temporary directory for basebackup files for the given timeline. + let initdb_path = path_with_suffix_extension( + self.conf + .timelines_path(&self.tenant_id) + .join(format!("basebackup-{timeline_id}")), + TEMP_FILE_SUFFIX, + ); + + // an uninit mark was placed before, nothing else can access this timeline files + // current initdb was not run yet, so remove whatever was left from the previous runs + if initdb_path.exists() { + fs::remove_dir_all(&initdb_path).with_context(|| { + format!( + "Failed to remove already existing initdb directory: {}", + initdb_path.display() + ) + })?; + } + // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path + run_initdb(self.conf, &initdb_path, pg_version)?; + // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it + scopeguard::defer! { + if let Err(e) = fs::remove_dir_all(&initdb_path) { + // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call + error!("Failed to remove temporary initdb directory '{}': {}", initdb_path.display(), e); + } + } + let pgdata_path = &initdb_path; + let pgdata_lsn = import_datadir::get_lsn_from_controlfile(pgdata_path)?.align(); + + // Import the contents of the data directory at the initial checkpoint + // LSN, and any WAL after that. + // Initdb lsn will be equal to last_record_lsn which will be set after import. + // Because we know it upfront avoid having an option or dummy zero value by passing it to the metadata. + let new_metadata = TimelineMetadata::new( + Lsn(0), + None, + None, + Lsn(0), + pgdata_lsn, + pgdata_lsn, + pg_version, + ); + let raw_timeline = + self.prepare_timeline(timeline_id, new_metadata, timeline_uninit_mark, true, None)?; + + let tenant_id = raw_timeline.owning_tenant.tenant_id; + let unfinished_timeline = raw_timeline.raw_timeline()?; + + tokio::task::block_in_place(|| { + import_datadir::import_timeline_from_postgres_datadir( + unfinished_timeline, + pgdata_path, + pgdata_lsn, + ) + }) + .with_context(|| { + format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}") + })?; + + // Flush loop needs to be spawned in order for checkpoint to be able to flush. + // We want to run proper checkpoint before we mark timeline as available to outside world + // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock + unfinished_timeline.maybe_spawn_flush_loop(); + + fail::fail_point!("before-checkpoint-new-timeline", |_| { + anyhow::bail!("failpoint before-checkpoint-new-timeline"); + }); + + unfinished_timeline + .checkpoint(CheckpointConfig::Forced).await + .with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?; + + let timeline = { + let mut timelines = self.timelines.lock().unwrap(); + raw_timeline.initialize_with_lock(&mut timelines, false)? + }; + + info!( + "created root timeline {} timeline.lsn {}", + timeline_id, + timeline.get_last_record_lsn() + ); + + Ok(timeline) + } + + /// Creates intermediate timeline structure and its files, without loading it into memory. + /// It's up to the caller to import the necesary data and import the timeline into memory. + fn prepare_timeline( + &self, + new_timeline_id: TimelineId, + new_metadata: TimelineMetadata, + uninit_mark: TimelineUninitMark, + init_layers: bool, + ancestor: Option>, + ) -> anyhow::Result { + let tenant_id = self.tenant_id; + + match self.create_timeline_files( + &uninit_mark.timeline_path, + new_timeline_id, + new_metadata, + ancestor, + ) { + Ok(new_timeline) => { + if init_layers { + new_timeline.layers.write().unwrap().next_open_layer_at = + Some(new_timeline.initdb_lsn); + } + debug!( + "Successfully created initial files for timeline {tenant_id}/{new_timeline_id}" + ); + Ok(UninitializedTimeline { + owning_tenant: self, + timeline_id: new_timeline_id, + raw_timeline: Some((Arc::new(new_timeline), uninit_mark)), + }) + } + Err(e) => { + error!("Failed to create initial files for timeline {tenant_id}/{new_timeline_id}, cleaning up: {e:?}"); + cleanup_timeline_directory(uninit_mark); + Err(e) + } + } + } + + fn create_timeline_files( + &self, + timeline_path: &Path, + new_timeline_id: TimelineId, + new_metadata: TimelineMetadata, + ancestor: Option>, + ) -> anyhow::Result { + let timeline_data = self + .create_timeline_data(new_timeline_id, new_metadata.clone(), ancestor) + .context("Failed to create timeline data structure")?; + crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?; + + fail::fail_point!("after-timeline-uninit-mark-creation", |_| { + anyhow::bail!("failpoint after-timeline-uninit-mark-creation"); + }); + + save_metadata( + self.conf, + new_timeline_id, + self.tenant_id, + &new_metadata, + true, + ) + .context("Failed to create timeline metadata")?; + + Ok(timeline_data) + } + + /// Attempts to create an uninit mark file for the timeline initialization. + /// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists. + /// + /// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init. + fn create_timeline_uninit_mark( + &self, + timeline_id: TimelineId, + timelines: &MutexGuard>>, + ) -> anyhow::Result { + let tenant_id = self.tenant_id; + + anyhow::ensure!( + timelines.get(&timeline_id).is_none(), + "Timeline {tenant_id}/{timeline_id} already exists in pageserver's memory" + ); + let timeline_path = self.conf.timeline_path(&timeline_id, &tenant_id); + anyhow::ensure!( + !timeline_path.exists(), + "Timeline {} already exists, cannot create its uninit mark file", + timeline_path.display() + ); + + let uninit_mark_path = self + .conf + .timeline_uninit_mark_file_path(tenant_id, timeline_id); + fs::File::create(&uninit_mark_path) + .context("Failed to create uninit mark file") + .and_then(|_| { + crashsafe::fsync_file_and_parent(&uninit_mark_path) + .context("Failed to fsync uninit mark file") + }) + .with_context(|| { + format!("Failed to crate uninit mark for timeline {tenant_id}/{timeline_id}") + })?; + + let uninit_mark = TimelineUninitMark::new(uninit_mark_path, timeline_path); + + Ok(uninit_mark) + } + + pub(super) fn init_attach_timelines( + &self, + timelines: HashMap, + ) -> anyhow::Result<()> { + let sorted_timelines = if timelines.len() == 1 { + timelines.into_iter().collect() + } else if !timelines.is_empty() { + tree_sort_timelines(timelines)? + } else { + warn!("No timelines to attach received"); + return Ok(()); + }; + + let tenant_id = self.tenant_id; + let mut timelines_accessor = self.timelines.lock().unwrap(); + for (timeline_id, metadata) in sorted_timelines { + info!( + "Attaching timeline {}/{} pg_version {}", + tenant_id, + timeline_id, + metadata.pg_version() + ); + + if timelines_accessor.contains_key(&timeline_id) { + warn!("Timeline {tenant_id}/{timeline_id} already exists in the tenant map, skipping its initialization"); + continue; + } + + let ancestor = metadata + .ancestor_timeline() + .and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id)) + .cloned(); + let dummy_timeline = self + .create_timeline_data(timeline_id, metadata.clone(), ancestor.clone()) + .with_context(|| { + format!("Failed to crate dummy timeline data for {tenant_id}/{timeline_id}") + })?; + let timeline = UninitializedTimeline { + owning_tenant: self, + timeline_id, + raw_timeline: Some((Arc::new(dummy_timeline), TimelineUninitMark::dummy())), + }; + match timeline.initialize_with_lock(&mut timelines_accessor, true) { + Ok(initialized_timeline) => { + timelines_accessor.insert(timeline_id, initialized_timeline); + } + Err(e) => { + error!("Failed to initialize timeline {tenant_id}/{timeline_id}: {e:?}"); + let broken_timeline = self + .create_timeline_data(timeline_id, metadata, ancestor) + .with_context(|| { + format!("Failed to crate broken timeline data for {tenant_id}/{timeline_id}") + })?; + broken_timeline.set_state(TimelineState::Broken); + timelines_accessor.insert(timeline_id, Arc::new(broken_timeline)); + } + } + } + + Ok(()) + } + + /// Gathers inputs from all of the timelines to produce a sizing model input. + /// + /// Future is cancellation safe. Only one calculation can be running at once per tenant. + #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] + pub async fn gather_size_inputs(&self) -> anyhow::Result { + let logical_sizes_at_once = self + .conf + .concurrent_tenant_size_logical_size_queries + .inner(); + + // TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries + // are for testing/experimenting, we tolerate this. + // + // See more for on the issue #2748 condenced out of the initial PR review. + let mut shared_cache = self.cached_logical_sizes.lock().await; + + size::gather_inputs(self, logical_sizes_at_once, &mut *shared_cache).await + } +} + +/// Create the cluster temporarily in 'initdbpath' directory inside the repository +/// to get bootstrap data for timeline initialization. +fn run_initdb( + conf: &'static PageServerConf, + initdb_target_dir: &Path, + pg_version: u32, +) -> anyhow::Result<()> { + let initdb_bin_path = conf.pg_bin_dir(pg_version)?.join("initdb"); + let initdb_lib_dir = conf.pg_lib_dir(pg_version)?; + info!( + "running {} in {}, libdir: {}", + initdb_bin_path.display(), + initdb_target_dir.display(), + initdb_lib_dir.display(), + ); + + let initdb_output = Command::new(&initdb_bin_path) + .args(&["-D", &initdb_target_dir.to_string_lossy()]) + .args(&["-U", &conf.superuser]) + .args(&["-E", "utf8"]) + .arg("--no-instructions") + // This is only used for a temporary installation that is deleted shortly after, + // so no need to fsync it + .arg("--no-sync") + .env_clear() + .env("LD_LIBRARY_PATH", &initdb_lib_dir) + .env("DYLD_LIBRARY_PATH", &initdb_lib_dir) + .stdout(Stdio::null()) + .output() + .with_context(|| { + format!( + "failed to execute {} at target dir {}", + initdb_bin_path.display(), + initdb_target_dir.display() + ) + })?; + if !initdb_output.status.success() { + bail!( + "initdb failed: '{}'", + String::from_utf8_lossy(&initdb_output.stderr) + ); + } + + Ok(()) +} + +impl Drop for Tenant { + fn drop(&mut self) { + remove_tenant_metrics(&self.tenant_id); + } +} +/// Dump contents of a layer file to stdout. +pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> anyhow::Result<()> { + use std::os::unix::fs::FileExt; + + // All layer files start with a two-byte "magic" value, to identify the kind of + // file. + let file = File::open(path)?; + let mut header_buf = [0u8; 2]; + file.read_exact_at(&mut header_buf, 0)?; + + match u16::from_be_bytes(header_buf) { + crate::IMAGE_FILE_MAGIC => { + image_layer::ImageLayer::new_for_path(path, file)?.dump(verbose)? + } + crate::DELTA_FILE_MAGIC => { + delta_layer::DeltaLayer::new_for_path(path, file)?.dump(verbose)? + } + magic => bail!("unrecognized magic identifier: {:?}", magic), + } + + Ok(()) +} + +fn ignore_absent_files(fs_operation: F) -> io::Result<()> +where + F: Fn() -> io::Result<()>, +{ + fs_operation().or_else(|e| { + if e.kind() == io::ErrorKind::NotFound { + Ok(()) + } else { + Err(e) + } + }) +} + +#[cfg(test)] +pub mod harness { + use bytes::{Bytes, BytesMut}; + use once_cell::sync::Lazy; + use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; + use std::{fs, path::PathBuf}; + use utils::lsn::Lsn; + + use crate::storage_sync::index::RemoteIndex; + use crate::{ + config::PageServerConf, + repository::Key, + tenant::Tenant, + walrecord::NeonWalRecord, + walredo::{WalRedoError, WalRedoManager}, + }; + + use super::*; + use crate::tenant_config::{TenantConf, TenantConfOpt}; + use hex_literal::hex; + use utils::id::{TenantId, TimelineId}; + + pub const TIMELINE_ID: TimelineId = + TimelineId::from_array(hex!("11223344556677881122334455667788")); + pub const NEW_TIMELINE_ID: TimelineId = + TimelineId::from_array(hex!("AA223344556677881122334455667788")); + + /// Convenience function to create a page image with given string as the only content + #[allow(non_snake_case)] + pub fn TEST_IMG(s: &str) -> Bytes { + let mut buf = BytesMut::new(); + buf.extend_from_slice(s.as_bytes()); + buf.resize(64, 0); + + buf.freeze() + } + + static LOCK: Lazy> = Lazy::new(|| RwLock::new(())); + + impl From for TenantConfOpt { + fn from(tenant_conf: TenantConf) -> Self { + Self { + checkpoint_distance: Some(tenant_conf.checkpoint_distance), + checkpoint_timeout: Some(tenant_conf.checkpoint_timeout), + compaction_target_size: Some(tenant_conf.compaction_target_size), + compaction_period: Some(tenant_conf.compaction_period), + compaction_threshold: Some(tenant_conf.compaction_threshold), + gc_horizon: Some(tenant_conf.gc_horizon), + gc_period: Some(tenant_conf.gc_period), + image_creation_threshold: Some(tenant_conf.image_creation_threshold), + pitr_interval: Some(tenant_conf.pitr_interval), + walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout), + lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout), + max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), + trace_read_requests: Some(tenant_conf.trace_read_requests), + } + } + } + + pub struct TenantHarness<'a> { + pub conf: &'static PageServerConf, + pub tenant_conf: TenantConf, + pub tenant_id: TenantId, + + pub lock_guard: ( + Option>, + Option>, + ), + } + + impl<'a> TenantHarness<'a> { + pub fn create(test_name: &'static str) -> anyhow::Result { + Self::create_internal(test_name, false) + } + pub fn create_exclusive(test_name: &'static str) -> anyhow::Result { + Self::create_internal(test_name, true) + } + fn create_internal(test_name: &'static str, exclusive: bool) -> anyhow::Result { + let lock_guard = if exclusive { + (None, Some(LOCK.write().unwrap())) + } else { + (Some(LOCK.read().unwrap()), None) + }; + + let repo_dir = PageServerConf::test_repo_dir(test_name); + let _ = fs::remove_dir_all(&repo_dir); + fs::create_dir_all(&repo_dir)?; + + let conf = PageServerConf::dummy_conf(repo_dir); + // Make a static copy of the config. This can never be free'd, but that's + // OK in a test. + let conf: &'static PageServerConf = Box::leak(Box::new(conf)); + + let tenant_conf = TenantConf::dummy_conf(); + + let tenant_id = TenantId::generate(); + fs::create_dir_all(conf.tenant_path(&tenant_id))?; + fs::create_dir_all(conf.timelines_path(&tenant_id))?; + + Ok(Self { + conf, + tenant_conf, + tenant_id, + lock_guard, + }) + } + + pub fn load(&self) -> Tenant { + self.try_load().expect("failed to load test tenant") + } + + pub fn try_load(&self) -> anyhow::Result { + let walredo_mgr = Arc::new(TestRedoManager); + + let tenant = Tenant::new( + self.conf, + TenantConfOpt::from(self.tenant_conf), + walredo_mgr, + self.tenant_id, + RemoteIndex::default(), + false, + ); + // populate tenant with locally available timelines + let mut timelines_to_load = HashMap::new(); + for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id)) + .expect("should be able to read timelines dir") + { + let timeline_dir_entry = timeline_dir_entry?; + let timeline_id: TimelineId = timeline_dir_entry + .path() + .file_name() + .unwrap() + .to_string_lossy() + .parse()?; + + let timeline_metadata = load_metadata(self.conf, timeline_id, self.tenant_id)?; + timelines_to_load.insert(timeline_id, timeline_metadata); + } + tenant.init_attach_timelines(timelines_to_load)?; + tenant.set_state(TenantState::Active { + background_jobs_running: false, + }); + + Ok(tenant) + } + + pub fn timeline_path(&self, timeline_id: &TimelineId) -> PathBuf { + self.conf.timeline_path(timeline_id, &self.tenant_id) + } + } + + fn load_metadata( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_id: TenantId, + ) -> anyhow::Result { + let metadata_path = conf.metadata_path(timeline_id, tenant_id); + let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { + format!( + "Failed to read metadata bytes from path {}", + metadata_path.display() + ) + })?; + TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| { + format!( + "Failed to parse metadata bytes from path {}", + metadata_path.display() + ) + }) + } + + // Mock WAL redo manager that doesn't do much + pub struct TestRedoManager; + + impl WalRedoManager for TestRedoManager { + fn request_redo( + &self, + key: Key, + lsn: Lsn, + base_img: Option, + records: Vec<(Lsn, NeonWalRecord)>, + _pg_version: u32, + ) -> Result { + let s = format!( + "redo for {} to get to {}, with {} and {} records", + key, + lsn, + if base_img.is_some() { + "base image" + } else { + "no base image" + }, + records.len() + ); + println!("{s}"); + + Ok(TEST_IMG(&s)) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::config::METADATA_FILE_NAME; + use crate::keyspace::KeySpaceAccum; + use crate::repository::{Key, Value}; + use crate::tenant::harness::*; + use crate::DEFAULT_PG_VERSION; + use bytes::BytesMut; + use hex_literal::hex; + use once_cell::sync::Lazy; + use rand::{thread_rng, Rng}; + + static TEST_KEY: Lazy = + Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001"))); + + #[test] + fn test_basic() -> anyhow::Result<()> { + let tenant = TenantHarness::create("test_basic")?.load(); + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; + + let writer = tline.writer(); + writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; + writer.finish_write(Lsn(0x10)); + drop(writer); + + let writer = tline.writer(); + writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; + writer.finish_write(Lsn(0x20)); + drop(writer); + + assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); + + Ok(()) + } + + #[test] + fn no_duplicate_timelines() -> anyhow::Result<()> { + let tenant = TenantHarness::create("no_duplicate_timelines")?.load(); + let _ = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; + + match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION) { + Ok(_) => panic!("duplicate timeline creation should fail"), + Err(e) => assert_eq!( + e.to_string(), + format!( + "Timeline {}/{} already exists in pageserver's memory", + tenant.tenant_id, TIMELINE_ID + ) + ), + } + + Ok(()) + } + + /// Convenience function to create a page image with given string as the only content + pub fn test_value(s: &str) -> Value { + let mut buf = BytesMut::new(); + buf.extend_from_slice(s.as_bytes()); + Value::Image(buf.freeze()) + } + + /// + /// Test branch creation + /// + #[test] + fn test_branch() -> anyhow::Result<()> { + let tenant = TenantHarness::create("test_branch")?.load(); + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; + let writer = tline.writer(); + use std::str::from_utf8; + + #[allow(non_snake_case)] + let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); + #[allow(non_snake_case)] + let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap(); + + // Insert a value on the timeline + writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?; + writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?; + writer.finish_write(Lsn(0x20)); + + writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?; + writer.finish_write(Lsn(0x30)); + writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?; + writer.finish_write(Lsn(0x40)); + + //assert_current_logical_size(&tline, Lsn(0x40)); + + // Branch the history, modify relation differently on the new timeline + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; + let newtline = tenant + .get_timeline(NEW_TIMELINE_ID, true) + .expect("Should have a local timeline"); + let new_writer = newtline.writer(); + new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?; + new_writer.finish_write(Lsn(0x40)); + + // Check page contents on both branches + assert_eq!( + from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?, + "foo at 0x40" + ); + assert_eq!( + from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?, + "bar at 0x40" + ); + assert_eq!( + from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?, + "foobar at 0x20" + ); + + //assert_current_logical_size(&tline, Lsn(0x40)); + + Ok(()) + } + + async fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> { + let mut lsn = start_lsn; + #[allow(non_snake_case)] + { + let writer = tline.writer(); + // Create a relation on the timeline + writer.put( + *TEST_KEY, + lsn, + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; + writer.finish_write(lsn); + lsn += 0x10; + writer.put( + *TEST_KEY, + lsn, + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; + writer.finish_write(lsn); + lsn += 0x10; + } + tline.checkpoint(CheckpointConfig::Forced).await?; + { + let writer = tline.writer(); + writer.put( + *TEST_KEY, + lsn, + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; + writer.finish_write(lsn); + lsn += 0x10; + writer.put( + *TEST_KEY, + lsn, + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; + writer.finish_write(lsn); + } + tline.checkpoint(CheckpointConfig::Forced).await + } + + #[tokio::test] + async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> { + let tenant = + TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")? + .load(); + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; + make_some_layers(tline.as_ref(), Lsn(0x20)).await?; + + // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 + // FIXME: this doesn't actually remove any layer currently, given how the checkpointing + // and compaction works. But it does set the 'cutoff' point so that the cross check + // below should fail. + tenant + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false) + .await?; + + // try to branch at lsn 25, should fail because we already garbage collected the data + match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + Ok(_) => panic!("branching should have failed"), + Err(err) => { + assert!(err.to_string().contains("invalid branch start lsn")); + assert!(err + .source() + .unwrap() + .to_string() + .contains("we might've already garbage collected needed data")) + } + } + + Ok(()) + } + + #[test] + fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> { + let tenant = + TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); + + tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)? + .initialize()?; + // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 + match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + Ok(_) => panic!("branching should have failed"), + Err(err) => { + assert!(&err.to_string().contains("invalid branch start lsn")); + assert!(&err + .source() + .unwrap() + .to_string() + .contains("is earlier than latest GC horizon")); + } + } + + Ok(()) + } + + /* + // FIXME: This currently fails to error out. Calling GC doesn't currently + // remove the old value, we'd need to work a little harder + #[tokio::test] + async fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> { + let repo = + RepoHarness::create("test_prohibit_get_for_garbage_collected_data")? + .load(); + + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; + make_some_layers(tline.as_ref(), Lsn(0x20)).await?; + + repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); + assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); + match tline.get(*TEST_KEY, Lsn(0x25)) { + Ok(_) => panic!("request for page should have failed"), + Err(err) => assert!(err.to_string().contains("not found at")), + } + Ok(()) + } + */ + + #[tokio::test] + async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> { + let tenant = + TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; + make_some_layers(tline.as_ref(), Lsn(0x20)).await?; + + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + let newtline = tenant + .get_timeline(NEW_TIMELINE_ID, true) + .expect("Should have a local timeline"); + // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 + tenant + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false) + .await?; + assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); + + Ok(()) + } + #[tokio::test] + async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> { + let tenant = + TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; + make_some_layers(tline.as_ref(), Lsn(0x20)).await?; + + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + let newtline = tenant + .get_timeline(NEW_TIMELINE_ID, true) + .expect("Should have a local timeline"); + + make_some_layers(newtline.as_ref(), Lsn(0x60)).await?; + + // run gc on parent + tenant + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false) + .await?; + + // Check that the data is still accessible on the branch. + assert_eq!( + newtline.get(*TEST_KEY, Lsn(0x50))?, + TEST_IMG(&format!("foo at {}", Lsn(0x40))) + ); + + Ok(()) + } + + #[tokio::test] + async fn timeline_load() -> anyhow::Result<()> { + const TEST_NAME: &str = "timeline_load"; + let harness = TenantHarness::create(TEST_NAME)?; + { + let tenant = harness.load(); + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)? + .initialize()?; + make_some_layers(tline.as_ref(), Lsn(0x8000)).await?; + tline.checkpoint(CheckpointConfig::Forced).await?; + } + + let tenant = harness.load(); + tenant + .get_timeline(TIMELINE_ID, true) + .expect("cannot load timeline"); + + Ok(()) + } + + #[tokio::test] + async fn timeline_load_with_ancestor() -> anyhow::Result<()> { + const TEST_NAME: &str = "timeline_load_with_ancestor"; + let harness = TenantHarness::create(TEST_NAME)?; + // create two timelines + { + let tenant = harness.load(); + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; + + make_some_layers(tline.as_ref(), Lsn(0x20)).await?; + tline.checkpoint(CheckpointConfig::Forced).await?; + + tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + + let newtline = tenant + .get_timeline(NEW_TIMELINE_ID, true) + .expect("Should have a local timeline"); + + make_some_layers(newtline.as_ref(), Lsn(0x60)).await?; + tline.checkpoint(CheckpointConfig::Forced).await?; + } + + // check that both of them are initially unloaded + let tenant = harness.load(); + + // check that both, child and ancestor are loaded + let _child_tline = tenant + .get_timeline(NEW_TIMELINE_ID, true) + .expect("cannot get child timeline loaded"); + + let _ancestor_tline = tenant + .get_timeline(TIMELINE_ID, true) + .expect("cannot get ancestor timeline loaded"); + + Ok(()) + } + + #[test] + fn corrupt_metadata() -> anyhow::Result<()> { + const TEST_NAME: &str = "corrupt_metadata"; + let harness = TenantHarness::create(TEST_NAME)?; + let tenant = harness.load(); + + tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; + drop(tenant); + + let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); + + assert!(metadata_path.is_file()); + + let mut metadata_bytes = std::fs::read(&metadata_path)?; + assert_eq!(metadata_bytes.len(), 512); + metadata_bytes[8] ^= 1; + std::fs::write(metadata_path, metadata_bytes)?; + + let err = harness.try_load().err().expect("should fail"); + assert!(err + .to_string() + .starts_with("Failed to parse metadata bytes from path")); + + let mut found_error_message = false; + let mut err_source = err.source(); + while let Some(source) = err_source { + if source.to_string() == "metadata checksum mismatch" { + found_error_message = true; + break; + } + err_source = source.source(); + } + assert!( + found_error_message, + "didn't find the corrupted metadata error" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_images() -> anyhow::Result<()> { + let tenant = TenantHarness::create("test_images")?.load(); + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; + + let writer = tline.writer(); + writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; + writer.finish_write(Lsn(0x10)); + drop(writer); + + tline.checkpoint(CheckpointConfig::Forced).await?; + tline.compact()?; + + let writer = tline.writer(); + writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; + writer.finish_write(Lsn(0x20)); + drop(writer); + + tline.checkpoint(CheckpointConfig::Forced).await?; + tline.compact()?; + + let writer = tline.writer(); + writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?; + writer.finish_write(Lsn(0x30)); + drop(writer); + + tline.checkpoint(CheckpointConfig::Forced).await?; + tline.compact()?; + + let writer = tline.writer(); + writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?; + writer.finish_write(Lsn(0x40)); + drop(writer); + + tline.checkpoint(CheckpointConfig::Forced).await?; + tline.compact()?; + + assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40")); + + Ok(()) + } + + // + // Insert 1000 key-value pairs with increasing keys, checkpoint, + // repeat 50 times. + // + #[tokio::test] + async fn test_bulk_insert() -> anyhow::Result<()> { + let tenant = TenantHarness::create("test_bulk_insert")?.load(); + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; + + let mut lsn = Lsn(0x10); + + let mut keyspace = KeySpaceAccum::new(); + + let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); + let mut blknum = 0; + for _ in 0..50 { + for _ in 0..10000 { + test_key.field6 = blknum; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + )?; + writer.finish_write(lsn); + drop(writer); + + keyspace.add_key(test_key); + + lsn = Lsn(lsn.0 + 0x10); + blknum += 1; + } + + let cutoff = tline.get_last_record_lsn(); + + tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; + tline.checkpoint(CheckpointConfig::Forced).await?; + tline.compact()?; + tline.gc()?; + } + + Ok(()) + } + + #[tokio::test] + async fn test_random_updates() -> anyhow::Result<()> { + let tenant = TenantHarness::create("test_random_updates")?.load(); + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; + + const NUM_KEYS: usize = 1000; + + let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); + + let mut keyspace = KeySpaceAccum::new(); + + // Track when each page was last modified. Used to assert that + // a read sees the latest page version. + let mut updated = [Lsn(0); NUM_KEYS]; + + let mut lsn = Lsn(0); + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = blknum as u32; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + )?; + writer.finish_write(lsn); + updated[blknum] = lsn; + drop(writer); + + keyspace.add_key(test_key); + } + + for _ in 0..50 { + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = blknum as u32; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + )?; + writer.finish_write(lsn); + drop(writer); + updated[blknum] = lsn; + } + + // Read all the blocks + for (blknum, last_lsn) in updated.iter().enumerate() { + test_key.field6 = blknum as u32; + assert_eq!( + tline.get(test_key, lsn)?, + TEST_IMG(&format!("{} at {}", blknum, last_lsn)) + ); + } + + // Perform a cycle of checkpoint, compaction, and GC + println!("checkpointing {}", lsn); + let cutoff = tline.get_last_record_lsn(); + tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; + tline.checkpoint(CheckpointConfig::Forced).await?; + tline.compact()?; + tline.gc()?; + } + + Ok(()) + } + + #[tokio::test] + async fn test_traverse_branches() -> anyhow::Result<()> { + let tenant = TenantHarness::create("test_traverse_branches")?.load(); + let mut tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; + + const NUM_KEYS: usize = 1000; + + let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); + + let mut keyspace = KeySpaceAccum::new(); + + // Track when each page was last modified. Used to assert that + // a read sees the latest page version. + let mut updated = [Lsn(0); NUM_KEYS]; + + let mut lsn = Lsn(0); + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = blknum as u32; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + )?; + writer.finish_write(lsn); + updated[blknum] = lsn; + drop(writer); + + keyspace.add_key(test_key); + } + + let mut tline_id = TIMELINE_ID; + for _ in 0..50 { + let new_tline_id = TimelineId::generate(); + tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; + tline = tenant + .get_timeline(new_tline_id, true) + .expect("Should have the branched timeline"); + tline_id = new_tline_id; + + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = blknum as u32; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + )?; + println!("updating {} at {}", blknum, lsn); + writer.finish_write(lsn); + drop(writer); + updated[blknum] = lsn; + } + + // Read all the blocks + for (blknum, last_lsn) in updated.iter().enumerate() { + test_key.field6 = blknum as u32; + assert_eq!( + tline.get(test_key, lsn)?, + TEST_IMG(&format!("{} at {}", blknum, last_lsn)) + ); + } + + // Perform a cycle of checkpoint, compaction, and GC + println!("checkpointing {}", lsn); + let cutoff = tline.get_last_record_lsn(); + tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; + tline.checkpoint(CheckpointConfig::Forced).await?; + tline.compact()?; + tline.gc()?; + } + + Ok(()) + } + + #[test] + fn test_traverse_ancestors() -> anyhow::Result<()> { + let tenant = TenantHarness::create("test_traverse_ancestors")?.load(); + let mut tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)? + .initialize()?; + + const NUM_KEYS: usize = 100; + const NUM_TLINES: usize = 50; + + let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); + // Track page mutation lsns across different timelines. + let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES]; + + let mut lsn = Lsn(0); + let mut tline_id = TIMELINE_ID; + + #[allow(clippy::needless_range_loop)] + for idx in 0..NUM_TLINES { + let new_tline_id = TimelineId::generate(); + tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; + tline = tenant + .get_timeline(new_tline_id, true) + .expect("Should have the branched timeline"); + tline_id = new_tline_id; + + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = blknum as u32; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + &Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))), + )?; + println!("updating [{}][{}] at {}", idx, blknum, lsn); + writer.finish_write(lsn); + drop(writer); + updated[idx][blknum] = lsn; + } + } + + // Read pages from leaf timeline across all ancestors. + for (idx, lsns) in updated.iter().enumerate() { + for (blknum, lsn) in lsns.iter().enumerate() { + // Skip empty mutations. + if lsn.0 == 0 { + continue; + } + println!("checking [{idx}][{blknum}] at {lsn}"); + test_key.field6 = blknum as u32; + assert_eq!( + tline.get(test_key, *lsn)?, + TEST_IMG(&format!("{idx} {blknum} at {lsn}")) + ); + } + } + Ok(()) + } +} diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs new file mode 100644 index 0000000000..52eafc72ee --- /dev/null +++ b/pageserver/src/tenant/blob_io.rs @@ -0,0 +1,174 @@ +//! +//! Functions for reading and writing variable-sized "blobs". +//! +//! Each blob begins with a 1- or 4-byte length field, followed by the +//! actual data. If the length is smaller than 128 bytes, the length +//! is written as a one byte. If it's larger than that, the length +//! is written as a four-byte integer, in big-endian, with the high +//! bit set. This way, we can detect whether it's 1- or 4-byte header +//! by peeking at the first byte. +//! +//! len < 128: 0XXXXXXX +//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX +//! +use crate::page_cache::PAGE_SZ; +use crate::tenant::block_io::{BlockCursor, BlockReader}; +use std::cmp::min; +use std::io::{Error, ErrorKind}; + +/// For reading +pub trait BlobCursor { + /// Read a blob into a new buffer. + fn read_blob(&mut self, offset: u64) -> Result, std::io::Error> { + let mut buf = Vec::new(); + self.read_blob_into_buf(offset, &mut buf)?; + Ok(buf) + } + + /// Read blob into the given buffer. Any previous contents in the buffer + /// are overwritten. + fn read_blob_into_buf( + &mut self, + offset: u64, + dstbuf: &mut Vec, + ) -> Result<(), std::io::Error>; +} + +impl BlobCursor for BlockCursor +where + R: BlockReader, +{ + fn read_blob_into_buf( + &mut self, + offset: u64, + dstbuf: &mut Vec, + ) -> Result<(), std::io::Error> { + let mut blknum = (offset / PAGE_SZ as u64) as u32; + let mut off = (offset % PAGE_SZ as u64) as usize; + + let mut buf = self.read_blk(blknum)?; + + // peek at the first byte, to determine if it's a 1- or 4-byte length + let first_len_byte = buf[off]; + let len: usize = if first_len_byte < 0x80 { + // 1-byte length header + off += 1; + first_len_byte as usize + } else { + // 4-byte length header + let mut len_buf = [0u8; 4]; + let thislen = PAGE_SZ - off; + if thislen < 4 { + // it is split across two pages + len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]); + blknum += 1; + buf = self.read_blk(blknum)?; + len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]); + off = 4 - thislen; + } else { + len_buf.copy_from_slice(&buf[off..off + 4]); + off += 4; + } + len_buf[0] &= 0x7f; + u32::from_be_bytes(len_buf) as usize + }; + + dstbuf.clear(); + dstbuf.reserve(len); + + // Read the payload + let mut remain = len; + while remain > 0 { + let mut page_remain = PAGE_SZ - off; + if page_remain == 0 { + // continue on next page + blknum += 1; + buf = self.read_blk(blknum)?; + off = 0; + page_remain = PAGE_SZ; + } + let this_blk_len = min(remain, page_remain); + dstbuf.extend_from_slice(&buf[off..off + this_blk_len]); + remain -= this_blk_len; + off += this_blk_len; + } + Ok(()) + } +} + +/// +/// Abstract trait for a data sink that you can write blobs to. +/// +pub trait BlobWriter { + /// Write a blob of data. Returns the offset that it was written to, + /// which can be used to retrieve the data later. + fn write_blob(&mut self, srcbuf: &[u8]) -> Result; +} + +/// +/// An implementation of BlobWriter to write blobs to anything that +/// implements std::io::Write. +/// +pub struct WriteBlobWriter +where + W: std::io::Write, +{ + inner: W, + offset: u64, +} + +impl WriteBlobWriter +where + W: std::io::Write, +{ + pub fn new(inner: W, start_offset: u64) -> Self { + WriteBlobWriter { + inner, + offset: start_offset, + } + } + + pub fn size(&self) -> u64 { + self.offset + } + + /// Access the underlying Write object. + /// + /// NOTE: WriteBlobWriter keeps track of the current write offset. If + /// you write something directly to the inner Write object, it makes the + /// internally tracked 'offset' to go out of sync. So don't do that. + pub fn into_inner(self) -> W { + self.inner + } +} + +impl BlobWriter for WriteBlobWriter +where + W: std::io::Write, +{ + fn write_blob(&mut self, srcbuf: &[u8]) -> Result { + let offset = self.offset; + + if srcbuf.len() < 128 { + // Short blob. Write a 1-byte length header + let len_buf = srcbuf.len() as u8; + self.inner.write_all(&[len_buf])?; + self.offset += 1; + } else { + // Write a 4-byte length header + if srcbuf.len() > 0x7fff_ffff { + return Err(Error::new( + ErrorKind::Other, + format!("blob too large ({} bytes)", srcbuf.len()), + )); + } + let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes(); + len_buf[0] |= 0x80; + self.inner.write_all(&len_buf)?; + self.offset += 4; + } + self.inner.write_all(srcbuf)?; + self.offset += srcbuf.len() as u64; + Ok(offset) + } +} diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs new file mode 100644 index 0000000000..bbcdabe1cd --- /dev/null +++ b/pageserver/src/tenant/block_io.rs @@ -0,0 +1,223 @@ +//! +//! Low-level Block-oriented I/O functions +//! + +use crate::page_cache; +use crate::page_cache::{ReadBufResult, PAGE_SZ}; +use bytes::Bytes; +use once_cell::sync::Lazy; +use std::ops::{Deref, DerefMut}; +use std::os::unix::fs::FileExt; +use std::sync::atomic::AtomicU64; + +/// This is implemented by anything that can read 8 kB (PAGE_SZ) +/// blocks, using the page cache +/// +/// There are currently two implementations: EphemeralFile, and FileBlockReader +/// below. +pub trait BlockReader { + type BlockLease: Deref + 'static; + + /// + /// Read a block. Returns a "lease" object that can be used to + /// access to the contents of the page. (For the page cache, the + /// lease object represents a lock on the buffer.) + /// + fn read_blk(&self, blknum: u32) -> Result; + + /// + /// Create a new "cursor" for reading from this reader. + /// + /// A cursor caches the last accessed page, allowing for faster + /// access if the same block is accessed repeatedly. + fn block_cursor(&self) -> BlockCursor<&Self> + where + Self: Sized, + { + BlockCursor::new(self) + } +} + +impl BlockReader for &B +where + B: BlockReader, +{ + type BlockLease = B::BlockLease; + + fn read_blk(&self, blknum: u32) -> Result { + (*self).read_blk(blknum) + } +} + +/// +/// A "cursor" for efficiently reading multiple pages from a BlockReader +/// +/// A cursor caches the last accessed page, allowing for faster access if the +/// same block is accessed repeatedly. +/// +/// You can access the last page with `*cursor`. 'read_blk' returns 'self', so +/// that in many cases you can use a BlockCursor as a drop-in replacement for +/// the underlying BlockReader. For example: +/// +/// ```no_run +/// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader}; +/// # let reader: FileBlockReader = todo!(); +/// let cursor = reader.block_cursor(); +/// let buf = cursor.read_blk(1); +/// // do stuff with 'buf' +/// let buf = cursor.read_blk(2); +/// // do stuff with 'buf' +/// ``` +/// +pub struct BlockCursor +where + R: BlockReader, +{ + reader: R, + /// last accessed page + cache: Option<(u32, R::BlockLease)>, +} + +impl BlockCursor +where + R: BlockReader, +{ + pub fn new(reader: R) -> Self { + BlockCursor { + reader, + cache: None, + } + } + + pub fn read_blk(&mut self, blknum: u32) -> Result<&Self, std::io::Error> { + // Fast return if this is the same block as before + if let Some((cached_blk, _buf)) = &self.cache { + if *cached_blk == blknum { + return Ok(self); + } + } + + // Read the block from the underlying reader, and cache it + self.cache = None; + let buf = self.reader.read_blk(blknum)?; + self.cache = Some((blknum, buf)); + + Ok(self) + } +} + +impl Deref for BlockCursor +where + R: BlockReader, +{ + type Target = [u8; PAGE_SZ]; + + fn deref(&self) -> &::Target { + &self.cache.as_ref().unwrap().1 + } +} + +static NEXT_ID: Lazy = Lazy::new(|| AtomicU64::new(1)); + +/// An adapter for reading a (virtual) file using the page cache. +/// +/// The file is assumed to be immutable. This doesn't provide any functions +/// for modifying the file, nor for invalidating the cache if it is modified. +pub struct FileBlockReader { + pub file: F, + + /// Unique ID of this file, used as key in the page cache. + file_id: u64, +} + +impl FileBlockReader +where + F: FileExt, +{ + pub fn new(file: F) -> Self { + let file_id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + + FileBlockReader { file_id, file } + } + + /// Read a page from the underlying file into given buffer. + fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> { + assert!(buf.len() == PAGE_SZ); + self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64) + } +} + +impl BlockReader for FileBlockReader +where + F: FileExt, +{ + type BlockLease = page_cache::PageReadGuard<'static>; + + fn read_blk(&self, blknum: u32) -> Result { + // Look up the right page + let cache = page_cache::get(); + loop { + match cache + .read_immutable_buf(self.file_id, blknum) + .map_err(|e| { + std::io::Error::new( + std::io::ErrorKind::Other, + format!("Failed to read immutable buf: {e:#}"), + ) + })? { + ReadBufResult::Found(guard) => break Ok(guard), + ReadBufResult::NotFound(mut write_guard) => { + // Read the page from disk into the buffer + self.fill_buffer(write_guard.deref_mut(), blknum)?; + write_guard.mark_valid(); + + // Swap for read lock + continue; + } + }; + } + } +} + +/// +/// Trait for block-oriented output +/// +pub trait BlockWriter { + /// + /// Write a page to the underlying storage. + /// + /// 'buf' must be of size PAGE_SZ. Returns the block number the page was + /// written to. + /// + fn write_blk(&mut self, buf: Bytes) -> Result; +} + +/// +/// A simple in-memory buffer of blocks. +/// +pub struct BlockBuf { + pub blocks: Vec, +} +impl BlockWriter for BlockBuf { + fn write_blk(&mut self, buf: Bytes) -> Result { + assert!(buf.len() == PAGE_SZ); + let blknum = self.blocks.len(); + self.blocks.push(buf); + Ok(blknum as u32) + } +} + +impl BlockBuf { + pub fn new() -> Self { + BlockBuf { blocks: Vec::new() } + } + + pub fn size(&self) -> u64 { + (self.blocks.len() * PAGE_SZ) as u64 + } +} +impl Default for BlockBuf { + fn default() -> Self { + Self::new() + } +} diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/delta_layer.rs new file mode 100644 index 0000000000..dcd6956640 --- /dev/null +++ b/pageserver/src/tenant/delta_layer.rs @@ -0,0 +1,1014 @@ +//! A DeltaLayer represents a collection of WAL records or page images in a range of +//! LSNs, and in a range of Keys. It is stored on a file on disk. +//! +//! Usually a delta layer only contains differences, in the form of WAL records +//! against a base LSN. However, if a relation extended or a whole new relation +//! is created, there would be no base for the new pages. The entries for them +//! must be page images or WAL records with the 'will_init' flag set, so that +//! they can be replayed without referring to an older page version. +//! +//! The delta files are stored in timelines/ directory. Currently, +//! there are no subdirectories, and each delta file is named like this: +//! +//! -__-, + lsn_range: Range, + + /// Block number where the 'index' part of the file begins. + index_start_blk: u32, + /// Block within the 'index', where the B-tree root page is stored + index_root_blk: u32, +} + +impl From<&DeltaLayer> for Summary { + fn from(layer: &DeltaLayer) -> Self { + Self { + magic: DELTA_FILE_MAGIC, + format_version: STORAGE_FORMAT_VERSION, + + tenant_id: layer.tenant_id, + timeline_id: layer.timeline_id, + key_range: layer.key_range.clone(), + lsn_range: layer.lsn_range.clone(), + + index_start_blk: 0, + index_root_blk: 0, + } + } +} + +// Flag indicating that this version initialize the page +const WILL_INIT: u64 = 1; + +/// +/// Struct representing reference to BLOB in layers. Reference contains BLOB +/// offset, and for WAL records it also contains `will_init` flag. The flag +/// helps to determine the range of records that needs to be applied, without +/// reading/deserializing records themselves. +/// +#[derive(Debug, Serialize, Deserialize, Copy, Clone)] +struct BlobRef(u64); + +impl BlobRef { + pub fn will_init(&self) -> bool { + (self.0 & WILL_INIT) != 0 + } + + pub fn pos(&self) -> u64 { + self.0 >> 1 + } + + pub fn new(pos: u64, will_init: bool) -> BlobRef { + let mut blob_ref = pos << 1; + if will_init { + blob_ref |= WILL_INIT; + } + BlobRef(blob_ref) + } +} + +const DELTA_KEY_SIZE: usize = KEY_SIZE + 8; +struct DeltaKey([u8; DELTA_KEY_SIZE]); + +/// +/// This is the key of the B-tree index stored in the delta layer. It consists +/// of the serialized representation of a Key and LSN. +/// +impl DeltaKey { + fn from_slice(buf: &[u8]) -> Self { + let mut bytes: [u8; DELTA_KEY_SIZE] = [0u8; DELTA_KEY_SIZE]; + bytes.copy_from_slice(buf); + DeltaKey(bytes) + } + + fn from_key_lsn(key: &Key, lsn: Lsn) -> Self { + let mut bytes: [u8; DELTA_KEY_SIZE] = [0u8; DELTA_KEY_SIZE]; + key.write_to_byte_slice(&mut bytes[0..KEY_SIZE]); + bytes[KEY_SIZE..].copy_from_slice(&u64::to_be_bytes(lsn.0)); + DeltaKey(bytes) + } + + fn key(&self) -> Key { + Key::from_slice(&self.0) + } + + fn lsn(&self) -> Lsn { + Lsn(u64::from_be_bytes(self.0[KEY_SIZE..].try_into().unwrap())) + } + + fn extract_key_from_buf(buf: &[u8]) -> Key { + Key::from_slice(&buf[..KEY_SIZE]) + } + + fn extract_lsn_from_buf(buf: &[u8]) -> Lsn { + let mut lsn_buf = [0u8; 8]; + lsn_buf.copy_from_slice(&buf[KEY_SIZE..]); + Lsn(u64::from_be_bytes(lsn_buf)) + } +} + +/// +/// DeltaLayer is the in-memory data structure associated with an +/// on-disk delta file. We keep a DeltaLayer in memory for each +/// file, in the LayerMap. If a layer is in "loaded" state, we have a +/// copy of the index in memory, in 'inner'. Otherwise the struct is +/// just a placeholder for a file that exists on disk, and it needs to +/// be loaded before using it in queries. +/// +pub struct DeltaLayer { + path_or_conf: PathOrConf, + + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub key_range: Range, + pub lsn_range: Range, + + inner: RwLock, +} + +pub struct DeltaLayerInner { + /// If false, the fields below have not been loaded into memory yet. + loaded: bool, + + // values copied from summary + index_start_blk: u32, + index_root_blk: u32, + + /// Reader object for reading blocks from the file. (None if not loaded yet) + file: Option>, +} + +impl Layer for DeltaLayer { + fn get_tenant_id(&self) -> TenantId { + self.tenant_id + } + + fn get_timeline_id(&self) -> TimelineId { + self.timeline_id + } + + fn get_key_range(&self) -> Range { + self.key_range.clone() + } + + fn get_lsn_range(&self) -> Range { + self.lsn_range.clone() + } + + fn filename(&self) -> PathBuf { + PathBuf::from(self.layer_name().to_string()) + } + + fn local_path(&self) -> Option { + Some(self.path()) + } + + fn get_value_reconstruct_data( + &self, + key: Key, + lsn_range: Range, + reconstruct_state: &mut ValueReconstructState, + ) -> anyhow::Result { + ensure!(lsn_range.start >= self.lsn_range.start); + let mut need_image = true; + + ensure!(self.key_range.contains(&key)); + + { + // Open the file and lock the metadata in memory + let inner = self.load()?; + + // Scan the page versions backwards, starting from `lsn`. + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + file, + ); + let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1)); + + let mut offsets: Vec<(Lsn, u64)> = Vec::new(); + + tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| { + let blob_ref = BlobRef(value); + if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] { + return false; + } + let entry_lsn = DeltaKey::extract_lsn_from_buf(key); + if entry_lsn < lsn_range.start { + return false; + } + offsets.push((entry_lsn, blob_ref.pos())); + + !blob_ref.will_init() + })?; + + // Ok, 'offsets' now contains the offsets of all the entries we need to read + let mut cursor = file.block_cursor(); + let mut buf = Vec::new(); + for (entry_lsn, pos) in offsets { + cursor.read_blob_into_buf(pos, &mut buf).with_context(|| { + format!( + "Failed to read blob from virtual file {}", + file.file.path.display() + ) + })?; + let val = Value::des(&buf).with_context(|| { + format!( + "Failed to deserialize file blob from virtual file {}", + file.file.path.display() + ) + })?; + match val { + Value::Image(img) => { + reconstruct_state.img = Some((entry_lsn, img)); + need_image = false; + break; + } + Value::WalRecord(rec) => { + let will_init = rec.will_init(); + reconstruct_state.records.push((entry_lsn, rec)); + if will_init { + // This WAL record initializes the page, so no need to go further back + need_image = false; + break; + } + } + } + } + // release metadata lock and close the file + } + + // If an older page image is needed to reconstruct the page, let the + // caller know. + if need_image { + Ok(ValueReconstructResult::Continue) + } else { + Ok(ValueReconstructResult::Complete) + } + } + + fn iter<'a>(&'a self) -> Box> + 'a> { + let inner = match self.load() { + Ok(inner) => inner, + Err(e) => panic!("Failed to load a delta layer: {e:?}"), + }; + + match DeltaValueIter::new(inner) { + Ok(iter) => Box::new(iter), + Err(err) => Box::new(std::iter::once(Err(err))), + } + } + + fn key_iter<'a>(&'a self) -> Box + 'a> { + let inner = match self.load() { + Ok(inner) => inner, + Err(e) => panic!("Failed to load a delta layer: {e:?}"), + }; + + match DeltaKeyIter::new(inner) { + Ok(iter) => Box::new(iter), + Err(e) => panic!("Layer index is corrupted: {e:?}"), + } + } + + fn delete(&self) -> Result<()> { + // delete underlying file + fs::remove_file(self.path())?; + Ok(()) + } + + fn is_incremental(&self) -> bool { + true + } + + fn is_in_memory(&self) -> bool { + false + } + + /// debugging function to print out the contents of the layer + fn dump(&self, verbose: bool) -> Result<()> { + println!( + "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----", + self.tenant_id, + self.timeline_id, + self.key_range.start, + self.key_range.end, + self.lsn_range.start, + self.lsn_range.end + ); + + if !verbose { + return Ok(()); + } + + let inner = self.load()?; + + println!( + "index_start_blk: {}, root {}", + inner.index_start_blk, inner.index_root_blk + ); + + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + file, + ); + + tree_reader.dump()?; + + let mut cursor = file.block_cursor(); + + // A subroutine to dump a single blob + let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result { + let buf = cursor.read_blob(blob_ref.pos())?; + let val = Value::des(&buf)?; + let desc = match val { + Value::Image(img) => { + format!(" img {} bytes", img.len()) + } + Value::WalRecord(rec) => { + let wal_desc = walrecord::describe_wal_record(&rec)?; + format!( + " rec {} bytes will_init: {} {}", + buf.len(), + rec.will_init(), + wal_desc + ) + } + }; + Ok(desc) + }; + + tree_reader.visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |delta_key, val| { + let blob_ref = BlobRef(val); + let key = DeltaKey::extract_key_from_buf(delta_key); + let lsn = DeltaKey::extract_lsn_from_buf(delta_key); + + let desc = match dump_blob(blob_ref) { + Ok(desc) => desc, + Err(err) => format!("ERROR: {}", err), + }; + println!(" key {} at {}: {}", key, lsn, desc); + true + }, + )?; + + Ok(()) + } +} + +impl DeltaLayer { + fn path_for( + path_or_conf: &PathOrConf, + timeline_id: TimelineId, + tenant_id: TenantId, + fname: &DeltaFileName, + ) -> PathBuf { + match path_or_conf { + PathOrConf::Path(path) => path.clone(), + PathOrConf::Conf(conf) => conf + .timeline_path(&timeline_id, &tenant_id) + .join(fname.to_string()), + } + } + + fn temp_path_for( + conf: &PageServerConf, + timeline_id: TimelineId, + tenant_id: TenantId, + key_start: Key, + lsn_range: &Range, + ) -> PathBuf { + let rand_string: String = rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(8) + .map(char::from) + .collect(); + + conf.timeline_path(&timeline_id, &tenant_id).join(format!( + "{}-XXX__{:016X}-{:016X}.{}.{}", + key_start, + u64::from(lsn_range.start), + u64::from(lsn_range.end), + rand_string, + TEMP_FILE_SUFFIX, + )) + } + + /// + /// Open the underlying file and read the metadata into memory, if it's + /// not loaded already. + /// + fn load(&self) -> Result> { + loop { + // Quick exit if already loaded + let inner = self.inner.read().unwrap(); + if inner.loaded { + return Ok(inner); + } + + // Need to open the file and load the metadata. Upgrade our lock to + // a write lock. (Or rather, release and re-lock in write mode.) + drop(inner); + let inner = self.inner.write().unwrap(); + if !inner.loaded { + self.load_inner(inner).with_context(|| { + format!("Failed to load delta layer {}", self.path().display()) + })?; + } else { + // Another thread loaded it while we were not holding the lock. + } + + // We now have the file open and loaded. There's no function to do + // that in the std library RwLock, so we have to release and re-lock + // in read mode. (To be precise, the lock guard was moved in the + // above call to `load_inner`, so it's already been released). And + // while we do that, another thread could unload again, so we have + // to re-check and retry if that happens. + } + } + + fn load_inner(&self, mut inner: RwLockWriteGuard) -> Result<()> { + let path = self.path(); + + // Open the file if it's not open already. + if inner.file.is_none() { + let file = VirtualFile::open(&path) + .with_context(|| format!("Failed to open file '{}'", path.display()))?; + inner.file = Some(FileBlockReader::new(file)); + } + let file = inner.file.as_mut().unwrap(); + let summary_blk = file.read_blk(0)?; + let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; + + match &self.path_or_conf { + PathOrConf::Conf(_) => { + let mut expected_summary = Summary::from(self); + expected_summary.index_start_blk = actual_summary.index_start_blk; + expected_summary.index_root_blk = actual_summary.index_root_blk; + if actual_summary != expected_summary { + bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary); + } + } + PathOrConf::Path(path) => { + let actual_filename = Path::new(path.file_name().unwrap()); + let expected_filename = self.filename(); + + if actual_filename != expected_filename { + println!( + "warning: filename does not match what is expected from in-file summary" + ); + println!("actual: {:?}", actual_filename); + println!("expected: {:?}", expected_filename); + } + } + } + + inner.index_start_blk = actual_summary.index_start_blk; + inner.index_root_blk = actual_summary.index_root_blk; + + debug!("loaded from {}", &path.display()); + + inner.loaded = true; + Ok(()) + } + + /// Create a DeltaLayer struct representing an existing file on disk. + pub fn new( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_id: TenantId, + filename: &DeltaFileName, + ) -> DeltaLayer { + DeltaLayer { + path_or_conf: PathOrConf::Conf(conf), + timeline_id, + tenant_id, + key_range: filename.key_range.clone(), + lsn_range: filename.lsn_range.clone(), + inner: RwLock::new(DeltaLayerInner { + loaded: false, + file: None, + index_start_blk: 0, + index_root_blk: 0, + }), + } + } + + /// Create a DeltaLayer struct representing an existing file on disk. + /// + /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. + pub fn new_for_path(path: &Path, file: F) -> Result + where + F: FileExt, + { + let mut summary_buf = Vec::new(); + summary_buf.resize(PAGE_SZ, 0); + file.read_exact_at(&mut summary_buf, 0)?; + let summary = Summary::des_prefix(&summary_buf)?; + + Ok(DeltaLayer { + path_or_conf: PathOrConf::Path(path.to_path_buf()), + timeline_id: summary.timeline_id, + tenant_id: summary.tenant_id, + key_range: summary.key_range, + lsn_range: summary.lsn_range, + inner: RwLock::new(DeltaLayerInner { + loaded: false, + file: None, + index_start_blk: 0, + index_root_blk: 0, + }), + }) + } + + fn layer_name(&self) -> DeltaFileName { + DeltaFileName { + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), + } + } + + /// Path to the layer file in pageserver workdir. + pub fn path(&self) -> PathBuf { + Self::path_for( + &self.path_or_conf, + self.timeline_id, + self.tenant_id, + &self.layer_name(), + ) + } +} + +/// A builder object for constructing a new delta layer. +/// +/// Usage: +/// +/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...) +/// +/// 2. Write the contents by calling `put_value` for every page +/// version to store in the layer. +/// +/// 3. Call `finish`. +/// +struct DeltaLayerWriterInner { + conf: &'static PageServerConf, + pub path: PathBuf, + timeline_id: TimelineId, + tenant_id: TenantId, + + key_start: Key, + lsn_range: Range, + + tree: DiskBtreeBuilder, + + blob_writer: WriteBlobWriter>, +} + +impl DeltaLayerWriterInner { + /// + /// Start building a new delta layer. + /// + fn new( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_id: TenantId, + key_start: Key, + lsn_range: Range, + ) -> anyhow::Result { + // Create the file initially with a temporary filename. We don't know + // the end key yet, so we cannot form the final filename yet. We will + // rename it when we're done. + // + // Note: This overwrites any existing file. There shouldn't be any. + // FIXME: throw an error instead? + let path = DeltaLayer::temp_path_for(conf, timeline_id, tenant_id, key_start, &lsn_range); + + let mut file = VirtualFile::create(&path)?; + // make room for the header block + file.seek(SeekFrom::Start(PAGE_SZ as u64))?; + let buf_writer = BufWriter::new(file); + let blob_writer = WriteBlobWriter::new(buf_writer, PAGE_SZ as u64); + + // Initialize the b-tree index builder + let block_buf = BlockBuf::new(); + let tree_builder = DiskBtreeBuilder::new(block_buf); + + Ok(Self { + conf, + path, + timeline_id, + tenant_id, + key_start, + lsn_range, + tree: tree_builder, + blob_writer, + }) + } + + /// + /// Append a key-value pair to the file. + /// + /// The values must be appended in key, lsn order. + /// + fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { + self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init()) + } + + fn put_value_bytes( + &mut self, + key: Key, + lsn: Lsn, + val: &[u8], + will_init: bool, + ) -> anyhow::Result<()> { + assert!(self.lsn_range.start <= lsn); + + let off = self.blob_writer.write_blob(val)?; + + let blob_ref = BlobRef::new(off, will_init); + + let delta_key = DeltaKey::from_key_lsn(&key, lsn); + self.tree.append(&delta_key.0, blob_ref.0)?; + + Ok(()) + } + + fn size(&self) -> u64 { + self.blob_writer.size() + self.tree.borrow_writer().size() + } + + /// + /// Finish writing the delta layer. + /// + fn finish(self, key_end: Key) -> anyhow::Result { + let index_start_blk = + ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; + + let buf_writer = self.blob_writer.into_inner(); + let mut file = buf_writer.into_inner()?; + + // Write out the index + let (index_root_blk, block_buf) = self.tree.finish()?; + file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?; + for buf in block_buf.blocks { + file.write_all(buf.as_ref())?; + } + assert!(self.lsn_range.start < self.lsn_range.end); + // Fill in the summary on blk 0 + let summary = Summary { + magic: DELTA_FILE_MAGIC, + format_version: STORAGE_FORMAT_VERSION, + tenant_id: self.tenant_id, + timeline_id: self.timeline_id, + key_range: self.key_start..key_end, + lsn_range: self.lsn_range.clone(), + index_start_blk, + index_root_blk, + }; + file.seek(SeekFrom::Start(0))?; + Summary::ser_into(&summary, &mut file)?; + + // Note: Because we opened the file in write-only mode, we cannot + // reuse the same VirtualFile for reading later. That's why we don't + // set inner.file here. The first read will have to re-open it. + let layer = DeltaLayer { + path_or_conf: PathOrConf::Conf(self.conf), + tenant_id: self.tenant_id, + timeline_id: self.timeline_id, + key_range: self.key_start..key_end, + lsn_range: self.lsn_range.clone(), + inner: RwLock::new(DeltaLayerInner { + loaded: false, + file: None, + index_start_blk, + index_root_blk, + }), + }; + + // fsync the file + file.sync_all()?; + // Rename the file to its final name + // + // Note: This overwrites any existing file. There shouldn't be any. + // FIXME: throw an error instead? + let final_path = DeltaLayer::path_for( + &PathOrConf::Conf(self.conf), + self.timeline_id, + self.tenant_id, + &DeltaFileName { + key_range: self.key_start..key_end, + lsn_range: self.lsn_range, + }, + ); + std::fs::rename(self.path, &final_path)?; + + trace!("created delta layer {}", final_path.display()); + + Ok(layer) + } +} + +/// A builder object for constructing a new delta layer. +/// +/// Usage: +/// +/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...) +/// +/// 2. Write the contents by calling `put_value` for every page +/// version to store in the layer. +/// +/// 3. Call `finish`. +/// +/// # Note +/// +/// As described in https://github.com/neondatabase/neon/issues/2650, it's +/// possible for the writer to drop before `finish` is actually called. So this +/// could lead to odd temporary files in the directory, exhausting file system. +/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop` +/// implementation that cleans up the temporary file in failure. It's not +/// possible to do this directly in `DeltaLayerWriterInner` since `finish` moves +/// out some fields, making it impossible to implement `Drop`. +/// +#[must_use] +pub struct DeltaLayerWriter { + inner: Option, +} + +impl DeltaLayerWriter { + /// + /// Start building a new delta layer. + /// + pub fn new( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_id: TenantId, + key_start: Key, + lsn_range: Range, + ) -> anyhow::Result { + Ok(Self { + inner: Some(DeltaLayerWriterInner::new( + conf, + timeline_id, + tenant_id, + key_start, + lsn_range, + )?), + }) + } + + /// + /// Append a key-value pair to the file. + /// + /// The values must be appended in key, lsn order. + /// + pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { + self.inner.as_mut().unwrap().put_value(key, lsn, val) + } + + pub fn put_value_bytes( + &mut self, + key: Key, + lsn: Lsn, + val: &[u8], + will_init: bool, + ) -> anyhow::Result<()> { + self.inner + .as_mut() + .unwrap() + .put_value_bytes(key, lsn, val, will_init) + } + + pub fn size(&self) -> u64 { + self.inner.as_ref().unwrap().size() + } + + /// + /// Finish writing the delta layer. + /// + pub fn finish(mut self, key_end: Key) -> anyhow::Result { + self.inner.take().unwrap().finish(key_end) + } +} + +impl Drop for DeltaLayerWriter { + fn drop(&mut self) { + if let Some(inner) = self.inner.take() { + match inner.blob_writer.into_inner().into_inner() { + Ok(vfile) => vfile.remove(), + Err(err) => warn!( + "error while flushing buffer of image layer temporary file: {}", + err + ), + } + } + } +} + +/// +/// Iterator over all key-value pairse stored in a delta layer +/// +/// FIXME: This creates a Vector to hold the offsets of all key value pairs. +/// That takes up quite a lot of memory. Should do this in a more streaming +/// fashion. +/// +struct DeltaValueIter<'a> { + all_offsets: Vec<(DeltaKey, BlobRef)>, + next_idx: usize, + reader: BlockCursor>, +} + +struct Adapter<'a>(RwLockReadGuard<'a, DeltaLayerInner>); + +impl<'a> BlockReader for Adapter<'a> { + type BlockLease = PageReadGuard<'static>; + + fn read_blk(&self, blknum: u32) -> Result { + self.0.file.as_ref().unwrap().read_blk(blknum) + } +} + +impl<'a> Iterator for DeltaValueIter<'a> { + type Item = Result<(Key, Lsn, Value)>; + + fn next(&mut self) -> Option { + self.next_res().transpose() + } +} + +impl<'a> DeltaValueIter<'a> { + fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result { + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + file, + ); + + let mut all_offsets: Vec<(DeltaKey, BlobRef)> = Vec::new(); + tree_reader.visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |key, value| { + all_offsets.push((DeltaKey::from_slice(key), BlobRef(value))); + true + }, + )?; + + let iter = DeltaValueIter { + all_offsets, + next_idx: 0, + reader: BlockCursor::new(Adapter(inner)), + }; + + Ok(iter) + } + + fn next_res(&mut self) -> Result> { + if self.next_idx < self.all_offsets.len() { + let (delta_key, blob_ref) = &self.all_offsets[self.next_idx]; + + let key = delta_key.key(); + let lsn = delta_key.lsn(); + + let buf = self.reader.read_blob(blob_ref.pos())?; + let val = Value::des(&buf)?; + self.next_idx += 1; + Ok(Some((key, lsn, val))) + } else { + Ok(None) + } + } +} +/// +/// Iterator over all keys stored in a delta layer +/// +/// FIXME: This creates a Vector to hold all keys. +/// That takes up quite a lot of memory. Should do this in a more streaming +/// fashion. +/// +struct DeltaKeyIter { + all_keys: Vec<(DeltaKey, u64)>, + next_idx: usize, +} + +impl Iterator for DeltaKeyIter { + type Item = (Key, Lsn, u64); + + fn next(&mut self) -> Option { + if self.next_idx < self.all_keys.len() { + let (delta_key, size) = &self.all_keys[self.next_idx]; + + let key = delta_key.key(); + let lsn = delta_key.lsn(); + + self.next_idx += 1; + Some((key, lsn, *size)) + } else { + None + } + } +} + +impl<'a> DeltaKeyIter { + fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result { + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + file, + ); + + let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new(); + tree_reader.visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |key, value| { + let delta_key = DeltaKey::from_slice(key); + let pos = BlobRef(value).pos(); + if let Some(last) = all_keys.last_mut() { + if last.0.key() == delta_key.key() { + return true; + } else { + // subtract offset of new key BLOB and first blob of this key + // to get total size if values associated with this key + let first_pos = last.1; + last.1 = pos - first_pos; + } + } + all_keys.push((delta_key, pos)); + true + }, + )?; + if let Some(last) = all_keys.last_mut() { + // Last key occupies all space till end of layer + last.1 = std::fs::metadata(&file.file.path)?.len() - last.1; + } + let iter = DeltaKeyIter { + all_keys, + next_idx: 0, + }; + + Ok(iter) + } +} diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs new file mode 100644 index 0000000000..33255dbd82 --- /dev/null +++ b/pageserver/src/tenant/disk_btree.rs @@ -0,0 +1,1033 @@ +//! +//! Simple on-disk B-tree implementation +//! +//! This is used as the index structure within image and delta layers +//! +//! Features: +//! - Fixed-width keys +//! - Fixed-width values (VALUE_SZ) +//! - The tree is created in a bulk operation. Insert/deletion after creation +//! is not supported +//! - page-oriented +//! +//! TODO: +//! - maybe something like an Adaptive Radix Tree would be more efficient? +//! - the values stored by image and delta layers are offsets into the file, +//! and they are in monotonically increasing order. Prefix compression would +//! be very useful for them, too. +//! - An Iterator interface would be more convenient for the callers than the +//! 'visit' function +//! +use byteorder::{ReadBytesExt, BE}; +use bytes::{BufMut, Bytes, BytesMut}; +use hex; +use std::{cmp::Ordering, io, result}; +use thiserror::Error; +use tracing::error; + +use crate::tenant::block_io::{BlockReader, BlockWriter}; + +// The maximum size of a value stored in the B-tree. 5 bytes is enough currently. +pub const VALUE_SZ: usize = 5; +pub const MAX_VALUE: u64 = 0x007f_ffff_ffff; + +#[allow(dead_code)] +pub const PAGE_SZ: usize = 8192; + +#[derive(Clone, Copy, Debug)] +struct Value([u8; VALUE_SZ]); + +impl Value { + fn from_slice(slice: &[u8]) -> Value { + let mut b = [0u8; VALUE_SZ]; + b.copy_from_slice(slice); + Value(b) + } + + fn from_u64(x: u64) -> Value { + assert!(x <= 0x007f_ffff_ffff); + Value([ + (x >> 32) as u8, + (x >> 24) as u8, + (x >> 16) as u8, + (x >> 8) as u8, + x as u8, + ]) + } + + fn from_blknum(x: u32) -> Value { + Value([ + 0x80, + (x >> 24) as u8, + (x >> 16) as u8, + (x >> 8) as u8, + x as u8, + ]) + } + + #[allow(dead_code)] + fn is_offset(self) -> bool { + self.0[0] & 0x80 != 0 + } + + fn to_u64(self) -> u64 { + let b = &self.0; + (b[0] as u64) << 32 + | (b[1] as u64) << 24 + | (b[2] as u64) << 16 + | (b[3] as u64) << 8 + | b[4] as u64 + } + + fn to_blknum(self) -> u32 { + let b = &self.0; + assert!(b[0] == 0x80); + (b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32 + } +} + +#[derive(Error, Debug)] +pub enum DiskBtreeError { + #[error("Attempt to append a value that is too large {0} > {}", MAX_VALUE)] + AppendOverflow(u64), + + #[error("Unsorted input: key {key:?} is <= last_key {last_key:?}")] + UnsortedInput { key: Box<[u8]>, last_key: Box<[u8]> }, + + #[error("Could not push to new leaf node")] + FailedToPushToNewLeafNode, + + #[error("IoError: {0}")] + Io(#[from] io::Error), +} + +pub type Result = result::Result; + +/// This is the on-disk representation. +struct OnDiskNode<'a, const L: usize> { + // Fixed-width fields + num_children: u16, + level: u8, + prefix_len: u8, + suffix_len: u8, + + // Variable-length fields. These are stored on-disk after the fixed-width + // fields, in this order. In the in-memory representation, these point to + // the right parts in the page buffer. + prefix: &'a [u8], + keys: &'a [u8], + values: &'a [u8], +} + +impl<'a, const L: usize> OnDiskNode<'a, L> { + /// + /// Interpret a PAGE_SZ page as a node. + /// + fn deparse(buf: &[u8]) -> Result> { + let mut cursor = std::io::Cursor::new(buf); + let num_children = cursor.read_u16::()?; + let level = cursor.read_u8()?; + let prefix_len = cursor.read_u8()?; + let suffix_len = cursor.read_u8()?; + + let mut off = cursor.position(); + let prefix_off = off as usize; + off += prefix_len as u64; + + let keys_off = off as usize; + let keys_len = num_children as usize * suffix_len as usize; + off += keys_len as u64; + + let values_off = off as usize; + let values_len = num_children as usize * VALUE_SZ as usize; + //off += values_len as u64; + + let prefix = &buf[prefix_off..prefix_off + prefix_len as usize]; + let keys = &buf[keys_off..keys_off + keys_len]; + let values = &buf[values_off..values_off + values_len]; + + Ok(OnDiskNode { + num_children, + level, + prefix_len, + suffix_len, + prefix, + keys, + values, + }) + } + + /// + /// Read a value at 'idx' + /// + fn value(&self, idx: usize) -> Value { + let value_off = idx * VALUE_SZ; + let value_slice = &self.values[value_off..value_off + VALUE_SZ]; + Value::from_slice(value_slice) + } + + fn binary_search( + &self, + search_key: &[u8; L], + keybuf: &mut [u8], + ) -> result::Result { + let mut size = self.num_children as usize; + let mut low = 0; + let mut high = size; + while low < high { + let mid = low + size / 2; + + let key_off = mid as usize * self.suffix_len as usize; + let suffix = &self.keys[key_off..key_off + self.suffix_len as usize]; + // Does this match? + keybuf[self.prefix_len as usize..].copy_from_slice(suffix); + + let cmp = keybuf[..].cmp(search_key); + + if cmp == Ordering::Less { + low = mid + 1; + } else if cmp == Ordering::Greater { + high = mid; + } else { + return Ok(mid); + } + size = high - low; + } + Err(low) + } +} + +/// +/// Public reader object, to search the tree. +/// +pub struct DiskBtreeReader +where + R: BlockReader, +{ + start_blk: u32, + root_blk: u32, + reader: R, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum VisitDirection { + Forwards, + Backwards, +} + +impl DiskBtreeReader +where + R: BlockReader, +{ + pub fn new(start_blk: u32, root_blk: u32, reader: R) -> Self { + DiskBtreeReader { + start_blk, + root_blk, + reader, + } + } + + /// + /// Read the value for given key. Returns the value, or None if it doesn't exist. + /// + pub fn get(&self, search_key: &[u8; L]) -> Result> { + let mut result: Option = None; + self.visit(search_key, VisitDirection::Forwards, |key, value| { + if key == search_key { + result = Some(value); + } + false + })?; + Ok(result) + } + + /// + /// Scan the tree, starting from 'search_key', in the given direction. 'visitor' + /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning + /// backwards) + /// + pub fn visit( + &self, + search_key: &[u8; L], + dir: VisitDirection, + mut visitor: V, + ) -> Result + where + V: FnMut(&[u8], u64) -> bool, + { + self.search_recurse(self.root_blk, search_key, dir, &mut visitor) + } + + fn search_recurse( + &self, + node_blknum: u32, + search_key: &[u8; L], + dir: VisitDirection, + visitor: &mut V, + ) -> Result + where + V: FnMut(&[u8], u64) -> bool, + { + // Locate the node. + let blk = self.reader.read_blk(self.start_blk + node_blknum)?; + + // Search all entries on this node + self.search_node(blk.as_ref(), search_key, dir, visitor) + } + + fn search_node( + &self, + node_buf: &[u8], + search_key: &[u8; L], + dir: VisitDirection, + visitor: &mut V, + ) -> Result + where + V: FnMut(&[u8], u64) -> bool, + { + let node = OnDiskNode::deparse(node_buf)?; + let prefix_len = node.prefix_len as usize; + let suffix_len = node.suffix_len as usize; + + assert!(node.num_children > 0); + + let mut keybuf = Vec::new(); + keybuf.extend(node.prefix); + keybuf.resize(prefix_len + suffix_len, 0); + + if dir == VisitDirection::Forwards { + // Locate the first match + let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) { + Ok(idx) => idx, + Err(idx) => { + if node.level == 0 { + // Imagine that the node contains the following keys: + // + // 1 + // 3 <-- idx + // 5 + // + // If the search key is '2' and there is exact match, + // the binary search would return the index of key + // '3'. That's cool, '3' is the first key to return. + idx + } else { + // This is an internal page, so each key represents a lower + // bound for what's in the child page. If there is no exact + // match, we have to return the *previous* entry. + // + // 1 <-- return this + // 3 <-- idx + // 5 + idx.saturating_sub(1) + } + } + }; + // idx points to the first match now. Keep going from there + let mut key_off = idx * suffix_len; + while idx < node.num_children as usize { + let suffix = &node.keys[key_off..key_off + suffix_len]; + keybuf[prefix_len..].copy_from_slice(suffix); + let value = node.value(idx as usize); + #[allow(clippy::collapsible_if)] + if node.level == 0 { + // leaf + if !visitor(&keybuf, value.to_u64()) { + return Ok(false); + } + } else { + #[allow(clippy::collapsible_if)] + if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? { + return Ok(false); + } + } + idx += 1; + key_off += suffix_len; + } + } else { + let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) { + Ok(idx) => { + // Exact match. That's the first entry to return, and walk + // backwards from there. (The loop below starts from 'idx - + // 1', so add one here to compensate.) + idx + 1 + } + Err(idx) => { + // No exact match. The binary search returned the index of the + // first key that's > search_key. Back off by one, and walk + // backwards from there. (The loop below starts from idx - 1, + // so we don't need to subtract one here) + idx + } + }; + + // idx points to the first match + 1 now. Keep going from there. + let mut key_off = idx * suffix_len; + while idx > 0 { + idx -= 1; + key_off -= suffix_len; + let suffix = &node.keys[key_off..key_off + suffix_len]; + keybuf[prefix_len..].copy_from_slice(suffix); + let value = node.value(idx as usize); + #[allow(clippy::collapsible_if)] + if node.level == 0 { + // leaf + if !visitor(&keybuf, value.to_u64()) { + return Ok(false); + } + } else { + #[allow(clippy::collapsible_if)] + if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? { + return Ok(false); + } + } + if idx == 0 { + break; + } + } + } + Ok(true) + } + + #[allow(dead_code)] + pub fn dump(&self) -> Result<()> { + self.dump_recurse(self.root_blk, &[], 0) + } + + fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> { + let blk = self.reader.read_blk(self.start_blk + blknum)?; + let buf: &[u8] = blk.as_ref(); + + let node = OnDiskNode::::deparse(buf)?; + + print!("{:indent$}", "", indent = depth * 2); + println!( + "blk #{}: path {}: prefix {}, suffix_len {}", + blknum, + hex::encode(path), + hex::encode(node.prefix), + node.suffix_len + ); + + let mut idx = 0; + let mut key_off = 0; + while idx < node.num_children { + let key = &node.keys[key_off..key_off + node.suffix_len as usize]; + let val = node.value(idx as usize); + print!("{:indent$}", "", indent = depth * 2 + 2); + println!("{}: {}", hex::encode(key), hex::encode(val.0)); + + if node.level > 0 { + let child_path = [path, node.prefix].concat(); + self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?; + } + idx += 1; + key_off += node.suffix_len as usize; + } + Ok(()) + } +} + +/// +/// Public builder object, for creating a new tree. +/// +/// Usage: Create a builder object by calling 'new', load all the data into the +/// tree by calling 'append' for each key-value pair, and then call 'finish' +/// +/// 'L' is the key length in bytes +pub struct DiskBtreeBuilder +where + W: BlockWriter, +{ + writer: W, + + /// + /// stack[0] is the current root page, stack.last() is the leaf. + /// + /// We maintain the length of the stack to be always greater than zero. + /// Two exceptions are: + /// 1. `Self::flush_node`. The method will push the new node if it extracted the last one. + /// So because other methods cannot see the intermediate state invariant still holds. + /// 2. `Self::finish`. It consumes self and does not return it back, + /// which means that this is where the structure is destroyed. + /// Thus stack of zero length cannot be observed by other methods. + stack: Vec>, + + /// Last key that was appended to the tree. Used to sanity check that append + /// is called in increasing key order. + last_key: Option<[u8; L]>, +} + +impl DiskBtreeBuilder +where + W: BlockWriter, +{ + pub fn new(writer: W) -> Self { + DiskBtreeBuilder { + writer, + last_key: None, + stack: vec![BuildNode::new(0)], + } + } + + pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<()> { + if value > MAX_VALUE { + return Err(DiskBtreeError::AppendOverflow(value)); + } + if let Some(last_key) = &self.last_key { + if key <= last_key { + return Err(DiskBtreeError::UnsortedInput { + key: key.as_slice().into(), + last_key: last_key.as_slice().into(), + }); + } + } + self.last_key = Some(*key); + + self.append_internal(key, Value::from_u64(value)) + } + + fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<()> { + // Try to append to the current leaf buffer + let last = self + .stack + .last_mut() + .expect("should always have at least one item"); + let level = last.level; + if last.push(key, value) { + return Ok(()); + } + + // It did not fit. Try to compress, and if it succeeds to make + // some room on the node, try appending to it again. + #[allow(clippy::collapsible_if)] + if last.compress() { + if last.push(key, value) { + return Ok(()); + } + } + + // Could not append to the current leaf. Flush it and create a new one. + self.flush_node()?; + + // Replace the node we flushed with an empty one and append the new + // key to it. + let mut last = BuildNode::new(level); + if !last.push(key, value) { + return Err(DiskBtreeError::FailedToPushToNewLeafNode); + } + + self.stack.push(last); + + Ok(()) + } + + /// Flush the bottommost node in the stack to disk. Appends a downlink to its parent, + /// and recursively flushes the parent too, if it becomes full. If the root page becomes full, + /// creates a new root page, increasing the height of the tree. + fn flush_node(&mut self) -> Result<()> { + // Get the current bottommost node in the stack and flush it to disk. + let last = self + .stack + .pop() + .expect("should always have at least one item"); + let buf = last.pack(); + let downlink_key = last.first_key(); + let downlink_ptr = self.writer.write_blk(buf)?; + + // Append the downlink to the parent. If there is no parent, ie. this was the root page, + // create a new root page, increasing the height of the tree. + if self.stack.is_empty() { + self.stack.push(BuildNode::new(last.level + 1)); + } + self.append_internal(&downlink_key, Value::from_blknum(downlink_ptr)) + } + + /// + /// Flushes everything to disk, and returns the block number of the root page. + /// The caller must store the root block number "out-of-band", and pass it + /// to the DiskBtreeReader::new() when you want to read the tree again. + /// (In the image and delta layers, it is stored in the beginning of the file, + /// in the summary header) + /// + pub fn finish(mut self) -> Result<(u32, W)> { + // flush all levels, except the root. + while self.stack.len() > 1 { + self.flush_node()?; + } + + let root = self + .stack + .first() + .expect("by the check above we left one item there"); + let buf = root.pack(); + let root_blknum = self.writer.write_blk(buf)?; + + Ok((root_blknum, self.writer)) + } + + pub fn borrow_writer(&self) -> &W { + &self.writer + } +} + +/// +/// BuildNode represesnts an incomplete page that we are appending to. +/// +#[derive(Clone, Debug)] +struct BuildNode { + num_children: u16, + level: u8, + prefix: Vec, + suffix_len: usize, + + keys: Vec, + values: Vec, + + size: usize, // physical size of this node, if it was written to disk like this +} + +const NODE_SIZE: usize = PAGE_SZ; + +const NODE_HDR_SIZE: usize = 2 + 1 + 1 + 1; + +impl BuildNode { + fn new(level: u8) -> Self { + BuildNode { + num_children: 0, + level, + prefix: Vec::new(), + suffix_len: 0, + keys: Vec::new(), + values: Vec::new(), + size: NODE_HDR_SIZE, + } + } + + /// Try to append a key-value pair to this node. Returns 'true' on + /// success, 'false' if the page was full or the key was + /// incompatible with the prefix of the existing keys. + fn push(&mut self, key: &[u8; L], value: Value) -> bool { + // If we have already performed prefix-compression on the page, + // check that the incoming key has the same prefix. + if self.num_children > 0 { + // does the prefix allow it? + if !key.starts_with(&self.prefix) { + return false; + } + } else { + self.suffix_len = key.len(); + } + + // Is the node too full? + if self.size + self.suffix_len + VALUE_SZ >= NODE_SIZE { + return false; + } + + // All clear + self.num_children += 1; + self.keys.extend(&key[self.prefix.len()..]); + self.values.extend(value.0); + + assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize); + assert!(self.values.len() == self.num_children as usize * VALUE_SZ); + + self.size += self.suffix_len + VALUE_SZ; + + true + } + + /// + /// Perform prefix-compression. + /// + /// Returns 'true' on success, 'false' if no compression was possible. + /// + fn compress(&mut self) -> bool { + let first_suffix = self.first_suffix(); + let last_suffix = self.last_suffix(); + + // Find the common prefix among all keys + let mut prefix_len = 0; + while prefix_len < self.suffix_len { + if first_suffix[prefix_len] != last_suffix[prefix_len] { + break; + } + prefix_len += 1; + } + if prefix_len == 0 { + return false; + } + + // Can compress. Rewrite the keys without the common prefix. + self.prefix.extend(&self.keys[..prefix_len]); + + let mut new_keys = Vec::new(); + let mut key_off = 0; + while key_off < self.keys.len() { + let next_key_off = key_off + self.suffix_len; + new_keys.extend(&self.keys[key_off + prefix_len..next_key_off]); + key_off = next_key_off; + } + self.keys = new_keys; + self.suffix_len -= prefix_len; + + self.size -= prefix_len * self.num_children as usize; + self.size += prefix_len; + + assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize); + assert!(self.values.len() == self.num_children as usize * VALUE_SZ); + + true + } + + /// + /// Serialize the node to on-disk format. + /// + fn pack(&self) -> Bytes { + assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize); + assert!(self.values.len() == self.num_children as usize * VALUE_SZ); + assert!(self.num_children > 0); + + let mut buf = BytesMut::new(); + + buf.put_u16(self.num_children); + buf.put_u8(self.level); + buf.put_u8(self.prefix.len() as u8); + buf.put_u8(self.suffix_len as u8); + buf.put(&self.prefix[..]); + buf.put(&self.keys[..]); + buf.put(&self.values[..]); + + assert!(buf.len() == self.size); + + assert!(buf.len() <= PAGE_SZ); + buf.resize(PAGE_SZ, 0); + buf.freeze() + } + + fn first_suffix(&self) -> &[u8] { + &self.keys[..self.suffix_len] + } + fn last_suffix(&self) -> &[u8] { + &self.keys[self.keys.len() - self.suffix_len..] + } + + /// Return the full first key of the page, including the prefix + fn first_key(&self) -> [u8; L] { + let mut key = [0u8; L]; + key[..self.prefix.len()].copy_from_slice(&self.prefix); + key[self.prefix.len()..].copy_from_slice(self.first_suffix()); + key + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::Rng; + use std::collections::BTreeMap; + use std::sync::atomic::{AtomicUsize, Ordering}; + + #[derive(Clone, Default)] + struct TestDisk { + blocks: Vec, + } + impl TestDisk { + fn new() -> Self { + Self::default() + } + } + impl BlockReader for TestDisk { + type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>; + + fn read_blk(&self, blknum: u32) -> io::Result { + let mut buf = [0u8; PAGE_SZ]; + buf.copy_from_slice(&self.blocks[blknum as usize]); + Ok(std::rc::Rc::new(buf)) + } + } + impl BlockWriter for &mut TestDisk { + fn write_blk(&mut self, buf: Bytes) -> io::Result { + let blknum = self.blocks.len(); + self.blocks.push(buf); + Ok(blknum as u32) + } + } + + #[test] + fn basic() -> Result<()> { + let mut disk = TestDisk::new(); + let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk); + + let all_keys: Vec<&[u8; 6]> = vec![ + b"xaaaaa", b"xaaaba", b"xaaaca", b"xabaaa", b"xababa", b"xabaca", b"xabada", b"xabadb", + ]; + let all_data: Vec<(&[u8; 6], u64)> = all_keys + .iter() + .enumerate() + .map(|(idx, key)| (*key, idx as u64)) + .collect(); + for (key, val) in all_data.iter() { + writer.append(key, *val)?; + } + + let (root_offset, _writer) = writer.finish()?; + + let reader = DiskBtreeReader::new(0, root_offset, disk); + + reader.dump()?; + + // Test the `get` function on all the keys. + for (key, val) in all_data.iter() { + assert_eq!(reader.get(key)?, Some(*val)); + } + // And on some keys that don't exist + assert_eq!(reader.get(b"aaaaaa")?, None); + assert_eq!(reader.get(b"zzzzzz")?, None); + assert_eq!(reader.get(b"xaaabx")?, None); + + // Test search with `visit` function + let search_key = b"xabaaa"; + let expected: Vec<(Vec, u64)> = all_data + .iter() + .filter(|(key, _value)| key[..] >= search_key[..]) + .map(|(key, value)| (key.to_vec(), *value)) + .collect(); + + let mut data = Vec::new(); + reader.visit(search_key, VisitDirection::Forwards, |key, value| { + data.push((key.to_vec(), value)); + true + })?; + assert_eq!(data, expected); + + // Test a backwards scan + let mut expected: Vec<(Vec, u64)> = all_data + .iter() + .filter(|(key, _value)| key[..] <= search_key[..]) + .map(|(key, value)| (key.to_vec(), *value)) + .collect(); + expected.reverse(); + let mut data = Vec::new(); + reader.visit(search_key, VisitDirection::Backwards, |key, value| { + data.push((key.to_vec(), value)); + true + })?; + assert_eq!(data, expected); + + // Backward scan where nothing matches + reader.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| { + panic!("found unexpected key {}: {}", hex::encode(key), value); + })?; + + // Full scan + let expected: Vec<(Vec, u64)> = all_data + .iter() + .map(|(key, value)| (key.to_vec(), *value)) + .collect(); + let mut data = Vec::new(); + reader.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| { + data.push((key.to_vec(), value)); + true + })?; + assert_eq!(data, expected); + + Ok(()) + } + + #[test] + fn lots_of_keys() -> Result<()> { + let mut disk = TestDisk::new(); + let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk); + + const NUM_KEYS: u64 = 1000; + + let mut all_data: BTreeMap = BTreeMap::new(); + + for idx in 0..NUM_KEYS { + let key_int: u64 = 1 + idx * 2; + let key = u64::to_be_bytes(key_int); + writer.append(&key, idx)?; + + all_data.insert(key_int, idx); + } + + let (root_offset, _writer) = writer.finish()?; + + let reader = DiskBtreeReader::new(0, root_offset, disk); + + reader.dump()?; + + use std::sync::Mutex; + + let result = Mutex::new(Vec::new()); + let limit: AtomicUsize = AtomicUsize::new(10); + let take_ten = |key: &[u8], value: u64| { + let mut keybuf = [0u8; 8]; + keybuf.copy_from_slice(key); + let key_int = u64::from_be_bytes(keybuf); + + let mut result = result.lock().unwrap(); + result.push((key_int, value)); + + // keep going until we have 10 matches + result.len() < limit.load(Ordering::Relaxed) + }; + + for search_key_int in 0..(NUM_KEYS * 2 + 10) { + let search_key = u64::to_be_bytes(search_key_int); + assert_eq!( + reader.get(&search_key)?, + all_data.get(&search_key_int).cloned() + ); + + // Test a forward scan starting with this key + result.lock().unwrap().clear(); + reader.visit(&search_key, VisitDirection::Forwards, take_ten)?; + let expected = all_data + .range(search_key_int..) + .take(10) + .map(|(&key, &val)| (key, val)) + .collect::>(); + assert_eq!(*result.lock().unwrap(), expected); + + // And a backwards scan + result.lock().unwrap().clear(); + reader.visit(&search_key, VisitDirection::Backwards, take_ten)?; + let expected = all_data + .range(..=search_key_int) + .rev() + .take(10) + .map(|(&key, &val)| (key, val)) + .collect::>(); + assert_eq!(*result.lock().unwrap(), expected); + } + + // full scan + let search_key = u64::to_be_bytes(0); + limit.store(usize::MAX, Ordering::Relaxed); + result.lock().unwrap().clear(); + reader.visit(&search_key, VisitDirection::Forwards, take_ten)?; + let expected = all_data + .iter() + .map(|(&key, &val)| (key, val)) + .collect::>(); + assert_eq!(*result.lock().unwrap(), expected); + + // full scan + let search_key = u64::to_be_bytes(u64::MAX); + limit.store(usize::MAX, Ordering::Relaxed); + result.lock().unwrap().clear(); + reader.visit(&search_key, VisitDirection::Backwards, take_ten)?; + let expected = all_data + .iter() + .rev() + .map(|(&key, &val)| (key, val)) + .collect::>(); + assert_eq!(*result.lock().unwrap(), expected); + + Ok(()) + } + + #[test] + fn random_data() -> Result<()> { + // Generate random keys with exponential distribution, to + // exercise the prefix compression + const NUM_KEYS: usize = 100000; + let mut all_data: BTreeMap = BTreeMap::new(); + for idx in 0..NUM_KEYS { + let u: f64 = rand::thread_rng().gen_range(0.0..1.0); + let t = -(f64::ln(u)); + let key_int = (t * 1000000.0) as u128; + + all_data.insert(key_int as u128, idx as u64); + } + + // Build a tree from it + let mut disk = TestDisk::new(); + let mut writer = DiskBtreeBuilder::<_, 16>::new(&mut disk); + + for (&key, &val) in all_data.iter() { + writer.append(&u128::to_be_bytes(key), val)?; + } + let (root_offset, _writer) = writer.finish()?; + + let reader = DiskBtreeReader::new(0, root_offset, disk); + + // Test get() operation on all the keys + for (&key, &val) in all_data.iter() { + let search_key = u128::to_be_bytes(key); + assert_eq!(reader.get(&search_key)?, Some(val)); + } + + // Test get() operations on random keys, most of which will not exist + for _ in 0..100000 { + let key_int = rand::thread_rng().gen::(); + let search_key = u128::to_be_bytes(key_int); + assert!(reader.get(&search_key)? == all_data.get(&key_int).cloned()); + } + + // Test boundary cases + assert!(reader.get(&u128::to_be_bytes(u128::MIN))? == all_data.get(&u128::MIN).cloned()); + assert!(reader.get(&u128::to_be_bytes(u128::MAX))? == all_data.get(&u128::MAX).cloned()); + + Ok(()) + } + + #[test] + fn unsorted_input() { + let mut disk = TestDisk::new(); + let mut writer = DiskBtreeBuilder::<_, 2>::new(&mut disk); + + let _ = writer.append(b"ba", 1); + let _ = writer.append(b"bb", 2); + let err = writer.append(b"aa", 3).expect_err("should've failed"); + match err { + DiskBtreeError::UnsortedInput { key, last_key } => { + assert_eq!(key.as_ref(), b"aa".as_slice()); + assert_eq!(last_key.as_ref(), b"bb".as_slice()); + } + _ => panic!("unexpected error variant, expected DiskBtreeError::UnsortedInput"), + } + } + + /// + /// This test contains a particular data set, see disk_btree_test_data.rs + /// + #[test] + fn particular_data() -> Result<()> { + // Build a tree from it + let mut disk = TestDisk::new(); + let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk); + + for (key, val) in disk_btree_test_data::TEST_DATA { + writer.append(&key, val)?; + } + let (root_offset, writer) = writer.finish()?; + + println!("SIZE: {} blocks", writer.blocks.len()); + + let reader = DiskBtreeReader::new(0, root_offset, disk); + + // Test get() operation on all the keys + for (key, val) in disk_btree_test_data::TEST_DATA { + assert_eq!(reader.get(&key)?, Some(val)); + } + + // Test full scan + let mut count = 0; + reader.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| { + count += 1; + true + })?; + assert_eq!(count, disk_btree_test_data::TEST_DATA.len()); + + reader.dump()?; + + Ok(()) + } +} + +#[cfg(test)] +#[path = "disk_btree_test_data.rs"] +mod disk_btree_test_data; diff --git a/pageserver/src/tenant/disk_btree_test_data.rs b/pageserver/src/tenant/disk_btree_test_data.rs new file mode 100644 index 0000000000..9462573f03 --- /dev/null +++ b/pageserver/src/tenant/disk_btree_test_data.rs @@ -0,0 +1,2013 @@ +use hex_literal::hex; + +/// Test data set for the 'particular_data' test in disk_btree.rs +/// +/// This test contains a particular data set, representing all the keys +/// generated by the 'test_random_updates' unit test. I extracted this while +/// trying to debug a failure in that test. The bug turned out to be +/// elsewhere, and I'm not sure if this is still useful, but keeping it for +/// now... Maybe it's a useful data set to show the typical key-values used +/// by a delta layer, for evaluating how well the prefix compression works. +#[rustfmt::skip] +pub static TEST_DATA: [([u8; 26], u64); 2000] = [ + (hex!("0122222222333333334444444455000000000000000000000010"), 0x004001), + (hex!("0122222222333333334444444455000000000000000000007cb0"), 0x0040a1), + (hex!("0122222222333333334444444455000000010000000000000020"), 0x004141), + (hex!("0122222222333333334444444455000000020000000000000030"), 0x0041e1), + (hex!("01222222223333333344444444550000000200000000000051a0"), 0x004281), + (hex!("0122222222333333334444444455000000030000000000000040"), 0x004321), + (hex!("0122222222333333334444444455000000030000000000006cf0"), 0x0043c1), + (hex!("0122222222333333334444444455000000030000000000007140"), 0x004461), + (hex!("0122222222333333334444444455000000040000000000000050"), 0x004501), + (hex!("01222222223333333344444444550000000400000000000047f0"), 0x0045a1), + (hex!("01222222223333333344444444550000000400000000000072b0"), 0x004641), + (hex!("0122222222333333334444444455000000050000000000000060"), 0x0046e1), + (hex!("0122222222333333334444444455000000050000000000005550"), 0x004781), + (hex!("0122222222333333334444444455000000060000000000000070"), 0x004821), + (hex!("01222222223333333344444444550000000600000000000044a0"), 0x0048c1), + (hex!("0122222222333333334444444455000000060000000000006870"), 0x004961), + (hex!("0122222222333333334444444455000000070000000000000080"), 0x004a01), + (hex!("0122222222333333334444444455000000080000000000000090"), 0x004aa1), + (hex!("0122222222333333334444444455000000080000000000004150"), 0x004b41), + (hex!("01222222223333333344444444550000000900000000000000a0"), 0x004be1), + (hex!("01222222223333333344444444550000000a00000000000000b0"), 0x004c81), + (hex!("01222222223333333344444444550000000a0000000000006680"), 0x004d21), + (hex!("01222222223333333344444444550000000b00000000000000c0"), 0x004dc1), + (hex!("01222222223333333344444444550000000b0000000000006230"), 0x004e61), + (hex!("01222222223333333344444444550000000c00000000000000d0"), 0x004f01), + (hex!("01222222223333333344444444550000000d00000000000000e0"), 0x004fa1), + (hex!("01222222223333333344444444550000000e00000000000000f0"), 0x005041), + (hex!("01222222223333333344444444550000000e0000000000006000"), 0x0050e1), + (hex!("01222222223333333344444444550000000f0000000000000100"), 0x005181), + (hex!("01222222223333333344444444550000000f00000000000053c0"), 0x005221), + (hex!("01222222223333333344444444550000000f0000000000006580"), 0x0052c1), + (hex!("0122222222333333334444444455000000100000000000000110"), 0x005361), + (hex!("01222222223333333344444444550000001000000000000046c0"), 0x005401), + (hex!("0122222222333333334444444455000000100000000000004e40"), 0x0054a1), + (hex!("0122222222333333334444444455000000110000000000000120"), 0x005541), + (hex!("0122222222333333334444444455000000120000000000000130"), 0x0055e1), + (hex!("01222222223333333344444444550000001200000000000066d0"), 0x005681), + (hex!("0122222222333333334444444455000000130000000000000140"), 0x005721), + (hex!("0122222222333333334444444455000000130000000000007710"), 0x0057c1), + (hex!("0122222222333333334444444455000000140000000000000150"), 0x005861), + (hex!("0122222222333333334444444455000000140000000000006c40"), 0x005901), + (hex!("0122222222333333334444444455000000150000000000000160"), 0x0059a1), + (hex!("0122222222333333334444444455000000150000000000005990"), 0x005a41), + (hex!("0122222222333333334444444455000000160000000000000170"), 0x005ae1), + (hex!("0122222222333333334444444455000000160000000000005530"), 0x005b81), + (hex!("0122222222333333334444444455000000170000000000000180"), 0x005c21), + (hex!("0122222222333333334444444455000000170000000000004290"), 0x005cc1), + (hex!("0122222222333333334444444455000000180000000000000190"), 0x005d61), + (hex!("01222222223333333344444444550000001800000000000051c0"), 0x005e01), + (hex!("01222222223333333344444444550000001900000000000001a0"), 0x005ea1), + (hex!("0122222222333333334444444455000000190000000000005420"), 0x005f41), + (hex!("0122222222333333334444444455000000190000000000005770"), 0x005fe1), + (hex!("01222222223333333344444444550000001900000000000079d0"), 0x006081), + (hex!("01222222223333333344444444550000001a00000000000001b0"), 0x006121), + (hex!("01222222223333333344444444550000001a0000000000006f70"), 0x0061c1), + (hex!("01222222223333333344444444550000001a0000000000007150"), 0x006261), + (hex!("01222222223333333344444444550000001b00000000000001c0"), 0x006301), + (hex!("01222222223333333344444444550000001b0000000000005070"), 0x0063a1), + (hex!("01222222223333333344444444550000001c00000000000001d0"), 0x006441), + (hex!("01222222223333333344444444550000001d00000000000001e0"), 0x0064e1), + (hex!("01222222223333333344444444550000001e00000000000001f0"), 0x006581), + (hex!("01222222223333333344444444550000001e0000000000005650"), 0x006621), + (hex!("01222222223333333344444444550000001f0000000000000200"), 0x0066c1), + (hex!("01222222223333333344444444550000001f0000000000006ca0"), 0x006761), + (hex!("0122222222333333334444444455000000200000000000000210"), 0x006801), + (hex!("0122222222333333334444444455000000200000000000005fc0"), 0x0068a1), + (hex!("0122222222333333334444444455000000210000000000000220"), 0x006941), + (hex!("0122222222333333334444444455000000210000000000006430"), 0x0069e1), + (hex!("0122222222333333334444444455000000220000000000000230"), 0x006a81), + (hex!("01222222223333333344444444550000002200000000000040e0"), 0x006b21), + (hex!("0122222222333333334444444455000000230000000000000240"), 0x006bc1), + (hex!("01222222223333333344444444550000002300000000000042d0"), 0x006c61), + (hex!("0122222222333333334444444455000000240000000000000250"), 0x006d01), + (hex!("0122222222333333334444444455000000250000000000000260"), 0x006da1), + (hex!("01222222223333333344444444550000002500000000000058c0"), 0x006e41), + (hex!("0122222222333333334444444455000000260000000000000270"), 0x006ee1), + (hex!("0122222222333333334444444455000000260000000000004020"), 0x006f81), + (hex!("0122222222333333334444444455000000270000000000000280"), 0x007021), + (hex!("0122222222333333334444444455000000280000000000000290"), 0x0070c1), + (hex!("0122222222333333334444444455000000280000000000007c00"), 0x007161), + (hex!("01222222223333333344444444550000002900000000000002a0"), 0x007201), + (hex!("01222222223333333344444444550000002a00000000000002b0"), 0x0072a1), + (hex!("01222222223333333344444444550000002b00000000000002c0"), 0x007341), + (hex!("01222222223333333344444444550000002c00000000000002d0"), 0x0073e1), + (hex!("01222222223333333344444444550000002c00000000000041b0"), 0x007481), + (hex!("01222222223333333344444444550000002c0000000000004c30"), 0x007521), + (hex!("01222222223333333344444444550000002d00000000000002e0"), 0x0075c1), + (hex!("01222222223333333344444444550000002d0000000000005e40"), 0x007661), + (hex!("01222222223333333344444444550000002d0000000000006990"), 0x007701), + (hex!("01222222223333333344444444550000002e00000000000002f0"), 0x0077a1), + (hex!("01222222223333333344444444550000002f0000000000000300"), 0x007841), + (hex!("01222222223333333344444444550000002f0000000000004a70"), 0x0078e1), + (hex!("01222222223333333344444444550000002f0000000000006b40"), 0x007981), + (hex!("0122222222333333334444444455000000300000000000000310"), 0x007a21), + (hex!("0122222222333333334444444455000000310000000000000320"), 0x007ac1), + (hex!("0122222222333333334444444455000000320000000000000330"), 0x007b61), + (hex!("01222222223333333344444444550000003200000000000041a0"), 0x007c01), + (hex!("0122222222333333334444444455000000320000000000007340"), 0x007ca1), + (hex!("0122222222333333334444444455000000320000000000007730"), 0x007d41), + (hex!("0122222222333333334444444455000000330000000000000340"), 0x007de1), + (hex!("01222222223333333344444444550000003300000000000055a0"), 0x007e81), + (hex!("0122222222333333334444444455000000340000000000000350"), 0x007f21), + (hex!("0122222222333333334444444455000000350000000000000360"), 0x007fc1), + (hex!("01222222223333333344444444550000003500000000000077a0"), 0x008061), + (hex!("0122222222333333334444444455000000360000000000000370"), 0x008101), + (hex!("0122222222333333334444444455000000370000000000000380"), 0x0081a1), + (hex!("0122222222333333334444444455000000380000000000000390"), 0x008241), + (hex!("01222222223333333344444444550000003900000000000003a0"), 0x0082e1), + (hex!("01222222223333333344444444550000003a00000000000003b0"), 0x008381), + (hex!("01222222223333333344444444550000003a00000000000071c0"), 0x008421), + (hex!("01222222223333333344444444550000003b00000000000003c0"), 0x0084c1), + (hex!("01222222223333333344444444550000003c00000000000003d0"), 0x008561), + (hex!("01222222223333333344444444550000003d00000000000003e0"), 0x008601), + (hex!("01222222223333333344444444550000003e00000000000003f0"), 0x0086a1), + (hex!("01222222223333333344444444550000003e00000000000062e0"), 0x008741), + (hex!("01222222223333333344444444550000003f0000000000000400"), 0x0087e1), + (hex!("0122222222333333334444444455000000400000000000000410"), 0x008881), + (hex!("0122222222333333334444444455000000400000000000004460"), 0x008921), + (hex!("0122222222333333334444444455000000400000000000005b90"), 0x0089c1), + (hex!("01222222223333333344444444550000004000000000000079b0"), 0x008a61), + (hex!("0122222222333333334444444455000000410000000000000420"), 0x008b01), + (hex!("0122222222333333334444444455000000420000000000000430"), 0x008ba1), + (hex!("0122222222333333334444444455000000420000000000005640"), 0x008c41), + (hex!("0122222222333333334444444455000000430000000000000440"), 0x008ce1), + (hex!("01222222223333333344444444550000004300000000000072a0"), 0x008d81), + (hex!("0122222222333333334444444455000000440000000000000450"), 0x008e21), + (hex!("0122222222333333334444444455000000450000000000000460"), 0x008ec1), + (hex!("0122222222333333334444444455000000450000000000005750"), 0x008f61), + (hex!("01222222223333333344444444550000004500000000000077b0"), 0x009001), + (hex!("0122222222333333334444444455000000460000000000000470"), 0x0090a1), + (hex!("0122222222333333334444444455000000470000000000000480"), 0x009141), + (hex!("0122222222333333334444444455000000480000000000000490"), 0x0091e1), + (hex!("01222222223333333344444444550000004800000000000069e0"), 0x009281), + (hex!("01222222223333333344444444550000004900000000000004a0"), 0x009321), + (hex!("0122222222333333334444444455000000490000000000007370"), 0x0093c1), + (hex!("01222222223333333344444444550000004a00000000000004b0"), 0x009461), + (hex!("01222222223333333344444444550000004a0000000000005cb0"), 0x009501), + (hex!("01222222223333333344444444550000004b00000000000004c0"), 0x0095a1), + (hex!("01222222223333333344444444550000004c00000000000004d0"), 0x009641), + (hex!("01222222223333333344444444550000004c0000000000004880"), 0x0096e1), + (hex!("01222222223333333344444444550000004c0000000000007a40"), 0x009781), + (hex!("01222222223333333344444444550000004d00000000000004e0"), 0x009821), + (hex!("01222222223333333344444444550000004d0000000000006390"), 0x0098c1), + (hex!("01222222223333333344444444550000004e00000000000004f0"), 0x009961), + (hex!("01222222223333333344444444550000004e0000000000004db0"), 0x009a01), + (hex!("01222222223333333344444444550000004f0000000000000500"), 0x009aa1), + (hex!("0122222222333333334444444455000000500000000000000510"), 0x009b41), + (hex!("0122222222333333334444444455000000510000000000000520"), 0x009be1), + (hex!("01222222223333333344444444550000005100000000000069c0"), 0x009c81), + (hex!("0122222222333333334444444455000000520000000000000530"), 0x009d21), + (hex!("0122222222333333334444444455000000520000000000006e60"), 0x009dc1), + (hex!("01222222223333333344444444550000005200000000000070c0"), 0x009e61), + (hex!("0122222222333333334444444455000000530000000000000540"), 0x009f01), + (hex!("0122222222333333334444444455000000530000000000005840"), 0x009fa1), + (hex!("0122222222333333334444444455000000540000000000000550"), 0x00a041), + (hex!("01222222223333333344444444550000005400000000000043e0"), 0x00a0e1), + (hex!("01222222223333333344444444550000005400000000000074e0"), 0x00a181), + (hex!("0122222222333333334444444455000000550000000000000560"), 0x00a221), + (hex!("0122222222333333334444444455000000550000000000003ee0"), 0x00a2c1), + (hex!("0122222222333333334444444455000000560000000000000570"), 0x00a361), + (hex!("0122222222333333334444444455000000570000000000000580"), 0x00a401), + (hex!("0122222222333333334444444455000000570000000000007030"), 0x00a4a1), + (hex!("0122222222333333334444444455000000580000000000000590"), 0x00a541), + (hex!("0122222222333333334444444455000000580000000000005340"), 0x00a5e1), + (hex!("01222222223333333344444444550000005800000000000059f0"), 0x00a681), + (hex!("0122222222333333334444444455000000580000000000006930"), 0x00a721), + (hex!("01222222223333333344444444550000005900000000000005a0"), 0x00a7c1), + (hex!("0122222222333333334444444455000000590000000000003f90"), 0x00a861), + (hex!("01222222223333333344444444550000005a00000000000005b0"), 0x00a901), + (hex!("01222222223333333344444444550000005b00000000000005c0"), 0x00a9a1), + (hex!("01222222223333333344444444550000005b00000000000062c0"), 0x00aa41), + (hex!("01222222223333333344444444550000005c00000000000005d0"), 0x00aae1), + (hex!("01222222223333333344444444550000005c0000000000005a70"), 0x00ab81), + (hex!("01222222223333333344444444550000005c0000000000005dd0"), 0x00ac21), + (hex!("01222222223333333344444444550000005d00000000000005e0"), 0x00acc1), + (hex!("01222222223333333344444444550000005d0000000000005730"), 0x00ad61), + (hex!("01222222223333333344444444550000005e00000000000005f0"), 0x00ae01), + (hex!("01222222223333333344444444550000005e0000000000004f40"), 0x00aea1), + (hex!("01222222223333333344444444550000005f0000000000000600"), 0x00af41), + (hex!("0122222222333333334444444455000000600000000000000610"), 0x00afe1), + (hex!("0122222222333333334444444455000000600000000000007c40"), 0x00b081), + (hex!("0122222222333333334444444455000000610000000000000620"), 0x00b121), + (hex!("0122222222333333334444444455000000610000000000007860"), 0x00b1c1), + (hex!("0122222222333333334444444455000000620000000000000630"), 0x00b261), + (hex!("0122222222333333334444444455000000620000000000005050"), 0x00b301), + (hex!("0122222222333333334444444455000000630000000000000640"), 0x00b3a1), + (hex!("0122222222333333334444444455000000640000000000000650"), 0x00b441), + (hex!("0122222222333333334444444455000000650000000000000660"), 0x00b4e1), + (hex!("0122222222333333334444444455000000650000000000005330"), 0x00b581), + (hex!("0122222222333333334444444455000000660000000000000670"), 0x00b621), + (hex!("0122222222333333334444444455000000660000000000004e20"), 0x00b6c1), + (hex!("0122222222333333334444444455000000660000000000005ee0"), 0x00b761), + (hex!("0122222222333333334444444455000000660000000000006360"), 0x00b801), + (hex!("0122222222333333334444444455000000670000000000000680"), 0x00b8a1), + (hex!("0122222222333333334444444455000000670000000000004040"), 0x00b941), + (hex!("0122222222333333334444444455000000680000000000000690"), 0x00b9e1), + (hex!("0122222222333333334444444455000000680000000000003f80"), 0x00ba81), + (hex!("01222222223333333344444444550000006800000000000041e0"), 0x00bb21), + (hex!("01222222223333333344444444550000006900000000000006a0"), 0x00bbc1), + (hex!("0122222222333333334444444455000000690000000000006080"), 0x00bc61), + (hex!("01222222223333333344444444550000006a00000000000006b0"), 0x00bd01), + (hex!("01222222223333333344444444550000006a00000000000042f0"), 0x00bda1), + (hex!("01222222223333333344444444550000006b00000000000006c0"), 0x00be41), + (hex!("01222222223333333344444444550000006b00000000000052f0"), 0x00bee1), + (hex!("01222222223333333344444444550000006b0000000000005980"), 0x00bf81), + (hex!("01222222223333333344444444550000006b0000000000006170"), 0x00c021), + (hex!("01222222223333333344444444550000006c00000000000006d0"), 0x00c0c1), + (hex!("01222222223333333344444444550000006d00000000000006e0"), 0x00c161), + (hex!("01222222223333333344444444550000006d0000000000006fb0"), 0x00c201), + (hex!("01222222223333333344444444550000006e00000000000006f0"), 0x00c2a1), + (hex!("01222222223333333344444444550000006e00000000000065b0"), 0x00c341), + (hex!("01222222223333333344444444550000006e0000000000007970"), 0x00c3e1), + (hex!("01222222223333333344444444550000006f0000000000000700"), 0x00c481), + (hex!("01222222223333333344444444550000006f0000000000005900"), 0x00c521), + (hex!("01222222223333333344444444550000006f0000000000006d90"), 0x00c5c1), + (hex!("0122222222333333334444444455000000700000000000000710"), 0x00c661), + (hex!("01222222223333333344444444550000007000000000000045c0"), 0x00c701), + (hex!("0122222222333333334444444455000000700000000000004d40"), 0x00c7a1), + (hex!("0122222222333333334444444455000000710000000000000720"), 0x00c841), + (hex!("0122222222333333334444444455000000710000000000004dc0"), 0x00c8e1), + (hex!("0122222222333333334444444455000000710000000000007550"), 0x00c981), + (hex!("0122222222333333334444444455000000720000000000000730"), 0x00ca21), + (hex!("0122222222333333334444444455000000720000000000003ec0"), 0x00cac1), + (hex!("01222222223333333344444444550000007200000000000045a0"), 0x00cb61), + (hex!("0122222222333333334444444455000000720000000000006770"), 0x00cc01), + (hex!("0122222222333333334444444455000000720000000000006bc0"), 0x00cca1), + (hex!("0122222222333333334444444455000000730000000000000740"), 0x00cd41), + (hex!("0122222222333333334444444455000000730000000000005250"), 0x00cde1), + (hex!("01222222223333333344444444550000007300000000000075f0"), 0x00ce81), + (hex!("0122222222333333334444444455000000740000000000000750"), 0x00cf21), + (hex!("0122222222333333334444444455000000740000000000003ff0"), 0x00cfc1), + (hex!("01222222223333333344444444550000007400000000000079e0"), 0x00d061), + (hex!("0122222222333333334444444455000000750000000000000760"), 0x00d101), + (hex!("0122222222333333334444444455000000750000000000004310"), 0x00d1a1), + (hex!("0122222222333333334444444455000000760000000000000770"), 0x00d241), + (hex!("0122222222333333334444444455000000770000000000000780"), 0x00d2e1), + (hex!("01222222223333333344444444550000007700000000000062f0"), 0x00d381), + (hex!("0122222222333333334444444455000000770000000000006940"), 0x00d421), + (hex!("0122222222333333334444444455000000780000000000000790"), 0x00d4c1), + (hex!("01222222223333333344444444550000007900000000000007a0"), 0x00d561), + (hex!("0122222222333333334444444455000000790000000000007af0"), 0x00d601), + (hex!("01222222223333333344444444550000007a00000000000007b0"), 0x00d6a1), + (hex!("01222222223333333344444444550000007b00000000000007c0"), 0x00d741), + (hex!("01222222223333333344444444550000007b00000000000067e0"), 0x00d7e1), + (hex!("01222222223333333344444444550000007b0000000000007890"), 0x00d881), + (hex!("01222222223333333344444444550000007c00000000000007d0"), 0x00d921), + (hex!("01222222223333333344444444550000007d00000000000007e0"), 0x00d9c1), + (hex!("01222222223333333344444444550000007e00000000000007f0"), 0x00da61), + (hex!("01222222223333333344444444550000007f0000000000000800"), 0x00db01), + (hex!("01222222223333333344444444550000007f0000000000005be0"), 0x00dba1), + (hex!("0122222222333333334444444455000000800000000000000810"), 0x00dc41), + (hex!("0122222222333333334444444455000000810000000000000820"), 0x00dce1), + (hex!("0122222222333333334444444455000000810000000000007190"), 0x00dd81), + (hex!("0122222222333333334444444455000000820000000000000830"), 0x00de21), + (hex!("0122222222333333334444444455000000820000000000004ab0"), 0x00dec1), + (hex!("0122222222333333334444444455000000830000000000000840"), 0x00df61), + (hex!("0122222222333333334444444455000000830000000000006720"), 0x00e001), + (hex!("0122222222333333334444444455000000840000000000000850"), 0x00e0a1), + (hex!("0122222222333333334444444455000000850000000000000860"), 0x00e141), + (hex!("01222222223333333344444444550000008500000000000054f0"), 0x00e1e1), + (hex!("0122222222333333334444444455000000850000000000007920"), 0x00e281), + (hex!("0122222222333333334444444455000000860000000000000870"), 0x00e321), + (hex!("01222222223333333344444444550000008600000000000060e0"), 0x00e3c1), + (hex!("0122222222333333334444444455000000860000000000006be0"), 0x00e461), + (hex!("0122222222333333334444444455000000870000000000000880"), 0x00e501), + (hex!("0122222222333333334444444455000000870000000000006820"), 0x00e5a1), + (hex!("0122222222333333334444444455000000880000000000000890"), 0x00e641), + (hex!("01222222223333333344444444550000008900000000000008a0"), 0x00e6e1), + (hex!("0122222222333333334444444455000000890000000000007c30"), 0x00e781), + (hex!("01222222223333333344444444550000008a00000000000008b0"), 0x00e821), + (hex!("01222222223333333344444444550000008b00000000000008c0"), 0x00e8c1), + (hex!("01222222223333333344444444550000008b0000000000005910"), 0x00e961), + (hex!("01222222223333333344444444550000008b0000000000006fe0"), 0x00ea01), + (hex!("01222222223333333344444444550000008c00000000000008d0"), 0x00eaa1), + (hex!("01222222223333333344444444550000008c0000000000006800"), 0x00eb41), + (hex!("01222222223333333344444444550000008d00000000000008e0"), 0x00ebe1), + (hex!("01222222223333333344444444550000008d0000000000005810"), 0x00ec81), + (hex!("01222222223333333344444444550000008d0000000000007c90"), 0x00ed21), + (hex!("01222222223333333344444444550000008e00000000000008f0"), 0x00edc1), + (hex!("01222222223333333344444444550000008e00000000000058f0"), 0x00ee61), + (hex!("01222222223333333344444444550000008f0000000000000900"), 0x00ef01), + (hex!("01222222223333333344444444550000008f0000000000005a30"), 0x00efa1), + (hex!("0122222222333333334444444455000000900000000000000910"), 0x00f041), + (hex!("0122222222333333334444444455000000900000000000006130"), 0x00f0e1), + (hex!("0122222222333333334444444455000000900000000000006550"), 0x00f181), + (hex!("0122222222333333334444444455000000910000000000000920"), 0x00f221), + (hex!("01222222223333333344444444550000009100000000000079f0"), 0x00f2c1), + (hex!("0122222222333333334444444455000000920000000000000930"), 0x00f361), + (hex!("0122222222333333334444444455000000920000000000005620"), 0x00f401), + (hex!("0122222222333333334444444455000000920000000000005e90"), 0x00f4a1), + (hex!("01222222223333333344444444550000009200000000000063d0"), 0x00f541), + (hex!("01222222223333333344444444550000009200000000000076c0"), 0x00f5e1), + (hex!("0122222222333333334444444455000000930000000000000940"), 0x00f681), + (hex!("01222222223333333344444444550000009300000000000044e0"), 0x00f721), + (hex!("0122222222333333334444444455000000940000000000000950"), 0x00f7c1), + (hex!("0122222222333333334444444455000000940000000000007a30"), 0x00f861), + (hex!("0122222222333333334444444455000000950000000000000960"), 0x00f901), + (hex!("0122222222333333334444444455000000950000000000007a70"), 0x00f9a1), + (hex!("0122222222333333334444444455000000960000000000000970"), 0x00fa41), + (hex!("0122222222333333334444444455000000970000000000000980"), 0x00fae1), + (hex!("0122222222333333334444444455000000970000000000007330"), 0x00fb81), + (hex!("0122222222333333334444444455000000980000000000000990"), 0x00fc21), + (hex!("0122222222333333334444444455000000980000000000005af0"), 0x00fcc1), + (hex!("0122222222333333334444444455000000980000000000007ae0"), 0x00fd61), + (hex!("01222222223333333344444444550000009900000000000009a0"), 0x00fe01), + (hex!("0122222222333333334444444455000000990000000000005160"), 0x00fea1), + (hex!("0122222222333333334444444455000000990000000000006850"), 0x00ff41), + (hex!("01222222223333333344444444550000009a00000000000009b0"), 0x00ffe1), + (hex!("01222222223333333344444444550000009b00000000000009c0"), 0x010081), + (hex!("01222222223333333344444444550000009b0000000000005010"), 0x010121), + (hex!("01222222223333333344444444550000009c00000000000009d0"), 0x0101c1), + (hex!("01222222223333333344444444550000009c00000000000042e0"), 0x010261), + (hex!("01222222223333333344444444550000009d00000000000009e0"), 0x010301), + (hex!("01222222223333333344444444550000009d00000000000057f0"), 0x0103a1), + (hex!("01222222223333333344444444550000009e00000000000009f0"), 0x010441), + (hex!("01222222223333333344444444550000009e0000000000004ef0"), 0x0104e1), + (hex!("01222222223333333344444444550000009f0000000000000a00"), 0x010581), + (hex!("01222222223333333344444444550000009f0000000000006110"), 0x010621), + (hex!("0122222222333333334444444455000000a00000000000000a10"), 0x0106c1), + (hex!("0122222222333333334444444455000000a10000000000000a20"), 0x010761), + (hex!("0122222222333333334444444455000000a100000000000040d0"), 0x010801), + (hex!("0122222222333333334444444455000000a10000000000007670"), 0x0108a1), + (hex!("0122222222333333334444444455000000a20000000000000a30"), 0x010941), + (hex!("0122222222333333334444444455000000a200000000000074d0"), 0x0109e1), + (hex!("0122222222333333334444444455000000a30000000000000a40"), 0x010a81), + (hex!("0122222222333333334444444455000000a30000000000004c90"), 0x010b21), + (hex!("0122222222333333334444444455000000a40000000000000a50"), 0x010bc1), + (hex!("0122222222333333334444444455000000a50000000000000a60"), 0x010c61), + (hex!("0122222222333333334444444455000000a60000000000000a70"), 0x010d01), + (hex!("0122222222333333334444444455000000a60000000000006d80"), 0x010da1), + (hex!("0122222222333333334444444455000000a60000000000007830"), 0x010e41), + (hex!("0122222222333333334444444455000000a70000000000000a80"), 0x010ee1), + (hex!("0122222222333333334444444455000000a700000000000064f0"), 0x010f81), + (hex!("0122222222333333334444444455000000a80000000000000a90"), 0x011021), + (hex!("0122222222333333334444444455000000a90000000000000aa0"), 0x0110c1), + (hex!("0122222222333333334444444455000000a90000000000005e30"), 0x011161), + (hex!("0122222222333333334444444455000000aa0000000000000ab0"), 0x011201), + (hex!("0122222222333333334444444455000000ab0000000000000ac0"), 0x0112a1), + (hex!("0122222222333333334444444455000000ac0000000000000ad0"), 0x011341), + (hex!("0122222222333333334444444455000000ac0000000000006d20"), 0x0113e1), + (hex!("0122222222333333334444444455000000ac0000000000007000"), 0x011481), + (hex!("0122222222333333334444444455000000ad0000000000000ae0"), 0x011521), + (hex!("0122222222333333334444444455000000ae0000000000000af0"), 0x0115c1), + (hex!("0122222222333333334444444455000000ae0000000000004a10"), 0x011661), + (hex!("0122222222333333334444444455000000af0000000000000b00"), 0x011701), + (hex!("0122222222333333334444444455000000af0000000000004e10"), 0x0117a1), + (hex!("0122222222333333334444444455000000b00000000000000b10"), 0x011841), + (hex!("0122222222333333334444444455000000b00000000000004280"), 0x0118e1), + (hex!("0122222222333333334444444455000000b000000000000077e0"), 0x011981), + (hex!("0122222222333333334444444455000000b10000000000000b20"), 0x011a21), + (hex!("0122222222333333334444444455000000b20000000000000b30"), 0x011ac1), + (hex!("0122222222333333334444444455000000b30000000000000b40"), 0x011b61), + (hex!("0122222222333333334444444455000000b30000000000004bc0"), 0x011c01), + (hex!("0122222222333333334444444455000000b40000000000000b50"), 0x011ca1), + (hex!("0122222222333333334444444455000000b50000000000000b60"), 0x011d41), + (hex!("0122222222333333334444444455000000b50000000000004fa0"), 0x011de1), + (hex!("0122222222333333334444444455000000b50000000000006a60"), 0x011e81), + (hex!("0122222222333333334444444455000000b60000000000000b70"), 0x011f21), + (hex!("0122222222333333334444444455000000b60000000000005630"), 0x011fc1), + (hex!("0122222222333333334444444455000000b70000000000000b80"), 0x012061), + (hex!("0122222222333333334444444455000000b80000000000000b90"), 0x012101), + (hex!("0122222222333333334444444455000000b80000000000006f80"), 0x0121a1), + (hex!("0122222222333333334444444455000000b90000000000000ba0"), 0x012241), + (hex!("0122222222333333334444444455000000ba0000000000000bb0"), 0x0122e1), + (hex!("0122222222333333334444444455000000bb0000000000000bc0"), 0x012381), + (hex!("0122222222333333334444444455000000bb00000000000047c0"), 0x012421), + (hex!("0122222222333333334444444455000000bb0000000000006060"), 0x0124c1), + (hex!("0122222222333333334444444455000000bc0000000000000bd0"), 0x012561), + (hex!("0122222222333333334444444455000000bd0000000000000be0"), 0x012601), + (hex!("0122222222333333334444444455000000bd0000000000004e80"), 0x0126a1), + (hex!("0122222222333333334444444455000000be0000000000000bf0"), 0x012741), + (hex!("0122222222333333334444444455000000bf0000000000000c00"), 0x0127e1), + (hex!("0122222222333333334444444455000000bf00000000000047a0"), 0x012881), + (hex!("0122222222333333334444444455000000bf0000000000006da0"), 0x012921), + (hex!("0122222222333333334444444455000000c00000000000000c10"), 0x0129c1), + (hex!("0122222222333333334444444455000000c10000000000000c20"), 0x012a61), + (hex!("0122222222333333334444444455000000c20000000000000c30"), 0x012b01), + (hex!("0122222222333333334444444455000000c20000000000004bd0"), 0x012ba1), + (hex!("0122222222333333334444444455000000c20000000000006ac0"), 0x012c41), + (hex!("0122222222333333334444444455000000c30000000000000c40"), 0x012ce1), + (hex!("0122222222333333334444444455000000c30000000000004660"), 0x012d81), + (hex!("0122222222333333334444444455000000c40000000000000c50"), 0x012e21), + (hex!("0122222222333333334444444455000000c50000000000000c60"), 0x012ec1), + (hex!("0122222222333333334444444455000000c60000000000000c70"), 0x012f61), + (hex!("0122222222333333334444444455000000c60000000000005880"), 0x013001), + (hex!("0122222222333333334444444455000000c60000000000006b70"), 0x0130a1), + (hex!("0122222222333333334444444455000000c70000000000000c80"), 0x013141), + (hex!("0122222222333333334444444455000000c80000000000000c90"), 0x0131e1), + (hex!("0122222222333333334444444455000000c80000000000005310"), 0x013281), + (hex!("0122222222333333334444444455000000c80000000000005db0"), 0x013321), + (hex!("0122222222333333334444444455000000c80000000000007040"), 0x0133c1), + (hex!("0122222222333333334444444455000000c80000000000007290"), 0x013461), + (hex!("0122222222333333334444444455000000c90000000000000ca0"), 0x013501), + (hex!("0122222222333333334444444455000000c90000000000004fe0"), 0x0135a1), + (hex!("0122222222333333334444444455000000ca0000000000000cb0"), 0x013641), + (hex!("0122222222333333334444444455000000ca0000000000006140"), 0x0136e1), + (hex!("0122222222333333334444444455000000ca0000000000007700"), 0x013781), + (hex!("0122222222333333334444444455000000cb0000000000000cc0"), 0x013821), + (hex!("0122222222333333334444444455000000cc0000000000000cd0"), 0x0138c1), + (hex!("0122222222333333334444444455000000cd0000000000000ce0"), 0x013961), + (hex!("0122222222333333334444444455000000cd0000000000003f20"), 0x013a01), + (hex!("0122222222333333334444444455000000cd00000000000040f0"), 0x013aa1), + (hex!("0122222222333333334444444455000000cd0000000000004ec0"), 0x013b41), + (hex!("0122222222333333334444444455000000ce0000000000000cf0"), 0x013be1), + (hex!("0122222222333333334444444455000000ce0000000000007200"), 0x013c81), + (hex!("0122222222333333334444444455000000cf0000000000000d00"), 0x013d21), + (hex!("0122222222333333334444444455000000cf00000000000046a0"), 0x013dc1), + (hex!("0122222222333333334444444455000000cf0000000000005960"), 0x013e61), + (hex!("0122222222333333334444444455000000d00000000000000d10"), 0x013f01), + (hex!("0122222222333333334444444455000000d00000000000005f30"), 0x013fa1), + (hex!("0122222222333333334444444455000000d10000000000000d20"), 0x014041), + (hex!("0122222222333333334444444455000000d10000000000007a00"), 0x0140e1), + (hex!("0122222222333333334444444455000000d20000000000000d30"), 0x014181), + (hex!("0122222222333333334444444455000000d30000000000000d40"), 0x014221), + (hex!("0122222222333333334444444455000000d40000000000000d50"), 0x0142c1), + (hex!("0122222222333333334444444455000000d50000000000000d60"), 0x014361), + (hex!("0122222222333333334444444455000000d50000000000004960"), 0x014401), + (hex!("0122222222333333334444444455000000d500000000000055d0"), 0x0144a1), + (hex!("0122222222333333334444444455000000d500000000000067d0"), 0x014541), + (hex!("0122222222333333334444444455000000d60000000000000d70"), 0x0145e1), + (hex!("0122222222333333334444444455000000d70000000000000d80"), 0x014681), + (hex!("0122222222333333334444444455000000d80000000000000d90"), 0x014721), + (hex!("0122222222333333334444444455000000d800000000000065f0"), 0x0147c1), + (hex!("0122222222333333334444444455000000d90000000000000da0"), 0x014861), + (hex!("0122222222333333334444444455000000d90000000000004980"), 0x014901), + (hex!("0122222222333333334444444455000000da0000000000000db0"), 0x0149a1), + (hex!("0122222222333333334444444455000000da00000000000048c0"), 0x014a41), + (hex!("0122222222333333334444444455000000da00000000000072c0"), 0x014ae1), + (hex!("0122222222333333334444444455000000da00000000000076b0"), 0x014b81), + (hex!("0122222222333333334444444455000000db0000000000000dc0"), 0x014c21), + (hex!("0122222222333333334444444455000000dc0000000000000dd0"), 0x014cc1), + (hex!("0122222222333333334444444455000000dc00000000000040a0"), 0x014d61), + (hex!("0122222222333333334444444455000000dc00000000000074c0"), 0x014e01), + (hex!("0122222222333333334444444455000000dd0000000000000de0"), 0x014ea1), + (hex!("0122222222333333334444444455000000dd0000000000004e50"), 0x014f41), + (hex!("0122222222333333334444444455000000dd0000000000007270"), 0x014fe1), + (hex!("0122222222333333334444444455000000de0000000000000df0"), 0x015081), + (hex!("0122222222333333334444444455000000de00000000000078d0"), 0x015121), + (hex!("0122222222333333334444444455000000df0000000000000e00"), 0x0151c1), + (hex!("0122222222333333334444444455000000df0000000000004d30"), 0x015261), + (hex!("0122222222333333334444444455000000df0000000000006c30"), 0x015301), + (hex!("0122222222333333334444444455000000e00000000000000e10"), 0x0153a1), + (hex!("0122222222333333334444444455000000e00000000000005d30"), 0x015441), + (hex!("0122222222333333334444444455000000e10000000000000e20"), 0x0154e1), + (hex!("0122222222333333334444444455000000e10000000000004610"), 0x015581), + (hex!("0122222222333333334444444455000000e100000000000051d0"), 0x015621), + (hex!("0122222222333333334444444455000000e10000000000005f10"), 0x0156c1), + (hex!("0122222222333333334444444455000000e20000000000000e30"), 0x015761), + (hex!("0122222222333333334444444455000000e20000000000007a90"), 0x015801), + (hex!("0122222222333333334444444455000000e30000000000000e40"), 0x0158a1), + (hex!("0122222222333333334444444455000000e30000000000005ae0"), 0x015941), + (hex!("0122222222333333334444444455000000e40000000000000e50"), 0x0159e1), + (hex!("0122222222333333334444444455000000e50000000000000e60"), 0x015a81), + (hex!("0122222222333333334444444455000000e50000000000004700"), 0x015b21), + (hex!("0122222222333333334444444455000000e500000000000065d0"), 0x015bc1), + (hex!("0122222222333333334444444455000000e60000000000000e70"), 0x015c61), + (hex!("0122222222333333334444444455000000e60000000000004fd0"), 0x015d01), + (hex!("0122222222333333334444444455000000e70000000000000e80"), 0x015da1), + (hex!("0122222222333333334444444455000000e70000000000005150"), 0x015e41), + (hex!("0122222222333333334444444455000000e70000000000005920"), 0x015ee1), + (hex!("0122222222333333334444444455000000e80000000000000e90"), 0x015f81), + (hex!("0122222222333333334444444455000000e80000000000004320"), 0x016021), + (hex!("0122222222333333334444444455000000e80000000000005ec0"), 0x0160c1), + (hex!("0122222222333333334444444455000000e90000000000000ea0"), 0x016161), + (hex!("0122222222333333334444444455000000e900000000000043b0"), 0x016201), + (hex!("0122222222333333334444444455000000ea0000000000000eb0"), 0x0162a1), + (hex!("0122222222333333334444444455000000ea0000000000003ea0"), 0x016341), + (hex!("0122222222333333334444444455000000ea0000000000004f50"), 0x0163e1), + (hex!("0122222222333333334444444455000000ea0000000000007520"), 0x016481), + (hex!("0122222222333333334444444455000000eb0000000000000ec0"), 0x016521), + (hex!("0122222222333333334444444455000000ec0000000000000ed0"), 0x0165c1), + (hex!("0122222222333333334444444455000000ec0000000000006670"), 0x016661), + (hex!("0122222222333333334444444455000000ed0000000000000ee0"), 0x016701), + (hex!("0122222222333333334444444455000000ee0000000000000ef0"), 0x0167a1), + (hex!("0122222222333333334444444455000000ee0000000000004d10"), 0x016841), + (hex!("0122222222333333334444444455000000ef0000000000000f00"), 0x0168e1), + (hex!("0122222222333333334444444455000000f00000000000000f10"), 0x016981), + (hex!("0122222222333333334444444455000000f00000000000007220"), 0x016a21), + (hex!("0122222222333333334444444455000000f00000000000007540"), 0x016ac1), + (hex!("0122222222333333334444444455000000f10000000000000f20"), 0x016b61), + (hex!("0122222222333333334444444455000000f100000000000066f0"), 0x016c01), + (hex!("0122222222333333334444444455000000f20000000000000f30"), 0x016ca1), + (hex!("0122222222333333334444444455000000f20000000000007810"), 0x016d41), + (hex!("0122222222333333334444444455000000f30000000000000f40"), 0x016de1), + (hex!("0122222222333333334444444455000000f30000000000007b70"), 0x016e81), + (hex!("0122222222333333334444444455000000f40000000000000f50"), 0x016f21), + (hex!("0122222222333333334444444455000000f400000000000059c0"), 0x016fc1), + (hex!("0122222222333333334444444455000000f50000000000000f60"), 0x017061), + (hex!("0122222222333333334444444455000000f50000000000003fb0"), 0x017101), + (hex!("0122222222333333334444444455000000f50000000000005740"), 0x0171a1), + (hex!("0122222222333333334444444455000000f500000000000064d0"), 0x017241), + (hex!("0122222222333333334444444455000000f50000000000006960"), 0x0172e1), + (hex!("0122222222333333334444444455000000f60000000000000f70"), 0x017381), + (hex!("0122222222333333334444444455000000f60000000000006d00"), 0x017421), + (hex!("0122222222333333334444444455000000f70000000000000f80"), 0x0174c1), + (hex!("0122222222333333334444444455000000f80000000000000f90"), 0x017561), + (hex!("0122222222333333334444444455000000f90000000000000fa0"), 0x017601), + (hex!("0122222222333333334444444455000000fa0000000000000fb0"), 0x0176a1), + (hex!("0122222222333333334444444455000000fa00000000000067b0"), 0x017741), + (hex!("0122222222333333334444444455000000fb0000000000000fc0"), 0x0177e1), + (hex!("0122222222333333334444444455000000fb0000000000004eb0"), 0x017881), + (hex!("0122222222333333334444444455000000fb0000000000006ef0"), 0x017921), + (hex!("0122222222333333334444444455000000fc0000000000000fd0"), 0x0179c1), + (hex!("0122222222333333334444444455000000fc0000000000004470"), 0x017a61), + (hex!("0122222222333333334444444455000000fc0000000000005940"), 0x017b01), + (hex!("0122222222333333334444444455000000fd0000000000000fe0"), 0x017ba1), + (hex!("0122222222333333334444444455000000fe0000000000000ff0"), 0x017c41), + (hex!("0122222222333333334444444455000000ff0000000000001000"), 0x017ce1), + (hex!("0122222222333333334444444455000000ff0000000000005690"), 0x017d81), + (hex!("0122222222333333334444444455000001000000000000001010"), 0x017e21), + (hex!("0122222222333333334444444455000001000000000000005210"), 0x017ec1), + (hex!("01222222223333333344444444550000010000000000000070a0"), 0x017f61), + (hex!("0122222222333333334444444455000001010000000000001020"), 0x018001), + (hex!("0122222222333333334444444455000001010000000000006b80"), 0x0180a1), + (hex!("0122222222333333334444444455000001020000000000001030"), 0x018141), + (hex!("0122222222333333334444444455000001030000000000001040"), 0x0181e1), + (hex!("0122222222333333334444444455000001030000000000004c80"), 0x018281), + (hex!("0122222222333333334444444455000001040000000000001050"), 0x018321), + (hex!("0122222222333333334444444455000001040000000000004850"), 0x0183c1), + (hex!("01222222223333333344444444550000010400000000000057b0"), 0x018461), + (hex!("0122222222333333334444444455000001050000000000001060"), 0x018501), + (hex!("01222222223333333344444444550000010500000000000048d0"), 0x0185a1), + (hex!("0122222222333333334444444455000001050000000000007870"), 0x018641), + (hex!("0122222222333333334444444455000001060000000000001070"), 0x0186e1), + (hex!("0122222222333333334444444455000001060000000000004f90"), 0x018781), + (hex!("0122222222333333334444444455000001060000000000006270"), 0x018821), + (hex!("0122222222333333334444444455000001070000000000001080"), 0x0188c1), + (hex!("01222222223333333344444444550000010700000000000063b0"), 0x018961), + (hex!("0122222222333333334444444455000001080000000000001090"), 0x018a01), + (hex!("01222222223333333344444444550000010900000000000010a0"), 0x018aa1), + (hex!("0122222222333333334444444455000001090000000000006f40"), 0x018b41), + (hex!("01222222223333333344444444550000010a00000000000010b0"), 0x018be1), + (hex!("01222222223333333344444444550000010a0000000000006640"), 0x018c81), + (hex!("01222222223333333344444444550000010b00000000000010c0"), 0x018d21), + (hex!("01222222223333333344444444550000010c00000000000010d0"), 0x018dc1), + (hex!("01222222223333333344444444550000010d00000000000010e0"), 0x018e61), + (hex!("01222222223333333344444444550000010e00000000000010f0"), 0x018f01), + (hex!("01222222223333333344444444550000010e0000000000005c40"), 0x018fa1), + (hex!("01222222223333333344444444550000010e0000000000007ba0"), 0x019041), + (hex!("01222222223333333344444444550000010f0000000000001100"), 0x0190e1), + (hex!("01222222223333333344444444550000010f0000000000005c30"), 0x019181), + (hex!("0122222222333333334444444455000001100000000000001110"), 0x019221), + (hex!("0122222222333333334444444455000001100000000000007640"), 0x0192c1), + (hex!("0122222222333333334444444455000001110000000000001120"), 0x019361), + (hex!("01222222223333333344444444550000011100000000000052c0"), 0x019401), + (hex!("0122222222333333334444444455000001110000000000005710"), 0x0194a1), + (hex!("0122222222333333334444444455000001110000000000006a00"), 0x019541), + (hex!("0122222222333333334444444455000001120000000000001130"), 0x0195e1), + (hex!("0122222222333333334444444455000001130000000000001140"), 0x019681), + (hex!("0122222222333333334444444455000001140000000000001150"), 0x019721), + (hex!("0122222222333333334444444455000001140000000000003fa0"), 0x0197c1), + (hex!("01222222223333333344444444550000011400000000000054b0"), 0x019861), + (hex!("0122222222333333334444444455000001140000000000006070"), 0x019901), + (hex!("0122222222333333334444444455000001150000000000001160"), 0x0199a1), + (hex!("0122222222333333334444444455000001150000000000005320"), 0x019a41), + (hex!("0122222222333333334444444455000001150000000000006600"), 0x019ae1), + (hex!("0122222222333333334444444455000001150000000000006df0"), 0x019b81), + (hex!("01222222223333333344444444550000011500000000000079c0"), 0x019c21), + (hex!("0122222222333333334444444455000001160000000000001170"), 0x019cc1), + (hex!("0122222222333333334444444455000001170000000000001180"), 0x019d61), + (hex!("0122222222333333334444444455000001170000000000004a60"), 0x019e01), + (hex!("01222222223333333344444444550000011700000000000063c0"), 0x019ea1), + (hex!("0122222222333333334444444455000001180000000000001190"), 0x019f41), + (hex!("0122222222333333334444444455000001180000000000004530"), 0x019fe1), + (hex!("01222222223333333344444444550000011800000000000077c0"), 0x01a081), + (hex!("01222222223333333344444444550000011900000000000011a0"), 0x01a121), + (hex!("01222222223333333344444444550000011a00000000000011b0"), 0x01a1c1), + (hex!("01222222223333333344444444550000011a00000000000041c0"), 0x01a261), + (hex!("01222222223333333344444444550000011a00000000000061e0"), 0x01a301), + (hex!("01222222223333333344444444550000011b00000000000011c0"), 0x01a3a1), + (hex!("01222222223333333344444444550000011c00000000000011d0"), 0x01a441), + (hex!("01222222223333333344444444550000011c0000000000005f90"), 0x01a4e1), + (hex!("01222222223333333344444444550000011d00000000000011e0"), 0x01a581), + (hex!("01222222223333333344444444550000011d0000000000004160"), 0x01a621), + (hex!("01222222223333333344444444550000011e00000000000011f0"), 0x01a6c1), + (hex!("01222222223333333344444444550000011e00000000000056d0"), 0x01a761), + (hex!("01222222223333333344444444550000011f0000000000001200"), 0x01a801), + (hex!("01222222223333333344444444550000011f0000000000004510"), 0x01a8a1), + (hex!("0122222222333333334444444455000001200000000000001210"), 0x01a941), + (hex!("0122222222333333334444444455000001210000000000001220"), 0x01a9e1), + (hex!("0122222222333333334444444455000001210000000000005140"), 0x01aa81), + (hex!("0122222222333333334444444455000001210000000000006710"), 0x01ab21), + (hex!("0122222222333333334444444455000001210000000000006f50"), 0x01abc1), + (hex!("0122222222333333334444444455000001220000000000001230"), 0x01ac61), + (hex!("0122222222333333334444444455000001220000000000005570"), 0x01ad01), + (hex!("0122222222333333334444444455000001220000000000007ac0"), 0x01ada1), + (hex!("0122222222333333334444444455000001230000000000001240"), 0x01ae41), + (hex!("0122222222333333334444444455000001240000000000001250"), 0x01aee1), + (hex!("0122222222333333334444444455000001240000000000006cd0"), 0x01af81), + (hex!("0122222222333333334444444455000001250000000000001260"), 0x01b021), + (hex!("01222222223333333344444444550000012500000000000046b0"), 0x01b0c1), + (hex!("0122222222333333334444444455000001250000000000005eb0"), 0x01b161), + (hex!("0122222222333333334444444455000001260000000000001270"), 0x01b201), + (hex!("0122222222333333334444444455000001260000000000004630"), 0x01b2a1), + (hex!("0122222222333333334444444455000001270000000000001280"), 0x01b341), + (hex!("0122222222333333334444444455000001270000000000004ff0"), 0x01b3e1), + (hex!("0122222222333333334444444455000001270000000000006ec0"), 0x01b481), + (hex!("0122222222333333334444444455000001280000000000001290"), 0x01b521), + (hex!("01222222223333333344444444550000012900000000000012a0"), 0x01b5c1), + (hex!("0122222222333333334444444455000001290000000000005f60"), 0x01b661), + (hex!("01222222223333333344444444550000012a00000000000012b0"), 0x01b701), + (hex!("01222222223333333344444444550000012a0000000000005480"), 0x01b7a1), + (hex!("01222222223333333344444444550000012b00000000000012c0"), 0x01b841), + (hex!("01222222223333333344444444550000012b00000000000065a0"), 0x01b8e1), + (hex!("01222222223333333344444444550000012b00000000000066c0"), 0x01b981), + (hex!("01222222223333333344444444550000012c00000000000012d0"), 0x01ba21), + (hex!("01222222223333333344444444550000012c00000000000064b0"), 0x01bac1), + (hex!("01222222223333333344444444550000012d00000000000012e0"), 0x01bb61), + (hex!("01222222223333333344444444550000012d00000000000049c0"), 0x01bc01), + (hex!("01222222223333333344444444550000012d0000000000004bf0"), 0x01bca1), + (hex!("01222222223333333344444444550000012e00000000000012f0"), 0x01bd41), + (hex!("01222222223333333344444444550000012e0000000000005ed0"), 0x01bde1), + (hex!("01222222223333333344444444550000012f0000000000001300"), 0x01be81), + (hex!("01222222223333333344444444550000012f00000000000049a0"), 0x01bf21), + (hex!("0122222222333333334444444455000001300000000000001310"), 0x01bfc1), + (hex!("0122222222333333334444444455000001300000000000007840"), 0x01c061), + (hex!("0122222222333333334444444455000001310000000000001320"), 0x01c101), + (hex!("0122222222333333334444444455000001310000000000005f70"), 0x01c1a1), + (hex!("0122222222333333334444444455000001320000000000001330"), 0x01c241), + (hex!("0122222222333333334444444455000001320000000000005a00"), 0x01c2e1), + (hex!("0122222222333333334444444455000001330000000000001340"), 0x01c381), + (hex!("0122222222333333334444444455000001330000000000006c70"), 0x01c421), + (hex!("0122222222333333334444444455000001340000000000001350"), 0x01c4c1), + (hex!("0122222222333333334444444455000001340000000000005c60"), 0x01c561), + (hex!("0122222222333333334444444455000001350000000000001360"), 0x01c601), + (hex!("0122222222333333334444444455000001350000000000004f10"), 0x01c6a1), + (hex!("0122222222333333334444444455000001360000000000001370"), 0x01c741), + (hex!("0122222222333333334444444455000001360000000000004c60"), 0x01c7e1), + (hex!("0122222222333333334444444455000001370000000000001380"), 0x01c881), + (hex!("0122222222333333334444444455000001380000000000001390"), 0x01c921), + (hex!("01222222223333333344444444550000013900000000000013a0"), 0x01c9c1), + (hex!("0122222222333333334444444455000001390000000000004ea0"), 0x01ca61), + (hex!("01222222223333333344444444550000013a00000000000013b0"), 0x01cb01), + (hex!("01222222223333333344444444550000013a0000000000007350"), 0x01cba1), + (hex!("01222222223333333344444444550000013b00000000000013c0"), 0x01cc41), + (hex!("01222222223333333344444444550000013c00000000000013d0"), 0x01cce1), + (hex!("01222222223333333344444444550000013c0000000000007050"), 0x01cd81), + (hex!("01222222223333333344444444550000013d00000000000013e0"), 0x01ce21), + (hex!("01222222223333333344444444550000013d0000000000006bd0"), 0x01cec1), + (hex!("01222222223333333344444444550000013e00000000000013f0"), 0x01cf61), + (hex!("01222222223333333344444444550000013e00000000000058e0"), 0x01d001), + (hex!("01222222223333333344444444550000013f0000000000001400"), 0x01d0a1), + (hex!("01222222223333333344444444550000013f0000000000004740"), 0x01d141), + (hex!("0122222222333333334444444455000001400000000000001410"), 0x01d1e1), + (hex!("0122222222333333334444444455000001400000000000003f10"), 0x01d281), + (hex!("0122222222333333334444444455000001400000000000006d40"), 0x01d321), + (hex!("01222222223333333344444444550000014000000000000072d0"), 0x01d3c1), + (hex!("0122222222333333334444444455000001410000000000001420"), 0x01d461), + (hex!("0122222222333333334444444455000001420000000000001430"), 0x01d501), + (hex!("0122222222333333334444444455000001430000000000001440"), 0x01d5a1), + (hex!("0122222222333333334444444455000001440000000000001450"), 0x01d641), + (hex!("0122222222333333334444444455000001450000000000001460"), 0x01d6e1), + (hex!("0122222222333333334444444455000001460000000000001470"), 0x01d781), + (hex!("01222222223333333344444444550000014600000000000055c0"), 0x01d821), + (hex!("0122222222333333334444444455000001470000000000001480"), 0x01d8c1), + (hex!("0122222222333333334444444455000001470000000000004570"), 0x01d961), + (hex!("0122222222333333334444444455000001470000000000004be0"), 0x01da01), + (hex!("0122222222333333334444444455000001480000000000001490"), 0x01daa1), + (hex!("0122222222333333334444444455000001480000000000005360"), 0x01db41), + (hex!("01222222223333333344444444550000014900000000000014a0"), 0x01dbe1), + (hex!("01222222223333333344444444550000014a00000000000014b0"), 0x01dc81), + (hex!("01222222223333333344444444550000014a00000000000053d0"), 0x01dd21), + (hex!("01222222223333333344444444550000014b00000000000014c0"), 0x01ddc1), + (hex!("01222222223333333344444444550000014b0000000000005950"), 0x01de61), + (hex!("01222222223333333344444444550000014c00000000000014d0"), 0x01df01), + (hex!("01222222223333333344444444550000014c0000000000004f60"), 0x01dfa1), + (hex!("01222222223333333344444444550000014d00000000000014e0"), 0x01e041), + (hex!("01222222223333333344444444550000014d0000000000004520"), 0x01e0e1), + (hex!("01222222223333333344444444550000014d0000000000005200"), 0x01e181), + (hex!("01222222223333333344444444550000014e00000000000014f0"), 0x01e221), + (hex!("01222222223333333344444444550000014e0000000000005bd0"), 0x01e2c1), + (hex!("01222222223333333344444444550000014f0000000000001500"), 0x01e361), + (hex!("01222222223333333344444444550000014f00000000000060d0"), 0x01e401), + (hex!("0122222222333333334444444455000001500000000000001510"), 0x01e4a1), + (hex!("01222222223333333344444444550000015000000000000075e0"), 0x01e541), + (hex!("0122222222333333334444444455000001510000000000001520"), 0x01e5e1), + (hex!("0122222222333333334444444455000001510000000000005c00"), 0x01e681), + (hex!("0122222222333333334444444455000001510000000000006af0"), 0x01e721), + (hex!("0122222222333333334444444455000001510000000000007b80"), 0x01e7c1), + (hex!("0122222222333333334444444455000001520000000000001530"), 0x01e861), + (hex!("0122222222333333334444444455000001520000000000004c70"), 0x01e901), + (hex!("0122222222333333334444444455000001530000000000001540"), 0x01e9a1), + (hex!("0122222222333333334444444455000001540000000000001550"), 0x01ea41), + (hex!("0122222222333333334444444455000001540000000000007cd0"), 0x01eae1), + (hex!("0122222222333333334444444455000001550000000000001560"), 0x01eb81), + (hex!("0122222222333333334444444455000001550000000000004ae0"), 0x01ec21), + (hex!("01222222223333333344444444550000015500000000000068c0"), 0x01ecc1), + (hex!("0122222222333333334444444455000001560000000000001570"), 0x01ed61), + (hex!("01222222223333333344444444550000015600000000000064a0"), 0x01ee01), + (hex!("0122222222333333334444444455000001570000000000001580"), 0x01eea1), + (hex!("0122222222333333334444444455000001580000000000001590"), 0x01ef41), + (hex!("0122222222333333334444444455000001580000000000006d30"), 0x01efe1), + (hex!("01222222223333333344444444550000015800000000000074f0"), 0x01f081), + (hex!("01222222223333333344444444550000015900000000000015a0"), 0x01f121), + (hex!("01222222223333333344444444550000015900000000000053a0"), 0x01f1c1), + (hex!("01222222223333333344444444550000015900000000000055e0"), 0x01f261), + (hex!("0122222222333333334444444455000001590000000000006210"), 0x01f301), + (hex!("01222222223333333344444444550000015900000000000067c0"), 0x01f3a1), + (hex!("01222222223333333344444444550000015a00000000000015b0"), 0x01f441), + (hex!("01222222223333333344444444550000015b00000000000015c0"), 0x01f4e1), + (hex!("01222222223333333344444444550000015c00000000000015d0"), 0x01f581), + (hex!("01222222223333333344444444550000015c0000000000004d80"), 0x01f621), + (hex!("01222222223333333344444444550000015c00000000000073f0"), 0x01f6c1), + (hex!("01222222223333333344444444550000015d00000000000015e0"), 0x01f761), + (hex!("01222222223333333344444444550000015e00000000000015f0"), 0x01f801), + (hex!("01222222223333333344444444550000015e0000000000004120"), 0x01f8a1), + (hex!("01222222223333333344444444550000015e0000000000004350"), 0x01f941), + (hex!("01222222223333333344444444550000015e0000000000007c50"), 0x01f9e1), + (hex!("01222222223333333344444444550000015f0000000000001600"), 0x01fa81), + (hex!("0122222222333333334444444455000001600000000000001610"), 0x01fb21), + (hex!("0122222222333333334444444455000001600000000000004840"), 0x01fbc1), + (hex!("0122222222333333334444444455000001600000000000004b10"), 0x01fc61), + (hex!("0122222222333333334444444455000001600000000000007060"), 0x01fd01), + (hex!("0122222222333333334444444455000001610000000000001620"), 0x01fda1), + (hex!("0122222222333333334444444455000001610000000000005300"), 0x01fe41), + (hex!("0122222222333333334444444455000001620000000000001630"), 0x01fee1), + (hex!("0122222222333333334444444455000001620000000000006530"), 0x01ff81), + (hex!("0122222222333333334444444455000001630000000000001640"), 0x020021), + (hex!("0122222222333333334444444455000001640000000000001650"), 0x0200c1), + (hex!("0122222222333333334444444455000001650000000000001660"), 0x020161), + (hex!("0122222222333333334444444455000001660000000000001670"), 0x020201), + (hex!("0122222222333333334444444455000001670000000000001680"), 0x0202a1), + (hex!("0122222222333333334444444455000001670000000000007310"), 0x020341), + (hex!("0122222222333333334444444455000001680000000000001690"), 0x0203e1), + (hex!("0122222222333333334444444455000001680000000000007b50"), 0x020481), + (hex!("01222222223333333344444444550000016900000000000016a0"), 0x020521), + (hex!("01222222223333333344444444550000016900000000000049d0"), 0x0205c1), + (hex!("01222222223333333344444444550000016a00000000000016b0"), 0x020661), + (hex!("01222222223333333344444444550000016a00000000000078b0"), 0x020701), + (hex!("01222222223333333344444444550000016b00000000000016c0"), 0x0207a1), + (hex!("01222222223333333344444444550000016b0000000000004100"), 0x020841), + (hex!("01222222223333333344444444550000016c00000000000016d0"), 0x0208e1), + (hex!("01222222223333333344444444550000016c0000000000006e00"), 0x020981), + (hex!("01222222223333333344444444550000016d00000000000016e0"), 0x020a21), + (hex!("01222222223333333344444444550000016e00000000000016f0"), 0x020ac1), + (hex!("01222222223333333344444444550000016e0000000000004ac0"), 0x020b61), + (hex!("01222222223333333344444444550000016e0000000000007820"), 0x020c01), + (hex!("01222222223333333344444444550000016f0000000000001700"), 0x020ca1), + (hex!("0122222222333333334444444455000001700000000000001710"), 0x020d41), + (hex!("0122222222333333334444444455000001700000000000005830"), 0x020de1), + (hex!("0122222222333333334444444455000001710000000000001720"), 0x020e81), + (hex!("01222222223333333344444444550000017100000000000072f0"), 0x020f21), + (hex!("0122222222333333334444444455000001720000000000001730"), 0x020fc1), + (hex!("0122222222333333334444444455000001720000000000004870"), 0x021061), + (hex!("01222222223333333344444444550000017200000000000070b0"), 0x021101), + (hex!("0122222222333333334444444455000001730000000000001740"), 0x0211a1), + (hex!("0122222222333333334444444455000001740000000000001750"), 0x021241), + (hex!("0122222222333333334444444455000001750000000000001760"), 0x0212e1), + (hex!("0122222222333333334444444455000001750000000000005670"), 0x021381), + (hex!("0122222222333333334444444455000001750000000000005870"), 0x021421), + (hex!("0122222222333333334444444455000001760000000000001770"), 0x0214c1), + (hex!("0122222222333333334444444455000001770000000000001780"), 0x021561), + (hex!("0122222222333333334444444455000001770000000000005000"), 0x021601), + (hex!("0122222222333333334444444455000001770000000000007090"), 0x0216a1), + (hex!("0122222222333333334444444455000001780000000000001790"), 0x021741), + (hex!("01222222223333333344444444550000017800000000000048a0"), 0x0217e1), + (hex!("0122222222333333334444444455000001780000000000006bf0"), 0x021881), + (hex!("01222222223333333344444444550000017900000000000017a0"), 0x021921), + (hex!("01222222223333333344444444550000017900000000000057d0"), 0x0219c1), + (hex!("0122222222333333334444444455000001790000000000006660"), 0x021a61), + (hex!("01222222223333333344444444550000017a00000000000017b0"), 0x021b01), + (hex!("01222222223333333344444444550000017a0000000000004970"), 0x021ba1), + (hex!("01222222223333333344444444550000017a0000000000005dc0"), 0x021c41), + (hex!("01222222223333333344444444550000017b00000000000017c0"), 0x021ce1), + (hex!("01222222223333333344444444550000017b0000000000004ee0"), 0x021d81), + (hex!("01222222223333333344444444550000017b00000000000054c0"), 0x021e21), + (hex!("01222222223333333344444444550000017c00000000000017d0"), 0x021ec1), + (hex!("01222222223333333344444444550000017c0000000000003fc0"), 0x021f61), + (hex!("01222222223333333344444444550000017c00000000000063e0"), 0x022001), + (hex!("01222222223333333344444444550000017c0000000000006520"), 0x0220a1), + (hex!("01222222223333333344444444550000017d00000000000017e0"), 0x022141), + (hex!("01222222223333333344444444550000017d0000000000006220"), 0x0221e1), + (hex!("01222222223333333344444444550000017d0000000000007120"), 0x022281), + (hex!("01222222223333333344444444550000017e00000000000017f0"), 0x022321), + (hex!("01222222223333333344444444550000017f0000000000001800"), 0x0223c1), + (hex!("0122222222333333334444444455000001800000000000001810"), 0x022461), + (hex!("0122222222333333334444444455000001810000000000001820"), 0x022501), + (hex!("01222222223333333344444444550000018100000000000041f0"), 0x0225a1), + (hex!("0122222222333333334444444455000001810000000000007590"), 0x022641), + (hex!("0122222222333333334444444455000001820000000000001830"), 0x0226e1), + (hex!("0122222222333333334444444455000001820000000000004ce0"), 0x022781), + (hex!("0122222222333333334444444455000001830000000000001840"), 0x022821), + (hex!("01222222223333333344444444550000018300000000000042c0"), 0x0228c1), + (hex!("0122222222333333334444444455000001840000000000001850"), 0x022961), + (hex!("0122222222333333334444444455000001840000000000004f70"), 0x022a01), + (hex!("0122222222333333334444444455000001850000000000001860"), 0x022aa1), + (hex!("0122222222333333334444444455000001850000000000006470"), 0x022b41), + (hex!("0122222222333333334444444455000001850000000000007500"), 0x022be1), + (hex!("0122222222333333334444444455000001860000000000001870"), 0x022c81), + (hex!("0122222222333333334444444455000001860000000000004770"), 0x022d21), + (hex!("0122222222333333334444444455000001870000000000001880"), 0x022dc1), + (hex!("0122222222333333334444444455000001870000000000006a30"), 0x022e61), + (hex!("0122222222333333334444444455000001880000000000001890"), 0x022f01), + (hex!("0122222222333333334444444455000001880000000000007410"), 0x022fa1), + (hex!("01222222223333333344444444550000018900000000000018a0"), 0x023041), + (hex!("01222222223333333344444444550000018900000000000044d0"), 0x0230e1), + (hex!("0122222222333333334444444455000001890000000000005ac0"), 0x023181), + (hex!("01222222223333333344444444550000018a00000000000018b0"), 0x023221), + (hex!("01222222223333333344444444550000018a0000000000006260"), 0x0232c1), + (hex!("01222222223333333344444444550000018a0000000000006d70"), 0x023361), + (hex!("01222222223333333344444444550000018b00000000000018c0"), 0x023401), + (hex!("01222222223333333344444444550000018b0000000000004aa0"), 0x0234a1), + (hex!("01222222223333333344444444550000018b0000000000006fd0"), 0x023541), + (hex!("01222222223333333344444444550000018c00000000000018d0"), 0x0235e1), + (hex!("01222222223333333344444444550000018c00000000000051b0"), 0x023681), + (hex!("01222222223333333344444444550000018c0000000000006650"), 0x023721), + (hex!("01222222223333333344444444550000018d00000000000018e0"), 0x0237c1), + (hex!("01222222223333333344444444550000018e00000000000018f0"), 0x023861), + (hex!("01222222223333333344444444550000018e00000000000041d0"), 0x023901), + (hex!("01222222223333333344444444550000018f0000000000001900"), 0x0239a1), + (hex!("01222222223333333344444444550000018f0000000000007600"), 0x023a41), + (hex!("0122222222333333334444444455000001900000000000001910"), 0x023ae1), + (hex!("0122222222333333334444444455000001900000000000005410"), 0x023b81), + (hex!("0122222222333333334444444455000001900000000000006760"), 0x023c21), + (hex!("0122222222333333334444444455000001910000000000001920"), 0x023cc1), + (hex!("0122222222333333334444444455000001920000000000001930"), 0x023d61), + (hex!("0122222222333333334444444455000001920000000000004ca0"), 0x023e01), + (hex!("0122222222333333334444444455000001920000000000005d80"), 0x023ea1), + (hex!("0122222222333333334444444455000001920000000000005fd0"), 0x023f41), + (hex!("01222222223333333344444444550000019200000000000070d0"), 0x023fe1), + (hex!("0122222222333333334444444455000001930000000000001940"), 0x024081), + (hex!("0122222222333333334444444455000001930000000000004010"), 0x024121), + (hex!("0122222222333333334444444455000001930000000000007ca0"), 0x0241c1), + (hex!("0122222222333333334444444455000001940000000000001950"), 0x024261), + (hex!("0122222222333333334444444455000001950000000000001960"), 0x024301), + (hex!("0122222222333333334444444455000001950000000000005380"), 0x0243a1), + (hex!("0122222222333333334444444455000001960000000000001970"), 0x024441), + (hex!("0122222222333333334444444455000001960000000000006de0"), 0x0244e1), + (hex!("0122222222333333334444444455000001970000000000001980"), 0x024581), + (hex!("01222222223333333344444444550000019700000000000048f0"), 0x024621), + (hex!("0122222222333333334444444455000001980000000000001990"), 0x0246c1), + (hex!("0122222222333333334444444455000001980000000000006510"), 0x024761), + (hex!("01222222223333333344444444550000019900000000000019a0"), 0x024801), + (hex!("0122222222333333334444444455000001990000000000007570"), 0x0248a1), + (hex!("0122222222333333334444444455000001990000000000007580"), 0x024941), + (hex!("01222222223333333344444444550000019a00000000000019b0"), 0x0249e1), + (hex!("01222222223333333344444444550000019a0000000000004050"), 0x024a81), + (hex!("01222222223333333344444444550000019a0000000000004ba0"), 0x024b21), + (hex!("01222222223333333344444444550000019a0000000000005540"), 0x024bc1), + (hex!("01222222223333333344444444550000019a00000000000061c0"), 0x024c61), + (hex!("01222222223333333344444444550000019a0000000000007c60"), 0x024d01), + (hex!("01222222223333333344444444550000019b00000000000019c0"), 0x024da1), + (hex!("01222222223333333344444444550000019b0000000000006240"), 0x024e41), + (hex!("01222222223333333344444444550000019c00000000000019d0"), 0x024ee1), + (hex!("01222222223333333344444444550000019d00000000000019e0"), 0x024f81), + (hex!("01222222223333333344444444550000019d0000000000004640"), 0x025021), + (hex!("01222222223333333344444444550000019d00000000000052a0"), 0x0250c1), + (hex!("01222222223333333344444444550000019d00000000000052b0"), 0x025161), + (hex!("01222222223333333344444444550000019e00000000000019f0"), 0x025201), + (hex!("01222222223333333344444444550000019f0000000000001a00"), 0x0252a1), + (hex!("01222222223333333344444444550000019f0000000000006b20"), 0x025341), + (hex!("0122222222333333334444444455000001a00000000000001a10"), 0x0253e1), + (hex!("0122222222333333334444444455000001a10000000000001a20"), 0x025481), + (hex!("0122222222333333334444444455000001a10000000000005460"), 0x025521), + (hex!("0122222222333333334444444455000001a10000000000005d20"), 0x0255c1), + (hex!("0122222222333333334444444455000001a100000000000068f0"), 0x025661), + (hex!("0122222222333333334444444455000001a20000000000001a30"), 0x025701), + (hex!("0122222222333333334444444455000001a20000000000007170"), 0x0257a1), + (hex!("0122222222333333334444444455000001a30000000000001a40"), 0x025841), + (hex!("0122222222333333334444444455000001a40000000000001a50"), 0x0258e1), + (hex!("0122222222333333334444444455000001a50000000000001a60"), 0x025981), + (hex!("0122222222333333334444444455000001a60000000000001a70"), 0x025a21), + (hex!("0122222222333333334444444455000001a70000000000001a80"), 0x025ac1), + (hex!("0122222222333333334444444455000001a70000000000005a90"), 0x025b61), + (hex!("0122222222333333334444444455000001a70000000000006440"), 0x025c01), + (hex!("0122222222333333334444444455000001a80000000000001a90"), 0x025ca1), + (hex!("0122222222333333334444444455000001a80000000000004800"), 0x025d41), + (hex!("0122222222333333334444444455000001a90000000000001aa0"), 0x025de1), + (hex!("0122222222333333334444444455000001aa0000000000001ab0"), 0x025e81), + (hex!("0122222222333333334444444455000001aa0000000000005b60"), 0x025f21), + (hex!("0122222222333333334444444455000001ab0000000000001ac0"), 0x025fc1), + (hex!("0122222222333333334444444455000001ab0000000000006700"), 0x026061), + (hex!("0122222222333333334444444455000001ab00000000000071d0"), 0x026101), + (hex!("0122222222333333334444444455000001ac0000000000001ad0"), 0x0261a1), + (hex!("0122222222333333334444444455000001ac0000000000007380"), 0x026241), + (hex!("0122222222333333334444444455000001ad0000000000001ae0"), 0x0262e1), + (hex!("0122222222333333334444444455000001ad0000000000006350"), 0x026381), + (hex!("0122222222333333334444444455000001ae0000000000001af0"), 0x026421), + (hex!("0122222222333333334444444455000001af0000000000001b00"), 0x0264c1), + (hex!("0122222222333333334444444455000001af0000000000007390"), 0x026561), + (hex!("0122222222333333334444444455000001b00000000000001b10"), 0x026601), + (hex!("0122222222333333334444444455000001b10000000000001b20"), 0x0266a1), + (hex!("0122222222333333334444444455000001b10000000000005cc0"), 0x026741), + (hex!("0122222222333333334444444455000001b20000000000001b30"), 0x0267e1), + (hex!("0122222222333333334444444455000001b20000000000004fb0"), 0x026881), + (hex!("0122222222333333334444444455000001b30000000000001b40"), 0x026921), + (hex!("0122222222333333334444444455000001b40000000000001b50"), 0x0269c1), + (hex!("0122222222333333334444444455000001b50000000000001b60"), 0x026a61), + (hex!("0122222222333333334444444455000001b60000000000001b70"), 0x026b01), + (hex!("0122222222333333334444444455000001b600000000000048e0"), 0x026ba1), + (hex!("0122222222333333334444444455000001b70000000000001b80"), 0x026c41), + (hex!("0122222222333333334444444455000001b70000000000005ca0"), 0x026ce1), + (hex!("0122222222333333334444444455000001b70000000000007900"), 0x026d81), + (hex!("0122222222333333334444444455000001b80000000000001b90"), 0x026e21), + (hex!("0122222222333333334444444455000001b80000000000004d90"), 0x026ec1), + (hex!("0122222222333333334444444455000001b90000000000001ba0"), 0x026f61), + (hex!("0122222222333333334444444455000001b90000000000003f40"), 0x027001), + (hex!("0122222222333333334444444455000001ba0000000000001bb0"), 0x0270a1), + (hex!("0122222222333333334444444455000001ba00000000000042a0"), 0x027141), + (hex!("0122222222333333334444444455000001ba00000000000067f0"), 0x0271e1), + (hex!("0122222222333333334444444455000001ba00000000000073a0"), 0x027281), + (hex!("0122222222333333334444444455000001bb0000000000001bc0"), 0x027321), + (hex!("0122222222333333334444444455000001bb0000000000004a00"), 0x0273c1), + (hex!("0122222222333333334444444455000001bb0000000000005e00"), 0x027461), + (hex!("0122222222333333334444444455000001bc0000000000001bd0"), 0x027501), + (hex!("0122222222333333334444444455000001bc0000000000004230"), 0x0275a1), + (hex!("0122222222333333334444444455000001bc0000000000005860"), 0x027641), + (hex!("0122222222333333334444444455000001bd0000000000001be0"), 0x0276e1), + (hex!("0122222222333333334444444455000001bd0000000000007c70"), 0x027781), + (hex!("0122222222333333334444444455000001be0000000000001bf0"), 0x027821), + (hex!("0122222222333333334444444455000001be0000000000007770"), 0x0278c1), + (hex!("0122222222333333334444444455000001be0000000000007cf0"), 0x027961), + (hex!("0122222222333333334444444455000001bf0000000000001c00"), 0x027a01), + (hex!("0122222222333333334444444455000001bf0000000000006490"), 0x027aa1), + (hex!("0122222222333333334444444455000001c00000000000001c10"), 0x027b41), + (hex!("0122222222333333334444444455000001c10000000000001c20"), 0x027be1), + (hex!("0122222222333333334444444455000001c10000000000004600"), 0x027c81), + (hex!("0122222222333333334444444455000001c20000000000001c30"), 0x027d21), + (hex!("0122222222333333334444444455000001c20000000000006e30"), 0x027dc1), + (hex!("0122222222333333334444444455000001c30000000000001c40"), 0x027e61), + (hex!("0122222222333333334444444455000001c40000000000001c50"), 0x027f01), + (hex!("0122222222333333334444444455000001c50000000000001c60"), 0x027fa1), + (hex!("0122222222333333334444444455000001c60000000000001c70"), 0x028041), + (hex!("0122222222333333334444444455000001c60000000000004240"), 0x0280e1), + (hex!("0122222222333333334444444455000001c60000000000005bb0"), 0x028181), + (hex!("0122222222333333334444444455000001c70000000000001c80"), 0x028221), + (hex!("0122222222333333334444444455000001c80000000000001c90"), 0x0282c1), + (hex!("0122222222333333334444444455000001c90000000000001ca0"), 0x028361), + (hex!("0122222222333333334444444455000001c90000000000006730"), 0x028401), + (hex!("0122222222333333334444444455000001ca0000000000001cb0"), 0x0284a1), + (hex!("0122222222333333334444444455000001ca00000000000070f0"), 0x028541), + (hex!("0122222222333333334444444455000001cb0000000000001cc0"), 0x0285e1), + (hex!("0122222222333333334444444455000001cb00000000000071a0"), 0x028681), + (hex!("0122222222333333334444444455000001cc0000000000001cd0"), 0x028721), + (hex!("0122222222333333334444444455000001cc0000000000005280"), 0x0287c1), + (hex!("0122222222333333334444444455000001cc0000000000005d90"), 0x028861), + (hex!("0122222222333333334444444455000001cd0000000000001ce0"), 0x028901), + (hex!("0122222222333333334444444455000001cd00000000000069b0"), 0x0289a1), + (hex!("0122222222333333334444444455000001ce0000000000001cf0"), 0x028a41), + (hex!("0122222222333333334444444455000001ce0000000000004540"), 0x028ae1), + (hex!("0122222222333333334444444455000001cf0000000000001d00"), 0x028b81), + (hex!("0122222222333333334444444455000001cf00000000000076a0"), 0x028c21), + (hex!("0122222222333333334444444455000001d00000000000001d10"), 0x028cc1), + (hex!("0122222222333333334444444455000001d000000000000060a0"), 0x028d61), + (hex!("0122222222333333334444444455000001d10000000000001d20"), 0x028e01), + (hex!("0122222222333333334444444455000001d20000000000001d30"), 0x028ea1), + (hex!("0122222222333333334444444455000001d30000000000001d40"), 0x028f41), + (hex!("0122222222333333334444444455000001d30000000000004000"), 0x028fe1), + (hex!("0122222222333333334444444455000001d30000000000004140"), 0x029081), + (hex!("0122222222333333334444444455000001d30000000000006790"), 0x029121), + (hex!("0122222222333333334444444455000001d40000000000001d50"), 0x0291c1), + (hex!("0122222222333333334444444455000001d50000000000001d60"), 0x029261), + (hex!("0122222222333333334444444455000001d60000000000001d70"), 0x029301), + (hex!("0122222222333333334444444455000001d60000000000004b50"), 0x0293a1), + (hex!("0122222222333333334444444455000001d60000000000007430"), 0x029441), + (hex!("0122222222333333334444444455000001d70000000000001d80"), 0x0294e1), + (hex!("0122222222333333334444444455000001d70000000000006920"), 0x029581), + (hex!("0122222222333333334444444455000001d80000000000001d90"), 0x029621), + (hex!("0122222222333333334444444455000001d80000000000005b30"), 0x0296c1), + (hex!("0122222222333333334444444455000001d90000000000001da0"), 0x029761), + (hex!("0122222222333333334444444455000001da0000000000001db0"), 0x029801), + (hex!("0122222222333333334444444455000001da0000000000004af0"), 0x0298a1), + (hex!("0122222222333333334444444455000001da0000000000007240"), 0x029941), + (hex!("0122222222333333334444444455000001da0000000000007470"), 0x0299e1), + (hex!("0122222222333333334444444455000001db0000000000001dc0"), 0x029a81), + (hex!("0122222222333333334444444455000001db00000000000045d0"), 0x029b21), + (hex!("0122222222333333334444444455000001dc0000000000001dd0"), 0x029bc1), + (hex!("0122222222333333334444444455000001dd0000000000001de0"), 0x029c61), + (hex!("0122222222333333334444444455000001dd0000000000004bb0"), 0x029d01), + (hex!("0122222222333333334444444455000001dd0000000000004cd0"), 0x029da1), + (hex!("0122222222333333334444444455000001dd0000000000006100"), 0x029e41), + (hex!("0122222222333333334444444455000001dd0000000000007bb0"), 0x029ee1), + (hex!("0122222222333333334444444455000001de0000000000001df0"), 0x029f81), + (hex!("0122222222333333334444444455000001de0000000000004260"), 0x02a021), + (hex!("0122222222333333334444444455000001de0000000000006040"), 0x02a0c1), + (hex!("0122222222333333334444444455000001df0000000000001e00"), 0x02a161), + (hex!("0122222222333333334444444455000001df0000000000005fa0"), 0x02a201), + (hex!("0122222222333333334444444455000001df0000000000006a70"), 0x02a2a1), + (hex!("0122222222333333334444444455000001df0000000000006dc0"), 0x02a341), + (hex!("0122222222333333334444444455000001e00000000000001e10"), 0x02a3e1), + (hex!("0122222222333333334444444455000001e00000000000007010"), 0x02a481), + (hex!("0122222222333333334444444455000001e10000000000001e20"), 0x02a521), + (hex!("0122222222333333334444444455000001e10000000000005720"), 0x02a5c1), + (hex!("0122222222333333334444444455000001e10000000000006830"), 0x02a661), + (hex!("0122222222333333334444444455000001e20000000000001e30"), 0x02a701), + (hex!("0122222222333333334444444455000001e20000000000005100"), 0x02a7a1), + (hex!("0122222222333333334444444455000001e30000000000001e40"), 0x02a841), + (hex!("0122222222333333334444444455000001e40000000000001e50"), 0x02a8e1), + (hex!("0122222222333333334444444455000001e40000000000003f30"), 0x02a981), + (hex!("0122222222333333334444444455000001e40000000000005220"), 0x02aa21), + (hex!("0122222222333333334444444455000001e50000000000001e60"), 0x02aac1), + (hex!("0122222222333333334444444455000001e50000000000006f60"), 0x02ab61), + (hex!("0122222222333333334444444455000001e60000000000001e70"), 0x02ac01), + (hex!("0122222222333333334444444455000001e60000000000006c80"), 0x02aca1), + (hex!("0122222222333333334444444455000001e70000000000001e80"), 0x02ad41), + (hex!("0122222222333333334444444455000001e80000000000001e90"), 0x02ade1), + (hex!("0122222222333333334444444455000001e80000000000004e30"), 0x02ae81), + (hex!("0122222222333333334444444455000001e90000000000001ea0"), 0x02af21), + (hex!("0122222222333333334444444455000001e90000000000005470"), 0x02afc1), + (hex!("0122222222333333334444444455000001ea0000000000001eb0"), 0x02b061), + (hex!("0122222222333333334444444455000001ea0000000000007980"), 0x02b101), + (hex!("0122222222333333334444444455000001eb0000000000001ec0"), 0x02b1a1), + (hex!("0122222222333333334444444455000001eb0000000000004390"), 0x02b241), + (hex!("0122222222333333334444444455000001eb0000000000005970"), 0x02b2e1), + (hex!("0122222222333333334444444455000001ec0000000000001ed0"), 0x02b381), + (hex!("0122222222333333334444444455000001ec0000000000005d50"), 0x02b421), + (hex!("0122222222333333334444444455000001ec00000000000076e0"), 0x02b4c1), + (hex!("0122222222333333334444444455000001ed0000000000001ee0"), 0x02b561), + (hex!("0122222222333333334444444455000001ed0000000000006190"), 0x02b601), + (hex!("0122222222333333334444444455000001ee0000000000001ef0"), 0x02b6a1), + (hex!("0122222222333333334444444455000001ee0000000000004900"), 0x02b741), + (hex!("0122222222333333334444444455000001ef0000000000001f00"), 0x02b7e1), + (hex!("0122222222333333334444444455000001ef0000000000006c60"), 0x02b881), + (hex!("0122222222333333334444444455000001f00000000000001f10"), 0x02b921), + (hex!("0122222222333333334444444455000001f00000000000006950"), 0x02b9c1), + (hex!("0122222222333333334444444455000001f10000000000001f20"), 0x02ba61), + (hex!("0122222222333333334444444455000001f10000000000006400"), 0x02bb01), + (hex!("0122222222333333334444444455000001f20000000000001f30"), 0x02bba1), + (hex!("0122222222333333334444444455000001f20000000000006f00"), 0x02bc41), + (hex!("0122222222333333334444444455000001f20000000000007b10"), 0x02bce1), + (hex!("0122222222333333334444444455000001f30000000000001f40"), 0x02bd81), + (hex!("0122222222333333334444444455000001f40000000000001f50"), 0x02be21), + (hex!("0122222222333333334444444455000001f50000000000001f60"), 0x02bec1), + (hex!("0122222222333333334444444455000001f500000000000044f0"), 0x02bf61), + (hex!("0122222222333333334444444455000001f60000000000001f70"), 0x02c001), + (hex!("0122222222333333334444444455000001f70000000000001f80"), 0x02c0a1), + (hex!("0122222222333333334444444455000001f70000000000004ad0"), 0x02c141), + (hex!("0122222222333333334444444455000001f80000000000001f90"), 0x02c1e1), + (hex!("0122222222333333334444444455000001f90000000000001fa0"), 0x02c281), + (hex!("0122222222333333334444444455000001f90000000000003f60"), 0x02c321), + (hex!("0122222222333333334444444455000001f90000000000004a80"), 0x02c3c1), + (hex!("0122222222333333334444444455000001fa0000000000001fb0"), 0x02c461), + (hex!("0122222222333333334444444455000001fa0000000000006f90"), 0x02c501), + (hex!("0122222222333333334444444455000001fb0000000000001fc0"), 0x02c5a1), + (hex!("0122222222333333334444444455000001fc0000000000001fd0"), 0x02c641), + (hex!("0122222222333333334444444455000001fc0000000000004a90"), 0x02c6e1), + (hex!("0122222222333333334444444455000001fd0000000000001fe0"), 0x02c781), + (hex!("0122222222333333334444444455000001fd0000000000005f50"), 0x02c821), + (hex!("0122222222333333334444444455000001fe0000000000001ff0"), 0x02c8c1), + (hex!("0122222222333333334444444455000001ff0000000000002000"), 0x02c961), + (hex!("0122222222333333334444444455000002000000000000002010"), 0x02ca01), + (hex!("0122222222333333334444444455000002000000000000005f00"), 0x02caa1), + (hex!("0122222222333333334444444455000002000000000000006840"), 0x02cb41), + (hex!("0122222222333333334444444455000002010000000000002020"), 0x02cbe1), + (hex!("0122222222333333334444444455000002020000000000002030"), 0x02cc81), + (hex!("0122222222333333334444444455000002030000000000002040"), 0x02cd21), + (hex!("0122222222333333334444444455000002040000000000002050"), 0x02cdc1), + (hex!("01222222223333333344444444550000020400000000000051f0"), 0x02ce61), + (hex!("0122222222333333334444444455000002050000000000002060"), 0x02cf01), + (hex!("0122222222333333334444444455000002060000000000002070"), 0x02cfa1), + (hex!("0122222222333333334444444455000002060000000000005c80"), 0x02d041), + (hex!("01222222223333333344444444550000020600000000000061d0"), 0x02d0e1), + (hex!("01222222223333333344444444550000020600000000000078c0"), 0x02d181), + (hex!("0122222222333333334444444455000002070000000000002080"), 0x02d221), + (hex!("0122222222333333334444444455000002070000000000006ba0"), 0x02d2c1), + (hex!("0122222222333333334444444455000002080000000000002090"), 0x02d361), + (hex!("01222222223333333344444444550000020900000000000020a0"), 0x02d401), + (hex!("01222222223333333344444444550000020900000000000067a0"), 0x02d4a1), + (hex!("01222222223333333344444444550000020a00000000000020b0"), 0x02d541), + (hex!("01222222223333333344444444550000020a0000000000004950"), 0x02d5e1), + (hex!("01222222223333333344444444550000020a0000000000004de0"), 0x02d681), + (hex!("01222222223333333344444444550000020b00000000000020c0"), 0x02d721), + (hex!("01222222223333333344444444550000020b0000000000004b00"), 0x02d7c1), + (hex!("01222222223333333344444444550000020c00000000000020d0"), 0x02d861), + (hex!("01222222223333333344444444550000020d00000000000020e0"), 0x02d901), + (hex!("01222222223333333344444444550000020e00000000000020f0"), 0x02d9a1), + (hex!("01222222223333333344444444550000020f0000000000002100"), 0x02da41), + (hex!("0122222222333333334444444455000002100000000000002110"), 0x02dae1), + (hex!("0122222222333333334444444455000002110000000000002120"), 0x02db81), + (hex!("0122222222333333334444444455000002110000000000004490"), 0x02dc21), + (hex!("0122222222333333334444444455000002120000000000002130"), 0x02dcc1), + (hex!("0122222222333333334444444455000002130000000000002140"), 0x02dd61), + (hex!("01222222223333333344444444550000021300000000000046d0"), 0x02de01), + (hex!("01222222223333333344444444550000021300000000000046e0"), 0x02dea1), + (hex!("0122222222333333334444444455000002130000000000004b70"), 0x02df41), + (hex!("0122222222333333334444444455000002140000000000002150"), 0x02dfe1), + (hex!("0122222222333333334444444455000002140000000000006c50"), 0x02e081), + (hex!("0122222222333333334444444455000002150000000000002160"), 0x02e121), + (hex!("01222222223333333344444444550000021500000000000043c0"), 0x02e1c1), + (hex!("0122222222333333334444444455000002160000000000002170"), 0x02e261), + (hex!("01222222223333333344444444550000021600000000000055b0"), 0x02e301), + (hex!("0122222222333333334444444455000002160000000000006150"), 0x02e3a1), + (hex!("0122222222333333334444444455000002170000000000002180"), 0x02e441), + (hex!("01222222223333333344444444550000021700000000000053b0"), 0x02e4e1), + (hex!("0122222222333333334444444455000002170000000000007460"), 0x02e581), + (hex!("0122222222333333334444444455000002180000000000002190"), 0x02e621), + (hex!("01222222223333333344444444550000021900000000000021a0"), 0x02e6c1), + (hex!("01222222223333333344444444550000021a00000000000021b0"), 0x02e761), + (hex!("01222222223333333344444444550000021a0000000000007650"), 0x02e801), + (hex!("01222222223333333344444444550000021b00000000000021c0"), 0x02e8a1), + (hex!("01222222223333333344444444550000021b0000000000004b20"), 0x02e941), + (hex!("01222222223333333344444444550000021c00000000000021d0"), 0x02e9e1), + (hex!("01222222223333333344444444550000021c0000000000007610"), 0x02ea81), + (hex!("01222222223333333344444444550000021d00000000000021e0"), 0x02eb21), + (hex!("01222222223333333344444444550000021d0000000000005f40"), 0x02ebc1), + (hex!("01222222223333333344444444550000021e00000000000021f0"), 0x02ec61), + (hex!("01222222223333333344444444550000021e0000000000005a50"), 0x02ed01), + (hex!("01222222223333333344444444550000021e0000000000005ff0"), 0x02eda1), + (hex!("01222222223333333344444444550000021f0000000000002200"), 0x02ee41), + (hex!("01222222223333333344444444550000021f00000000000043a0"), 0x02eee1), + (hex!("01222222223333333344444444550000021f0000000000004cb0"), 0x02ef81), + (hex!("01222222223333333344444444550000021f0000000000004e00"), 0x02f021), + (hex!("0122222222333333334444444455000002200000000000002210"), 0x02f0c1), + (hex!("0122222222333333334444444455000002210000000000002220"), 0x02f161), + (hex!("0122222222333333334444444455000002210000000000006290"), 0x02f201), + (hex!("0122222222333333334444444455000002210000000000007230"), 0x02f2a1), + (hex!("0122222222333333334444444455000002220000000000002230"), 0x02f341), + (hex!("0122222222333333334444444455000002220000000000006ea0"), 0x02f3e1), + (hex!("0122222222333333334444444455000002230000000000002240"), 0x02f481), + (hex!("0122222222333333334444444455000002230000000000004710"), 0x02f521), + (hex!("0122222222333333334444444455000002240000000000002250"), 0x02f5c1), + (hex!("0122222222333333334444444455000002250000000000002260"), 0x02f661), + (hex!("0122222222333333334444444455000002260000000000002270"), 0x02f701), + (hex!("0122222222333333334444444455000002260000000000005b40"), 0x02f7a1), + (hex!("0122222222333333334444444455000002260000000000006300"), 0x02f841), + (hex!("0122222222333333334444444455000002270000000000002280"), 0x02f8e1), + (hex!("0122222222333333334444444455000002270000000000005b80"), 0x02f981), + (hex!("0122222222333333334444444455000002280000000000002290"), 0x02fa21), + (hex!("0122222222333333334444444455000002280000000000003ed0"), 0x02fac1), + (hex!("0122222222333333334444444455000002280000000000004550"), 0x02fb61), + (hex!("01222222223333333344444444550000022800000000000077d0"), 0x02fc01), + (hex!("01222222223333333344444444550000022900000000000022a0"), 0x02fca1), + (hex!("0122222222333333334444444455000002290000000000006480"), 0x02fd41), + (hex!("01222222223333333344444444550000022a00000000000022b0"), 0x02fde1), + (hex!("01222222223333333344444444550000022a0000000000005450"), 0x02fe81), + (hex!("01222222223333333344444444550000022b00000000000022c0"), 0x02ff21), + (hex!("01222222223333333344444444550000022b0000000000006dd0"), 0x02ffc1), + (hex!("01222222223333333344444444550000022c00000000000022d0"), 0x030061), + (hex!("01222222223333333344444444550000022c0000000000006890"), 0x030101), + (hex!("01222222223333333344444444550000022d00000000000022e0"), 0x0301a1), + (hex!("01222222223333333344444444550000022e00000000000022f0"), 0x030241), + (hex!("01222222223333333344444444550000022e0000000000004f20"), 0x0302e1), + (hex!("01222222223333333344444444550000022f0000000000002300"), 0x030381), + (hex!("01222222223333333344444444550000022f0000000000005260"), 0x030421), + (hex!("01222222223333333344444444550000022f00000000000053f0"), 0x0304c1), + (hex!("0122222222333333334444444455000002300000000000002310"), 0x030561), + (hex!("01222222223333333344444444550000023000000000000050e0"), 0x030601), + (hex!("0122222222333333334444444455000002310000000000002320"), 0x0306a1), + (hex!("0122222222333333334444444455000002310000000000007800"), 0x030741), + (hex!("0122222222333333334444444455000002320000000000002330"), 0x0307e1), + (hex!("0122222222333333334444444455000002330000000000002340"), 0x030881), + (hex!("0122222222333333334444444455000002330000000000004d70"), 0x030921), + (hex!("0122222222333333334444444455000002330000000000005cf0"), 0x0309c1), + (hex!("0122222222333333334444444455000002340000000000002350"), 0x030a61), + (hex!("0122222222333333334444444455000002350000000000002360"), 0x030b01), + (hex!("0122222222333333334444444455000002350000000000006970"), 0x030ba1), + (hex!("0122222222333333334444444455000002360000000000002370"), 0x030c41), + (hex!("0122222222333333334444444455000002360000000000005270"), 0x030ce1), + (hex!("0122222222333333334444444455000002370000000000002380"), 0x030d81), + (hex!("0122222222333333334444444455000002370000000000005d70"), 0x030e21), + (hex!("0122222222333333334444444455000002380000000000002390"), 0x030ec1), + (hex!("01222222223333333344444444550000023800000000000069a0"), 0x030f61), + (hex!("01222222223333333344444444550000023900000000000023a0"), 0x031001), + (hex!("01222222223333333344444444550000023900000000000052e0"), 0x0310a1), + (hex!("0122222222333333334444444455000002390000000000005a10"), 0x031141), + (hex!("0122222222333333334444444455000002390000000000007440"), 0x0311e1), + (hex!("01222222223333333344444444550000023a00000000000023b0"), 0x031281), + (hex!("01222222223333333344444444550000023a0000000000003f00"), 0x031321), + (hex!("01222222223333333344444444550000023a0000000000004430"), 0x0313c1), + (hex!("01222222223333333344444444550000023a0000000000007070"), 0x031461), + (hex!("01222222223333333344444444550000023a00000000000074a0"), 0x031501), + (hex!("01222222223333333344444444550000023b00000000000023c0"), 0x0315a1), + (hex!("01222222223333333344444444550000023b0000000000004730"), 0x031641), + (hex!("01222222223333333344444444550000023b00000000000068b0"), 0x0316e1), + (hex!("01222222223333333344444444550000023c00000000000023d0"), 0x031781), + (hex!("01222222223333333344444444550000023c0000000000004680"), 0x031821), + (hex!("01222222223333333344444444550000023d00000000000023e0"), 0x0318c1), + (hex!("01222222223333333344444444550000023d00000000000059a0"), 0x031961), + (hex!("01222222223333333344444444550000023e00000000000023f0"), 0x031a01), + (hex!("01222222223333333344444444550000023f0000000000002400"), 0x031aa1), + (hex!("0122222222333333334444444455000002400000000000002410"), 0x031b41), + (hex!("0122222222333333334444444455000002400000000000004920"), 0x031be1), + (hex!("01222222223333333344444444550000024000000000000066e0"), 0x031c81), + (hex!("01222222223333333344444444550000024000000000000076f0"), 0x031d21), + (hex!("01222222223333333344444444550000024000000000000078e0"), 0x031dc1), + (hex!("0122222222333333334444444455000002410000000000002420"), 0x031e61), + (hex!("0122222222333333334444444455000002420000000000002430"), 0x031f01), + (hex!("0122222222333333334444444455000002420000000000006590"), 0x031fa1), + (hex!("0122222222333333334444444455000002430000000000002440"), 0x032041), + (hex!("0122222222333333334444444455000002430000000000004d00"), 0x0320e1), + (hex!("0122222222333333334444444455000002440000000000002450"), 0x032181), + (hex!("0122222222333333334444444455000002440000000000005f80"), 0x032221), + (hex!("0122222222333333334444444455000002450000000000002460"), 0x0322c1), + (hex!("0122222222333333334444444455000002450000000000004940"), 0x032361), + (hex!("0122222222333333334444444455000002460000000000002470"), 0x032401), + (hex!("0122222222333333334444444455000002470000000000002480"), 0x0324a1), + (hex!("0122222222333333334444444455000002470000000000004dd0"), 0x032541), + (hex!("0122222222333333334444444455000002470000000000005930"), 0x0325e1), + (hex!("01222222223333333344444444550000024700000000000061b0"), 0x032681), + (hex!("0122222222333333334444444455000002470000000000007740"), 0x032721), + (hex!("0122222222333333334444444455000002480000000000002490"), 0x0327c1), + (hex!("0122222222333333334444444455000002480000000000004890"), 0x032861), + (hex!("01222222223333333344444444550000024900000000000024a0"), 0x032901), + (hex!("01222222223333333344444444550000024a00000000000024b0"), 0x0329a1), + (hex!("01222222223333333344444444550000024b00000000000024c0"), 0x032a41), + (hex!("01222222223333333344444444550000024c00000000000024d0"), 0x032ae1), + (hex!("01222222223333333344444444550000024d00000000000024e0"), 0x032b81), + (hex!("01222222223333333344444444550000024d0000000000004070"), 0x032c21), + (hex!("01222222223333333344444444550000024e00000000000024f0"), 0x032cc1), + (hex!("01222222223333333344444444550000024e00000000000066a0"), 0x032d61), + (hex!("01222222223333333344444444550000024e0000000000006ab0"), 0x032e01), + (hex!("01222222223333333344444444550000024f0000000000002500"), 0x032ea1), + (hex!("0122222222333333334444444455000002500000000000002510"), 0x032f41), + (hex!("0122222222333333334444444455000002510000000000002520"), 0x032fe1), + (hex!("0122222222333333334444444455000002510000000000007320"), 0x033081), + (hex!("0122222222333333334444444455000002520000000000002530"), 0x033121), + (hex!("0122222222333333334444444455000002520000000000006410"), 0x0331c1), + (hex!("0122222222333333334444444455000002530000000000002540"), 0x033261), + (hex!("0122222222333333334444444455000002530000000000005110"), 0x033301), + (hex!("0122222222333333334444444455000002540000000000002550"), 0x0333a1), + (hex!("01222222223333333344444444550000025400000000000040c0"), 0x033441), + (hex!("0122222222333333334444444455000002540000000000006a40"), 0x0334e1), + (hex!("0122222222333333334444444455000002550000000000002560"), 0x033581), + (hex!("0122222222333333334444444455000002550000000000005190"), 0x033621), + (hex!("0122222222333333334444444455000002560000000000002570"), 0x0336c1), + (hex!("01222222223333333344444444550000025600000000000061f0"), 0x033761), + (hex!("0122222222333333334444444455000002570000000000002580"), 0x033801), + (hex!("0122222222333333334444444455000002580000000000002590"), 0x0338a1), + (hex!("01222222223333333344444444550000025800000000000043d0"), 0x033941), + (hex!("01222222223333333344444444550000025900000000000025a0"), 0x0339e1), + (hex!("0122222222333333334444444455000002590000000000006bb0"), 0x033a81), + (hex!("01222222223333333344444444550000025a00000000000025b0"), 0x033b21), + (hex!("01222222223333333344444444550000025a0000000000005fb0"), 0x033bc1), + (hex!("01222222223333333344444444550000025a00000000000064c0"), 0x033c61), + (hex!("01222222223333333344444444550000025b00000000000025c0"), 0x033d01), + (hex!("01222222223333333344444444550000025b0000000000005c10"), 0x033da1), + (hex!("01222222223333333344444444550000025c00000000000025d0"), 0x033e41), + (hex!("01222222223333333344444444550000025c0000000000007d00"), 0x033ee1), + (hex!("01222222223333333344444444550000025d00000000000025e0"), 0x033f81), + (hex!("01222222223333333344444444550000025e00000000000025f0"), 0x034021), + (hex!("01222222223333333344444444550000025e00000000000045e0"), 0x0340c1), + (hex!("01222222223333333344444444550000025e0000000000006ee0"), 0x034161), + (hex!("01222222223333333344444444550000025f0000000000002600"), 0x034201), + (hex!("01222222223333333344444444550000025f00000000000050b0"), 0x0342a1), + (hex!("01222222223333333344444444550000025f0000000000007690"), 0x034341), + (hex!("0122222222333333334444444455000002600000000000002610"), 0x0343e1), + (hex!("0122222222333333334444444455000002600000000000007b60"), 0x034481), + (hex!("0122222222333333334444444455000002610000000000002620"), 0x034521), + (hex!("0122222222333333334444444455000002620000000000002630"), 0x0345c1), + (hex!("0122222222333333334444444455000002630000000000002640"), 0x034661), + (hex!("0122222222333333334444444455000002640000000000002650"), 0x034701), + (hex!("0122222222333333334444444455000002650000000000002660"), 0x0347a1), + (hex!("0122222222333333334444444455000002650000000000006180"), 0x034841), + (hex!("0122222222333333334444444455000002660000000000002670"), 0x0348e1), + (hex!("0122222222333333334444444455000002660000000000005430"), 0x034981), + (hex!("0122222222333333334444444455000002660000000000007a60"), 0x034a21), + (hex!("0122222222333333334444444455000002670000000000002680"), 0x034ac1), + (hex!("01222222223333333344444444550000026700000000000077f0"), 0x034b61), + (hex!("0122222222333333334444444455000002680000000000002690"), 0x034c01), + (hex!("01222222223333333344444444550000026900000000000026a0"), 0x034ca1), + (hex!("01222222223333333344444444550000026a00000000000026b0"), 0x034d41), + (hex!("01222222223333333344444444550000026a0000000000007530"), 0x034de1), + (hex!("01222222223333333344444444550000026b00000000000026c0"), 0x034e81), + (hex!("01222222223333333344444444550000026b00000000000058b0"), 0x034f21), + (hex!("01222222223333333344444444550000026b00000000000066b0"), 0x034fc1), + (hex!("01222222223333333344444444550000026b0000000000006b10"), 0x035061), + (hex!("01222222223333333344444444550000026c00000000000026d0"), 0x035101), + (hex!("01222222223333333344444444550000026d00000000000026e0"), 0x0351a1), + (hex!("01222222223333333344444444550000026d0000000000004210"), 0x035241), + (hex!("01222222223333333344444444550000026d0000000000005490"), 0x0352e1), + (hex!("01222222223333333344444444550000026d0000000000005e60"), 0x035381), + (hex!("01222222223333333344444444550000026d00000000000068e0"), 0x035421), + (hex!("01222222223333333344444444550000026d0000000000007020"), 0x0354c1), + (hex!("01222222223333333344444444550000026d0000000000007300"), 0x035561), + (hex!("01222222223333333344444444550000026e00000000000026f0"), 0x035601), + (hex!("01222222223333333344444444550000026f0000000000002700"), 0x0356a1), + (hex!("01222222223333333344444444550000026f0000000000004910"), 0x035741), + (hex!("0122222222333333334444444455000002700000000000002710"), 0x0357e1), + (hex!("0122222222333333334444444455000002710000000000002720"), 0x035881), + (hex!("01222222223333333344444444550000027100000000000050c0"), 0x035921), + (hex!("0122222222333333334444444455000002720000000000002730"), 0x0359c1), + (hex!("0122222222333333334444444455000002730000000000002740"), 0x035a61), + (hex!("0122222222333333334444444455000002740000000000002750"), 0x035b01), + (hex!("0122222222333333334444444455000002740000000000007490"), 0x035ba1), + (hex!("0122222222333333334444444455000002750000000000002760"), 0x035c41), + (hex!("0122222222333333334444444455000002760000000000002770"), 0x035ce1), + (hex!("0122222222333333334444444455000002760000000000004790"), 0x035d81), + (hex!("0122222222333333334444444455000002770000000000002780"), 0x035e21), + (hex!("01222222223333333344444444550000027700000000000050a0"), 0x035ec1), + (hex!("0122222222333333334444444455000002780000000000002790"), 0x035f61), + (hex!("0122222222333333334444444455000002780000000000004330"), 0x036001), + (hex!("0122222222333333334444444455000002780000000000006b00"), 0x0360a1), + (hex!("01222222223333333344444444550000027900000000000027a0"), 0x036141), + (hex!("01222222223333333344444444550000027a00000000000027b0"), 0x0361e1), + (hex!("01222222223333333344444444550000027b00000000000027c0"), 0x036281), + (hex!("01222222223333333344444444550000027b0000000000004930"), 0x036321), + (hex!("01222222223333333344444444550000027b0000000000006250"), 0x0363c1), + (hex!("01222222223333333344444444550000027c00000000000027d0"), 0x036461), + (hex!("01222222223333333344444444550000027d00000000000027e0"), 0x036501), + (hex!("01222222223333333344444444550000027d0000000000005ce0"), 0x0365a1), + (hex!("01222222223333333344444444550000027d0000000000005fe0"), 0x036641), + (hex!("01222222223333333344444444550000027e00000000000027f0"), 0x0366e1), + (hex!("01222222223333333344444444550000027f0000000000002800"), 0x036781), + (hex!("01222222223333333344444444550000027f0000000000003e90"), 0x036821), + (hex!("01222222223333333344444444550000027f0000000000007910"), 0x0368c1), + (hex!("0122222222333333334444444455000002800000000000002810"), 0x036961), + (hex!("0122222222333333334444444455000002800000000000004990"), 0x036a01), + (hex!("0122222222333333334444444455000002800000000000006160"), 0x036aa1), + (hex!("0122222222333333334444444455000002800000000000006740"), 0x036b41), + (hex!("0122222222333333334444444455000002810000000000002820"), 0x036be1), + (hex!("0122222222333333334444444455000002820000000000002830"), 0x036c81), + (hex!("0122222222333333334444444455000002820000000000005170"), 0x036d21), + (hex!("0122222222333333334444444455000002830000000000002840"), 0x036dc1), + (hex!("0122222222333333334444444455000002840000000000002850"), 0x036e61), + (hex!("0122222222333333334444444455000002840000000000004810"), 0x036f01), + (hex!("0122222222333333334444444455000002840000000000006aa0"), 0x036fa1), + (hex!("0122222222333333334444444455000002850000000000002860"), 0x037041), + (hex!("0122222222333333334444444455000002860000000000002870"), 0x0370e1), + (hex!("0122222222333333334444444455000002860000000000005080"), 0x037181), + (hex!("0122222222333333334444444455000002870000000000002880"), 0x037221), + (hex!("0122222222333333334444444455000002870000000000004e60"), 0x0372c1), + (hex!("0122222222333333334444444455000002880000000000002890"), 0x037361), + (hex!("0122222222333333334444444455000002880000000000005060"), 0x037401), + (hex!("0122222222333333334444444455000002880000000000006f20"), 0x0374a1), + (hex!("01222222223333333344444444550000028900000000000028a0"), 0x037541), + (hex!("01222222223333333344444444550000028900000000000047e0"), 0x0375e1), + (hex!("01222222223333333344444444550000028a00000000000028b0"), 0x037681), + (hex!("01222222223333333344444444550000028a0000000000005ab0"), 0x037721), + (hex!("01222222223333333344444444550000028a0000000000007130"), 0x0377c1), + (hex!("01222222223333333344444444550000028a0000000000007660"), 0x037861), + (hex!("01222222223333333344444444550000028b00000000000028c0"), 0x037901), + (hex!("01222222223333333344444444550000028b00000000000054e0"), 0x0379a1), + (hex!("01222222223333333344444444550000028c00000000000028d0"), 0x037a41), + (hex!("01222222223333333344444444550000028c00000000000046f0"), 0x037ae1), + (hex!("01222222223333333344444444550000028c00000000000061a0"), 0x037b81), + (hex!("01222222223333333344444444550000028d00000000000028e0"), 0x037c21), + (hex!("01222222223333333344444444550000028e00000000000028f0"), 0x037cc1), + (hex!("01222222223333333344444444550000028e0000000000004130"), 0x037d61), + (hex!("01222222223333333344444444550000028f0000000000002900"), 0x037e01), + (hex!("01222222223333333344444444550000028f0000000000007510"), 0x037ea1), + (hex!("0122222222333333334444444455000002900000000000002910"), 0x037f41), + (hex!("0122222222333333334444444455000002900000000000004a40"), 0x037fe1), + (hex!("0122222222333333334444444455000002910000000000002920"), 0x038081), + (hex!("0122222222333333334444444455000002920000000000002930"), 0x038121), + (hex!("0122222222333333334444444455000002920000000000004e90"), 0x0381c1), + (hex!("0122222222333333334444444455000002930000000000002940"), 0x038261), + (hex!("0122222222333333334444444455000002930000000000006880"), 0x038301), + (hex!("0122222222333333334444444455000002940000000000002950"), 0x0383a1), + (hex!("0122222222333333334444444455000002940000000000007bc0"), 0x038441), + (hex!("0122222222333333334444444455000002950000000000002960"), 0x0384e1), + (hex!("0122222222333333334444444455000002960000000000002970"), 0x038581), + (hex!("01222222223333333344444444550000029600000000000059d0"), 0x038621), + (hex!("0122222222333333334444444455000002970000000000002980"), 0x0386c1), + (hex!("0122222222333333334444444455000002970000000000004a50"), 0x038761), + (hex!("0122222222333333334444444455000002970000000000005f20"), 0x038801), + (hex!("01222222223333333344444444550000029700000000000068d0"), 0x0388a1), + (hex!("0122222222333333334444444455000002980000000000002990"), 0x038941), + (hex!("0122222222333333334444444455000002980000000000004370"), 0x0389e1), + (hex!("0122222222333333334444444455000002980000000000004420"), 0x038a81), + (hex!("01222222223333333344444444550000029900000000000029a0"), 0x038b21), + (hex!("01222222223333333344444444550000029a00000000000029b0"), 0x038bc1), + (hex!("01222222223333333344444444550000029a0000000000006010"), 0x038c61), + (hex!("01222222223333333344444444550000029a0000000000006980"), 0x038d01), + (hex!("01222222223333333344444444550000029b00000000000029c0"), 0x038da1), + (hex!("01222222223333333344444444550000029c00000000000029d0"), 0x038e41), + (hex!("01222222223333333344444444550000029c0000000000007480"), 0x038ee1), + (hex!("01222222223333333344444444550000029d00000000000029e0"), 0x038f81), + (hex!("01222222223333333344444444550000029d0000000000005030"), 0x039021), + (hex!("01222222223333333344444444550000029d0000000000007780"), 0x0390c1), + (hex!("01222222223333333344444444550000029d0000000000007a50"), 0x039161), + (hex!("01222222223333333344444444550000029e00000000000029f0"), 0x039201), + (hex!("01222222223333333344444444550000029e00000000000074b0"), 0x0392a1), + (hex!("01222222223333333344444444550000029f0000000000002a00"), 0x039341), + (hex!("0122222222333333334444444455000002a00000000000002a10"), 0x0393e1), + (hex!("0122222222333333334444444455000002a10000000000002a20"), 0x039481), + (hex!("0122222222333333334444444455000002a20000000000002a30"), 0x039521), + (hex!("0122222222333333334444444455000002a20000000000004c50"), 0x0395c1), + (hex!("0122222222333333334444444455000002a20000000000006f10"), 0x039661), + (hex!("0122222222333333334444444455000002a30000000000002a40"), 0x039701), + (hex!("0122222222333333334444444455000002a40000000000002a50"), 0x0397a1), + (hex!("0122222222333333334444444455000002a40000000000005d60"), 0x039841), + (hex!("0122222222333333334444444455000002a50000000000002a60"), 0x0398e1), + (hex!("0122222222333333334444444455000002a50000000000005440"), 0x039981), + (hex!("0122222222333333334444444455000002a50000000000005890"), 0x039a21), + (hex!("0122222222333333334444444455000002a60000000000002a70"), 0x039ac1), + (hex!("0122222222333333334444444455000002a70000000000002a80"), 0x039b61), + (hex!("0122222222333333334444444455000002a700000000000054a0"), 0x039c01), + (hex!("0122222222333333334444444455000002a70000000000007280"), 0x039ca1), + (hex!("0122222222333333334444444455000002a80000000000002a90"), 0x039d41), + (hex!("0122222222333333334444444455000002a90000000000002aa0"), 0x039de1), + (hex!("0122222222333333334444444455000002aa0000000000002ab0"), 0x039e81), + (hex!("0122222222333333334444444455000002ab0000000000002ac0"), 0x039f21), + (hex!("0122222222333333334444444455000002ab0000000000006c90"), 0x039fc1), + (hex!("0122222222333333334444444455000002ac0000000000002ad0"), 0x03a061), + (hex!("0122222222333333334444444455000002ac0000000000006db0"), 0x03a101), + (hex!("0122222222333333334444444455000002ad0000000000002ae0"), 0x03a1a1), + (hex!("0122222222333333334444444455000002ad00000000000065e0"), 0x03a241), + (hex!("0122222222333333334444444455000002ad0000000000007b40"), 0x03a2e1), + (hex!("0122222222333333334444444455000002ae0000000000002af0"), 0x03a381), + (hex!("0122222222333333334444444455000002ae0000000000004d20"), 0x03a421), + (hex!("0122222222333333334444444455000002ae0000000000006f30"), 0x03a4c1), + (hex!("0122222222333333334444444455000002af0000000000002b00"), 0x03a561), + (hex!("0122222222333333334444444455000002b00000000000002b10"), 0x03a601), + (hex!("0122222222333333334444444455000002b00000000000004560"), 0x03a6a1), + (hex!("0122222222333333334444444455000002b00000000000005800"), 0x03a741), + (hex!("0122222222333333334444444455000002b00000000000005a60"), 0x03a7e1), + (hex!("0122222222333333334444444455000002b10000000000002b20"), 0x03a881), + (hex!("0122222222333333334444444455000002b10000000000007b30"), 0x03a921), + (hex!("0122222222333333334444444455000002b20000000000002b30"), 0x03a9c1), + (hex!("0122222222333333334444444455000002b20000000000004440"), 0x03aa61), + (hex!("0122222222333333334444444455000002b20000000000004f80"), 0x03ab01), + (hex!("0122222222333333334444444455000002b20000000000005020"), 0x03aba1), + (hex!("0122222222333333334444444455000002b30000000000002b40"), 0x03ac41), + (hex!("0122222222333333334444444455000002b40000000000002b50"), 0x03ace1), + (hex!("0122222222333333334444444455000002b50000000000002b60"), 0x03ad81), + (hex!("0122222222333333334444444455000002b500000000000059e0"), 0x03ae21), + (hex!("0122222222333333334444444455000002b60000000000002b70"), 0x03aec1), + (hex!("0122222222333333334444444455000002b70000000000002b80"), 0x03af61), + (hex!("0122222222333333334444444455000002b80000000000002b90"), 0x03b001), + (hex!("0122222222333333334444444455000002b80000000000004590"), 0x03b0a1), + (hex!("0122222222333333334444444455000002b800000000000047d0"), 0x03b141), + (hex!("0122222222333333334444444455000002b80000000000006030"), 0x03b1e1), + (hex!("0122222222333333334444444455000002b80000000000006a20"), 0x03b281), + (hex!("0122222222333333334444444455000002b80000000000006a90"), 0x03b321), + (hex!("0122222222333333334444444455000002b90000000000002ba0"), 0x03b3c1), + (hex!("0122222222333333334444444455000002ba0000000000002bb0"), 0x03b461), + (hex!("0122222222333333334444444455000002ba0000000000006e80"), 0x03b501), + (hex!("0122222222333333334444444455000002bb0000000000002bc0"), 0x03b5a1), + (hex!("0122222222333333334444444455000002bc0000000000002bd0"), 0x03b641), + (hex!("0122222222333333334444444455000002bc0000000000004b30"), 0x03b6e1), + (hex!("0122222222333333334444444455000002bd0000000000002be0"), 0x03b781), + (hex!("0122222222333333334444444455000002bd0000000000005e10"), 0x03b821), + (hex!("0122222222333333334444444455000002be0000000000002bf0"), 0x03b8c1), + (hex!("0122222222333333334444444455000002bf0000000000002c00"), 0x03b961), + (hex!("0122222222333333334444444455000002c00000000000002c10"), 0x03ba01), + (hex!("0122222222333333334444444455000002c10000000000002c20"), 0x03baa1), + (hex!("0122222222333333334444444455000002c10000000000003ef0"), 0x03bb41), + (hex!("0122222222333333334444444455000002c20000000000002c30"), 0x03bbe1), + (hex!("0122222222333333334444444455000002c200000000000056e0"), 0x03bc81), + (hex!("0122222222333333334444444455000002c30000000000002c40"), 0x03bd21), + (hex!("0122222222333333334444444455000002c30000000000004b60"), 0x03bdc1), + (hex!("0122222222333333334444444455000002c40000000000002c50"), 0x03be61), + (hex!("0122222222333333334444444455000002c400000000000045f0"), 0x03bf01), + (hex!("0122222222333333334444444455000002c40000000000005290"), 0x03bfa1), + (hex!("0122222222333333334444444455000002c50000000000002c60"), 0x03c041), + (hex!("0122222222333333334444444455000002c60000000000002c70"), 0x03c0e1), + (hex!("0122222222333333334444444455000002c60000000000006ae0"), 0x03c181), + (hex!("0122222222333333334444444455000002c70000000000002c80"), 0x03c221), + (hex!("0122222222333333334444444455000002c70000000000005680"), 0x03c2c1), + (hex!("0122222222333333334444444455000002c70000000000006e10"), 0x03c361), + (hex!("0122222222333333334444444455000002c80000000000002c90"), 0x03c401), + (hex!("0122222222333333334444444455000002c90000000000002ca0"), 0x03c4a1), + (hex!("0122222222333333334444444455000002ca0000000000002cb0"), 0x03c541), + (hex!("0122222222333333334444444455000002cb0000000000002cc0"), 0x03c5e1), + (hex!("0122222222333333334444444455000002cc0000000000002cd0"), 0x03c681), + (hex!("0122222222333333334444444455000002cc0000000000005b50"), 0x03c721), + (hex!("0122222222333333334444444455000002cd0000000000002ce0"), 0x03c7c1), + (hex!("0122222222333333334444444455000002ce0000000000002cf0"), 0x03c861), + (hex!("0122222222333333334444444455000002ce00000000000043f0"), 0x03c901), + (hex!("0122222222333333334444444455000002ce0000000000006420"), 0x03c9a1), + (hex!("0122222222333333334444444455000002cf0000000000002d00"), 0x03ca41), + (hex!("0122222222333333334444444455000002d00000000000002d10"), 0x03cae1), + (hex!("0122222222333333334444444455000002d10000000000002d20"), 0x03cb81), + (hex!("0122222222333333334444444455000002d10000000000005370"), 0x03cc21), + (hex!("0122222222333333334444444455000002d20000000000002d30"), 0x03ccc1), + (hex!("0122222222333333334444444455000002d20000000000005ef0"), 0x03cd61), + (hex!("0122222222333333334444444455000002d20000000000006570"), 0x03ce01), + (hex!("0122222222333333334444444455000002d30000000000002d40"), 0x03cea1), + (hex!("0122222222333333334444444455000002d30000000000007360"), 0x03cf41), + (hex!("0122222222333333334444444455000002d40000000000002d50"), 0x03cfe1), + (hex!("0122222222333333334444444455000002d400000000000079a0"), 0x03d081), + (hex!("0122222222333333334444444455000002d50000000000002d60"), 0x03d121), + (hex!("0122222222333333334444444455000002d50000000000004250"), 0x03d1c1), + (hex!("0122222222333333334444444455000002d50000000000006050"), 0x03d261), + (hex!("0122222222333333334444444455000002d60000000000002d70"), 0x03d301), + (hex!("0122222222333333334444444455000002d60000000000007080"), 0x03d3a1), + (hex!("0122222222333333334444444455000002d70000000000002d80"), 0x03d441), + (hex!("0122222222333333334444444455000002d80000000000002d90"), 0x03d4e1), + (hex!("0122222222333333334444444455000002d80000000000007110"), 0x03d581), + (hex!("0122222222333333334444444455000002d800000000000073c0"), 0x03d621), + (hex!("0122222222333333334444444455000002d800000000000075a0"), 0x03d6c1), + (hex!("0122222222333333334444444455000002d90000000000002da0"), 0x03d761), + (hex!("0122222222333333334444444455000002d90000000000004860"), 0x03d801), + (hex!("0122222222333333334444444455000002d90000000000006b60"), 0x03d8a1), + (hex!("0122222222333333334444444455000002da0000000000002db0"), 0x03d941), + (hex!("0122222222333333334444444455000002da0000000000006630"), 0x03d9e1), + (hex!("0122222222333333334444444455000002db0000000000002dc0"), 0x03da81), + (hex!("0122222222333333334444444455000002dc0000000000002dd0"), 0x03db21), + (hex!("0122222222333333334444444455000002dc0000000000004830"), 0x03dbc1), + (hex!("0122222222333333334444444455000002dd0000000000002de0"), 0x03dc61), + (hex!("0122222222333333334444444455000002de0000000000002df0"), 0x03dd01), + (hex!("0122222222333333334444444455000002de0000000000004f00"), 0x03dda1), + (hex!("0122222222333333334444444455000002df0000000000002e00"), 0x03de41), + (hex!("0122222222333333334444444455000002e00000000000002e10"), 0x03dee1), + (hex!("0122222222333333334444444455000002e10000000000002e20"), 0x03df81), + (hex!("0122222222333333334444444455000002e10000000000006e90"), 0x03e021), + (hex!("0122222222333333334444444455000002e20000000000002e30"), 0x03e0c1), + (hex!("0122222222333333334444444455000002e200000000000053e0"), 0x03e161), + (hex!("0122222222333333334444444455000002e30000000000002e40"), 0x03e201), + (hex!("0122222222333333334444444455000002e30000000000006020"), 0x03e2a1), + (hex!("0122222222333333334444444455000002e30000000000006540"), 0x03e341), + (hex!("0122222222333333334444444455000002e40000000000002e50"), 0x03e3e1), + (hex!("0122222222333333334444444455000002e50000000000002e60"), 0x03e481), + (hex!("0122222222333333334444444455000002e50000000000005180"), 0x03e521), + (hex!("0122222222333333334444444455000002e50000000000007bf0"), 0x03e5c1), + (hex!("0122222222333333334444444455000002e60000000000002e70"), 0x03e661), + (hex!("0122222222333333334444444455000002e60000000000005350"), 0x03e701), + (hex!("0122222222333333334444444455000002e60000000000007960"), 0x03e7a1), + (hex!("0122222222333333334444444455000002e70000000000002e80"), 0x03e841), + (hex!("0122222222333333334444444455000002e80000000000002e90"), 0x03e8e1), + (hex!("0122222222333333334444444455000002e90000000000002ea0"), 0x03e981), + (hex!("0122222222333333334444444455000002ea0000000000002eb0"), 0x03ea21), + (hex!("0122222222333333334444444455000002eb0000000000002ec0"), 0x03eac1), + (hex!("0122222222333333334444444455000002ec0000000000002ed0"), 0x03eb61), + (hex!("0122222222333333334444444455000002ec0000000000006c10"), 0x03ec01), + (hex!("0122222222333333334444444455000002ed0000000000002ee0"), 0x03eca1), + (hex!("0122222222333333334444444455000002ed0000000000005590"), 0x03ed41), + (hex!("0122222222333333334444444455000002ed0000000000005cd0"), 0x03ede1), + (hex!("0122222222333333334444444455000002ed0000000000006910"), 0x03ee81), + (hex!("0122222222333333334444444455000002ee0000000000002ef0"), 0x03ef21), + (hex!("0122222222333333334444444455000002ef0000000000002f00"), 0x03efc1), + (hex!("0122222222333333334444444455000002ef0000000000004ed0"), 0x03f061), + (hex!("0122222222333333334444444455000002f00000000000002f10"), 0x03f101), + (hex!("0122222222333333334444444455000002f00000000000004cf0"), 0x03f1a1), + (hex!("0122222222333333334444444455000002f00000000000005d10"), 0x03f241), + (hex!("0122222222333333334444444455000002f00000000000006860"), 0x03f2e1), + (hex!("0122222222333333334444444455000002f00000000000006b50"), 0x03f381), + (hex!("0122222222333333334444444455000002f00000000000007100"), 0x03f421), + (hex!("0122222222333333334444444455000002f00000000000007aa0"), 0x03f4c1), + (hex!("0122222222333333334444444455000002f10000000000002f20"), 0x03f561), + (hex!("0122222222333333334444444455000002f20000000000002f30"), 0x03f601), + (hex!("0122222222333333334444444455000002f200000000000044b0"), 0x03f6a1), + (hex!("0122222222333333334444444455000002f30000000000002f40"), 0x03f741), + (hex!("0122222222333333334444444455000002f300000000000075b0"), 0x03f7e1), + (hex!("0122222222333333334444444455000002f40000000000002f50"), 0x03f881), + (hex!("0122222222333333334444444455000002f400000000000060f0"), 0x03f921), + (hex!("0122222222333333334444444455000002f50000000000002f60"), 0x03f9c1), + (hex!("0122222222333333334444444455000002f50000000000007210"), 0x03fa61), + (hex!("0122222222333333334444444455000002f60000000000002f70"), 0x03fb01), + (hex!("0122222222333333334444444455000002f60000000000006610"), 0x03fba1), + (hex!("0122222222333333334444444455000002f70000000000002f80"), 0x03fc41), + (hex!("0122222222333333334444444455000002f70000000000007560"), 0x03fce1), + (hex!("0122222222333333334444444455000002f80000000000002f90"), 0x03fd81), + (hex!("0122222222333333334444444455000002f80000000000006320"), 0x03fe21), + (hex!("0122222222333333334444444455000002f90000000000002fa0"), 0x03fec1), + (hex!("0122222222333333334444444455000002f90000000000006e50"), 0x03ff61), + (hex!("0122222222333333334444444455000002fa0000000000002fb0"), 0x040001), + (hex!("0122222222333333334444444455000002fb0000000000002fc0"), 0x0400a1), + (hex!("0122222222333333334444444455000002fb0000000000004780"), 0x040141), + (hex!("0122222222333333334444444455000002fc0000000000002fd0"), 0x0401e1), + (hex!("0122222222333333334444444455000002fd0000000000002fe0"), 0x040281), + (hex!("0122222222333333334444444455000002fd0000000000005600"), 0x040321), + (hex!("0122222222333333334444444455000002fd0000000000006c00"), 0x0403c1), + (hex!("0122222222333333334444444455000002fe0000000000002ff0"), 0x040461), + (hex!("0122222222333333334444444455000002ff0000000000003000"), 0x040501), + (hex!("0122222222333333334444444455000003000000000000003010"), 0x0405a1), + (hex!("0122222222333333334444444455000003000000000000004080"), 0x040641), + (hex!("0122222222333333334444444455000003010000000000003020"), 0x0406e1), + (hex!("0122222222333333334444444455000003010000000000006340"), 0x040781), + (hex!("0122222222333333334444444455000003020000000000003030"), 0x040821), + (hex!("0122222222333333334444444455000003020000000000005b00"), 0x0408c1), + (hex!("0122222222333333334444444455000003020000000000007b20"), 0x040961), + (hex!("0122222222333333334444444455000003030000000000003040"), 0x040a01), + (hex!("01222222223333333344444444550000030300000000000056b0"), 0x040aa1), + (hex!("0122222222333333334444444455000003030000000000006280"), 0x040b41), + (hex!("0122222222333333334444444455000003030000000000007ad0"), 0x040be1), + (hex!("0122222222333333334444444455000003040000000000003050"), 0x040c81), + (hex!("0122222222333333334444444455000003040000000000005c50"), 0x040d21), + (hex!("0122222222333333334444444455000003050000000000003060"), 0x040dc1), + (hex!("01222222223333333344444444550000030500000000000072e0"), 0x040e61), + (hex!("0122222222333333334444444455000003060000000000003070"), 0x040f01), + (hex!("0122222222333333334444444455000003060000000000004360"), 0x040fa1), + (hex!("0122222222333333334444444455000003060000000000004380"), 0x041041), + (hex!("0122222222333333334444444455000003060000000000004820"), 0x0410e1), + (hex!("0122222222333333334444444455000003060000000000006d10"), 0x041181), + (hex!("0122222222333333334444444455000003070000000000003080"), 0x041221), + (hex!("0122222222333333334444444455000003070000000000004450"), 0x0412c1), + (hex!("0122222222333333334444444455000003080000000000003090"), 0x041361), + (hex!("0122222222333333334444444455000003080000000000005ad0"), 0x041401), + (hex!("01222222223333333344444444550000030900000000000030a0"), 0x0414a1), + (hex!("01222222223333333344444444550000030a00000000000030b0"), 0x041541), + (hex!("01222222223333333344444444550000030a0000000000007760"), 0x0415e1), + (hex!("01222222223333333344444444550000030b00000000000030c0"), 0x041681), + (hex!("01222222223333333344444444550000030b0000000000007a80"), 0x041721), + (hex!("01222222223333333344444444550000030c00000000000030d0"), 0x0417c1), + (hex!("01222222223333333344444444550000030d00000000000030e0"), 0x041861), + (hex!("01222222223333333344444444550000030d0000000000003eb0"), 0x041901), + (hex!("01222222223333333344444444550000030e00000000000030f0"), 0x0419a1), + (hex!("01222222223333333344444444550000030f0000000000003100"), 0x041a41), + (hex!("01222222223333333344444444550000030f0000000000004690"), 0x041ae1), + (hex!("01222222223333333344444444550000030f0000000000006900"), 0x041b81), + (hex!("0122222222333333334444444455000003100000000000003110"), 0x041c21), + (hex!("01222222223333333344444444550000031000000000000058a0"), 0x041cc1), + (hex!("0122222222333333334444444455000003110000000000003120"), 0x041d61), + (hex!("0122222222333333334444444455000003110000000000004200"), 0x041e01), + (hex!("0122222222333333334444444455000003120000000000003130"), 0x041ea1), + (hex!("0122222222333333334444444455000003130000000000003140"), 0x041f41), + (hex!("0122222222333333334444444455000003130000000000004d50"), 0x041fe1), + (hex!("0122222222333333334444444455000003130000000000005400"), 0x042081), + (hex!("0122222222333333334444444455000003130000000000005520"), 0x042121), + (hex!("0122222222333333334444444455000003140000000000003150"), 0x0421c1), + (hex!("0122222222333333334444444455000003140000000000006450"), 0x042261), + (hex!("0122222222333333334444444455000003150000000000003160"), 0x042301), + (hex!("01222222223333333344444444550000031500000000000062d0"), 0x0423a1), + (hex!("0122222222333333334444444455000003160000000000003170"), 0x042441), + (hex!("0122222222333333334444444455000003160000000000004c40"), 0x0424e1), + (hex!("0122222222333333334444444455000003160000000000007c80"), 0x042581), + (hex!("0122222222333333334444444455000003170000000000003180"), 0x042621), + (hex!("0122222222333333334444444455000003170000000000004400"), 0x0426c1), + (hex!("0122222222333333334444444455000003170000000000005090"), 0x042761), + (hex!("0122222222333333334444444455000003170000000000006cb0"), 0x042801), + (hex!("0122222222333333334444444455000003180000000000003190"), 0x0428a1), + (hex!("0122222222333333334444444455000003180000000000006560"), 0x042941), + (hex!("01222222223333333344444444550000031900000000000031a0"), 0x0429e1), + (hex!("01222222223333333344444444550000031900000000000052d0"), 0x042a81), + (hex!("01222222223333333344444444550000031900000000000057e0"), 0x042b21), + (hex!("01222222223333333344444444550000031a00000000000031b0"), 0x042bc1), + (hex!("01222222223333333344444444550000031a00000000000071e0"), 0x042c61), + (hex!("01222222223333333344444444550000031b00000000000031c0"), 0x042d01), + (hex!("01222222223333333344444444550000031c00000000000031d0"), 0x042da1), + (hex!("01222222223333333344444444550000031c0000000000004480"), 0x042e41), + (hex!("01222222223333333344444444550000031c0000000000005790"), 0x042ee1), + (hex!("01222222223333333344444444550000031c0000000000007be0"), 0x042f81), + (hex!("01222222223333333344444444550000031d00000000000031e0"), 0x043021), + (hex!("01222222223333333344444444550000031d0000000000005560"), 0x0430c1), + (hex!("01222222223333333344444444550000031e00000000000031f0"), 0x043161), + (hex!("01222222223333333344444444550000031f0000000000003200"), 0x043201), + (hex!("01222222223333333344444444550000031f0000000000004190"), 0x0432a1), + (hex!("0122222222333333334444444455000003200000000000003210"), 0x043341), + (hex!("0122222222333333334444444455000003210000000000003220"), 0x0433e1), + (hex!("0122222222333333334444444455000003220000000000003230"), 0x043481), + (hex!("0122222222333333334444444455000003230000000000003240"), 0x043521), + (hex!("01222222223333333344444444550000032300000000000069d0"), 0x0435c1), + (hex!("0122222222333333334444444455000003240000000000003250"), 0x043661), + (hex!("0122222222333333334444444455000003250000000000003260"), 0x043701), + (hex!("01222222223333333344444444550000032500000000000042b0"), 0x0437a1), + (hex!("01222222223333333344444444550000032500000000000064e0"), 0x043841), + (hex!("0122222222333333334444444455000003260000000000003270"), 0x0438e1), + (hex!("0122222222333333334444444455000003270000000000003280"), 0x043981), + (hex!("0122222222333333334444444455000003270000000000005b20"), 0x043a21), + (hex!("0122222222333333334444444455000003270000000000006330"), 0x043ac1), + (hex!("0122222222333333334444444455000003270000000000006810"), 0x043b61), + (hex!("0122222222333333334444444455000003280000000000003290"), 0x043c01), + (hex!("01222222223333333344444444550000032900000000000032a0"), 0x043ca1), + (hex!("01222222223333333344444444550000032900000000000056f0"), 0x043d41), + (hex!("0122222222333333334444444455000003290000000000005e20"), 0x043de1), + (hex!("0122222222333333334444444455000003290000000000005e70"), 0x043e81), + (hex!("01222222223333333344444444550000032a00000000000032b0"), 0x043f21), + (hex!("01222222223333333344444444550000032b00000000000032c0"), 0x043fc1), + (hex!("01222222223333333344444444550000032b0000000000005500"), 0x044061), + (hex!("01222222223333333344444444550000032b0000000000005a20"), 0x044101), + (hex!("01222222223333333344444444550000032c00000000000032d0"), 0x0441a1), + (hex!("01222222223333333344444444550000032c0000000000004060"), 0x044241), + (hex!("01222222223333333344444444550000032c0000000000004760"), 0x0442e1), + (hex!("01222222223333333344444444550000032d00000000000032e0"), 0x044381), + (hex!("01222222223333333344444444550000032d00000000000068a0"), 0x044421), + (hex!("01222222223333333344444444550000032e00000000000032f0"), 0x0444c1), + (hex!("01222222223333333344444444550000032f0000000000003300"), 0x044561), + (hex!("0122222222333333334444444455000003300000000000003310"), 0x044601), + (hex!("0122222222333333334444444455000003300000000000006e40"), 0x0446a1), + (hex!("0122222222333333334444444455000003310000000000003320"), 0x044741), + (hex!("0122222222333333334444444455000003310000000000004620"), 0x0447e1), + (hex!("0122222222333333334444444455000003320000000000003330"), 0x044881), + (hex!("0122222222333333334444444455000003330000000000003340"), 0x044921), + (hex!("0122222222333333334444444455000003330000000000004b80"), 0x0449c1), + (hex!("0122222222333333334444444455000003340000000000003350"), 0x044a61), + (hex!("0122222222333333334444444455000003350000000000003360"), 0x044b01), + (hex!("0122222222333333334444444455000003360000000000003370"), 0x044ba1), + (hex!("0122222222333333334444444455000003370000000000003380"), 0x044c41), + (hex!("0122222222333333334444444455000003380000000000003390"), 0x044ce1), + (hex!("01222222223333333344444444550000033900000000000033a0"), 0x044d81), + (hex!("0122222222333333334444444455000003390000000000006b90"), 0x044e21), + (hex!("01222222223333333344444444550000033a00000000000033b0"), 0x044ec1), + (hex!("01222222223333333344444444550000033a0000000000007420"), 0x044f61), + (hex!("01222222223333333344444444550000033b00000000000033c0"), 0x045001), + (hex!("01222222223333333344444444550000033b0000000000007620"), 0x0450a1), + (hex!("01222222223333333344444444550000033c00000000000033d0"), 0x045141), + (hex!("01222222223333333344444444550000033c0000000000006b30"), 0x0451e1), + (hex!("01222222223333333344444444550000033d00000000000033e0"), 0x045281), + (hex!("01222222223333333344444444550000033e00000000000033f0"), 0x045321), + (hex!("01222222223333333344444444550000033e00000000000048b0"), 0x0453c1), + (hex!("01222222223333333344444444550000033e0000000000004e70"), 0x045461), + (hex!("01222222223333333344444444550000033f0000000000003400"), 0x045501), + (hex!("01222222223333333344444444550000033f0000000000006380"), 0x0455a1), + (hex!("0122222222333333334444444455000003400000000000003410"), 0x045641), + (hex!("0122222222333333334444444455000003410000000000003420"), 0x0456e1), + (hex!("0122222222333333334444444455000003410000000000006090"), 0x045781), + (hex!("0122222222333333334444444455000003420000000000003430"), 0x045821), + (hex!("01222222223333333344444444550000034200000000000073d0"), 0x0458c1), + (hex!("0122222222333333334444444455000003430000000000003440"), 0x045961), + (hex!("0122222222333333334444444455000003430000000000006370"), 0x045a01), + (hex!("01222222223333333344444444550000034300000000000075c0"), 0x045aa1), + (hex!("0122222222333333334444444455000003440000000000003450"), 0x045b41), + (hex!("0122222222333333334444444455000003450000000000003460"), 0x045be1), + (hex!("0122222222333333334444444455000003460000000000003470"), 0x045c81), + (hex!("01222222223333333344444444550000034600000000000055f0"), 0x045d21), + (hex!("0122222222333333334444444455000003470000000000003480"), 0x045dc1), + (hex!("0122222222333333334444444455000003470000000000003fe0"), 0x045e61), + (hex!("0122222222333333334444444455000003480000000000003490"), 0x045f01), + (hex!("0122222222333333334444444455000003480000000000007990"), 0x045fa1), + (hex!("01222222223333333344444444550000034900000000000034a0"), 0x046041), + (hex!("0122222222333333334444444455000003490000000000004410"), 0x0460e1), + (hex!("01222222223333333344444444550000034a00000000000034b0"), 0x046181), + (hex!("01222222223333333344444444550000034a00000000000062a0"), 0x046221), + (hex!("01222222223333333344444444550000034a0000000000007260"), 0x0462c1), + (hex!("01222222223333333344444444550000034b00000000000034c0"), 0x046361), + (hex!("01222222223333333344444444550000034b0000000000005760"), 0x046401), + (hex!("01222222223333333344444444550000034b0000000000006200"), 0x0464a1), + (hex!("01222222223333333344444444550000034c00000000000034d0"), 0x046541), + (hex!("01222222223333333344444444550000034d00000000000034e0"), 0x0465e1), + (hex!("01222222223333333344444444550000034e00000000000034f0"), 0x046681), + (hex!("01222222223333333344444444550000034e0000000000007790"), 0x046721), + (hex!("01222222223333333344444444550000034f0000000000003500"), 0x0467c1), + (hex!("0122222222333333334444444455000003500000000000003510"), 0x046861), + (hex!("0122222222333333334444444455000003510000000000003520"), 0x046901), + (hex!("0122222222333333334444444455000003520000000000003530"), 0x0469a1), + (hex!("01222222223333333344444444550000035200000000000056a0"), 0x046a41), + (hex!("0122222222333333334444444455000003530000000000003540"), 0x046ae1), + (hex!("0122222222333333334444444455000003540000000000003550"), 0x046b81), + (hex!("01222222223333333344444444550000035400000000000047b0"), 0x046c21), + (hex!("0122222222333333334444444455000003550000000000003560"), 0x046cc1), + (hex!("0122222222333333334444444455000003550000000000004500"), 0x046d61), + (hex!("0122222222333333334444444455000003560000000000003570"), 0x046e01), + (hex!("0122222222333333334444444455000003560000000000004fc0"), 0x046ea1), + (hex!("0122222222333333334444444455000003560000000000007160"), 0x046f41), + (hex!("0122222222333333334444444455000003560000000000007400"), 0x046fe1), + (hex!("0122222222333333334444444455000003570000000000003580"), 0x047081), + (hex!("0122222222333333334444444455000003580000000000003590"), 0x047121), + (hex!("0122222222333333334444444455000003580000000000005a80"), 0x0471c1), + (hex!("01222222223333333344444444550000035900000000000035a0"), 0x047261), + (hex!("01222222223333333344444444550000035900000000000073b0"), 0x047301), + (hex!("01222222223333333344444444550000035a00000000000035b0"), 0x0473a1), + (hex!("01222222223333333344444444550000035a0000000000004c20"), 0x047441), + (hex!("01222222223333333344444444550000035b00000000000035c0"), 0x0474e1), + (hex!("01222222223333333344444444550000035b0000000000005120"), 0x047581), + (hex!("01222222223333333344444444550000035c00000000000035d0"), 0x047621), + (hex!("01222222223333333344444444550000035c0000000000004300"), 0x0476c1), + (hex!("01222222223333333344444444550000035c0000000000005a40"), 0x047761), + (hex!("01222222223333333344444444550000035c0000000000006620"), 0x047801), + (hex!("01222222223333333344444444550000035c0000000000006ed0"), 0x0478a1), + (hex!("01222222223333333344444444550000035d00000000000035e0"), 0x047941), + (hex!("01222222223333333344444444550000035d0000000000005df0"), 0x0479e1), + (hex!("01222222223333333344444444550000035e00000000000035f0"), 0x047a81), + (hex!("01222222223333333344444444550000035f0000000000003600"), 0x047b21), + (hex!("01222222223333333344444444550000035f00000000000058d0"), 0x047bc1), + (hex!("0122222222333333334444444455000003600000000000003610"), 0x047c61), + (hex!("0122222222333333334444444455000003600000000000007b90"), 0x047d01), + (hex!("0122222222333333334444444455000003610000000000003620"), 0x047da1), + (hex!("0122222222333333334444444455000003610000000000006ad0"), 0x047e41), + (hex!("0122222222333333334444444455000003620000000000003630"), 0x047ee1), + (hex!("01222222223333333344444444550000036200000000000063a0"), 0x047f81), + (hex!("0122222222333333334444444455000003630000000000003640"), 0x048021), + (hex!("0122222222333333334444444455000003630000000000007250"), 0x0480c1), + (hex!("0122222222333333334444444455000003640000000000003650"), 0x048161), + (hex!("0122222222333333334444444455000003640000000000005510"), 0x048201), + (hex!("0122222222333333334444444455000003640000000000007850"), 0x0482a1), + (hex!("0122222222333333334444444455000003650000000000003660"), 0x048341), + (hex!("0122222222333333334444444455000003660000000000003670"), 0x0483e1), + (hex!("0122222222333333334444444455000003660000000000004650"), 0x048481), + (hex!("01222222223333333344444444550000036600000000000050d0"), 0x048521), + (hex!("0122222222333333334444444455000003660000000000006eb0"), 0x0485c1), + (hex!("0122222222333333334444444455000003670000000000003680"), 0x048661), + (hex!("01222222223333333344444444550000036700000000000071f0"), 0x048701), + (hex!("0122222222333333334444444455000003680000000000003690"), 0x0487a1), + (hex!("01222222223333333344444444550000036900000000000036a0"), 0x048841), + (hex!("0122222222333333334444444455000003690000000000005c70"), 0x0488e1), + (hex!("01222222223333333344444444550000036a00000000000036b0"), 0x048981), + (hex!("01222222223333333344444444550000036a00000000000071b0"), 0x048a21), + (hex!("01222222223333333344444444550000036b00000000000036c0"), 0x048ac1), + (hex!("01222222223333333344444444550000036b0000000000004670"), 0x048b61), + (hex!("01222222223333333344444444550000036c00000000000036d0"), 0x048c01), + (hex!("01222222223333333344444444550000036c0000000000004750"), 0x048ca1), + (hex!("01222222223333333344444444550000036c0000000000006fa0"), 0x048d41), + (hex!("01222222223333333344444444550000036d00000000000036e0"), 0x048de1), + (hex!("01222222223333333344444444550000036d0000000000003f70"), 0x048e81), + (hex!("01222222223333333344444444550000036d0000000000004b90"), 0x048f21), + (hex!("01222222223333333344444444550000036d00000000000057a0"), 0x048fc1), + (hex!("01222222223333333344444444550000036e00000000000036f0"), 0x049061), + (hex!("01222222223333333344444444550000036e00000000000075d0"), 0x049101), + (hex!("01222222223333333344444444550000036f0000000000003700"), 0x0491a1), + (hex!("0122222222333333334444444455000003700000000000003710"), 0x049241), + (hex!("0122222222333333334444444455000003700000000000005aa0"), 0x0492e1), + (hex!("0122222222333333334444444455000003710000000000003720"), 0x049381), + (hex!("0122222222333333334444444455000003710000000000005130"), 0x049421), + (hex!("0122222222333333334444444455000003710000000000006fc0"), 0x0494c1), + (hex!("0122222222333333334444444455000003710000000000007b00"), 0x049561), + (hex!("0122222222333333334444444455000003720000000000003730"), 0x049601), + (hex!("01222222223333333344444444550000037200000000000054d0"), 0x0496a1), + (hex!("0122222222333333334444444455000003730000000000003740"), 0x049741), + (hex!("0122222222333333334444444455000003730000000000004220"), 0x0497e1), + (hex!("0122222222333333334444444455000003740000000000003750"), 0x049881), + (hex!("0122222222333333334444444455000003740000000000004720"), 0x049921), + (hex!("0122222222333333334444444455000003750000000000003760"), 0x0499c1), + (hex!("0122222222333333334444444455000003750000000000004110"), 0x049a61), + (hex!("0122222222333333334444444455000003760000000000003770"), 0x049b01), + (hex!("0122222222333333334444444455000003770000000000003780"), 0x049ba1), + (hex!("0122222222333333334444444455000003780000000000003790"), 0x049c41), + (hex!("0122222222333333334444444455000003780000000000004b40"), 0x049ce1), + (hex!("0122222222333333334444444455000003780000000000005660"), 0x049d81), + (hex!("0122222222333333334444444455000003780000000000005ea0"), 0x049e21), + (hex!("01222222223333333344444444550000037900000000000037a0"), 0x049ec1), + (hex!("01222222223333333344444444550000037a00000000000037b0"), 0x049f61), + (hex!("01222222223333333344444444550000037b00000000000037c0"), 0x04a001), + (hex!("01222222223333333344444444550000037c00000000000037d0"), 0x04a0a1), + (hex!("01222222223333333344444444550000037c0000000000004340"), 0x04a141), + (hex!("01222222223333333344444444550000037c0000000000005230"), 0x04a1e1), + (hex!("01222222223333333344444444550000037d00000000000037e0"), 0x04a281), + (hex!("01222222223333333344444444550000037d00000000000051e0"), 0x04a321), + (hex!("01222222223333333344444444550000037e00000000000037f0"), 0x04a3c1), + (hex!("01222222223333333344444444550000037e0000000000004090"), 0x04a461), + (hex!("01222222223333333344444444550000037e0000000000005c20"), 0x04a501), + (hex!("01222222223333333344444444550000037f0000000000003800"), 0x04a5a1), + (hex!("0122222222333333334444444455000003800000000000003810"), 0x04a641), + (hex!("0122222222333333334444444455000003800000000000007630"), 0x04a6e1), + (hex!("0122222222333333334444444455000003810000000000003820"), 0x04a781), + (hex!("0122222222333333334444444455000003820000000000003830"), 0x04a821), + (hex!("0122222222333333334444444455000003820000000000004170"), 0x04a8c1), + (hex!("0122222222333333334444444455000003830000000000003840"), 0x04a961), + (hex!("0122222222333333334444444455000003840000000000003850"), 0x04aa01), + (hex!("0122222222333333334444444455000003850000000000003860"), 0x04aaa1), + (hex!("0122222222333333334444444455000003850000000000004180"), 0x04ab41), + (hex!("0122222222333333334444444455000003850000000000005c90"), 0x04abe1), + (hex!("0122222222333333334444444455000003850000000000005da0"), 0x04ac81), + (hex!("0122222222333333334444444455000003850000000000006ff0"), 0x04ad21), + (hex!("0122222222333333334444444455000003860000000000003870"), 0x04adc1), + (hex!("01222222223333333344444444550000038600000000000065c0"), 0x04ae61), + (hex!("0122222222333333334444444455000003870000000000003880"), 0x04af01), + (hex!("0122222222333333334444444455000003870000000000007cc0"), 0x04afa1), + (hex!("0122222222333333334444444455000003880000000000003890"), 0x04b041), + (hex!("01222222223333333344444444550000038900000000000038a0"), 0x04b0e1), + (hex!("01222222223333333344444444550000038a00000000000038b0"), 0x04b181), + (hex!("01222222223333333344444444550000038a00000000000073e0"), 0x04b221), + (hex!("01222222223333333344444444550000038b00000000000038c0"), 0x04b2c1), + (hex!("01222222223333333344444444550000038c00000000000038d0"), 0x04b361), + (hex!("01222222223333333344444444550000038d00000000000038e0"), 0x04b401), + (hex!("01222222223333333344444444550000038d00000000000069f0"), 0x04b4a1), + (hex!("01222222223333333344444444550000038d0000000000007680"), 0x04b541), + (hex!("01222222223333333344444444550000038e00000000000038f0"), 0x04b5e1), + (hex!("01222222223333333344444444550000038f0000000000003900"), 0x04b681), + (hex!("01222222223333333344444444550000038f00000000000045b0"), 0x04b721), + (hex!("01222222223333333344444444550000038f0000000000007180"), 0x04b7c1), + (hex!("0122222222333333334444444455000003900000000000003910"), 0x04b861), + (hex!("0122222222333333334444444455000003910000000000003920"), 0x04b901), + (hex!("0122222222333333334444444455000003910000000000004a20"), 0x04b9a1), + (hex!("0122222222333333334444444455000003920000000000003930"), 0x04ba41), + (hex!("01222222223333333344444444550000039200000000000059b0"), 0x04bae1), + (hex!("0122222222333333334444444455000003930000000000003940"), 0x04bb81), + (hex!("0122222222333333334444444455000003930000000000006cc0"), 0x04bc21), + (hex!("0122222222333333334444444455000003940000000000003950"), 0x04bcc1), + (hex!("01222222223333333344444444550000039400000000000056c0"), 0x04bd61), + (hex!("0122222222333333334444444455000003950000000000003960"), 0x04be01), + (hex!("0122222222333333334444444455000003950000000000004cc0"), 0x04bea1), + (hex!("0122222222333333334444444455000003950000000000007720"), 0x04bf41), + (hex!("0122222222333333334444444455000003960000000000003970"), 0x04bfe1), + (hex!("0122222222333333334444444455000003960000000000004da0"), 0x04c081), + (hex!("0122222222333333334444444455000003960000000000004df0"), 0x04c121), + (hex!("0122222222333333334444444455000003960000000000004f30"), 0x04c1c1), + (hex!("01222222223333333344444444550000039600000000000050f0"), 0x04c261), + (hex!("0122222222333333334444444455000003960000000000007940"), 0x04c301), + (hex!("0122222222333333334444444455000003970000000000003980"), 0x04c3a1), + (hex!("0122222222333333334444444455000003970000000000005850"), 0x04c441), + (hex!("0122222222333333334444444455000003970000000000007bd0"), 0x04c4e1), + (hex!("0122222222333333334444444455000003980000000000003990"), 0x04c581), + (hex!("0122222222333333334444444455000003980000000000004c00"), 0x04c621), + (hex!("0122222222333333334444444455000003980000000000005580"), 0x04c6c1), + (hex!("01222222223333333344444444550000039900000000000039a0"), 0x04c761), + (hex!("0122222222333333334444444455000003990000000000005820"), 0x04c801), + (hex!("01222222223333333344444444550000039a00000000000039b0"), 0x04c8a1), + (hex!("01222222223333333344444444550000039b00000000000039c0"), 0x04c941), + (hex!("01222222223333333344444444550000039b0000000000004c10"), 0x04c9e1), + (hex!("01222222223333333344444444550000039b0000000000006460"), 0x04ca81), + (hex!("01222222223333333344444444550000039c00000000000039d0"), 0x04cb21), + (hex!("01222222223333333344444444550000039d00000000000039e0"), 0x04cbc1), + (hex!("01222222223333333344444444550000039d00000000000044c0"), 0x04cc61), + (hex!("01222222223333333344444444550000039d00000000000049e0"), 0x04cd01), + (hex!("01222222223333333344444444550000039e00000000000039f0"), 0x04cda1), + (hex!("01222222223333333344444444550000039f0000000000003a00"), 0x04ce41), + (hex!("0122222222333333334444444455000003a00000000000003a10"), 0x04cee1), + (hex!("0122222222333333334444444455000003a10000000000003a20"), 0x04cf81), + (hex!("0122222222333333334444444455000003a10000000000006a80"), 0x04d021), + (hex!("0122222222333333334444444455000003a20000000000003a30"), 0x04d0c1), + (hex!("0122222222333333334444444455000003a200000000000062b0"), 0x04d161), + (hex!("0122222222333333334444444455000003a30000000000003a40"), 0x04d201), + (hex!("0122222222333333334444444455000003a30000000000006ce0"), 0x04d2a1), + (hex!("0122222222333333334444444455000003a40000000000003a50"), 0x04d341), + (hex!("0122222222333333334444444455000003a50000000000003a60"), 0x04d3e1), + (hex!("0122222222333333334444444455000003a60000000000003a70"), 0x04d481), + (hex!("0122222222333333334444444455000003a60000000000007750"), 0x04d521), + (hex!("0122222222333333334444444455000003a70000000000003a80"), 0x04d5c1), + (hex!("0122222222333333334444444455000003a70000000000005b10"), 0x04d661), + (hex!("0122222222333333334444444455000003a80000000000003a90"), 0x04d701), + (hex!("0122222222333333334444444455000003a80000000000006c20"), 0x04d7a1), + (hex!("0122222222333333334444444455000003a90000000000003aa0"), 0x04d841), + (hex!("0122222222333333334444444455000003a90000000000005b70"), 0x04d8e1), + (hex!("0122222222333333334444444455000003a900000000000070e0"), 0x04d981), + (hex!("0122222222333333334444444455000003aa0000000000003ab0"), 0x04da21), + (hex!("0122222222333333334444444455000003aa00000000000049f0"), 0x04dac1), + (hex!("0122222222333333334444444455000003aa0000000000004d60"), 0x04db61), + (hex!("0122222222333333334444444455000003ab0000000000003ac0"), 0x04dc01), + (hex!("0122222222333333334444444455000003ac0000000000003ad0"), 0x04dca1), + (hex!("0122222222333333334444444455000003ac0000000000004580"), 0x04dd41), + (hex!("0122222222333333334444444455000003ad0000000000003ae0"), 0x04dde1), + (hex!("0122222222333333334444444455000003ae0000000000003af0"), 0x04de81), + (hex!("0122222222333333334444444455000003af0000000000003b00"), 0x04df21), + (hex!("0122222222333333334444444455000003b00000000000003b10"), 0x04dfc1), + (hex!("0122222222333333334444444455000003b10000000000003b20"), 0x04e061), + (hex!("0122222222333333334444444455000003b10000000000003fd0"), 0x04e101), + (hex!("0122222222333333334444444455000003b20000000000003b30"), 0x04e1a1), + (hex!("0122222222333333334444444455000003b30000000000003b40"), 0x04e241), + (hex!("0122222222333333334444444455000003b40000000000003b50"), 0x04e2e1), + (hex!("0122222222333333334444444455000003b40000000000007450"), 0x04e381), + (hex!("0122222222333333334444444455000003b50000000000003b60"), 0x04e421), + (hex!("0122222222333333334444444455000003b60000000000003b70"), 0x04e4c1), + (hex!("0122222222333333334444444455000003b70000000000003b80"), 0x04e561), + (hex!("0122222222333333334444444455000003b70000000000006d50"), 0x04e601), + (hex!("0122222222333333334444444455000003b80000000000003b90"), 0x04e6a1), + (hex!("0122222222333333334444444455000003b800000000000057c0"), 0x04e741), + (hex!("0122222222333333334444444455000003b800000000000078a0"), 0x04e7e1), + (hex!("0122222222333333334444444455000003b90000000000003ba0"), 0x04e881), + (hex!("0122222222333333334444444455000003b90000000000006750"), 0x04e921), + (hex!("0122222222333333334444444455000003ba0000000000003bb0"), 0x04e9c1), + (hex!("0122222222333333334444444455000003ba0000000000007a10"), 0x04ea61), + (hex!("0122222222333333334444444455000003ba0000000000007a20"), 0x04eb01), + (hex!("0122222222333333334444444455000003bb0000000000003bc0"), 0x04eba1), + (hex!("0122222222333333334444444455000003bb0000000000005bc0"), 0x04ec41), + (hex!("0122222222333333334444444455000003bc0000000000003bd0"), 0x04ece1), + (hex!("0122222222333333334444444455000003bc0000000000005e80"), 0x04ed81), + (hex!("0122222222333333334444444455000003bc0000000000007ab0"), 0x04ee21), + (hex!("0122222222333333334444444455000003bd0000000000003be0"), 0x04eec1), + (hex!("0122222222333333334444444455000003bd00000000000049b0"), 0x04ef61), + (hex!("0122222222333333334444444455000003be0000000000003bf0"), 0x04f001), + (hex!("0122222222333333334444444455000003be0000000000005780"), 0x04f0a1), + (hex!("0122222222333333334444444455000003be0000000000007930"), 0x04f141), + (hex!("0122222222333333334444444455000003bf0000000000003c00"), 0x04f1e1), + (hex!("0122222222333333334444444455000003bf0000000000005de0"), 0x04f281), + (hex!("0122222222333333334444444455000003bf00000000000060b0"), 0x04f321), + (hex!("0122222222333333334444444455000003bf00000000000060c0"), 0x04f3c1), + (hex!("0122222222333333334444444455000003bf0000000000006a50"), 0x04f461), + (hex!("0122222222333333334444444455000003c00000000000003c10"), 0x04f501), + (hex!("0122222222333333334444444455000003c00000000000004030"), 0x04f5a1), + (hex!("0122222222333333334444444455000003c10000000000003c20"), 0x04f641), + (hex!("0122222222333333334444444455000003c20000000000003c30"), 0x04f6e1), + (hex!("0122222222333333334444444455000003c200000000000040b0"), 0x04f781), + (hex!("0122222222333333334444444455000003c30000000000003c40"), 0x04f821), + (hex!("0122222222333333334444444455000003c40000000000003c50"), 0x04f8c1), + (hex!("0122222222333333334444444455000003c40000000000005ba0"), 0x04f961), + (hex!("0122222222333333334444444455000003c50000000000003c60"), 0x04fa01), + (hex!("0122222222333333334444444455000003c60000000000003c70"), 0x04faa1), + (hex!("0122222222333333334444444455000003c70000000000003c80"), 0x04fb41), + (hex!("0122222222333333334444444455000003c70000000000004270"), 0x04fbe1), + (hex!("0122222222333333334444444455000003c80000000000003c90"), 0x04fc81), + (hex!("0122222222333333334444444455000003c80000000000006e70"), 0x04fd21), + (hex!("0122222222333333334444444455000003c90000000000003ca0"), 0x04fdc1), + (hex!("0122222222333333334444444455000003ca0000000000003cb0"), 0x04fe61), + (hex!("0122222222333333334444444455000003ca0000000000006e20"), 0x04ff01), + (hex!("0122222222333333334444444455000003ca0000000000007c20"), 0x04ffa1), + (hex!("0122222222333333334444444455000003cb0000000000003cc0"), 0x050041), + (hex!("0122222222333333334444444455000003cc0000000000003cd0"), 0x0500e1), + (hex!("0122222222333333334444444455000003cc0000000000006120"), 0x050181), + (hex!("0122222222333333334444444455000003cc0000000000007950"), 0x050221), + (hex!("0122222222333333334444444455000003cd0000000000003ce0"), 0x0502c1), + (hex!("0122222222333333334444444455000003ce0000000000003cf0"), 0x050361), + (hex!("0122222222333333334444444455000003cf0000000000003d00"), 0x050401), + (hex!("0122222222333333334444444455000003d00000000000003d10"), 0x0504a1), + (hex!("0122222222333333334444444455000003d10000000000003d20"), 0x050541), + (hex!("0122222222333333334444444455000003d10000000000005e50"), 0x0505e1), + (hex!("0122222222333333334444444455000003d10000000000007880"), 0x050681), + (hex!("0122222222333333334444444455000003d20000000000003d30"), 0x050721), + (hex!("0122222222333333334444444455000003d20000000000005d00"), 0x0507c1), + (hex!("0122222222333333334444444455000003d30000000000003d40"), 0x050861), + (hex!("0122222222333333334444444455000003d30000000000005d40"), 0x050901), + (hex!("0122222222333333334444444455000003d300000000000063f0"), 0x0509a1), + (hex!("0122222222333333334444444455000003d40000000000003d50"), 0x050a41), + (hex!("0122222222333333334444444455000003d40000000000005700"), 0x050ae1), + (hex!("0122222222333333334444444455000003d400000000000078f0"), 0x050b81), + (hex!("0122222222333333334444444455000003d50000000000003d60"), 0x050c21), + (hex!("0122222222333333334444444455000003d60000000000003d70"), 0x050cc1), + (hex!("0122222222333333334444444455000003d70000000000003d80"), 0x050d61), + (hex!("0122222222333333334444444455000003d80000000000003d90"), 0x050e01), + (hex!("0122222222333333334444444455000003d80000000000006690"), 0x050ea1), + (hex!("0122222222333333334444444455000003d90000000000003da0"), 0x050f41), + (hex!("0122222222333333334444444455000003d900000000000076d0"), 0x050fe1), + (hex!("0122222222333333334444444455000003da0000000000003db0"), 0x051081), + (hex!("0122222222333333334444444455000003db0000000000003dc0"), 0x051121), + (hex!("0122222222333333334444444455000003db0000000000004a30"), 0x0511c1), + (hex!("0122222222333333334444444455000003db0000000000005390"), 0x051261), + (hex!("0122222222333333334444444455000003dc0000000000003dd0"), 0x051301), + (hex!("0122222222333333334444444455000003dc0000000000006d60"), 0x0513a1), + (hex!("0122222222333333334444444455000003dd0000000000003de0"), 0x051441), + (hex!("0122222222333333334444444455000003de0000000000003df0"), 0x0514e1), + (hex!("0122222222333333334444444455000003df0000000000003e00"), 0x051581), + (hex!("0122222222333333334444444455000003df0000000000005240"), 0x051621), + (hex!("0122222222333333334444444455000003df0000000000005610"), 0x0516c1), + (hex!("0122222222333333334444444455000003e00000000000003e10"), 0x051761), + (hex!("0122222222333333334444444455000003e00000000000006500"), 0x051801), + (hex!("0122222222333333334444444455000003e10000000000003e20"), 0x0518a1), + (hex!("0122222222333333334444444455000003e10000000000006a10"), 0x051941), + (hex!("0122222222333333334444444455000003e10000000000007c10"), 0x0519e1), + (hex!("0122222222333333334444444455000003e20000000000003e30"), 0x051a81), + (hex!("0122222222333333334444444455000003e20000000000006310"), 0x051b21), + (hex!("0122222222333333334444444455000003e30000000000003e40"), 0x051bc1), + (hex!("0122222222333333334444444455000003e40000000000003e50"), 0x051c61), + (hex!("0122222222333333334444444455000003e40000000000006780"), 0x051d01), + (hex!("0122222222333333334444444455000003e40000000000007ce0"), 0x051da1), + (hex!("0122222222333333334444444455000003e50000000000003e60"), 0x051e41), + (hex!("0122222222333333334444444455000003e60000000000003e70"), 0x051ee1), + (hex!("0122222222333333334444444455000003e60000000000005040"), 0x051f81), + (hex!("0122222222333333334444444455000003e60000000000005bf0"), 0x052021), + (hex!("0122222222333333334444444455000003e70000000000003e80"), 0x0520c1), + (hex!("0122222222333333334444444455000003e70000000000003f50"), 0x052161), +]; diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs new file mode 100644 index 0000000000..0774fa42a6 --- /dev/null +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -0,0 +1,442 @@ +//! Implementation of append-only file data structure +//! used to keep in-memory layers spilled on disk. + +use crate::config::PageServerConf; +use crate::page_cache; +use crate::page_cache::PAGE_SZ; +use crate::page_cache::{ReadBufResult, WriteBufResult}; +use crate::tenant::blob_io::BlobWriter; +use crate::tenant::block_io::BlockReader; +use crate::virtual_file::VirtualFile; +use once_cell::sync::Lazy; +use std::cmp::min; +use std::collections::HashMap; +use std::fs::OpenOptions; +use std::io::{self, ErrorKind}; +use std::ops::DerefMut; +use std::path::PathBuf; +use std::sync::{Arc, RwLock}; +use tracing::*; +use utils::id::{TenantId, TimelineId}; + +use std::os::unix::fs::FileExt; + +/// +/// This is the global cache of file descriptors (File objects). +/// +static EPHEMERAL_FILES: Lazy> = Lazy::new(|| { + RwLock::new(EphemeralFiles { + next_file_id: 1, + files: HashMap::new(), + }) +}); + +pub struct EphemeralFiles { + next_file_id: u64, + + files: HashMap>, +} + +pub struct EphemeralFile { + file_id: u64, + _tenant_id: TenantId, + _timeline_id: TimelineId, + file: Arc, + + pub size: u64, +} + +impl EphemeralFile { + pub fn create( + conf: &PageServerConf, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + let mut l = EPHEMERAL_FILES.write().unwrap(); + let file_id = l.next_file_id; + l.next_file_id += 1; + + let filename = conf + .timeline_path(&timeline_id, &tenant_id) + .join(PathBuf::from(format!("ephemeral-{}", file_id))); + + let file = VirtualFile::open_with_options( + &filename, + OpenOptions::new().read(true).write(true).create(true), + )?; + let file_rc = Arc::new(file); + l.files.insert(file_id, file_rc.clone()); + + Ok(EphemeralFile { + file_id, + _tenant_id: tenant_id, + _timeline_id: timeline_id, + file: file_rc, + size: 0, + }) + } + + fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> { + let mut off = 0; + while off < PAGE_SZ { + let n = self + .file + .read_at(&mut buf[off..], blkno as u64 * PAGE_SZ as u64 + off as u64)?; + + if n == 0 { + // Reached EOF. Fill the rest of the buffer with zeros. + const ZERO_BUF: [u8; PAGE_SZ] = [0u8; PAGE_SZ]; + + buf[off..].copy_from_slice(&ZERO_BUF[off..]); + break; + } + + off += n as usize; + } + Ok(()) + } + + fn get_buf_for_write(&self, blkno: u32) -> Result { + // Look up the right page + let cache = page_cache::get(); + let mut write_guard = match cache + .write_ephemeral_buf(self.file_id, blkno) + .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))? + { + WriteBufResult::Found(guard) => guard, + WriteBufResult::NotFound(mut guard) => { + // Read the page from disk into the buffer + // TODO: if we're overwriting the whole page, no need to read it in first + self.fill_buffer(guard.deref_mut(), blkno)?; + guard.mark_valid(); + + // And then fall through to modify it. + guard + } + }; + write_guard.mark_dirty(); + + Ok(write_guard) + } +} + +/// Does the given filename look like an ephemeral file? +pub fn is_ephemeral_file(filename: &str) -> bool { + if let Some(rest) = filename.strip_prefix("ephemeral-") { + rest.parse::().is_ok() + } else { + false + } +} + +impl FileExt for EphemeralFile { + fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result { + // Look up the right page + let blkno = (offset / PAGE_SZ as u64) as u32; + let off = offset as usize % PAGE_SZ; + let len = min(PAGE_SZ - off, dstbuf.len()); + + let read_guard; + let mut write_guard; + + let cache = page_cache::get(); + let buf = match cache + .read_ephemeral_buf(self.file_id, blkno) + .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))? + { + ReadBufResult::Found(guard) => { + read_guard = guard; + read_guard.as_ref() + } + ReadBufResult::NotFound(guard) => { + // Read the page from disk into the buffer + write_guard = guard; + self.fill_buffer(write_guard.deref_mut(), blkno)?; + write_guard.mark_valid(); + + // And then fall through to read the requested slice from the + // buffer. + write_guard.as_ref() + } + }; + + dstbuf[0..len].copy_from_slice(&buf[off..(off + len)]); + Ok(len) + } + + fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result { + // Look up the right page + let blkno = (offset / PAGE_SZ as u64) as u32; + let off = offset as usize % PAGE_SZ; + let len = min(PAGE_SZ - off, srcbuf.len()); + + let mut write_guard; + let cache = page_cache::get(); + let buf = match cache + .write_ephemeral_buf(self.file_id, blkno) + .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))? + { + WriteBufResult::Found(guard) => { + write_guard = guard; + write_guard.deref_mut() + } + WriteBufResult::NotFound(guard) => { + // Read the page from disk into the buffer + // TODO: if we're overwriting the whole page, no need to read it in first + write_guard = guard; + self.fill_buffer(write_guard.deref_mut(), blkno)?; + write_guard.mark_valid(); + + // And then fall through to modify it. + write_guard.deref_mut() + } + }; + + buf[off..(off + len)].copy_from_slice(&srcbuf[0..len]); + write_guard.mark_dirty(); + Ok(len) + } +} + +impl BlobWriter for EphemeralFile { + fn write_blob(&mut self, srcbuf: &[u8]) -> Result { + let pos = self.size; + + let mut blknum = (self.size / PAGE_SZ as u64) as u32; + let mut off = (pos % PAGE_SZ as u64) as usize; + + let mut buf = self.get_buf_for_write(blknum)?; + + // Write the length field + if srcbuf.len() < 0x80 { + buf[off] = srcbuf.len() as u8; + off += 1; + } else { + let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32); + len_buf[0] |= 0x80; + let thislen = PAGE_SZ - off; + if thislen < 4 { + // it needs to be split across pages + buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]); + blknum += 1; + buf = self.get_buf_for_write(blknum)?; + buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]); + off = 4 - thislen; + } else { + buf[off..off + 4].copy_from_slice(&len_buf); + off += 4; + } + } + + // Write the payload + let mut buf_remain = srcbuf; + while !buf_remain.is_empty() { + let mut page_remain = PAGE_SZ - off; + if page_remain == 0 { + blknum += 1; + buf = self.get_buf_for_write(blknum)?; + off = 0; + page_remain = PAGE_SZ; + } + let this_blk_len = min(page_remain, buf_remain.len()); + buf[off..(off + this_blk_len)].copy_from_slice(&buf_remain[..this_blk_len]); + off += this_blk_len; + buf_remain = &buf_remain[this_blk_len..]; + } + drop(buf); + + if srcbuf.len() < 0x80 { + self.size += 1; + } else { + self.size += 4; + } + self.size += srcbuf.len() as u64; + + Ok(pos) + } +} + +impl Drop for EphemeralFile { + fn drop(&mut self) { + // drop all pages from page cache + let cache = page_cache::get(); + cache.drop_buffers_for_ephemeral(self.file_id); + + // remove entry from the hash map + EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id); + + // unlink the file + let res = std::fs::remove_file(&self.file.path); + if let Err(e) = res { + warn!( + "could not remove ephemeral file '{}': {}", + self.file.path.display(), + e + ); + } + } +} + +pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> { + if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) { + match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) { + Ok(_) => Ok(()), + Err(e) => Err(io::Error::new( + ErrorKind::Other, + format!( + "failed to write back to ephemeral file at {} error: {}", + file.path.display(), + e + ), + )), + } + } else { + Err(io::Error::new( + ErrorKind::Other, + "could not write back page, not found in ephemeral files hash", + )) + } +} + +impl BlockReader for EphemeralFile { + type BlockLease = page_cache::PageReadGuard<'static>; + + fn read_blk(&self, blknum: u32) -> Result { + // Look up the right page + let cache = page_cache::get(); + loop { + match cache + .read_ephemeral_buf(self.file_id, blknum) + .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))? + { + ReadBufResult::Found(guard) => return Ok(guard), + ReadBufResult::NotFound(mut write_guard) => { + // Read the page from disk into the buffer + self.fill_buffer(write_guard.deref_mut(), blknum)?; + write_guard.mark_valid(); + + // Swap for read lock + continue; + } + }; + } + } +} + +fn to_io_error(e: anyhow::Error, context: &str) -> io::Error { + io::Error::new(ErrorKind::Other, format!("{context}: {e:#}")) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tenant::blob_io::{BlobCursor, BlobWriter}; + use crate::tenant::block_io::BlockCursor; + use rand::{seq::SliceRandom, thread_rng, RngCore}; + use std::fs; + use std::str::FromStr; + + fn harness( + test_name: &str, + ) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> { + let repo_dir = PageServerConf::test_repo_dir(test_name); + let _ = fs::remove_dir_all(&repo_dir); + let conf = PageServerConf::dummy_conf(repo_dir); + // Make a static copy of the config. This can never be free'd, but that's + // OK in a test. + let conf: &'static PageServerConf = Box::leak(Box::new(conf)); + + let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap(); + let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap(); + fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?; + + Ok((conf, tenant_id, timeline_id)) + } + + // Helper function to slurp contents of a file, starting at the current position, + // into a string + fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result { + let mut buf = Vec::new(); + buf.resize(len, 0u8); + + efile.read_exact_at(&mut buf, offset)?; + + Ok(String::from_utf8_lossy(&buf) + .trim_end_matches('\0') + .to_string()) + } + + #[test] + fn test_ephemeral_files() -> Result<(), io::Error> { + let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?; + + let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?; + + file_a.write_all_at(b"foo", 0)?; + assert_eq!("foo", read_string(&file_a, 0, 20)?); + + file_a.write_all_at(b"bar", 3)?; + assert_eq!("foobar", read_string(&file_a, 0, 20)?); + + // Open a lot of files, enough to cause some page evictions. + let mut efiles = Vec::new(); + for fileno in 0..100 { + let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?; + efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?; + assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?); + efiles.push((fileno, efile)); + } + + // Check that all the files can still be read from. Use them in random order for + // good measure. + efiles.as_mut_slice().shuffle(&mut thread_rng()); + for (fileno, efile) in efiles.iter_mut() { + assert_eq!(format!("file {}", fileno), read_string(efile, 0, 10)?); + } + + Ok(()) + } + + #[test] + fn test_ephemeral_blobs() -> Result<(), io::Error> { + let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?; + + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?; + + let pos_foo = file.write_blob(b"foo")?; + assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice()); + let pos_bar = file.write_blob(b"bar")?; + assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice()); + assert_eq!(b"bar", file.block_cursor().read_blob(pos_bar)?.as_slice()); + + let mut blobs = Vec::new(); + for i in 0..10000 { + let data = Vec::from(format!("blob{}", i).as_bytes()); + let pos = file.write_blob(&data)?; + blobs.push((pos, data)); + } + // also test with a large blobs + for i in 0..100 { + let data = format!("blob{}", i).as_bytes().repeat(100); + let pos = file.write_blob(&data)?; + blobs.push((pos, data)); + } + + let mut cursor = BlockCursor::new(&file); + for (pos, expected) in blobs { + let actual = cursor.read_blob(pos)?; + assert_eq!(actual, expected); + } + drop(cursor); + + // Test a large blob that spans multiple pages + let mut large_data = Vec::new(); + large_data.resize(20000, 0); + thread_rng().fill_bytes(&mut large_data); + let pos_large = file.write_blob(&large_data)?; + let result = file.block_cursor().read_blob(pos_large)?; + assert_eq!(result, large_data); + + Ok(()) + } +} diff --git a/pageserver/src/tenant/filename.rs b/pageserver/src/tenant/filename.rs new file mode 100644 index 0000000000..0ebf2d479b --- /dev/null +++ b/pageserver/src/tenant/filename.rs @@ -0,0 +1,187 @@ +//! +//! Helper functions for dealing with filenames of the image and delta layer files. +//! +use crate::config::PageServerConf; +use crate::repository::Key; +use std::cmp::Ordering; +use std::fmt; +use std::ops::Range; +use std::path::PathBuf; + +use utils::lsn::Lsn; + +// Note: Timeline::load_layer_map() relies on this sort order +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct DeltaFileName { + pub key_range: Range, + pub lsn_range: Range, +} + +impl PartialOrd for DeltaFileName { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for DeltaFileName { + fn cmp(&self, other: &Self) -> Ordering { + let mut cmp = self.key_range.start.cmp(&other.key_range.start); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.key_range.end.cmp(&other.key_range.end); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.lsn_range.start.cmp(&other.lsn_range.start); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.lsn_range.end.cmp(&other.lsn_range.end); + + cmp + } +} + +/// Represents the filename of a DeltaLayer +/// +/// -__- +/// +impl DeltaFileName { + /// + /// Parse a string as a delta file name. Returns None if the filename does not + /// match the expected pattern. + /// + pub fn parse_str(fname: &str) -> Option { + let mut parts = fname.split("__"); + let mut key_parts = parts.next()?.split('-'); + let mut lsn_parts = parts.next()?.split('-'); + + let key_start_str = key_parts.next()?; + let key_end_str = key_parts.next()?; + let lsn_start_str = lsn_parts.next()?; + let lsn_end_str = lsn_parts.next()?; + if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() { + return None; + } + + let key_start = Key::from_hex(key_start_str).ok()?; + let key_end = Key::from_hex(key_end_str).ok()?; + + let start_lsn = Lsn::from_hex(lsn_start_str).ok()?; + let end_lsn = Lsn::from_hex(lsn_end_str).ok()?; + + if start_lsn >= end_lsn { + return None; + // or panic? + } + + if key_start >= key_end { + return None; + // or panic? + } + + Some(DeltaFileName { + key_range: key_start..key_end, + lsn_range: start_lsn..end_lsn, + }) + } +} + +impl fmt::Display for DeltaFileName { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{}-{}__{:016X}-{:016X}", + self.key_range.start, + self.key_range.end, + u64::from(self.lsn_range.start), + u64::from(self.lsn_range.end), + ) + } +} + +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct ImageFileName { + pub key_range: Range, + pub lsn: Lsn, +} + +impl PartialOrd for ImageFileName { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for ImageFileName { + fn cmp(&self, other: &Self) -> Ordering { + let mut cmp = self.key_range.start.cmp(&other.key_range.start); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.key_range.end.cmp(&other.key_range.end); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.lsn.cmp(&other.lsn); + + cmp + } +} + +/// +/// Represents the filename of an ImageLayer +/// +/// -__ +impl ImageFileName { + /// + /// Parse a string as an image file name. Returns None if the filename does not + /// match the expected pattern. + /// + pub fn parse_str(fname: &str) -> Option { + let mut parts = fname.split("__"); + let mut key_parts = parts.next()?.split('-'); + + let key_start_str = key_parts.next()?; + let key_end_str = key_parts.next()?; + let lsn_str = parts.next()?; + if parts.next().is_some() || key_parts.next().is_some() { + return None; + } + + let key_start = Key::from_hex(key_start_str).ok()?; + let key_end = Key::from_hex(key_end_str).ok()?; + + let lsn = Lsn::from_hex(lsn_str).ok()?; + + Some(ImageFileName { + key_range: key_start..key_end, + lsn, + }) + } +} + +impl fmt::Display for ImageFileName { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{}-{}__{:016X}", + self.key_range.start, + self.key_range.end, + u64::from(self.lsn), + ) + } +} + +/// Helper enum to hold a PageServerConf, or a path +/// +/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the +/// global config, and paths to layer files are constructed using the tenant/timeline +/// path from the config. But in the 'pageserver_binutils' binary, we need to construct a Layer +/// struct for a file on disk, without having a page server running, so that we have no +/// config. In that case, we use the Path variant to hold the full path to the file on +/// disk. +pub enum PathOrConf { + Path(PathBuf), + Conf(&'static PageServerConf), +} diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/image_layer.rs new file mode 100644 index 0000000000..8409d34bc9 --- /dev/null +++ b/pageserver/src/tenant/image_layer.rs @@ -0,0 +1,633 @@ +//! An ImageLayer represents an image or a snapshot of a key-range at +//! one particular LSN. It contains an image of all key-value pairs +//! in its key-range. Any key that falls into the image layer's range +//! but does not exist in the layer, does not exist. +//! +//! An image layer is stored in a file on disk. The file is stored in +//! timelines/ directory. Currently, there are no +//! subdirectories, and each image layer file is named like this: +//! +//! -__ +//! +//! For example: +//! +//! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568 +//! +//! Every image layer file consists of three parts: "summary", +//! "index", and "values". The summary is a fixed size header at the +//! beginning of the file, and it contains basic information about the +//! layer, and offsets to the other parts. The "index" is a B-tree, +//! mapping from Key to an offset in the "values" part. The +//! actual page images are stored in the "values" part. +use crate::config::PageServerConf; +use crate::page_cache::PAGE_SZ; +use crate::repository::{Key, Value, KEY_SIZE}; +use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; +use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; +use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; +use crate::tenant::filename::{ImageFileName, PathOrConf}; +use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; +use crate::virtual_file::VirtualFile; +use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; +use anyhow::{bail, ensure, Context, Result}; +use bytes::Bytes; +use hex; +use rand::{distributions::Alphanumeric, Rng}; +use serde::{Deserialize, Serialize}; +use std::fs; +use std::io::Write; +use std::io::{Seek, SeekFrom}; +use std::ops::Range; +use std::path::{Path, PathBuf}; +use std::sync::{RwLock, RwLockReadGuard}; +use tracing::*; + +use utils::{ + bin_ser::BeSer, + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +/// +/// Header stored in the beginning of the file +/// +/// After this comes the 'values' part, starting on block 1. After that, +/// the 'index' starts at the block indicated by 'index_start_blk' +/// +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] +struct Summary { + /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC. + magic: u16, + format_version: u16, + + tenant_id: TenantId, + timeline_id: TimelineId, + key_range: Range, + lsn: Lsn, + + /// Block number where the 'index' part of the file begins. + index_start_blk: u32, + /// Block within the 'index', where the B-tree root page is stored + index_root_blk: u32, + // the 'values' part starts after the summary header, on block 1. +} + +impl From<&ImageLayer> for Summary { + fn from(layer: &ImageLayer) -> Self { + Self { + magic: IMAGE_FILE_MAGIC, + format_version: STORAGE_FORMAT_VERSION, + tenant_id: layer.tenant_id, + timeline_id: layer.timeline_id, + key_range: layer.key_range.clone(), + lsn: layer.lsn, + + index_start_blk: 0, + index_root_blk: 0, + } + } +} + +/// +/// ImageLayer is the in-memory data structure associated with an on-disk image +/// file. We keep an ImageLayer in memory for each file, in the LayerMap. If a +/// layer is in "loaded" state, we have a copy of the index in memory, in 'inner'. +/// Otherwise the struct is just a placeholder for a file that exists on disk, +/// and it needs to be loaded before using it in queries. +/// +pub struct ImageLayer { + path_or_conf: PathOrConf, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub key_range: Range, + + // This entry contains an image of all pages as of this LSN + pub lsn: Lsn, + + inner: RwLock, +} + +pub struct ImageLayerInner { + /// If false, the 'index' has not been loaded into memory yet. + loaded: bool, + + // values copied from summary + index_start_blk: u32, + index_root_blk: u32, + + /// Reader object for reading blocks from the file. (None if not loaded yet) + file: Option>, +} + +impl Layer for ImageLayer { + fn filename(&self) -> PathBuf { + PathBuf::from(self.layer_name().to_string()) + } + + fn local_path(&self) -> Option { + Some(self.path()) + } + + fn get_tenant_id(&self) -> TenantId { + self.tenant_id + } + + fn get_timeline_id(&self) -> TimelineId { + self.timeline_id + } + + fn get_key_range(&self) -> Range { + self.key_range.clone() + } + + fn get_lsn_range(&self) -> Range { + // End-bound is exclusive + self.lsn..(self.lsn + 1) + } + + /// Look up given page in the file + fn get_value_reconstruct_data( + &self, + key: Key, + lsn_range: Range, + reconstruct_state: &mut ValueReconstructState, + ) -> anyhow::Result { + assert!(self.key_range.contains(&key)); + assert!(lsn_range.start >= self.lsn); + assert!(lsn_range.end >= self.lsn); + + let inner = self.load()?; + + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file); + + let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; + key.write_to_byte_slice(&mut keybuf); + if let Some(offset) = tree_reader.get(&keybuf)? { + let blob = file.block_cursor().read_blob(offset).with_context(|| { + format!( + "failed to read value from data file {} at offset {}", + self.filename().display(), + offset + ) + })?; + let value = Bytes::from(blob); + + reconstruct_state.img = Some((self.lsn, value)); + Ok(ValueReconstructResult::Complete) + } else { + Ok(ValueReconstructResult::Missing) + } + } + + fn iter(&self) -> Box>> { + todo!(); + } + + fn delete(&self) -> Result<()> { + // delete underlying file + fs::remove_file(self.path())?; + Ok(()) + } + + fn is_incremental(&self) -> bool { + false + } + + fn is_in_memory(&self) -> bool { + false + } + + /// debugging function to print out the contents of the layer + fn dump(&self, verbose: bool) -> Result<()> { + println!( + "----- image layer for ten {} tli {} key {}-{} at {} ----", + self.tenant_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn + ); + + if !verbose { + return Ok(()); + } + + let inner = self.load()?; + let file = inner.file.as_ref().unwrap(); + let tree_reader = + DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file); + + tree_reader.dump()?; + + tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| { + println!("key: {} offset {}", hex::encode(key), value); + true + })?; + + Ok(()) + } +} + +impl ImageLayer { + fn path_for( + path_or_conf: &PathOrConf, + timeline_id: TimelineId, + tenant_id: TenantId, + fname: &ImageFileName, + ) -> PathBuf { + match path_or_conf { + PathOrConf::Path(path) => path.to_path_buf(), + PathOrConf::Conf(conf) => conf + .timeline_path(&timeline_id, &tenant_id) + .join(fname.to_string()), + } + } + + fn temp_path_for( + conf: &PageServerConf, + timeline_id: TimelineId, + tenant_id: TenantId, + fname: &ImageFileName, + ) -> PathBuf { + let rand_string: String = rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(8) + .map(char::from) + .collect(); + + conf.timeline_path(&timeline_id, &tenant_id) + .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}")) + } + + /// + /// Open the underlying file and read the metadata into memory, if it's + /// not loaded already. + /// + fn load(&self) -> Result> { + loop { + // Quick exit if already loaded + let inner = self.inner.read().unwrap(); + if inner.loaded { + return Ok(inner); + } + + // Need to open the file and load the metadata. Upgrade our lock to + // a write lock. (Or rather, release and re-lock in write mode.) + drop(inner); + let mut inner = self.inner.write().unwrap(); + if !inner.loaded { + self.load_inner(&mut inner).with_context(|| { + format!("Failed to load image layer {}", self.path().display()) + })? + } else { + // Another thread loaded it while we were not holding the lock. + } + + // We now have the file open and loaded. There's no function to do + // that in the std library RwLock, so we have to release and re-lock + // in read mode. (To be precise, the lock guard was moved in the + // above call to `load_inner`, so it's already been released). And + // while we do that, another thread could unload again, so we have + // to re-check and retry if that happens. + drop(inner); + } + } + + fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> { + let path = self.path(); + + // Open the file if it's not open already. + if inner.file.is_none() { + let file = VirtualFile::open(&path) + .with_context(|| format!("Failed to open file '{}'", path.display()))?; + inner.file = Some(FileBlockReader::new(file)); + } + let file = inner.file.as_mut().unwrap(); + let summary_blk = file.read_blk(0)?; + let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; + + match &self.path_or_conf { + PathOrConf::Conf(_) => { + let mut expected_summary = Summary::from(self); + expected_summary.index_start_blk = actual_summary.index_start_blk; + expected_summary.index_root_blk = actual_summary.index_root_blk; + + if actual_summary != expected_summary { + bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary); + } + } + PathOrConf::Path(path) => { + let actual_filename = Path::new(path.file_name().unwrap()); + let expected_filename = self.filename(); + + if actual_filename != expected_filename { + println!( + "warning: filename does not match what is expected from in-file summary" + ); + println!("actual: {:?}", actual_filename); + println!("expected: {:?}", expected_filename); + } + } + } + + inner.index_start_blk = actual_summary.index_start_blk; + inner.index_root_blk = actual_summary.index_root_blk; + inner.loaded = true; + Ok(()) + } + + /// Create an ImageLayer struct representing an existing file on disk + pub fn new( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_id: TenantId, + filename: &ImageFileName, + ) -> ImageLayer { + ImageLayer { + path_or_conf: PathOrConf::Conf(conf), + timeline_id, + tenant_id, + key_range: filename.key_range.clone(), + lsn: filename.lsn, + inner: RwLock::new(ImageLayerInner { + loaded: false, + file: None, + index_start_blk: 0, + index_root_blk: 0, + }), + } + } + + /// Create an ImageLayer struct representing an existing file on disk. + /// + /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. + pub fn new_for_path(path: &Path, file: F) -> Result + where + F: std::os::unix::prelude::FileExt, + { + let mut summary_buf = Vec::new(); + summary_buf.resize(PAGE_SZ, 0); + file.read_exact_at(&mut summary_buf, 0)?; + let summary = Summary::des_prefix(&summary_buf)?; + + Ok(ImageLayer { + path_or_conf: PathOrConf::Path(path.to_path_buf()), + timeline_id: summary.timeline_id, + tenant_id: summary.tenant_id, + key_range: summary.key_range, + lsn: summary.lsn, + inner: RwLock::new(ImageLayerInner { + file: None, + loaded: false, + index_start_blk: 0, + index_root_blk: 0, + }), + }) + } + + fn layer_name(&self) -> ImageFileName { + ImageFileName { + key_range: self.key_range.clone(), + lsn: self.lsn, + } + } + + /// Path to the layer file in pageserver workdir. + pub fn path(&self) -> PathBuf { + Self::path_for( + &self.path_or_conf, + self.timeline_id, + self.tenant_id, + &self.layer_name(), + ) + } +} + +/// A builder object for constructing a new image layer. +/// +/// Usage: +/// +/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...) +/// +/// 2. Write the contents by calling `put_page_image` for every key-value +/// pair in the key range. +/// +/// 3. Call `finish`. +/// +struct ImageLayerWriterInner { + conf: &'static PageServerConf, + path: PathBuf, + timeline_id: TimelineId, + tenant_id: TenantId, + key_range: Range, + lsn: Lsn, + + blob_writer: WriteBlobWriter, + tree: DiskBtreeBuilder, +} + +impl ImageLayerWriterInner { + /// + /// Start building a new image layer. + /// + fn new( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_id: TenantId, + key_range: &Range, + lsn: Lsn, + ) -> anyhow::Result { + // Create the file initially with a temporary filename. + // We'll atomically rename it to the final name when we're done. + let path = ImageLayer::temp_path_for( + conf, + timeline_id, + tenant_id, + &ImageFileName { + key_range: key_range.clone(), + lsn, + }, + ); + info!("new image layer {}", path.display()); + let mut file = VirtualFile::open_with_options( + &path, + std::fs::OpenOptions::new().write(true).create_new(true), + )?; + // make room for the header block + file.seek(SeekFrom::Start(PAGE_SZ as u64))?; + let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64); + + // Initialize the b-tree index builder + let block_buf = BlockBuf::new(); + let tree_builder = DiskBtreeBuilder::new(block_buf); + + let writer = Self { + conf, + path, + timeline_id, + tenant_id, + key_range: key_range.clone(), + lsn, + tree: tree_builder, + blob_writer, + }; + + Ok(writer) + } + + /// + /// Write next value to the file. + /// + /// The page versions must be appended in blknum order. + /// + fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> { + ensure!(self.key_range.contains(&key)); + let off = self.blob_writer.write_blob(img)?; + + let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; + key.write_to_byte_slice(&mut keybuf); + self.tree.append(&keybuf, off)?; + + Ok(()) + } + + /// + /// Finish writing the image layer. + /// + fn finish(self) -> anyhow::Result { + let index_start_blk = + ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; + + let mut file = self.blob_writer.into_inner(); + + // Write out the index + file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?; + let (index_root_blk, block_buf) = self.tree.finish()?; + for buf in block_buf.blocks { + file.write_all(buf.as_ref())?; + } + + // Fill in the summary on blk 0 + let summary = Summary { + magic: IMAGE_FILE_MAGIC, + format_version: STORAGE_FORMAT_VERSION, + tenant_id: self.tenant_id, + timeline_id: self.timeline_id, + key_range: self.key_range.clone(), + lsn: self.lsn, + index_start_blk, + index_root_blk, + }; + file.seek(SeekFrom::Start(0))?; + Summary::ser_into(&summary, &mut file)?; + + // Note: Because we open the file in write-only mode, we cannot + // reuse the same VirtualFile for reading later. That's why we don't + // set inner.file here. The first read will have to re-open it. + let layer = ImageLayer { + path_or_conf: PathOrConf::Conf(self.conf), + timeline_id: self.timeline_id, + tenant_id: self.tenant_id, + key_range: self.key_range.clone(), + lsn: self.lsn, + inner: RwLock::new(ImageLayerInner { + loaded: false, + file: None, + index_start_blk, + index_root_blk, + }), + }; + + // fsync the file + file.sync_all()?; + + // Rename the file to its final name + // + // Note: This overwrites any existing file. There shouldn't be any. + // FIXME: throw an error instead? + let final_path = ImageLayer::path_for( + &PathOrConf::Conf(self.conf), + self.timeline_id, + self.tenant_id, + &ImageFileName { + key_range: self.key_range.clone(), + lsn: self.lsn, + }, + ); + std::fs::rename(self.path, &final_path)?; + + trace!("created image layer {}", layer.path().display()); + + Ok(layer) + } +} + +/// A builder object for constructing a new image layer. +/// +/// Usage: +/// +/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...) +/// +/// 2. Write the contents by calling `put_page_image` for every key-value +/// pair in the key range. +/// +/// 3. Call `finish`. +/// +/// # Note +/// +/// As described in https://github.com/neondatabase/neon/issues/2650, it's +/// possible for the writer to drop before `finish` is actually called. So this +/// could lead to odd temporary files in the directory, exhausting file system. +/// This structure wraps `ImageLayerWriterInner` and also contains `Drop` +/// implementation that cleans up the temporary file in failure. It's not +/// possible to do this directly in `ImageLayerWriterInner` since `finish` moves +/// out some fields, making it impossible to implement `Drop`. +/// +#[must_use] +pub struct ImageLayerWriter { + inner: Option, +} + +impl ImageLayerWriter { + /// + /// Start building a new image layer. + /// + pub fn new( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_id: TenantId, + key_range: &Range, + lsn: Lsn, + ) -> anyhow::Result { + Ok(Self { + inner: Some(ImageLayerWriterInner::new( + conf, + timeline_id, + tenant_id, + key_range, + lsn, + )?), + }) + } + + /// + /// Write next value to the file. + /// + /// The page versions must be appended in blknum order. + /// + pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> { + self.inner.as_mut().unwrap().put_image(key, img) + } + + /// + /// Finish writing the image layer. + /// + pub fn finish(mut self) -> anyhow::Result { + self.inner.take().unwrap().finish() + } +} + +impl Drop for ImageLayerWriter { + fn drop(&mut self) { + if let Some(inner) = self.inner.take() { + inner.blob_writer.into_inner().remove(); + } + } +} diff --git a/pageserver/src/tenant/inmemory_layer.rs b/pageserver/src/tenant/inmemory_layer.rs new file mode 100644 index 0000000000..9aa33a72ca --- /dev/null +++ b/pageserver/src/tenant/inmemory_layer.rs @@ -0,0 +1,369 @@ +//! An in-memory layer stores recently received key-value pairs. +//! +//! The "in-memory" part of the name is a bit misleading: the actual page versions are +//! held in an ephemeral file, not in memory. The metadata for each page version, i.e. +//! its position in the file, is kept in memory, though. +//! +use crate::config::PageServerConf; +use crate::repository::{Key, Value}; +use crate::tenant::blob_io::{BlobCursor, BlobWriter}; +use crate::tenant::block_io::BlockReader; +use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter}; +use crate::tenant::ephemeral_file::EphemeralFile; +use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; +use crate::walrecord; +use anyhow::{bail, ensure, Result}; +use std::cell::RefCell; +use std::collections::HashMap; +use tracing::*; +use utils::{ + bin_ser::BeSer, + id::{TenantId, TimelineId}, + lsn::Lsn, + vec_map::VecMap, +}; +// avoid binding to Write (conflicts with std::io::Write) +// while being able to use std::fmt::Write's methods +use std::fmt::Write as _; +use std::ops::Range; +use std::path::PathBuf; +use std::sync::RwLock; + +thread_local! { + /// A buffer for serializing object during [`InMemoryLayer::put_value`]. + /// This buffer is reused for each serialization to avoid additional malloc calls. + static SER_BUFFER: RefCell> = RefCell::new(Vec::new()); +} + +pub struct InMemoryLayer { + conf: &'static PageServerConf, + tenant_id: TenantId, + timeline_id: TimelineId, + + /// + /// This layer contains all the changes from 'start_lsn'. The + /// start is inclusive. + /// + start_lsn: Lsn, + + /// The above fields never change. The parts that do change are in 'inner', + /// and protected by mutex. + inner: RwLock, +} + +pub struct InMemoryLayerInner { + /// Frozen layers have an exclusive end LSN. + /// Writes are only allowed when this is None + end_lsn: Option, + + /// + /// All versions of all pages in the layer are kept here. Indexed + /// by block number and LSN. The value is an offset into the + /// ephemeral file where the page version is stored. + /// + index: HashMap>, + + /// The values are stored in a serialized format in this file. + /// Each serialized Value is preceded by a 'u32' length field. + /// PerSeg::page_versions map stores offsets into this file. + file: EphemeralFile, +} + +impl InMemoryLayerInner { + fn assert_writeable(&self) { + assert!(self.end_lsn.is_none()); + } +} + +impl Layer for InMemoryLayer { + // An in-memory layer can be spilled to disk into ephemeral file, + // This function is used only for debugging, so we don't need to be very precise. + // Construct a filename as if it was a delta layer. + fn filename(&self) -> PathBuf { + let inner = self.inner.read().unwrap(); + + let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX)); + + PathBuf::from(format!( + "inmem-{:016X}-{:016X}", + self.start_lsn.0, end_lsn.0 + )) + } + + fn local_path(&self) -> Option { + None + } + + fn get_tenant_id(&self) -> TenantId { + self.tenant_id + } + + fn get_timeline_id(&self) -> TimelineId { + self.timeline_id + } + + fn get_key_range(&self) -> Range { + Key::MIN..Key::MAX + } + + fn get_lsn_range(&self) -> Range { + let inner = self.inner.read().unwrap(); + + let end_lsn = if let Some(end_lsn) = inner.end_lsn { + end_lsn + } else { + Lsn(u64::MAX) + }; + self.start_lsn..end_lsn + } + + /// Look up given value in the layer. + fn get_value_reconstruct_data( + &self, + key: Key, + lsn_range: Range, + reconstruct_state: &mut ValueReconstructState, + ) -> anyhow::Result { + ensure!(lsn_range.start >= self.start_lsn); + let mut need_image = true; + + let inner = self.inner.read().unwrap(); + + let mut reader = inner.file.block_cursor(); + + // Scan the page versions backwards, starting from `lsn`. + if let Some(vec_map) = inner.index.get(&key) { + let slice = vec_map.slice_range(lsn_range); + for (entry_lsn, pos) in slice.iter().rev() { + let buf = reader.read_blob(*pos)?; + let value = Value::des(&buf)?; + match value { + Value::Image(img) => { + reconstruct_state.img = Some((*entry_lsn, img)); + return Ok(ValueReconstructResult::Complete); + } + Value::WalRecord(rec) => { + let will_init = rec.will_init(); + reconstruct_state.records.push((*entry_lsn, rec)); + if will_init { + // This WAL record initializes the page, so no need to go further back + need_image = false; + break; + } + } + } + } + } + + // release lock on 'inner' + + // If an older page image is needed to reconstruct the page, let the + // caller know. + if need_image { + Ok(ValueReconstructResult::Continue) + } else { + Ok(ValueReconstructResult::Complete) + } + } + + fn iter(&self) -> Box>> { + todo!(); + } + + /// Nothing to do here. When you drop the last reference to the layer, it will + /// be deallocated. + fn delete(&self) -> Result<()> { + bail!("can't delete an InMemoryLayer") + } + + fn is_incremental(&self) -> bool { + // in-memory layer is always considered incremental. + true + } + + fn is_in_memory(&self) -> bool { + true + } + + /// debugging function to print out the contents of the layer + fn dump(&self, verbose: bool) -> Result<()> { + let inner = self.inner.read().unwrap(); + + let end_str = inner + .end_lsn + .as_ref() + .map(Lsn::to_string) + .unwrap_or_default(); + + println!( + "----- in-memory layer for tli {} LSNs {}-{} ----", + self.timeline_id, self.start_lsn, end_str, + ); + + if !verbose { + return Ok(()); + } + + let mut cursor = inner.file.block_cursor(); + let mut buf = Vec::new(); + for (key, vec_map) in inner.index.iter() { + for (lsn, pos) in vec_map.as_slice() { + let mut desc = String::new(); + cursor.read_blob_into_buf(*pos, &mut buf)?; + let val = Value::des(&buf); + match val { + Ok(Value::Image(img)) => { + write!(&mut desc, " img {} bytes", img.len())?; + } + Ok(Value::WalRecord(rec)) => { + let wal_desc = walrecord::describe_wal_record(&rec).unwrap(); + write!( + &mut desc, + " rec {} bytes will_init: {} {}", + buf.len(), + rec.will_init(), + wal_desc + )?; + } + Err(err) => { + write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?; + } + } + println!(" key {} at {}: {}", key, lsn, desc); + } + } + + Ok(()) + } +} + +impl InMemoryLayer { + /// + /// Get layer size on the disk + /// + pub fn size(&self) -> Result { + let inner = self.inner.read().unwrap(); + Ok(inner.file.size) + } + + /// + /// Create a new, empty, in-memory layer + /// + pub fn create( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_id: TenantId, + start_lsn: Lsn, + ) -> Result { + trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); + + let file = EphemeralFile::create(conf, tenant_id, timeline_id)?; + + Ok(InMemoryLayer { + conf, + timeline_id, + tenant_id, + start_lsn, + inner: RwLock::new(InMemoryLayerInner { + end_lsn: None, + index: HashMap::new(), + file, + }), + }) + } + + // Write operations + + /// Common subroutine of the public put_wal_record() and put_page_image() functions. + /// Adds the page version to the in-memory tree + pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { + trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); + let mut inner = self.inner.write().unwrap(); + inner.assert_writeable(); + + let off = { + SER_BUFFER.with(|x| -> Result<_> { + let mut buf = x.borrow_mut(); + buf.clear(); + val.ser_into(&mut (*buf))?; + let off = inner.file.write_blob(&buf)?; + Ok(off) + })? + }; + + let vec_map = inner.index.entry(key).or_default(); + let old = vec_map.append_or_update_last(lsn, off).unwrap().0; + if old.is_some() { + // We already had an entry for this LSN. That's odd.. + warn!("Key {} at {} already exists", key, lsn); + } + + Ok(()) + } + + pub fn put_tombstone(&self, _key_range: Range, _lsn: Lsn) -> Result<()> { + // TODO: Currently, we just leak the storage for any deleted keys + + Ok(()) + } + + /// Make the layer non-writeable. Only call once. + /// Records the end_lsn for non-dropped layers. + /// `end_lsn` is exclusive + pub fn freeze(&self, end_lsn: Lsn) { + let mut inner = self.inner.write().unwrap(); + + assert!(self.start_lsn < end_lsn); + inner.end_lsn = Some(end_lsn); + + for vec_map in inner.index.values() { + for (lsn, _pos) in vec_map.as_slice() { + assert!(*lsn < end_lsn); + } + } + } + + /// Write this frozen in-memory layer to disk. + /// + /// Returns a new delta layer with all the same data as this in-memory layer + pub fn write_to_disk(&self) -> Result { + // Grab the lock in read-mode. We hold it over the I/O, but because this + // layer is not writeable anymore, no one should be trying to acquire the + // write lock on it, so we shouldn't block anyone. There's one exception + // though: another thread might have grabbed a reference to this layer + // in `get_layer_for_write' just before the checkpointer called + // `freeze`, and then `write_to_disk` on it. When the thread gets the + // lock, it will see that it's not writeable anymore and retry, but it + // would have to wait until we release it. That race condition is very + // rare though, so we just accept the potential latency hit for now. + let inner = self.inner.read().unwrap(); + + let mut delta_layer_writer = DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_id, + Key::MIN, + self.start_lsn..inner.end_lsn.unwrap(), + )?; + + let mut buf = Vec::new(); + + let mut cursor = inner.file.block_cursor(); + + let mut keys: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); + keys.sort_by_key(|k| k.0); + + for (key, vec_map) in keys.iter() { + let key = **key; + // Write all page versions + for (lsn, pos) in vec_map.as_slice() { + cursor.read_blob_into_buf(*pos, &mut buf)?; + let will_init = Value::des(&buf)?.will_init(); + delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?; + } + } + + let delta_layer = delta_layer_writer.finish(Key::MAX)?; + Ok(delta_layer) + } +} diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs new file mode 100644 index 0000000000..9d914c1839 --- /dev/null +++ b/pageserver/src/tenant/layer_map.rs @@ -0,0 +1,588 @@ +//! +//! The layer map tracks what layers exist in a timeline. +//! +//! When the timeline is first accessed, the server lists of all layer files +//! in the timelines/ directory, and populates this map with +//! ImageLayer and DeltaLayer structs corresponding to each file. When the first +//! new WAL record is received, we create an InMemoryLayer to hold the incoming +//! records. Now and then, in the checkpoint() function, the in-memory layer is +//! are frozen, and it is split up into new image and delta layers and the +//! corresponding files are written to disk. +//! + +use crate::metrics::NUM_ONDISK_LAYERS; +use crate::repository::Key; +use crate::tenant::inmemory_layer::InMemoryLayer; +use crate::tenant::storage_layer::Layer; +use crate::tenant::storage_layer::{range_eq, range_overlaps}; +use amplify_num::i256; +use anyhow::Result; +use num_traits::identities::{One, Zero}; +use num_traits::{Bounded, Num, Signed}; +use rstar::{RTree, RTreeObject, AABB}; +use std::cmp::Ordering; +use std::collections::VecDeque; +use std::ops::Range; +use std::ops::{Add, Div, Mul, Neg, Rem, Sub}; +use std::sync::Arc; +use tracing::*; +use utils::lsn::Lsn; + +/// +/// LayerMap tracks what layers exist on a timeline. +/// +#[derive(Default)] +pub struct LayerMap { + // + // 'open_layer' holds the current InMemoryLayer that is accepting new + // records. If it is None, 'next_open_layer_at' will be set instead, indicating + // where the start LSN of the next InMemoryLayer that is to be created. + // + pub open_layer: Option>, + pub next_open_layer_at: Option, + + /// + /// Frozen layers, if any. Frozen layers are in-memory layers that + /// are no longer added to, but haven't been written out to disk + /// yet. They contain WAL older than the current 'open_layer' or + /// 'next_open_layer_at', but newer than any historic layer. + /// The frozen layers are in order from oldest to newest, so that + /// the newest one is in the 'back' of the VecDeque, and the oldest + /// in the 'front'. + /// + pub frozen_layers: VecDeque>, + + /// All the historic layers are kept here + historic_layers: RTree, + + /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. + /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. + l0_delta_layers: Vec>, +} + +struct LayerRTreeObject { + layer: Arc, + + envelope: AABB<[IntKey; 2]>, +} + +// Representation of Key as numeric type. +// We can not use native implementation of i128, because rstar::RTree +// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi). +// Overflow will cause panic in debug mode and incorrect area calculation in release mode, +// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work). +// By using i256 as the type, even though all the actual values would fit in i128, we can be +// sure that multiplication doesn't overflow. +// + +#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)] +struct IntKey(i256); + +impl Copy for IntKey {} + +impl IntKey { + fn from(i: i128) -> Self { + IntKey(i256::from(i)) + } +} + +impl Bounded for IntKey { + fn min_value() -> Self { + IntKey(i256::MIN) + } + fn max_value() -> Self { + IntKey(i256::MAX) + } +} + +impl Signed for IntKey { + fn is_positive(&self) -> bool { + self.0 > i256::ZERO + } + fn is_negative(&self) -> bool { + self.0 < i256::ZERO + } + fn signum(&self) -> Self { + match self.0.cmp(&i256::ZERO) { + Ordering::Greater => IntKey(i256::ONE), + Ordering::Less => IntKey(-i256::ONE), + Ordering::Equal => IntKey(i256::ZERO), + } + } + fn abs(&self) -> Self { + IntKey(self.0.abs()) + } + fn abs_sub(&self, other: &Self) -> Self { + if self.0 <= other.0 { + IntKey(i256::ZERO) + } else { + IntKey(self.0 - other.0) + } + } +} + +impl Neg for IntKey { + type Output = Self; + fn neg(self) -> Self::Output { + IntKey(-self.0) + } +} + +impl Rem for IntKey { + type Output = Self; + fn rem(self, rhs: Self) -> Self::Output { + IntKey(self.0 % rhs.0) + } +} + +impl Div for IntKey { + type Output = Self; + fn div(self, rhs: Self) -> Self::Output { + IntKey(self.0 / rhs.0) + } +} + +impl Add for IntKey { + type Output = Self; + fn add(self, rhs: Self) -> Self::Output { + IntKey(self.0 + rhs.0) + } +} + +impl Sub for IntKey { + type Output = Self; + fn sub(self, rhs: Self) -> Self::Output { + IntKey(self.0 - rhs.0) + } +} + +impl Mul for IntKey { + type Output = Self; + fn mul(self, rhs: Self) -> Self::Output { + IntKey(self.0 * rhs.0) + } +} + +impl One for IntKey { + fn one() -> Self { + IntKey(i256::ONE) + } +} + +impl Zero for IntKey { + fn zero() -> Self { + IntKey(i256::ZERO) + } + fn is_zero(&self) -> bool { + self.0 == i256::ZERO + } +} + +impl Num for IntKey { + type FromStrRadixErr = ::FromStrRadixErr; + fn from_str_radix(str: &str, radix: u32) -> Result { + Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?))) + } +} + +impl PartialEq for LayerRTreeObject { + fn eq(&self, other: &Self) -> bool { + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + Arc::ptr_eq(&self.layer, &other.layer) + } +} + +impl RTreeObject for LayerRTreeObject { + type Envelope = AABB<[IntKey; 2]>; + fn envelope(&self) -> Self::Envelope { + self.envelope + } +} + +impl LayerRTreeObject { + fn new(layer: Arc) -> Self { + let key_range = layer.get_key_range(); + let lsn_range = layer.get_lsn_range(); + + let envelope = AABB::from_corners( + [ + IntKey::from(key_range.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(key_range.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive + ); + LayerRTreeObject { layer, envelope } + } +} + +/// Return value of LayerMap::search +pub struct SearchResult { + pub layer: Arc, + pub lsn_floor: Lsn, +} + +impl LayerMap { + /// + /// Find the latest layer that covers the given 'key', with lsn < + /// 'end_lsn'. + /// + /// Returns the layer, if any, and an 'lsn_floor' value that + /// indicates which portion of the layer the caller should + /// check. 'lsn_floor' is normally the start-LSN of the layer, but + /// can be greater if there is an overlapping layer that might + /// contain the version, even if it's missing from the returned + /// layer. + /// + pub fn search(&self, key: Key, end_lsn: Lsn) -> Result> { + // linear search + // Find the latest image layer that covers the given key + let mut latest_img: Option> = None; + let mut latest_img_lsn: Option = None; + let envelope = AABB::from_corners( + [IntKey::from(key.to_i128()), IntKey::from(0i128)], + [ + IntKey::from(key.to_i128()), + IntKey::from(end_lsn.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; + if l.is_incremental() { + continue; + } + assert!(l.get_key_range().contains(&key)); + let img_lsn = l.get_lsn_range().start; + assert!(img_lsn < end_lsn); + if Lsn(img_lsn.0 + 1) == end_lsn { + // found exact match + return Ok(Some(SearchResult { + layer: Arc::clone(l), + lsn_floor: img_lsn, + })); + } + if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) { + latest_img = Some(Arc::clone(l)); + latest_img_lsn = Some(img_lsn); + } + } + + // Search the delta layers + let mut latest_delta: Option> = None; + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; + if !l.is_incremental() { + continue; + } + assert!(l.get_key_range().contains(&key)); + if l.get_lsn_range().start >= end_lsn { + info!( + "Candidate delta layer {}..{} is too new for lsn {}", + l.get_lsn_range().start, + l.get_lsn_range().end, + end_lsn + ); + } + assert!(l.get_lsn_range().start < end_lsn); + if l.get_lsn_range().end >= end_lsn { + // this layer contains the requested point in the key/lsn space. + // No need to search any further + trace!( + "found layer {} for request on {key} at {end_lsn}", + l.filename().display(), + ); + latest_delta.replace(Arc::clone(l)); + break; + } + // this layer's end LSN is smaller than the requested point. If there's + // nothing newer, this is what we need to return. Remember this. + if let Some(old_candidate) = &latest_delta { + if l.get_lsn_range().end > old_candidate.get_lsn_range().end { + latest_delta.replace(Arc::clone(l)); + } + } else { + latest_delta.replace(Arc::clone(l)); + } + } + if let Some(l) = latest_delta { + trace!( + "found (old) layer {} for request on {key} at {end_lsn}", + l.filename().display(), + ); + let lsn_floor = std::cmp::max( + Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1), + l.get_lsn_range().start, + ); + Ok(Some(SearchResult { + lsn_floor, + layer: l, + })) + } else if let Some(l) = latest_img { + trace!("found img layer and no deltas for request on {key} at {end_lsn}"); + Ok(Some(SearchResult { + lsn_floor: latest_img_lsn.unwrap(), + layer: l, + })) + } else { + trace!("no layer found for request on {key} at {end_lsn}"); + Ok(None) + } + } + + /// + /// Insert an on-disk layer + /// + pub fn insert_historic(&mut self, layer: Arc) { + if layer.get_key_range() == (Key::MIN..Key::MAX) { + self.l0_delta_layers.push(layer.clone()); + } + self.historic_layers.insert(LayerRTreeObject::new(layer)); + NUM_ONDISK_LAYERS.inc(); + } + + /// + /// Remove an on-disk layer from the map. + /// + /// This should be called when the corresponding file on disk has been deleted. + /// + pub fn remove_historic(&mut self, layer: Arc) { + if layer.get_key_range() == (Key::MIN..Key::MAX) { + let len_before = self.l0_delta_layers.len(); + + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + self.l0_delta_layers + .retain(|other| !Arc::ptr_eq(other, &layer)); + assert_eq!(self.l0_delta_layers.len(), len_before - 1); + } + assert!(self + .historic_layers + .remove(&LayerRTreeObject::new(layer)) + .is_some()); + NUM_ONDISK_LAYERS.dec(); + } + + /// Is there a newer image layer for given key- and LSN-range? + /// + /// This is used for garbage collection, to determine if an old layer can + /// be deleted. + pub fn image_layer_exists( + &self, + key_range: &Range, + lsn_range: &Range, + ) -> Result { + let mut range_remain = key_range.clone(); + + loop { + let mut made_progress = false; + let envelope = AABB::from_corners( + [ + IntKey::from(range_remain.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(range_remain.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; + if l.is_incremental() { + continue; + } + let img_lsn = l.get_lsn_range().start; + if l.get_key_range().contains(&range_remain.start) && lsn_range.contains(&img_lsn) { + made_progress = true; + let img_key_end = l.get_key_range().end; + + if img_key_end >= range_remain.end { + return Ok(true); + } + range_remain.start = img_key_end; + } + } + + if !made_progress { + return Ok(false); + } + } + } + + pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { + self.historic_layers.iter().map(|e| e.layer.clone()) + } + + /// Find the last image layer that covers 'key', ignoring any image layers + /// newer than 'lsn'. + fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option> { + let mut candidate_lsn = Lsn(0); + let mut candidate = None; + let envelope = AABB::from_corners( + [IntKey::from(key.to_i128()), IntKey::from(0)], + [IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; + if l.is_incremental() { + continue; + } + + assert!(l.get_key_range().contains(&key)); + let this_lsn = l.get_lsn_range().start; + assert!(this_lsn <= lsn); + if this_lsn < candidate_lsn { + // our previous candidate was better + continue; + } + candidate_lsn = this_lsn; + candidate = Some(Arc::clone(l)); + } + + candidate + } + + /// + /// Divide the whole given range of keys into sub-ranges based on the latest + /// image layer that covers each range. (This is used when creating new + /// image layers) + /// + // FIXME: clippy complains that the result type is very complex. She's probably + // right... + #[allow(clippy::type_complexity)] + pub fn image_coverage( + &self, + key_range: &Range, + lsn: Lsn, + ) -> Result, Option>)>> { + let mut points = vec![key_range.start]; + let envelope = AABB::from_corners( + [IntKey::from(key_range.start.to_i128()), IntKey::from(0)], + [ + IntKey::from(key_range.end.to_i128()), + IntKey::from(lsn.0 as i128), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; + assert!(l.get_lsn_range().start <= lsn); + let range = l.get_key_range(); + if key_range.contains(&range.start) { + points.push(l.get_key_range().start); + } + if key_range.contains(&range.end) { + points.push(l.get_key_range().end); + } + } + points.push(key_range.end); + + points.sort(); + points.dedup(); + + // Ok, we now have a list of "interesting" points in the key space + + // For each range between the points, find the latest image + let mut start = *points.first().unwrap(); + let mut ranges = Vec::new(); + for end in points[1..].iter() { + let img = self.find_latest_image(start, lsn); + + ranges.push((start..*end, img)); + + start = *end; + } + Ok(ranges) + } + + /// Count how many L1 delta layers there are that overlap with the + /// given key and LSN range. + pub fn count_deltas(&self, key_range: &Range, lsn_range: &Range) -> Result { + let mut result = 0; + if lsn_range.start >= lsn_range.end { + return Ok(0); + } + let envelope = AABB::from_corners( + [ + IntKey::from(key_range.start.to_i128()), + IntKey::from(lsn_range.start.0 as i128), + ], + [ + IntKey::from(key_range.end.to_i128() - 1), + IntKey::from(lsn_range.end.0 as i128 - 1), + ], + ); + for e in self + .historic_layers + .locate_in_envelope_intersecting(&envelope) + { + let l = &e.layer; + if !l.is_incremental() { + continue; + } + assert!(range_overlaps(&l.get_lsn_range(), lsn_range)); + assert!(range_overlaps(&l.get_key_range(), key_range)); + + // We ignore level0 delta layers. Unless the whole keyspace fits + // into one partition + if !range_eq(key_range, &(Key::MIN..Key::MAX)) + && range_eq(&l.get_key_range(), &(Key::MIN..Key::MAX)) + { + continue; + } + + result += 1; + } + Ok(result) + } + + /// Return all L0 delta layers + pub fn get_level0_deltas(&self) -> Result>> { + Ok(self.l0_delta_layers.clone()) + } + + /// debugging function to print out the contents of the layer map + #[allow(unused)] + pub fn dump(&self, verbose: bool) -> Result<()> { + println!("Begin dump LayerMap"); + + println!("open_layer:"); + if let Some(open_layer) = &self.open_layer { + open_layer.dump(verbose)?; + } + + println!("frozen_layers:"); + for frozen_layer in self.frozen_layers.iter() { + frozen_layer.dump(verbose)?; + } + + println!("historic_layers:"); + for e in self.historic_layers.iter() { + e.layer.dump(verbose)?; + } + println!("End dump LayerMap"); + Ok(()) + } +} diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs new file mode 100644 index 0000000000..3fb9ccd936 --- /dev/null +++ b/pageserver/src/tenant/metadata.rs @@ -0,0 +1,365 @@ +//! Every image of a certain timeline from [`crate::tenant::Tenant`] +//! has a metadata that needs to be stored persistently. +//! +//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of +//! external storage import and export operations. +//! +//! The module contains all structs and related helper methods related to timeline metadata. + +use std::fs::{File, OpenOptions}; +use std::io::Write; + +use anyhow::{bail, ensure, Context}; +use serde::{Deserialize, Serialize}; +use tracing::info_span; +use utils::{ + bin_ser::BeSer, + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +use crate::config::PageServerConf; +use crate::virtual_file::VirtualFile; + +/// Use special format number to enable backward compatibility. +const METADATA_FORMAT_VERSION: u16 = 4; + +/// Previous supported format versions. +const METADATA_OLD_FORMAT_VERSION: u16 = 3; + +/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic. +/// +/// This is the same assumption that PostgreSQL makes with the control file, +/// see PG_CONTROL_MAX_SAFE_SIZE +const METADATA_MAX_SIZE: usize = 512; + +/// Metadata stored on disk for each timeline +/// +/// The fields correspond to the values we hold in memory, in Timeline. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TimelineMetadata { + hdr: TimelineMetadataHeader, + body: TimelineMetadataBodyV2, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +struct TimelineMetadataHeader { + checksum: u32, // CRC of serialized metadata body + size: u16, // size of serialized metadata + format_version: u16, // metadata format version (used for compatibility checks) +} +const METADATA_HDR_SIZE: usize = std::mem::size_of::(); + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +struct TimelineMetadataBodyV2 { + disk_consistent_lsn: Lsn, + // This is only set if we know it. We track it in memory when the page + // server is running, but we only track the value corresponding to + // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a + // lot. We only store it in the metadata file when we flush *all* the + // in-memory data so that 'last_record_lsn' is the same as + // 'disk_consistent_lsn'. That's OK, because after page server restart, as + // soon as we reprocess at least one record, we will have a valid + // 'prev_record_lsn' value in memory again. This is only really needed when + // doing a clean shutdown, so that there is no more WAL beyond + // 'disk_consistent_lsn' + prev_record_lsn: Option, + ancestor_timeline: Option, + ancestor_lsn: Lsn, + latest_gc_cutoff_lsn: Lsn, + initdb_lsn: Lsn, + pg_version: u32, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +struct TimelineMetadataBodyV1 { + disk_consistent_lsn: Lsn, + // This is only set if we know it. We track it in memory when the page + // server is running, but we only track the value corresponding to + // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a + // lot. We only store it in the metadata file when we flush *all* the + // in-memory data so that 'last_record_lsn' is the same as + // 'disk_consistent_lsn'. That's OK, because after page server restart, as + // soon as we reprocess at least one record, we will have a valid + // 'prev_record_lsn' value in memory again. This is only really needed when + // doing a clean shutdown, so that there is no more WAL beyond + // 'disk_consistent_lsn' + prev_record_lsn: Option, + ancestor_timeline: Option, + ancestor_lsn: Lsn, + latest_gc_cutoff_lsn: Lsn, + initdb_lsn: Lsn, +} + +impl TimelineMetadata { + pub fn new( + disk_consistent_lsn: Lsn, + prev_record_lsn: Option, + ancestor_timeline: Option, + ancestor_lsn: Lsn, + latest_gc_cutoff_lsn: Lsn, + initdb_lsn: Lsn, + pg_version: u32, + ) -> Self { + Self { + hdr: TimelineMetadataHeader { + checksum: 0, + size: 0, + format_version: METADATA_FORMAT_VERSION, + }, + body: TimelineMetadataBodyV2 { + disk_consistent_lsn, + prev_record_lsn, + ancestor_timeline, + ancestor_lsn, + latest_gc_cutoff_lsn, + initdb_lsn, + pg_version, + }, + } + } + + fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result { + let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; + + // backward compatible only up to this version + ensure!( + hdr.format_version == METADATA_OLD_FORMAT_VERSION, + "unsupported metadata format version {}", + hdr.format_version + ); + + let metadata_size = hdr.size as usize; + + let body: TimelineMetadataBodyV1 = + TimelineMetadataBodyV1::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; + + let body = TimelineMetadataBodyV2 { + disk_consistent_lsn: body.disk_consistent_lsn, + prev_record_lsn: body.prev_record_lsn, + ancestor_timeline: body.ancestor_timeline, + ancestor_lsn: body.ancestor_lsn, + latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn, + initdb_lsn: body.initdb_lsn, + pg_version: 14, // All timelines created before this version had pg_version 14 + }; + + hdr.format_version = METADATA_FORMAT_VERSION; + + Ok(Self { hdr, body }) + } + + pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result { + ensure!( + metadata_bytes.len() == METADATA_MAX_SIZE, + "metadata bytes size is wrong" + ); + let hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; + + let metadata_size = hdr.size as usize; + ensure!( + metadata_size <= METADATA_MAX_SIZE, + "corrupted metadata file" + ); + let calculated_checksum = crc32c::crc32c(&metadata_bytes[METADATA_HDR_SIZE..metadata_size]); + ensure!( + hdr.checksum == calculated_checksum, + "metadata checksum mismatch" + ); + + if hdr.format_version != METADATA_FORMAT_VERSION { + // If metadata has the old format, + // upgrade it and return the result + TimelineMetadata::upgrade_timeline_metadata(metadata_bytes) + } else { + let body = + TimelineMetadataBodyV2::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?; + ensure!( + body.disk_consistent_lsn.is_aligned(), + "disk_consistent_lsn is not aligned" + ); + Ok(TimelineMetadata { hdr, body }) + } + } + + pub fn to_bytes(&self) -> anyhow::Result> { + let body_bytes = self.body.ser()?; + let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); + let hdr = TimelineMetadataHeader { + size: metadata_size as u16, + format_version: METADATA_FORMAT_VERSION, + checksum: crc32c::crc32c(&body_bytes), + }; + let hdr_bytes = hdr.ser()?; + let mut metadata_bytes = vec![0u8; METADATA_MAX_SIZE]; + metadata_bytes[0..METADATA_HDR_SIZE].copy_from_slice(&hdr_bytes); + metadata_bytes[METADATA_HDR_SIZE..metadata_size].copy_from_slice(&body_bytes); + Ok(metadata_bytes) + } + + /// [`Lsn`] that corresponds to the corresponding timeline directory + /// contents, stored locally in the pageserver workdir. + pub fn disk_consistent_lsn(&self) -> Lsn { + self.body.disk_consistent_lsn + } + + pub fn prev_record_lsn(&self) -> Option { + self.body.prev_record_lsn + } + + pub fn ancestor_timeline(&self) -> Option { + self.body.ancestor_timeline + } + + pub fn ancestor_lsn(&self) -> Lsn { + self.body.ancestor_lsn + } + + pub fn latest_gc_cutoff_lsn(&self) -> Lsn { + self.body.latest_gc_cutoff_lsn + } + + pub fn initdb_lsn(&self) -> Lsn { + self.body.initdb_lsn + } + + pub fn pg_version(&self) -> u32 { + self.body.pg_version + } +} + +/// Save timeline metadata to file +pub fn save_metadata( + conf: &'static PageServerConf, + timeline_id: TimelineId, + tenant_id: TenantId, + data: &TimelineMetadata, + first_save: bool, +) -> anyhow::Result<()> { + let _enter = info_span!("saving metadata").entered(); + let path = conf.metadata_path(timeline_id, tenant_id); + // use OpenOptions to ensure file presence is consistent with first_save + let mut file = VirtualFile::open_with_options( + &path, + OpenOptions::new().write(true).create_new(first_save), + )?; + + let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?; + + if file.write(&metadata_bytes)? != metadata_bytes.len() { + bail!("Could not write all the metadata bytes in a single call"); + } + file.sync_all()?; + + // fsync the parent directory to ensure the directory entry is durable + if first_save { + let timeline_dir = File::open( + &path + .parent() + .expect("Metadata should always have a parent dir"), + )?; + timeline_dir.sync_all()?; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tenant::harness::TIMELINE_ID; + + #[test] + fn metadata_serializes_correctly() { + let original_metadata = TimelineMetadata::new( + Lsn(0x200), + Some(Lsn(0x100)), + Some(TIMELINE_ID), + Lsn(0), + Lsn(0), + Lsn(0), + // Any version will do here, so use the default + crate::DEFAULT_PG_VERSION, + ); + + let metadata_bytes = original_metadata + .to_bytes() + .expect("Should serialize correct metadata to bytes"); + + let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes) + .expect("Should deserialize its own bytes"); + + assert_eq!( + deserialized_metadata.body, original_metadata.body, + "Metadata that was serialized to bytes and deserialized back should not change" + ); + } + + // Generate old version metadata and read it with current code. + // Ensure that it is upgraded correctly + #[test] + fn test_metadata_upgrade() { + #[derive(Debug, Clone, PartialEq, Eq)] + struct TimelineMetadataV1 { + hdr: TimelineMetadataHeader, + body: TimelineMetadataBodyV1, + } + + let metadata_v1 = TimelineMetadataV1 { + hdr: TimelineMetadataHeader { + checksum: 0, + size: 0, + format_version: METADATA_OLD_FORMAT_VERSION, + }, + body: TimelineMetadataBodyV1 { + disk_consistent_lsn: Lsn(0x200), + prev_record_lsn: Some(Lsn(0x100)), + ancestor_timeline: Some(TIMELINE_ID), + ancestor_lsn: Lsn(0), + latest_gc_cutoff_lsn: Lsn(0), + initdb_lsn: Lsn(0), + }, + }; + + impl TimelineMetadataV1 { + pub fn to_bytes(&self) -> anyhow::Result> { + let body_bytes = self.body.ser()?; + let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); + let hdr = TimelineMetadataHeader { + size: metadata_size as u16, + format_version: METADATA_OLD_FORMAT_VERSION, + checksum: crc32c::crc32c(&body_bytes), + }; + let hdr_bytes = hdr.ser()?; + let mut metadata_bytes = vec![0u8; METADATA_MAX_SIZE]; + metadata_bytes[0..METADATA_HDR_SIZE].copy_from_slice(&hdr_bytes); + metadata_bytes[METADATA_HDR_SIZE..metadata_size].copy_from_slice(&body_bytes); + Ok(metadata_bytes) + } + } + + let metadata_bytes = metadata_v1 + .to_bytes() + .expect("Should serialize correct metadata to bytes"); + + // This should deserialize to the latest version format + let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes) + .expect("Should deserialize its own bytes"); + + let expected_metadata = TimelineMetadata::new( + Lsn(0x200), + Some(Lsn(0x100)), + Some(TIMELINE_ID), + Lsn(0), + Lsn(0), + Lsn(0), + 14, // All timelines created before this version had pg_version 14 + ); + + assert_eq!( + deserialized_metadata.body, expected_metadata.body, + "Metadata of the old version {} should be upgraded to the latest version {}", + METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION + ); + } +} diff --git a/pageserver/src/layered_repository/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs similarity index 100% rename from pageserver/src/layered_repository/par_fsync.rs rename to pageserver/src/tenant/par_fsync.rs diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs new file mode 100644 index 0000000000..86e685fd4c --- /dev/null +++ b/pageserver/src/tenant/size.rs @@ -0,0 +1,475 @@ +use std::cmp; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use anyhow::Context; +use tokio::sync::Semaphore; + +use super::Tenant; +use utils::id::TimelineId; +use utils::lsn::Lsn; + +use tracing::*; + +/// Inputs to the actual tenant sizing model +/// +/// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to +/// be a transferrable format between execution environments and developer. +#[serde_with::serde_as] +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct ModelInputs { + updates: Vec, + retention_period: u64, + #[serde_as(as = "HashMap")] + timeline_inputs: HashMap, +} + +/// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as +/// part of [`ModelInputs`] from the HTTP api, explaining the inputs. +#[serde_with::serde_as] +#[derive(Debug, serde::Serialize, serde::Deserialize)] +struct TimelineInputs { + #[serde_as(as = "serde_with::DisplayFromStr")] + last_record: Lsn, + #[serde_as(as = "serde_with::DisplayFromStr")] + latest_gc_cutoff: Lsn, + #[serde_as(as = "serde_with::DisplayFromStr")] + horizon_cutoff: Lsn, + #[serde_as(as = "serde_with::DisplayFromStr")] + pitr_cutoff: Lsn, + #[serde_as(as = "serde_with::DisplayFromStr")] + next_gc_cutoff: Lsn, +} + +/// Gathers the inputs for the tenant sizing model. +/// +/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which +/// is updated on-demand, during the start of this calculation and separate from the +/// [`Timeline::latest_gc_cutoff`]. +/// +/// For timelines in general: +/// +/// ```ignore +/// 0-----|---------|----|------------| · · · · · |·> lsn +/// initdb_lsn branchpoints* next_gc_cutoff latest +/// ``` +/// +/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the +/// tenant size will be zero. +pub(super) async fn gather_inputs( + tenant: &Tenant, + limit: &Arc, + logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>, +) -> anyhow::Result { + // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to + // our advantage with `?` error handling. + let mut joinset = tokio::task::JoinSet::new(); + + let timelines = tenant + .refresh_gc_info() + .context("Failed to refresh gc_info before gathering inputs")?; + + if timelines.is_empty() { + // All timelines are below tenant's gc_horizon; alternative would be to use + // Tenant::list_timelines but then those gc_info's would not be updated yet, possibly + // missing GcInfo::retain_lsns or having obsolete values for cutoff's. + return Ok(ModelInputs { + updates: vec![], + retention_period: 0, + timeline_inputs: HashMap::new(), + }); + } + + // record the used/inserted cache keys here, to remove extras not to start leaking + // after initial run the cache should be quite stable, but live timelines will eventually + // require new lsns to be inspected. + let mut needed_cache = HashSet::<(TimelineId, Lsn)>::new(); + + let mut updates = Vec::new(); + + // record the per timline values used to determine `retention_period` + let mut timeline_inputs = HashMap::with_capacity(timelines.len()); + + // used to determine the `retention_period` for the size model + let mut max_cutoff_distance = None; + + // this will probably conflict with on-demand downloaded layers, or at least force them all + // to be downloaded + for timeline in timelines { + let last_record_lsn = timeline.get_last_record_lsn(); + + let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = { + // there's a race between the update (holding tenant.gc_lock) and this read but it + // might not be an issue, because it's not for Timeline::gc + let gc_info = timeline.gc_info.read().unwrap(); + + // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a + // new gc run, which we have no control over. however differently from `Timeline::gc` + // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not + // actually removing files. + let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff); + + // the minimum where we should find the next_gc_cutoff for our calculations. + // + // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we + // want to query any logical size before initdb_lsn. + let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn); + + let maybe_cutoff = if next_gc_cutoff > cutoff_minimum { + Some((next_gc_cutoff, LsnKind::GcCutOff)) + } else { + None + }; + + // this assumes there are no other lsns than the branchpoints + let lsns = gc_info + .retain_lsns + .iter() + .inspect(|&&lsn| { + trace!( + timeline_id=%timeline.timeline_id, + "retained lsn: {lsn:?}, is_before_ancestor_lsn={}", + lsn < timeline.get_ancestor_lsn() + ) + }) + .filter(|&&lsn| lsn > timeline.get_ancestor_lsn()) + .copied() + .map(|lsn| (lsn, LsnKind::BranchPoint)) + .chain(maybe_cutoff) + .collect::>(); + + ( + lsns, + gc_info.horizon_cutoff, + gc_info.pitr_cutoff, + next_gc_cutoff, + ) + }; + + // update this to have a retention_period later for the tenant_size_model + // tenant_size_model compares this to the last segments start_lsn + if let Some(cutoff_distance) = last_record_lsn.checked_sub(next_gc_cutoff) { + match max_cutoff_distance.as_mut() { + Some(max) => { + *max = std::cmp::max(*max, cutoff_distance); + } + _ => { + max_cutoff_distance = Some(cutoff_distance); + } + } + } + + // all timelines branch from something, because it might be impossible to pinpoint + // which is the tenant_size_model's "default" branch. + updates.push(Update { + lsn: timeline.get_ancestor_lsn(), + command: Command::BranchFrom(timeline.get_ancestor_timeline_id()), + timeline_id: timeline.timeline_id, + }); + + for (lsn, _kind) in &interesting_lsns { + if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) { + updates.push(Update { + lsn: *lsn, + timeline_id: timeline.timeline_id, + command: Command::Update(*size), + }); + + needed_cache.insert((timeline.timeline_id, *lsn)); + } else { + let timeline = Arc::clone(&timeline); + let parallel_size_calcs = Arc::clone(limit); + joinset.spawn(calculate_logical_size(parallel_size_calcs, timeline, *lsn)); + } + } + + timeline_inputs.insert( + timeline.timeline_id, + TimelineInputs { + last_record: last_record_lsn, + // this is not used above, because it might not have updated recently enough + latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), + horizon_cutoff, + pitr_cutoff, + next_gc_cutoff, + }, + ); + } + + let mut have_any_error = false; + + while let Some(res) = joinset.join_next().await { + // each of these come with Result, JoinError> + // because of spawn + spawn_blocking + let res = res.and_then(|inner| inner); + match res { + Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => { + debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated"); + + logical_size_cache.insert((timeline.timeline_id, lsn), size); + needed_cache.insert((timeline.timeline_id, lsn)); + + updates.push(Update { + lsn, + timeline_id: timeline.timeline_id, + command: Command::Update(size), + }); + } + Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => { + warn!( + timeline_id=%timeline.timeline_id, + "failed to calculate logical size at {lsn}: {error:#}" + ); + have_any_error = true; + } + Err(join_error) if join_error.is_cancelled() => { + unreachable!("we are not cancelling any of the futures, nor should be"); + } + Err(join_error) => { + // cannot really do anything, as this panic is likely a bug + error!("logical size query panicked: {join_error:#}"); + have_any_error = true; + } + } + } + + // prune any keys not needed anymore; we record every used key and added key. + logical_size_cache.retain(|key, _| needed_cache.contains(key)); + + if have_any_error { + // we cannot complete this round, because we are missing data. + // we have however cached all we were able to request calculation on. + anyhow::bail!("failed to calculate some logical_sizes"); + } + + // the data gathered to updates is per lsn, regardless of the branch, so we can use it to + // our advantage, not requiring a sorted container or graph walk. + // + // for branch points, which come as multiple updates at the same LSN, the Command::Update + // is needed before a branch is made out of that branch Command::BranchFrom. this is + // handled by the variant order in `Command`. + updates.sort_unstable(); + + let retention_period = match max_cutoff_distance { + Some(max) => max.0, + None => { + anyhow::bail!("the first branch should have a gc_cutoff after it's branch point at 0") + } + }; + + Ok(ModelInputs { + updates, + retention_period, + timeline_inputs, + }) +} + +impl ModelInputs { + pub fn calculate(&self) -> anyhow::Result { + // Option is used for "naming" the branches because it is assumed to be + // impossible to always determine the a one main branch. + let mut storage = tenant_size_model::Storage::>::new(None); + + // tracking these not to require modifying the current implementation of the size model, + // which works in relative LSNs and sizes. + let mut last_state: HashMap = HashMap::new(); + + for update in &self.updates { + let Update { + lsn, + command: op, + timeline_id, + } = update; + match op { + Command::Update(sz) => { + let latest = last_state.get_mut(timeline_id).ok_or_else(|| { + anyhow::anyhow!( + "ordering-mismatch: there must had been a previous state for {timeline_id}" + ) + })?; + + let lsn_bytes = { + let Lsn(now) = lsn; + let Lsn(prev) = latest.0; + debug_assert!(prev <= *now, "self.updates should had been sorted"); + now - prev + }; + + let size_diff = + i64::try_from(*sz as i128 - latest.1 as i128).with_context(|| { + format!("size difference i64 overflow for {timeline_id}") + })?; + + storage.modify_branch(&Some(*timeline_id), "".into(), lsn_bytes, size_diff); + *latest = (*lsn, *sz); + } + Command::BranchFrom(parent) => { + storage.branch(parent, Some(*timeline_id)); + + let size = parent + .as_ref() + .and_then(|id| last_state.get(id)) + .map(|x| x.1) + .unwrap_or(0); + last_state.insert(*timeline_id, (*lsn, size)); + } + } + } + + Ok(storage.calculate(self.retention_period).total_children()) + } +} + +/// Single size model update. +/// +/// Sizing model works with relative increments over latest branch state. +/// Updates are absolute, so additional state needs to be tracked when applying. +#[serde_with::serde_as] +#[derive( + Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize, +)] +struct Update { + #[serde_as(as = "serde_with::DisplayFromStr")] + lsn: utils::lsn::Lsn, + command: Command, + #[serde_as(as = "serde_with::DisplayFromStr")] + timeline_id: TimelineId, +} + +#[serde_with::serde_as] +#[derive(PartialOrd, PartialEq, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "snake_case")] +enum Command { + Update(u64), + BranchFrom(#[serde_as(as = "Option")] Option), +} + +impl std::fmt::Debug for Command { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // custom one-line implementation makes it more enjoyable to read {:#?} avoiding 3 + // linebreaks + match self { + Self::Update(arg0) => write!(f, "Update({arg0})"), + Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"), + } + } +} + +#[derive(Debug, Clone, Copy)] +enum LsnKind { + BranchPoint, + GcCutOff, +} + +/// Newtype around the tuple that carries the timeline at lsn logical size calculation. +struct TimelineAtLsnSizeResult( + Arc, + utils::lsn::Lsn, + anyhow::Result, +); + +#[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))] +async fn calculate_logical_size( + limit: Arc, + timeline: Arc, + lsn: utils::lsn::Lsn, +) -> Result { + let permit = tokio::sync::Semaphore::acquire_owned(limit) + .await + .expect("global semaphore should not had been closed"); + + tokio::task::spawn_blocking(move || { + let _permit = permit; + let size_res = timeline.calculate_logical_size(lsn); + TimelineAtLsnSizeResult(timeline, lsn, size_res) + }) + .await +} + +#[test] +fn updates_sort() { + use std::str::FromStr; + use utils::id::TimelineId; + use utils::lsn::Lsn; + + let ids = [ + TimelineId::from_str("7ff1edab8182025f15ae33482edb590a").unwrap(), + TimelineId::from_str("b1719e044db05401a05a2ed588a3ad3f").unwrap(), + TimelineId::from_str("b68d6691c895ad0a70809470020929ef").unwrap(), + ]; + + // try through all permutations + let ids = [ + [&ids[0], &ids[1], &ids[2]], + [&ids[0], &ids[2], &ids[1]], + [&ids[1], &ids[0], &ids[2]], + [&ids[1], &ids[2], &ids[0]], + [&ids[2], &ids[0], &ids[1]], + [&ids[2], &ids[1], &ids[0]], + ]; + + for ids in ids { + // apply a fixture which uses a permutation of ids + let commands = [ + Update { + lsn: Lsn(0), + command: Command::BranchFrom(None), + timeline_id: *ids[0], + }, + Update { + lsn: Lsn::from_str("0/67E7618").unwrap(), + command: Command::Update(43696128), + timeline_id: *ids[0], + }, + Update { + lsn: Lsn::from_str("0/67E7618").unwrap(), + command: Command::BranchFrom(Some(*ids[0])), + timeline_id: *ids[1], + }, + Update { + lsn: Lsn::from_str("0/76BE4F0").unwrap(), + command: Command::Update(41844736), + timeline_id: *ids[1], + }, + Update { + lsn: Lsn::from_str("0/10E49380").unwrap(), + command: Command::Update(42164224), + timeline_id: *ids[0], + }, + Update { + lsn: Lsn::from_str("0/10E49380").unwrap(), + command: Command::BranchFrom(Some(*ids[0])), + timeline_id: *ids[2], + }, + Update { + lsn: Lsn::from_str("0/11D74910").unwrap(), + command: Command::Update(42172416), + timeline_id: *ids[2], + }, + Update { + lsn: Lsn::from_str("0/12051E98").unwrap(), + command: Command::Update(42196992), + timeline_id: *ids[0], + }, + ]; + + let mut sorted = commands; + + // these must sort in the same order, regardless of how the ids sort + // which is why the timeline_id is the last field + sorted.sort_unstable(); + + assert_eq!(commands, sorted, "{:#?} vs. {:#?}", commands, sorted); + } +} + +#[test] +fn verify_size_for_multiple_branches() { + // this is generated from integration test test_tenant_size_with_multiple_branches, but this way + // it has the stable lsn's + let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072,"timeline_inputs":{"cd9d9409c216e64bf580904facedb01b":{"last_record":"0/18D5E40","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/18B5E40","pitr_cutoff":"0/18B5E40","next_gc_cutoff":"0/18B5E40"},"10b532a550540bc15385eac4edde416a":{"last_record":"0/1839818","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/1819818","pitr_cutoff":"0/1819818","next_gc_cutoff":"0/1819818"},"230fc9d756f7363574c0d66533564dcc":{"last_record":"0/222F438","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/220F438","pitr_cutoff":"0/220F438","next_gc_cutoff":"0/220F438"}}}"#; + + let inputs: ModelInputs = serde_json::from_str(doc).unwrap(); + + assert_eq!(inputs.calculate().unwrap(), 36_409_872); +} diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs new file mode 100644 index 0000000000..8dafcab124 --- /dev/null +++ b/pageserver/src/tenant/storage_layer.rs @@ -0,0 +1,153 @@ +//! +//! Common traits and structs for layers +//! + +use crate::repository::{Key, Value}; +use crate::walrecord::NeonWalRecord; +use anyhow::Result; +use bytes::Bytes; +use std::ops::Range; +use std::path::PathBuf; + +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +pub fn range_overlaps(a: &Range, b: &Range) -> bool +where + T: PartialOrd, +{ + if a.start < b.start { + a.end > b.start + } else { + b.end > a.start + } +} + +pub fn range_eq(a: &Range, b: &Range) -> bool +where + T: PartialEq, +{ + a.start == b.start && a.end == b.end +} + +/// Struct used to communicate across calls to 'get_value_reconstruct_data'. +/// +/// Before first call, you can fill in 'page_img' if you have an older cached +/// version of the page available. That can save work in +/// 'get_value_reconstruct_data', as it can stop searching for page versions +/// when all the WAL records going back to the cached image have been collected. +/// +/// When get_value_reconstruct_data returns Complete, 'img' is set to an image +/// of the page, or the oldest WAL record in 'records' is a will_init-type +/// record that initializes the page without requiring a previous image. +/// +/// If 'get_page_reconstruct_data' returns Continue, some 'records' may have +/// been collected, but there are more records outside the current layer. Pass +/// the same ValueReconstructState struct in the next 'get_value_reconstruct_data' +/// call, to collect more records. +/// +#[derive(Debug)] +pub struct ValueReconstructState { + pub records: Vec<(Lsn, NeonWalRecord)>, + pub img: Option<(Lsn, Bytes)>, +} + +/// Return value from Layer::get_page_reconstruct_data +#[derive(Clone, Copy, Debug)] +pub enum ValueReconstructResult { + /// Got all the data needed to reconstruct the requested page + Complete, + /// This layer didn't contain all the required data, the caller should look up + /// the predecessor layer at the returned LSN and collect more data from there. + Continue, + + /// This layer didn't contain data needed to reconstruct the page version at + /// the returned LSN. This is usually considered an error, but might be OK + /// in some circumstances. + Missing, +} + +/// A Layer contains all data in a "rectangle" consisting of a range of keys and +/// range of LSNs. +/// +/// There are two kinds of layers, in-memory and on-disk layers. In-memory +/// layers are used to ingest incoming WAL, and provide fast access to the +/// recent page versions. On-disk layers are stored as files on disk, and are +/// immutable. This trait presents the common functionality of in-memory and +/// on-disk layers. +/// +/// Furthermore, there are two kinds of on-disk layers: delta and image layers. +/// A delta layer contains all modifications within a range of LSNs and keys. +/// An image layer is a snapshot of all the data in a key-range, at a single +/// LSN +/// +pub trait Layer: Send + Sync { + fn get_tenant_id(&self) -> TenantId; + + /// Identify the timeline this layer belongs to + fn get_timeline_id(&self) -> TimelineId; + + /// Range of keys that this layer covers + fn get_key_range(&self) -> Range; + + /// Inclusive start bound of the LSN range that this layer holds + /// Exclusive end bound of the LSN range that this layer holds. + /// + /// - For an open in-memory layer, this is MAX_LSN. + /// - For a frozen in-memory layer or a delta layer, this is a valid end bound. + /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1 + fn get_lsn_range(&self) -> Range; + + /// Filename used to store this layer on disk. (Even in-memory layers + /// implement this, to print a handy unique identifier for the layer for + /// log messages, even though they're never not on disk.) + fn filename(&self) -> PathBuf; + + /// If a layer has a corresponding file on a local filesystem, return its absolute path. + fn local_path(&self) -> Option; + + /// + /// Return data needed to reconstruct given page at LSN. + /// + /// It is up to the caller to collect more data from previous layer and + /// perform WAL redo, if necessary. + /// + /// See PageReconstructResult for possible return values. The collected data + /// is appended to reconstruct_data; the caller should pass an empty struct + /// on first call, or a struct with a cached older image of the page if one + /// is available. If this returns PageReconstructResult::Continue, look up + /// the predecessor layer and call again with the same 'reconstruct_data' to + /// collect more data. + fn get_value_reconstruct_data( + &self, + key: Key, + lsn_range: Range, + reconstruct_data: &mut ValueReconstructState, + ) -> Result; + + /// Does this layer only contain some data for the key-range (incremental), + /// or does it contain a version of every page? This is important to know + /// for garbage collecting old layers: an incremental layer depends on + /// the previous non-incremental layer. + fn is_incremental(&self) -> bool; + + /// Returns true for layers that are represented in memory. + fn is_in_memory(&self) -> bool; + + /// Iterate through all keys and values stored in the layer + fn iter(&self) -> Box> + '_>; + + /// Iterate through all keys stored in the layer. Returns key, lsn and value size + /// It is used only for compaction and so is currently implemented only for DeltaLayer + fn key_iter(&self) -> Box + '_> { + panic!("Not implemented") + } + + /// Permanently remove this layer from disk. + fn delete(&self) -> Result<()>; + + /// Dump summary of the contents of the layer to stdout + fn dump(&self, verbose: bool) -> Result<()>; +} diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs new file mode 100644 index 0000000000..0b2f7876db --- /dev/null +++ b/pageserver/src/tenant/timeline.rs @@ -0,0 +1,2456 @@ +//! + +use anyhow::{anyhow, bail, ensure, Context}; +use bytes::Bytes; +use fail::fail_point; +use itertools::Itertools; +use once_cell::sync::OnceCell; +use pageserver_api::models::TimelineState; +use tokio::sync::watch; +use tokio::task::spawn_blocking; +use tracing::*; + +use std::cmp::{max, min, Ordering}; +use std::collections::{HashMap, HashSet}; +use std::fs; +use std::ops::{Deref, Range}; +use std::path::PathBuf; +use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock}; +use std::time::{Duration, Instant, SystemTime}; + +use crate::tenant::{ + delta_layer::{DeltaLayer, DeltaLayerWriter}, + ephemeral_file::is_ephemeral_file, + filename::{DeltaFileName, ImageFileName}, + image_layer::{ImageLayer, ImageLayerWriter}, + inmemory_layer::InMemoryLayer, + layer_map::{LayerMap, SearchResult}, + metadata::{save_metadata, TimelineMetadata}, + par_fsync, + storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}, +}; + +use crate::config::{PageServerConf, METADATA_FILE_NAME}; +use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::metrics::TimelineMetrics; +use crate::pgdatadir_mapping::BlockNumber; +use crate::pgdatadir_mapping::LsnForTimestamp; +use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; +use crate::tenant_config::TenantConfOpt; +use pageserver_api::reltag::RelTag; + +use postgres_ffi::to_pg_timestamp; +use utils::{ + id::{TenantId, TimelineId}, + lsn::{AtomicLsn, Lsn, RecordLsn}, + seqwait::SeqWait, + simple_rcu::{Rcu, RcuReadGuard}, +}; + +use crate::repository::GcResult; +use crate::repository::{Key, Value}; +use crate::task_mgr; +use crate::task_mgr::TaskKind; +use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task}; +use crate::walredo::WalRedoManager; +use crate::CheckpointConfig; +use crate::ZERO_PAGE; +use crate::{ + page_cache, + storage_sync::{self, index::LayerFileMetadata}, +}; + +pub struct Timeline { + conf: &'static PageServerConf, + tenant_conf: Arc>, + + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + + pub pg_version: u32, + + pub layers: RwLock, + + last_freeze_at: AtomicLsn, + // Atomic would be more appropriate here. + last_freeze_ts: RwLock, + + // WAL redo manager + walredo_mgr: Arc, + + // What page versions do we hold in the repository? If we get a + // request > last_record_lsn, we need to wait until we receive all + // the WAL up to the request. The SeqWait provides functions for + // that. TODO: If we get a request for an old LSN, such that the + // versions have already been garbage collected away, we should + // throw an error, but we don't track that currently. + // + // last_record_lsn.load().last points to the end of last processed WAL record. + // + // We also remember the starting point of the previous record in + // 'last_record_lsn.load().prev'. It's used to set the xl_prev pointer of the + // first WAL record when the node is started up. But here, we just + // keep track of it. + last_record_lsn: SeqWait, + + // All WAL records have been processed and stored durably on files on + // local disk, up to this LSN. On crash and restart, we need to re-process + // the WAL starting from this point. + // + // Some later WAL records might have been processed and also flushed to disk + // already, so don't be surprised to see some, but there's no guarantee on + // them yet. + disk_consistent_lsn: AtomicLsn, + + // Parent timeline that this timeline was branched from, and the LSN + // of the branch point. + ancestor_timeline: Option>, + ancestor_lsn: Lsn, + + // Metrics + metrics: TimelineMetrics, + + /// If `true`, will backup its files that appear after each checkpointing to the remote storage. + upload_layers: AtomicBool, + + /// Ensures layers aren't frozen by checkpointer between + /// [`Timeline::get_layer_for_write`] and layer reads. + /// Locked automatically by [`TimelineWriter`] and checkpointer. + /// Must always be acquired before the layer map/individual layer lock + /// to avoid deadlock. + write_lock: Mutex<()>, + + /// Used to avoid multiple `flush_loop` tasks running + flush_loop_started: Mutex, + + /// layer_flush_start_tx can be used to wake up the layer-flushing task. + /// The value is a counter, incremented every time a new flush cycle is requested. + /// The flush cycle counter is sent back on the layer_flush_done channel when + /// the flush finishes. You can use that to wait for the flush to finish. + layer_flush_start_tx: tokio::sync::watch::Sender, + /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel + layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>, + + /// Layer removal lock. + /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks. + /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], + /// and [`Tenant::delete_timeline`]. + layer_removal_cs: Mutex<()>, + + // Needed to ensure that we can't create a branch at a point that was already garbage collected + pub latest_gc_cutoff_lsn: Rcu, + + // List of child timelines and their branch points. This is needed to avoid + // garbage collecting data that is still needed by the child timelines. + pub gc_info: RwLock, + + // It may change across major versions so for simplicity + // keep it after running initdb for a timeline. + // It is needed in checks when we want to error on some operations + // when they are requested for pre-initdb lsn. + // It can be unified with latest_gc_cutoff_lsn under some "first_valid_lsn", + // though let's keep them both for better error visibility. + pub initdb_lsn: Lsn, + + /// When did we last calculate the partitioning? + partitioning: Mutex<(KeyPartitioning, Lsn)>, + + /// Configuration: how often should the partitioning be recalculated. + repartition_threshold: u64, + + /// Current logical size of the "datadir", at the last LSN. + current_logical_size: LogicalSize, + initial_size_computation_started: AtomicBool, + + /// Information about the last processed message by the WAL receiver, + /// or None if WAL receiver has not received anything for this timeline + /// yet. + pub last_received_wal: Mutex>, + + /// Relation size cache + pub rel_size_cache: RwLock>, + + state: watch::Sender, +} + +/// Internal structure to hold all data needed for logical size calculation. +/// Calculation consists of two parts: +/// 1. Initial size calculation. That might take a long time, because it requires +/// reading all layers containing relation sizes up to the `initial_part_end`. +/// 2. Collecting an incremental part and adding that to the initial size. +/// Increments are appended on walreceiver writing new timeline data, +/// which result in increase or decrease of the logical size. +struct LogicalSize { + /// Size, potentially slow to compute, derived from all layers located locally on this node's FS. + /// Might require reading multiple layers, and even ancestor's layers, to collect the size. + /// + /// NOTE: initial size is not a constant and will change between restarts. + initial_logical_size: OnceCell, + /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines. + initial_part_end: Option, + /// All other size changes after startup, combined together. + /// + /// Size shouldn't ever be negative, but this is signed for two reasons: + /// + /// 1. If we initialized the "baseline" size lazily, while we already + /// process incoming WAL, the incoming WAL records could decrement the + /// variable and temporarily make it negative. (This is just future-proofing; + /// the initialization is currently not done lazily.) + /// + /// 2. If there is a bug and we e.g. forget to increment it in some cases + /// when size grows, but remember to decrement it when it shrinks again, the + /// variable could go negative. In that case, it seems better to at least + /// try to keep tracking it, rather than clamp or overflow it. Note that + /// get_current_logical_size() will clamp the returned value to zero if it's + /// negative, and log an error. Could set it permanently to zero or some + /// special value to indicate "broken" instead, but this will do for now. + /// + /// Note that we also expose a copy of this value as a prometheus metric, + /// see `current_logical_size_gauge`. Use the `update_current_logical_size` + /// to modify this, it will also keep the prometheus metric in sync. + size_added_after_initial: AtomicI64, +} + +/// Normalized current size, that the data in pageserver occupies. +#[derive(Debug, Clone, Copy)] +enum CurrentLogicalSize { + /// The size is not yet calculated to the end, this is an intermediate result, + /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative, + /// yet total logical size cannot be below 0. + Approximate(u64), + // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are + // available for observation without any calculations. + Exact(u64), +} + +impl CurrentLogicalSize { + fn size(&self) -> u64 { + *match self { + Self::Approximate(size) => size, + Self::Exact(size) => size, + } + } +} + +impl LogicalSize { + fn empty_initial() -> Self { + Self { + initial_logical_size: OnceCell::with_value(0), + initial_part_end: None, + size_added_after_initial: AtomicI64::new(0), + } + } + + fn deferred_initial(compute_to: Lsn) -> Self { + Self { + initial_logical_size: OnceCell::new(), + initial_part_end: Some(compute_to), + size_added_after_initial: AtomicI64::new(0), + } + } + + fn current_size(&self) -> anyhow::Result { + let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire); + // ^^^ keep this type explicit so that the casts in this function break if + // we change the type. + match self.initial_logical_size.get() { + Some(initial_size) => { + let absolute_size_increment = u64::try_from( + size_increment + .checked_abs() + .with_context(|| format!("Size added after initial {size_increment} is not expected to be i64::MIN"))?, + ).expect("casting nonnegative i64 to u64 should not fail"); + + if size_increment < 0 { + initial_size.checked_sub(absolute_size_increment) + } else { + initial_size.checked_add(absolute_size_increment) + }.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}")) + .map(CurrentLogicalSize::Exact) + } + None => { + let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0); + Ok(CurrentLogicalSize::Approximate(non_negative_size_increment)) + } + } + } + + fn increment_size(&self, delta: i64) { + self.size_added_after_initial + .fetch_add(delta, AtomicOrdering::SeqCst); + } + + /// Returns the initialized (already calculated) value, if any. + fn initialized_size(&self) -> Option { + self.initial_logical_size.get().copied() + } +} + +pub struct WalReceiverInfo { + pub wal_source_connstr: String, + pub last_received_msg_lsn: Lsn, + pub last_received_msg_ts: u128, +} + +/// +/// Information about how much history needs to be retained, needed by +/// Garbage Collection. +/// +pub struct GcInfo { + /// Specific LSNs that are needed. + /// + /// Currently, this includes all points where child branches have + /// been forked off from. In the future, could also include + /// explicit user-defined snapshot points. + pub retain_lsns: Vec, + + /// In addition to 'retain_lsns', keep everything newer than this + /// point. + /// + /// This is calculated by subtracting 'gc_horizon' setting from + /// last-record LSN + /// + /// FIXME: is this inclusive or exclusive? + pub horizon_cutoff: Lsn, + + /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this + /// point. + /// + /// This is calculated by finding a number such that a record is needed for PITR + /// if only if its LSN is larger than 'pitr_cutoff'. + pub pitr_cutoff: Lsn, +} + +/// Public interface functions +impl Timeline { + /// Get the LSN where this branch was created + pub fn get_ancestor_lsn(&self) -> Lsn { + self.ancestor_lsn + } + + /// Get the ancestor's timeline id + pub fn get_ancestor_timeline_id(&self) -> Option { + self.ancestor_timeline + .as_ref() + .map(|ancestor| ancestor.timeline_id) + } + + /// Lock and get timeline's GC cuttof + pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { + self.latest_gc_cutoff_lsn.read() + } + + /// Look up given page version. + /// + /// NOTE: It is considered an error to 'get' a key that doesn't exist. The abstraction + /// above this needs to store suitable metadata to track what data exists with + /// what keys, in separate metadata entries. If a non-existent key is requested, + /// the Repository implementation may incorrectly return a value from an ancestor + /// branch, for example, or waste a lot of cycles chasing the non-existing key. + /// + pub fn get(&self, key: Key, lsn: Lsn) -> anyhow::Result { + anyhow::ensure!(lsn.is_valid(), "Invalid LSN"); + + // Check the page cache. We will get back the most recent page with lsn <= `lsn`. + // The cached image can be returned directly if there is no WAL between the cached image + // and requested LSN. The cached image can also be used to reduce the amount of WAL needed + // for redo. + let cached_page_img = match self.lookup_cached_page(&key, lsn) { + Some((cached_lsn, cached_img)) => { + match cached_lsn.cmp(&lsn) { + Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check + Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image + Ordering::Greater => { + unreachable!("the returned lsn should never be after the requested lsn") + } + } + Some((cached_lsn, cached_img)) + } + None => None, + }; + + let mut reconstruct_state = ValueReconstructState { + records: Vec::new(), + img: cached_page_img, + }; + + self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; + + self.metrics + .reconstruct_time_histo + .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) + } + + /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. + pub fn get_last_record_lsn(&self) -> Lsn { + self.last_record_lsn.load().last + } + + pub fn get_prev_record_lsn(&self) -> Lsn { + self.last_record_lsn.load().prev + } + + /// Atomically get both last and prev. + pub fn get_last_record_rlsn(&self) -> RecordLsn { + self.last_record_lsn.load() + } + + pub fn get_disk_consistent_lsn(&self) -> Lsn { + self.disk_consistent_lsn.load() + } + + /// Get the physical size of the timeline at the latest LSN + pub fn get_physical_size(&self) -> u64 { + self.metrics.current_physical_size_gauge.get() + } + + /// Get the physical size of the timeline at the latest LSN non incrementally + pub fn get_physical_size_non_incremental(&self) -> anyhow::Result { + let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + // total size of layer files in the current timeline directory + let mut total_physical_size = 0; + + for direntry in fs::read_dir(timeline_path)? { + let direntry = direntry?; + let fname = direntry.file_name(); + let fname = fname.to_string_lossy(); + + if ImageFileName::parse_str(&fname).is_some() + || DeltaFileName::parse_str(&fname).is_some() + { + total_physical_size += direntry.metadata()?.len(); + } + } + + Ok(total_physical_size) + } + + /// + /// Wait until WAL has been received and processed up to this LSN. + /// + /// You should call this before any of the other get_* or list_* functions. Calling + /// those functions with an LSN that has been processed yet is an error. + /// + pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> { + anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline"); + + // This should never be called from the WAL receiver, because that could lead + // to a deadlock. + anyhow::ensure!( + task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection), + "wait_lsn cannot be called in WAL receiver" + ); + + let _timer = self.metrics.wait_lsn_time_histo.start_timer(); + + self.last_record_lsn.wait_for_timeout(lsn, self.conf.wait_lsn_timeout).await + .with_context(|| + format!( + "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}", + lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn() + ) + )?; + + Ok(()) + } + + /// Check that it is valid to request operations with that lsn. + pub fn check_lsn_is_in_scope( + &self, + lsn: Lsn, + latest_gc_cutoff_lsn: &RcuReadGuard, + ) -> anyhow::Result<()> { + ensure!( + lsn >= **latest_gc_cutoff_lsn, + "LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)", + lsn, + **latest_gc_cutoff_lsn, + ); + Ok(()) + } + + /// Flush to disk all data that was written with the put_* functions + /// + /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't + /// know anything about them here in the repository. + #[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))] + pub async fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { + match cconf { + CheckpointConfig::Flush => { + self.freeze_inmem_layer(false); + self.flush_frozen_layers_and_wait().await + } + CheckpointConfig::Forced => { + self.freeze_inmem_layer(false); + self.flush_frozen_layers_and_wait().await?; + self.compact() + } + } + } + + pub fn compact(&self) -> anyhow::Result<()> { + let last_record_lsn = self.get_last_record_lsn(); + + // Last record Lsn could be zero in case the timelie was just created + if !last_record_lsn.is_valid() { + warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}"); + return Ok(()); + } + + // + // High level strategy for compaction / image creation: + // + // 1. First, calculate the desired "partitioning" of the + // currently in-use key space. The goal is to partition the + // key space into roughly fixed-size chunks, but also take into + // account any existing image layers, and try to align the + // chunk boundaries with the existing image layers to avoid + // too much churn. Also try to align chunk boundaries with + // relation boundaries. In principle, we don't know about + // relation boundaries here, we just deal with key-value + // pairs, and the code in pgdatadir_mapping.rs knows how to + // map relations into key-value pairs. But in practice we know + // that 'field6' is the block number, and the fields 1-5 + // identify a relation. This is just an optimization, + // though. + // + // 2. Once we know the partitioning, for each partition, + // decide if it's time to create a new image layer. The + // criteria is: there has been too much "churn" since the last + // image layer? The "churn" is fuzzy concept, it's a + // combination of too many delta files, or too much WAL in + // total in the delta file. Or perhaps: if creating an image + // file would allow to delete some older files. + // + // 3. After that, we compact all level0 delta files if there + // are too many of them. While compacting, we also garbage + // collect any page versions that are no longer needed because + // of the new image layers we created in step 2. + // + // TODO: This high level strategy hasn't been implemented yet. + // Below are functions compact_level0() and create_image_layers() + // but they are a bit ad hoc and don't quite work like it's explained + // above. Rewrite it. + let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); + + let target_file_size = self.get_checkpoint_distance(); + + // Define partitioning schema if needed + + match self.repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + ) { + Ok((partitioning, lsn)) => { + // 2. Create new image layers for partitions that have been modified + // "enough". + let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; + if !layer_paths_to_upload.is_empty() + && self.upload_layers.load(atomic::Ordering::Relaxed) + { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + layer_paths_to_upload, + None, + ); + } + + // 3. Compact + let timer = self.metrics.compact_time_histo.start_timer(); + self.compact_level0(target_file_size)?; + timer.stop_and_record(); + } + Err(err) => { + // no partitioning? This is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline + // as a simple key-value store, ignoring the datadir layout. Log the + // error but continue. + error!("could not compact, repartitioning keyspace failed: {err:?}"); + } + }; + + Ok(()) + } + + /// Mutate the timeline with a [`TimelineWriter`]. + pub fn writer(&self) -> TimelineWriter<'_> { + TimelineWriter { + tl: self, + _write_guard: self.write_lock.lock().unwrap(), + } + } + + /// Retrieve current logical size of the timeline. + /// + /// The size could be lagging behind the actual number, in case + /// the initial size calculation has not been run (gets triggered on the first size access). + pub fn get_current_logical_size(self: &Arc) -> anyhow::Result { + let current_size = self.current_logical_size.current_size()?; + debug!("Current size: {current_size:?}"); + + let size = current_size.size(); + if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) = + (current_size, self.current_logical_size.initial_part_end) + { + self.try_spawn_size_init_task(init_lsn); + } + + Ok(size) + } + + /// Check if more than 'checkpoint_distance' of WAL has been accumulated in + /// the in-memory layer, and initiate flushing it if so. + /// + /// Also flush after a period of time without new data -- it helps + /// safekeepers to regard pageserver as caught up and suspend activity. + pub fn check_checkpoint_distance(self: &Arc) -> anyhow::Result<()> { + let last_lsn = self.get_last_record_lsn(); + let layers = self.layers.read().unwrap(); + if let Some(open_layer) = &layers.open_layer { + let open_layer_size = open_layer.size()?; + drop(layers); + let last_freeze_at = self.last_freeze_at.load(); + let last_freeze_ts = *(self.last_freeze_ts.read().unwrap()); + let distance = last_lsn.widening_sub(last_freeze_at); + // Checkpointing the open layer can be triggered by layer size or LSN range. + // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and + // we want to stay below that with a big margin. The LSN distance determines how + // much WAL the safekeepers need to store. + if distance >= self.get_checkpoint_distance().into() + || open_layer_size > self.get_checkpoint_distance() + || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()) + { + info!( + "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}", + distance, + open_layer_size, + last_freeze_ts.elapsed() + ); + + self.freeze_inmem_layer(true); + self.last_freeze_at.store(last_lsn); + *(self.last_freeze_ts.write().unwrap()) = Instant::now(); + + // Wake up the layer flusher + self.flush_frozen_layers(); + } + } + Ok(()) + } + + pub fn set_state(&self, new_state: TimelineState) { + match (self.current_state(), new_state) { + (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { + debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}"); + } + (TimelineState::Broken, _) => { + error!("Ignoring state update {new_state:?} for broken tenant"); + } + (TimelineState::Paused, TimelineState::Active) => { + debug!("Not activating a paused timeline"); + } + (_, new_state) => { + self.state.send_replace(new_state); + } + } + } + + pub fn current_state(&self) -> TimelineState { + *self.state.borrow() + } + + pub fn is_active(&self) -> bool { + self.current_state() == TimelineState::Active + } + + pub fn subscribe_for_state_updates(&self) -> watch::Receiver { + self.state.subscribe() + } +} + +// Private functions +impl Timeline { + fn get_checkpoint_distance(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .checkpoint_distance + .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) + } + + fn get_checkpoint_timeout(&self) -> Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .checkpoint_timeout + .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) + } + + fn get_compaction_target_size(&self) -> u64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_target_size + .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) + } + + fn get_compaction_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .compaction_threshold + .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) + } + + fn get_image_creation_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .image_creation_threshold + .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) + } + + /// Open a Timeline handle. + /// + /// Loads the metadata for the timeline into memory, but not the layer map. + #[allow(clippy::too_many_arguments)] + pub(super) fn new( + conf: &'static PageServerConf, + tenant_conf: Arc>, + metadata: TimelineMetadata, + ancestor: Option>, + timeline_id: TimelineId, + tenant_id: TenantId, + walredo_mgr: Arc, + upload_layers: bool, + pg_version: u32, + ) -> Self { + let disk_consistent_lsn = metadata.disk_consistent_lsn(); + let (state, _) = watch::channel(TimelineState::Suspended); + + let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0); + let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); + + let mut result = Timeline { + conf, + tenant_conf, + timeline_id, + tenant_id, + pg_version, + layers: RwLock::new(LayerMap::default()), + + walredo_mgr, + + // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. + last_record_lsn: SeqWait::new(RecordLsn { + last: disk_consistent_lsn, + prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)), + }), + disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0), + + last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0), + last_freeze_ts: RwLock::new(Instant::now()), + + ancestor_timeline: ancestor, + ancestor_lsn: metadata.ancestor_lsn(), + + metrics: TimelineMetrics::new(&tenant_id, &timeline_id), + + upload_layers: AtomicBool::new(upload_layers), + + flush_loop_started: Mutex::new(false), + + layer_flush_start_tx, + layer_flush_done_tx, + + write_lock: Mutex::new(()), + layer_removal_cs: Mutex::new(()), + + gc_info: RwLock::new(GcInfo { + retain_lsns: Vec::new(), + horizon_cutoff: Lsn(0), + pitr_cutoff: Lsn(0), + }), + + latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), + initdb_lsn: metadata.initdb_lsn(), + + current_logical_size: if disk_consistent_lsn.is_valid() { + // we're creating timeline data with some layer files existing locally, + // need to recalculate timeline's logical size based on data in the layers. + LogicalSize::deferred_initial(disk_consistent_lsn) + } else { + // we're creating timeline data without any layers existing locally, + // initial logical size is 0. + LogicalSize::empty_initial() + }, + initial_size_computation_started: AtomicBool::new(false), + partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), + repartition_threshold: 0, + + last_received_wal: Mutex::new(None), + rel_size_cache: RwLock::new(HashMap::new()), + state, + }; + result.repartition_threshold = result.get_checkpoint_distance() / 10; + result + } + + pub(super) fn maybe_spawn_flush_loop(self: &Arc) { + let mut flush_loop_started = self.flush_loop_started.lock().unwrap(); + if *flush_loop_started { + info!( + "skipping attempt to start flush_loop twice {}/{}", + self.tenant_id, self.timeline_id + ); + return; + } + + let layer_flush_start_rx = self.layer_flush_start_tx.subscribe(); + let self_clone = Arc::clone(self); + info!("spawning flush loop"); + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::LayerFlushTask, + Some(self.tenant_id), + Some(self.timeline_id), + "layer flush task", + false, + async move { self_clone.flush_loop(layer_flush_start_rx).await; Ok(()) } + .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id)) + ); + + *flush_loop_started = true; + } + + pub(super) fn launch_wal_receiver(self: &Arc) { + if !is_etcd_client_initialized() { + if cfg!(test) { + info!("not launching WAL receiver because etcd client hasn't been initialized"); + return; + } else { + panic!("etcd client not initialized"); + } + } + + info!( + "launching WAL receiver for timeline {} of tenant {}", + self.timeline_id, self.tenant_id + ); + let tenant_conf_guard = self.tenant_conf.read().unwrap(); + let lagging_wal_timeout = tenant_conf_guard + .lagging_wal_timeout + .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout); + let walreceiver_connect_timeout = tenant_conf_guard + .walreceiver_connect_timeout + .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout); + let max_lsn_wal_lag = tenant_conf_guard + .max_lsn_wal_lag + .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); + drop(tenant_conf_guard); + let self_clone = Arc::clone(self); + spawn_connection_manager_task( + self.conf.broker_etcd_prefix.clone(), + self_clone, + walreceiver_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + ); + } + + /// + /// Scan the timeline directory to populate the layer map. + /// Returns all timeline-related files that were found and loaded. + /// + pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> { + let mut layers = self.layers.write().unwrap(); + let mut num_layers = 0; + + let timer = self.metrics.load_layer_map_histo.start_timer(); + + // Scan timeline directory and create ImageFileName and DeltaFilename + // structs representing all files on disk + let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); + // total size of layer files in the current timeline directory + let mut total_physical_size = 0; + + for direntry in fs::read_dir(timeline_path)? { + let direntry = direntry?; + let fname = direntry.file_name(); + let fname = fname.to_string_lossy(); + + if let Some(imgfilename) = ImageFileName::parse_str(&fname) { + // create an ImageLayer struct for each image file. + if imgfilename.lsn > disk_consistent_lsn { + warn!( + "found future image layer {} on timeline {} disk_consistent_lsn is {}", + imgfilename, self.timeline_id, disk_consistent_lsn + ); + + rename_to_backup(direntry.path())?; + continue; + } + + let layer = + ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); + + trace!("found layer {}", layer.filename().display()); + total_physical_size += layer.path().metadata()?.len(); + layers.insert_historic(Arc::new(layer)); + num_layers += 1; + } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { + // Create a DeltaLayer struct for each delta file. + // The end-LSN is exclusive, while disk_consistent_lsn is + // inclusive. For example, if disk_consistent_lsn is 100, it is + // OK for a delta layer to have end LSN 101, but if the end LSN + // is 102, then it might not have been fully flushed to disk + // before crash. + if deltafilename.lsn_range.end > disk_consistent_lsn + 1 { + warn!( + "found future delta layer {} on timeline {} disk_consistent_lsn is {}", + deltafilename, self.timeline_id, disk_consistent_lsn + ); + + rename_to_backup(direntry.path())?; + continue; + } + + let layer = + DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); + + trace!("found layer {}", layer.filename().display()); + total_physical_size += layer.path().metadata()?.len(); + layers.insert_historic(Arc::new(layer)); + num_layers += 1; + } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { + // ignore these + } else if is_ephemeral_file(&fname) { + // Delete any old ephemeral files + trace!("deleting old ephemeral file in timeline dir: {}", fname); + fs::remove_file(direntry.path())?; + } else { + warn!("unrecognized filename in timeline dir: {}", fname); + } + } + + layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1); + + info!( + "loaded layer map with {} layers at {}, total physical size: {}", + num_layers, disk_consistent_lsn, total_physical_size + ); + self.metrics + .current_physical_size_gauge + .set(total_physical_size); + + timer.stop_and_record(); + + Ok(()) + } + + pub(super) fn layer_removal_guard(&self) -> anyhow::Result> { + self.layer_removal_cs + .try_lock() + .map_err(|e| anyhow!("cannot lock compaction critical section {e}")) + } + + fn try_spawn_size_init_task(self: &Arc, init_lsn: Lsn) { + // Atomically check if the timeline size calculation had already started. + // If the flag was not already set, this sets it. + if !self + .initial_size_computation_started + .swap(true, AtomicOrdering::SeqCst) + { + // We need to start the computation task. + let self_clone = Arc::clone(self); + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::InitialLogicalSizeCalculation, + Some(self.tenant_id), + Some(self.timeline_id), + "initial size calculation", + false, + async move { + let mut timeline_state_updates = self_clone.subscribe_for_state_updates(); + let self_calculation = Arc::clone(&self_clone); + tokio::select! { + calculation_result = spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn)) => { + let calculated_size = calculation_result + .context("Failed to spawn calculation result task")? + .context("Failed to calculate logical size")?; + match self_clone.current_logical_size.initial_logical_size.set(calculated_size) { + Ok(()) => info!("Successfully calculated initial logical size"), + Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"), + } + Ok(()) + }, + new_event = async { + loop { + match timeline_state_updates.changed().await { + Ok(()) => { + let new_state = *timeline_state_updates.borrow(); + match new_state { + // we're running this job for active timelines only + TimelineState::Active => continue, + TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return Some(new_state), + } + } + Err(_sender_dropped_error) => return None, + } + } + } => { + match new_event { + Some(new_state) => info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates"), + None => info!("Timeline dropped state updates sender, stopping init size calculation"), + } + Ok(()) + }, + } + }.instrument(info_span!("initial_logical_size_calculation", tenant = %self.tenant_id, timeline = %self.timeline_id)), + ); + } + } + + /// Calculate the logical size of the database at the latest LSN. + /// + /// NOTE: counted incrementally, includes ancestors, this can be a slow operation. + pub fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result { + info!( + "Calculating logical size for timeline {} at {}", + self.timeline_id, up_to_lsn + ); + let timer = if up_to_lsn == self.initdb_lsn { + if let Some(size) = self.current_logical_size.initialized_size() { + if size != 0 { + // non-zero size means that the size has already been calculated by this method + // after startup. if the logical size is for a new timeline without layers the + // size will be zero, and we cannot use that, or this caching strategy until + // pageserver restart. + return Ok(size); + } + } + + self.metrics.init_logical_size_histo.start_timer() + } else { + self.metrics.logical_size_histo.start_timer() + }; + let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn)?; + debug!("calculated logical size: {logical_size}"); + timer.stop_and_record(); + Ok(logical_size) + } + + /// Update current logical size, adding `delta' to the old value. + fn update_current_logical_size(&self, delta: i64) { + let logical_size = &self.current_logical_size; + logical_size.increment_size(delta); + + // Also set the value in the prometheus gauge. Note that + // there is a race condition here: if this is is called by two + // threads concurrently, the prometheus gauge might be set to + // one value while current_logical_size is set to the + // other. + match logical_size.current_size() { + Ok(new_current_size) => self + .metrics + .current_logical_size_gauge + .set(new_current_size.size()), + Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"), + } + } + + /// + /// Get a handle to a Layer for reading. + /// + /// The returned Layer might be from an ancestor timeline, if the + /// segment hasn't been updated on this timeline yet. + /// + /// This function takes the current timeline's locked LayerMap as an argument, + /// so callers can avoid potential race conditions. + fn get_reconstruct_data( + &self, + key: Key, + request_lsn: Lsn, + reconstruct_state: &mut ValueReconstructState, + ) -> anyhow::Result<()> { + // Start from the current timeline. + let mut timeline_owned; + let mut timeline = self; + + // For debugging purposes, collect the path of layers that we traversed + // through. It's included in the error message if we fail to find the key. + let mut traversal_path: Vec<(ValueReconstructResult, Lsn, Arc)> = Vec::new(); + + let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { + *cached_lsn + } else { + Lsn(0) + }; + + // 'prev_lsn' tracks the last LSN that we were at in our search. It's used + // to check that each iteration make some progress, to break infinite + // looping if something goes wrong. + let mut prev_lsn = Lsn(u64::MAX); + + let mut result = ValueReconstructResult::Continue; + let mut cont_lsn = Lsn(request_lsn.0 + 1); + + 'outer: loop { + // The function should have updated 'state' + //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); + match result { + ValueReconstructResult::Complete => return Ok(()), + ValueReconstructResult::Continue => { + // If we reached an earlier cached page image, we're done. + if cont_lsn == cached_lsn + 1 { + self.metrics.materialized_page_cache_hit_counter.inc_by(1); + return Ok(()); + } + if prev_lsn <= cont_lsn { + // Didn't make any progress in last iteration. Error out to avoid + // getting stuck in the loop. + return layer_traversal_error(format!( + "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", + key, + Lsn(cont_lsn.0 - 1), + request_lsn, + timeline.ancestor_lsn + ), traversal_path); + } + prev_lsn = cont_lsn; + } + ValueReconstructResult::Missing => { + return layer_traversal_error( + format!( + "could not find data for key {} at LSN {}, for request at LSN {}", + key, cont_lsn, request_lsn + ), + traversal_path, + ); + } + } + + // Recurse into ancestor if needed + if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { + trace!( + "going into ancestor {}, cont_lsn is {}", + timeline.ancestor_lsn, + cont_lsn + ); + let ancestor = timeline.get_ancestor_timeline()?; + timeline_owned = ancestor; + timeline = &*timeline_owned; + prev_lsn = Lsn(u64::MAX); + continue; + } + + let layers = timeline.layers.read().unwrap(); + + // Check the open and frozen in-memory layers first, in order from newest + // to oldest. + if let Some(open_layer) = &layers.open_layer { + let start_lsn = open_layer.get_lsn_range().start; + if cont_lsn > start_lsn { + //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); + // Get all the data needed to reconstruct the page version from this layer. + // But if we have an older cached page image, no need to go past that. + let lsn_floor = max(cached_lsn + 1, start_lsn); + result = open_layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + )?; + cont_lsn = lsn_floor; + traversal_path.push((result, cont_lsn, open_layer.clone())); + continue; + } + } + for frozen_layer in layers.frozen_layers.iter().rev() { + let start_lsn = frozen_layer.get_lsn_range().start; + if cont_lsn > start_lsn { + //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); + let lsn_floor = max(cached_lsn + 1, start_lsn); + result = frozen_layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + )?; + cont_lsn = lsn_floor; + traversal_path.push((result, cont_lsn, frozen_layer.clone())); + continue 'outer; + } + } + + if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn)? { + //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); + + let lsn_floor = max(cached_lsn + 1, lsn_floor); + result = layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + )?; + cont_lsn = lsn_floor; + traversal_path.push((result, cont_lsn, layer)); + } else if timeline.ancestor_timeline.is_some() { + // Nothing on this timeline. Traverse to parent + result = ValueReconstructResult::Continue; + cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1); + } else { + // Nothing found + result = ValueReconstructResult::Missing; + } + } + } + + fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> { + let cache = page_cache::get(); + + // FIXME: It's pointless to check the cache for things that are not 8kB pages. + // We should look at the key to determine if it's a cacheable object + let (lsn, read_guard) = + cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?; + let img = Bytes::from(read_guard.to_vec()); + Some((lsn, img)) + } + + fn get_ancestor_timeline(&self) -> anyhow::Result> { + let ancestor = self.ancestor_timeline.as_ref().with_context(|| { + format!( + "Ancestor is missing. Timeline id: {} Ancestor id {:?}", + self.timeline_id, + self.get_ancestor_timeline_id(), + ) + })?; + Ok(Arc::clone(ancestor)) + } + + /// + /// Get a handle to the latest layer for appending. + /// + fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { + let mut layers = self.layers.write().unwrap(); + + ensure!(lsn.is_aligned()); + + let last_record_lsn = self.get_last_record_lsn(); + ensure!( + lsn > last_record_lsn, + "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})", + lsn, + last_record_lsn, + ); + + // Do we have a layer open for writing already? + let layer; + if let Some(open_layer) = &layers.open_layer { + if open_layer.get_lsn_range().start > lsn { + bail!("unexpected open layer in the future"); + } + + layer = Arc::clone(open_layer); + } else { + // No writeable layer yet. Create one. + let start_lsn = layers + .next_open_layer_at + .context("No next open layer found")?; + + trace!( + "creating layer for write at {}/{} for record at {}", + self.timeline_id, + start_lsn, + lsn + ); + let new_layer = + InMemoryLayer::create(self.conf, self.timeline_id, self.tenant_id, start_lsn)?; + let layer_rc = Arc::new(new_layer); + + layers.open_layer = Some(Arc::clone(&layer_rc)); + layers.next_open_layer_at = None; + + layer = layer_rc; + } + Ok(layer) + } + + fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> { + //info!("PUT: key {} at {}", key, lsn); + let layer = self.get_layer_for_write(lsn)?; + layer.put_value(key, lsn, val)?; + Ok(()) + } + + fn put_tombstone(&self, key_range: Range, lsn: Lsn) -> anyhow::Result<()> { + let layer = self.get_layer_for_write(lsn)?; + layer.put_tombstone(key_range, lsn)?; + + Ok(()) + } + + fn finish_write(&self, new_lsn: Lsn) { + assert!(new_lsn.is_aligned()); + + self.metrics.last_record_gauge.set(new_lsn.0 as i64); + self.last_record_lsn.advance(new_lsn); + } + + fn freeze_inmem_layer(&self, write_lock_held: bool) { + // Freeze the current open in-memory layer. It will be written to disk on next + // iteration. + let _write_guard = if write_lock_held { + None + } else { + Some(self.write_lock.lock().unwrap()) + }; + let mut layers = self.layers.write().unwrap(); + if let Some(open_layer) = &layers.open_layer { + let open_layer_rc = Arc::clone(open_layer); + // Does this layer need freezing? + let end_lsn = Lsn(self.get_last_record_lsn().0 + 1); + open_layer.freeze(end_lsn); + + // The layer is no longer open, update the layer map to reflect this. + // We will replace it with on-disk historics below. + layers.frozen_layers.push_back(open_layer_rc); + layers.open_layer = None; + layers.next_open_layer_at = Some(end_lsn); + self.last_freeze_at.store(end_lsn); + } + drop(layers); + } + + /// Layer flusher task's main loop. + async fn flush_loop(&self, mut layer_flush_start_rx: tokio::sync::watch::Receiver) { + info!("started flush loop"); + loop { + tokio::select! { + _ = task_mgr::shutdown_watcher() => { + info!("shutting down layer flush task"); + break; + }, + _ = layer_flush_start_rx.changed() => {} + } + + trace!("waking up"); + let timer = self.metrics.flush_time_histo.start_timer(); + let flush_counter = *layer_flush_start_rx.borrow(); + let result = loop { + let layer_to_flush = { + let layers = self.layers.read().unwrap(); + layers.frozen_layers.front().cloned() + // drop 'layers' lock to allow concurrent reads and writes + }; + if let Some(layer_to_flush) = layer_to_flush { + if let Err(err) = self.flush_frozen_layer(layer_to_flush).await { + error!("could not flush frozen layer: {err:?}"); + break Err(err); + } + continue; + } else { + break Ok(()); + } + }; + // Notify any listeners that we're done + let _ = self + .layer_flush_done_tx + .send_replace((flush_counter, result)); + + timer.stop_and_record(); + } + } + + async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> { + let mut rx = self.layer_flush_done_tx.subscribe(); + + // Increment the flush cycle counter and wake up the flush task. + // Remember the new value, so that when we listen for the flush + // to finish, we know when the flush that we initiated has + // finished, instead of some other flush that was started earlier. + let mut my_flush_request = 0; + + if !&*self.flush_loop_started.lock().unwrap() { + anyhow::bail!("cannot flush frozen layers when flush_loop is not running") + } + + self.layer_flush_start_tx.send_modify(|counter| { + my_flush_request = *counter + 1; + *counter = my_flush_request; + }); + + loop { + { + let (last_result_counter, last_result) = &*rx.borrow(); + if *last_result_counter >= my_flush_request { + if let Err(_err) = last_result { + // We already logged the original error in + // flush_loop. We cannot propagate it to the caller + // here, because it might not be Cloneable + anyhow::bail!( + "Could not flush frozen layer. Request id: {}", + my_flush_request + ); + } else { + return Ok(()); + } + } + } + trace!("waiting for flush to complete"); + rx.changed().await?; + trace!("done") + } + } + + fn flush_frozen_layers(&self) { + self.layer_flush_start_tx.send_modify(|val| *val += 1); + } + + /// Flush one frozen in-memory layer to disk, as a new delta layer. + #[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.filename().display()))] + async fn flush_frozen_layer(&self, frozen_layer: Arc) -> anyhow::Result<()> { + // As a special case, when we have just imported an image into the repository, + // instead of writing out a L0 delta layer, we directly write out image layer + // files instead. This is possible as long as *all* the data imported into the + // repository have the same LSN. + let lsn_range = frozen_layer.get_lsn_range(); + let layer_paths_to_upload = + if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { + let (partitioning, _lsn) = + self.repartition(self.initdb_lsn, self.get_compaction_target_size())?; + self.create_image_layers(&partitioning, self.initdb_lsn, true)? + } else { + // normal case, write out a L0 delta layer file. + let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?; + HashMap::from([(delta_path, metadata)]) + }; + + fail_point!("flush-frozen-before-sync"); + + // The new on-disk layers are now in the layer map. We can remove the + // in-memory layer from the map now. + { + let mut layers = self.layers.write().unwrap(); + let l = layers.frozen_layers.pop_front(); + + // Only one thread may call this function at a time (for this + // timeline). If two threads tried to flush the same frozen + // layer to disk at the same time, that would not work. + assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer)); + + // release lock on 'layers' + } + + fail_point!("checkpoint-after-sync"); + + // Update the metadata file, with new 'disk_consistent_lsn' + // + // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing + // *all* the layers, to avoid fsyncing the file multiple times. + let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); + let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); + + // If we were able to advance 'disk_consistent_lsn', save it the metadata file. + // After crash, we will restart WAL streaming and processing from that point. + if disk_consistent_lsn != old_disk_consistent_lsn { + assert!(disk_consistent_lsn > old_disk_consistent_lsn); + self.update_metadata_file(disk_consistent_lsn, layer_paths_to_upload)?; + // Also update the in-memory copy + self.disk_consistent_lsn.store(disk_consistent_lsn); + } + Ok(()) + } + + /// Update metadata file + fn update_metadata_file( + &self, + disk_consistent_lsn: Lsn, + layer_paths_to_upload: HashMap, + ) -> anyhow::Result<()> { + // We can only save a valid 'prev_record_lsn' value on disk if we + // flushed *all* in-memory changes to disk. We only track + // 'prev_record_lsn' in memory for the latest processed record, so we + // don't remember what the correct value that corresponds to some old + // LSN is. But if we flush everything, then the value corresponding + // current 'last_record_lsn' is correct and we can store it on disk. + let RecordLsn { + last: last_record_lsn, + prev: prev_record_lsn, + } = self.last_record_lsn.load(); + let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn { + Some(prev_record_lsn) + } else { + None + }; + + let ancestor_timeline_id = self + .ancestor_timeline + .as_ref() + .map(|ancestor| ancestor.timeline_id); + + let metadata = TimelineMetadata::new( + disk_consistent_lsn, + ondisk_prev_record_lsn, + ancestor_timeline_id, + self.ancestor_lsn, + *self.latest_gc_cutoff_lsn.read(), + self.initdb_lsn, + self.pg_version, + ); + + fail_point!("checkpoint-before-saving-metadata", |x| bail!( + "{}", + x.unwrap() + )); + + save_metadata( + self.conf, + self.timeline_id, + self.tenant_id, + &metadata, + false, + )?; + + if self.can_upload_layers() { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + layer_paths_to_upload, + Some(metadata), + ); + } + + Ok(()) + } + + // Write out the given frozen in-memory layer as a new L0 delta file + fn create_delta_layer( + &self, + frozen_layer: &InMemoryLayer, + ) -> anyhow::Result<(PathBuf, LayerFileMetadata)> { + // Write it out + let new_delta = frozen_layer.write_to_disk()?; + let new_delta_path = new_delta.path(); + + // Sync it to disk. + // + // We must also fsync the timeline dir to ensure the directory entries for + // new layer files are durable + // + // TODO: If we're running inside 'flush_frozen_layers' and there are multiple + // files to flush, it might be better to first write them all, and then fsync + // them all in parallel. + par_fsync::par_fsync(&[ + new_delta_path.clone(), + self.conf.timeline_path(&self.timeline_id, &self.tenant_id), + ])?; + + // Add it to the layer map + { + let mut layers = self.layers.write().unwrap(); + layers.insert_historic(Arc::new(new_delta)); + } + + // update the timeline's physical size + let sz = new_delta_path.metadata()?.len(); + + self.metrics.current_physical_size_gauge.add(sz); + // update metrics + self.metrics.num_persistent_files_created.inc_by(1); + self.metrics.persistent_bytes_written.inc_by(sz); + + Ok((new_delta_path, LayerFileMetadata::new(sz))) + } + + fn repartition(&self, lsn: Lsn, partition_size: u64) -> anyhow::Result<(KeyPartitioning, Lsn)> { + let mut partitioning_guard = self.partitioning.lock().unwrap(); + if partitioning_guard.1 == Lsn(0) + || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold + { + let keyspace = self.collect_keyspace(lsn)?; + let partitioning = keyspace.partition(partition_size); + *partitioning_guard = (partitioning, lsn); + return Ok((partitioning_guard.0.clone(), lsn)); + } + Ok((partitioning_guard.0.clone(), partitioning_guard.1)) + } + + // Is it time to create a new image layer for the given partition? + fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result { + let layers = self.layers.read().unwrap(); + + for part_range in &partition.ranges { + let image_coverage = layers.image_coverage(part_range, lsn)?; + for (img_range, last_img) in image_coverage { + let img_lsn = if let Some(last_img) = last_img { + last_img.get_lsn_range().end + } else { + Lsn(0) + }; + // Let's consider an example: + // + // delta layer with LSN range 71-81 + // delta layer with LSN range 81-91 + // delta layer with LSN range 91-101 + // image layer at LSN 100 + // + // If 'lsn' is still 100, i.e. no new WAL has been processed since the last image layer, + // there's no need to create a new one. We check this case explicitly, to avoid passing + // a bogus range to count_deltas below, with start > end. It's even possible that there + // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed + // after we read last_record_lsn, which is passed here in the 'lsn' argument. + if img_lsn < lsn { + let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; + + debug!( + "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", + img_range.start, img_range.end, num_deltas, img_lsn, lsn + ); + if num_deltas >= self.get_image_creation_threshold() { + return Ok(true); + } + } + } + } + + Ok(false) + } + + fn create_image_layers( + &self, + partitioning: &KeyPartitioning, + lsn: Lsn, + force: bool, + ) -> anyhow::Result> { + let timer = self.metrics.create_images_time_histo.start_timer(); + let mut image_layers: Vec = Vec::new(); + for partition in partitioning.parts.iter() { + if force || self.time_for_new_image_layer(partition, lsn)? { + let img_range = + partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_id, + &img_range, + lsn, + )?; + + fail_point!("image-layer-writer-fail-before-finish", |_| { + anyhow::bail!("failpoint image-layer-writer-fail-before-finish"); + }); + + for range in &partition.ranges { + let mut key = range.start; + while key < range.end { + let img = match self.get(key, lsn) { + Ok(img) => img, + Err(err) => { + // If we fail to reconstruct a VM or FSM page, we can zero the + // page without losing any actual user data. That seems better + // than failing repeatedly and getting stuck. + // + // We had a bug at one point, where we truncated the FSM and VM + // in the pageserver, but the Postgres didn't know about that + // and continued to generate incremental WAL records for pages + // that didn't exist in the pageserver. Trying to replay those + // WAL records failed to find the previous image of the page. + // This special case allows us to recover from that situation. + // See https://github.com/neondatabase/neon/issues/2601. + // + // Unfortunately we cannot do this for the main fork, or for + // any metadata keys, keys, as that would lead to actual data + // loss. + if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) { + warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}"); + ZERO_PAGE.clone() + } else { + return Err(err); + } + } + }; + image_layer_writer.put_image(key, &img)?; + key = key.next(); + } + } + let image_layer = image_layer_writer.finish()?; + image_layers.push(image_layer); + } + } + + // Sync the new layer to disk before adding it to the layer map, to make sure + // we don't garbage collect something based on the new layer, before it has + // reached the disk. + // + // We must also fsync the timeline dir to ensure the directory entries for + // new layer files are durable + // + // Compaction creates multiple image layers. It would be better to create them all + // and fsync them all in parallel. + let all_paths = image_layers + .iter() + .map(|layer| layer.path()) + .chain(std::iter::once( + self.conf.timeline_path(&self.timeline_id, &self.tenant_id), + )) + .collect::>(); + par_fsync::par_fsync(&all_paths)?; + + let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len()); + + let mut layers = self.layers.write().unwrap(); + for l in image_layers { + let path = l.path(); + let metadata = path.metadata()?; + + layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len())); + + self.metrics.current_physical_size_gauge.add(metadata.len()); + layers.insert_historic(Arc::new(l)); + } + drop(layers); + timer.stop_and_record(); + + Ok(layer_paths_to_upload) + } + + /// + /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as + /// as Level 1 files. + /// + fn compact_level0(&self, target_file_size: u64) -> anyhow::Result<()> { + let layers = self.layers.read().unwrap(); + let mut level0_deltas = layers.get_level0_deltas()?; + drop(layers); + + // Only compact if enough layers have accumulated. + if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() { + return Ok(()); + } + + // Gather the files to compact in this iteration. + // + // Start with the oldest Level 0 delta file, and collect any other + // level 0 files that form a contiguous sequence, such that the end + // LSN of previous file matches the start LSN of the next file. + // + // Note that if the files don't form such a sequence, we might + // "compact" just a single file. That's a bit pointless, but it allows + // us to get rid of the level 0 file, and compact the other files on + // the next iteration. This could probably made smarter, but such + // "gaps" in the sequence of level 0 files should only happen in case + // of a crash, partial download from cloud storage, or something like + // that, so it's not a big deal in practice. + level0_deltas.sort_by_key(|l| l.get_lsn_range().start); + let mut level0_deltas_iter = level0_deltas.iter(); + + let first_level0_delta = level0_deltas_iter.next().unwrap(); + let mut prev_lsn_end = first_level0_delta.get_lsn_range().end; + let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)]; + for l in level0_deltas_iter { + let lsn_range = l.get_lsn_range(); + + if lsn_range.start != prev_lsn_end { + break; + } + deltas_to_compact.push(Arc::clone(l)); + prev_lsn_end = lsn_range.end; + } + let lsn_range = Range { + start: deltas_to_compact.first().unwrap().get_lsn_range().start, + end: deltas_to_compact.last().unwrap().get_lsn_range().end, + }; + + info!( + "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", + lsn_range.start, + lsn_range.end, + deltas_to_compact.len(), + level0_deltas.len() + ); + for l in deltas_to_compact.iter() { + info!("compact includes {}", l.filename().display()); + } + // We don't need the original list of layers anymore. Drop it so that + // we don't accidentally use it later in the function. + drop(level0_deltas); + + // This iterator walks through all key-value pairs from all the layers + // we're compacting, in key, LSN order. + let all_values_iter = deltas_to_compact + .iter() + .map(|l| l.iter()) + .kmerge_by(|a, b| { + if let Ok((a_key, a_lsn, _)) = a { + if let Ok((b_key, b_lsn, _)) = b { + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + } else { + false + } + } else { + true + } + }); + + // This iterator walks through all keys and is needed to calculate size used by each key + let mut all_keys_iter = deltas_to_compact + .iter() + .map(|l| l.key_iter()) + .kmerge_by(|a, b| { + let (a_key, a_lsn, _) = a; + let (b_key, b_lsn, _) = b; + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + }); + + // Merge the contents of all the input delta layers into a new set + // of delta layers, based on the current partitioning. + // + // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. + // It's possible that there is a single key with so many page versions that storing all of them in a single layer file + // would be too large. In that case, we also split on the LSN dimension. + // + // LSN + // ^ + // | + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // + // If one key (X) has a lot of page versions: + // + // LSN + // ^ + // | (X) + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | +--+ | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | +--+ | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // TODO: this actually divides the layers into fixed-size chunks, not + // based on the partitioning. + // + // TODO: we should also opportunistically materialize and + // garbage collect what we can. + let mut new_layers = Vec::new(); + let mut prev_key: Option = None; + let mut writer: Option = None; + let mut key_values_total_size = 0u64; + let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key + let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key + for x in all_values_iter { + let (key, lsn, value) = x?; + let same_key = prev_key.map_or(false, |prev_key| prev_key == key); + // We need to check key boundaries once we reach next key or end of layer with the same key + if !same_key || lsn == dup_end_lsn { + let mut next_key_size = 0u64; + let is_dup_layer = dup_end_lsn.is_valid(); + dup_start_lsn = Lsn::INVALID; + if !same_key { + dup_end_lsn = Lsn::INVALID; + } + // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size + for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { + next_key_size = next_size; + if key != next_key { + if dup_end_lsn.is_valid() { + // We are writting segment with duplicates: + // place all remaining values of this key in separate segment + dup_start_lsn = dup_end_lsn; // new segments starts where old stops + dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range + } + break; + } + key_values_total_size += next_size; + // Check if it is time to split segment: if total keys size is larger than target file size. + // We need to avoid generation of empty segments if next_size > target_file_size. + if key_values_total_size > target_file_size && lsn != next_lsn { + // Split key between multiple layers: such layer can contain only single key + dup_start_lsn = if dup_end_lsn.is_valid() { + dup_end_lsn // new segment with duplicates starts where old one stops + } else { + lsn // start with the first LSN for this key + }; + dup_end_lsn = next_lsn; // upper LSN boundary is exclusive + break; + } + } + // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set. + if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; + } + if writer.is_some() { + let written_size = writer.as_mut().unwrap().size(); + // check if key cause layer overflow... + if is_dup_layer + || dup_end_lsn.is_valid() + || written_size + key_values_total_size > target_file_size + { + // ... if so, flush previous layer and prepare to write new one + new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?); + writer = None; + } + } + // Remember size of key value because at next iteration we will access next item + key_values_total_size = next_key_size; + } + if writer.is_none() { + // Create writer if not initiaized yet + writer = Some(DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_id, + key, + if dup_end_lsn.is_valid() { + // this is a layer containing slice of values of the same key + debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); + dup_start_lsn..dup_end_lsn + } else { + debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); + lsn_range.clone() + }, + )?); + } + + fail_point!("delta-layer-writer-fail-before-finish", |_| { + anyhow::bail!("failpoint delta-layer-writer-fail-before-finish"); + }); + + writer.as_mut().unwrap().put_value(key, lsn, value)?; + prev_key = Some(key); + } + if let Some(writer) = writer { + new_layers.push(writer.finish(prev_key.unwrap().next())?); + } + + // Sync layers + if !new_layers.is_empty() { + let mut layer_paths: Vec = new_layers.iter().map(|l| l.path()).collect(); + + // also sync the directory + layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); + + // Fsync all the layer files and directory using multiple threads to + // minimize latency. + par_fsync::par_fsync(&layer_paths)?; + + layer_paths.pop().unwrap(); + } + + let mut layers = self.layers.write().unwrap(); + let mut new_layer_paths = HashMap::with_capacity(new_layers.len()); + for l in new_layers { + let new_delta_path = l.path(); + + let metadata = new_delta_path.metadata()?; + + // update the timeline's physical size + self.metrics.current_physical_size_gauge.add(metadata.len()); + + new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len())); + layers.insert_historic(Arc::new(l)); + } + + // Now that we have reshuffled the data to set of new delta layers, we can + // delete the old ones + let mut layer_paths_do_delete = HashSet::with_capacity(deltas_to_compact.len()); + drop(all_keys_iter); + for l in deltas_to_compact { + if let Some(path) = l.local_path() { + self.metrics + .current_physical_size_gauge + .sub(path.metadata()?.len()); + layer_paths_do_delete.insert(path); + } + l.delete()?; + layers.remove_historic(l); + } + drop(layers); + + if self.can_upload_layers() { + storage_sync::schedule_layer_upload( + self.tenant_id, + self.timeline_id, + new_layer_paths, + None, + ); + storage_sync::schedule_layer_delete( + self.tenant_id, + self.timeline_id, + layer_paths_do_delete, + ); + } + + Ok(()) + } + + /// Update information about which layer files need to be retained on + /// garbage collection. This is separate from actually performing the GC, + /// and is updated more frequently, so that compaction can remove obsolete + /// page versions more aggressively. + /// + /// TODO: that's wishful thinking, compaction doesn't actually do that + /// currently. + /// + /// The caller specifies how much history is needed with the 3 arguments: + /// + /// retain_lsns: keep a version of each page at these LSNs + /// cutoff_horizon: also keep everything newer than this LSN + /// pitr: the time duration required to keep data for PITR + /// + /// The 'retain_lsns' list is currently used to prevent removing files that + /// are needed by child timelines. In the future, the user might be able to + /// name additional points in time to retain. The caller is responsible for + /// collecting that information. + /// + /// The 'cutoff_horizon' point is used to retain recent versions that might still be + /// needed by read-only nodes. (As of this writing, the caller just passes + /// the latest LSN subtracted by a constant, and doesn't do anything smart + /// to figure out what read-only nodes might actually need.) + /// + /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine + /// whether a record is needed for PITR. + pub(super) fn update_gc_info( + &self, + retain_lsns: Vec, + cutoff_horizon: Lsn, + pitr: Duration, + ) -> anyhow::Result<()> { + let mut gc_info = self.gc_info.write().unwrap(); + + gc_info.horizon_cutoff = cutoff_horizon; + gc_info.retain_lsns = retain_lsns; + + // Calculate pitr cutoff point. + // If we cannot determine a cutoff LSN, be conservative and don't GC anything. + let mut pitr_cutoff_lsn: Lsn; + + if pitr != Duration::ZERO { + // conservative, safe default is to remove nothing, when we have no + // commit timestamp data available + pitr_cutoff_lsn = *self.get_latest_gc_cutoff_lsn(); + + // First, calculate pitr_cutoff_timestamp and then convert it to LSN. + // If we don't have enough data to convert to LSN, + // play safe and don't remove any layers. + let now = SystemTime::now(); + if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { + let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); + + match self.find_lsn_for_timestamp(pitr_timestamp)? { + LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, + LsnForTimestamp::Future(lsn) => { + debug!("future({})", lsn); + pitr_cutoff_lsn = gc_info.horizon_cutoff; + } + LsnForTimestamp::Past(lsn) => { + debug!("past({})", lsn); + } + LsnForTimestamp::NoData(lsn) => { + debug!("nodata({})", lsn); + } + } + debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) + } + } else { + // No time-based retention. (Some unit tests depend on garbage-collection + // working even when CLOG data is missing, so that find_lsn_for_timestamp() + // above doesn't work.) + pitr_cutoff_lsn = gc_info.horizon_cutoff; + } + gc_info.pitr_cutoff = pitr_cutoff_lsn; + + Ok(()) + } + + /// + /// Garbage collect layer files on a timeline that are no longer needed. + /// + /// Currently, we don't make any attempt at removing unneeded page versions + /// within a layer file. We can only remove the whole file if it's fully + /// obsolete. + /// + pub(super) fn gc(&self) -> anyhow::Result { + let mut result: GcResult = GcResult::default(); + let now = SystemTime::now(); + + fail_point!("before-timeline-gc"); + + let _layer_removal_cs = self.layer_removal_cs.lock().unwrap(); + + let gc_info = self.gc_info.read().unwrap(); + + let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn()); + let pitr_cutoff = gc_info.pitr_cutoff; + let retain_lsns = &gc_info.retain_lsns; + + let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + + let _enter = + info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff) + .entered(); + + // Nothing to GC. Return early. + let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); + if latest_gc_cutoff >= new_gc_cutoff { + info!( + "Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}", + ); + return Ok(result); + } + + // We need to ensure that no one tries to read page versions or create + // branches at a point before latest_gc_cutoff_lsn. See branch_timeline() + // for details. This will block until the old value is no longer in use. + // + // The GC cutoff should only ever move forwards. + { + let write_guard = self.latest_gc_cutoff_lsn.lock_for_write(); + ensure!( + *write_guard <= new_gc_cutoff, + "Cannot move GC cutoff LSN backwards (was {}, new {})", + *write_guard, + new_gc_cutoff + ); + write_guard.store_and_unlock(new_gc_cutoff).wait(); + } + // Persist the new GC cutoff value in the metadata file, before + // we actually remove anything. + self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; + + info!("GC starting"); + + debug!("retain_lsns: {:?}", retain_lsns); + + let mut layers_to_remove = Vec::new(); + + // Scan all on-disk layers in the timeline. + // + // Garbage collect the layer if all conditions are satisfied: + // 1. it is older than cutoff LSN; + // 2. it is older than PITR interval; + // 3. it doesn't need to be retained for 'retain_lsns'; + // 4. newer on-disk image layers cover the layer's whole key range + // + let mut layers = self.layers.write().unwrap(); + 'outer: for l in layers.iter_historic_layers() { + // This layer is in the process of being flushed to disk. + // It will be swapped out of the layer map, replaced with + // on-disk layers containing the same data. + // We can't GC it, as it's not on disk. We can't remove it + // from the layer map yet, as it would make its data + // inaccessible. + if l.is_in_memory() { + continue; + } + + result.layers_total += 1; + + // 1. Is it newer than GC horizon cutoff point? + if l.get_lsn_range().end > horizon_cutoff { + debug!( + "keeping {} because it's newer than horizon_cutoff {}", + l.filename().display(), + horizon_cutoff + ); + result.layers_needed_by_cutoff += 1; + continue 'outer; + } + + // 2. It is newer than PiTR cutoff point? + if l.get_lsn_range().end > pitr_cutoff { + debug!( + "keeping {} because it's newer than pitr_cutoff {}", + l.filename().display(), + pitr_cutoff + ); + result.layers_needed_by_pitr += 1; + continue 'outer; + } + + // 3. Is it needed by a child branch? + // NOTE With that we would keep data that + // might be referenced by child branches forever. + // We can track this in child timeline GC and delete parent layers when + // they are no longer needed. This might be complicated with long inheritance chains. + for retain_lsn in retain_lsns { + // start_lsn is inclusive + if &l.get_lsn_range().start <= retain_lsn { + debug!( + "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", + l.filename().display(), + retain_lsn, + l.is_incremental(), + ); + result.layers_needed_by_branches += 1; + continue 'outer; + } + } + + // 4. Is there a later on-disk layer for this relation? + // + // The end-LSN is exclusive, while disk_consistent_lsn is + // inclusive. For example, if disk_consistent_lsn is 100, it is + // OK for a delta layer to have end LSN 101, but if the end LSN + // is 102, then it might not have been fully flushed to disk + // before crash. + // + // For example, imagine that the following layers exist: + // + // 1000 - image (A) + // 1000-2000 - delta (B) + // 2000 - image (C) + // 2000-3000 - delta (D) + // 3000 - image (E) + // + // If GC horizon is at 2500, we can remove layers A and B, but + // we cannot remove C, even though it's older than 2500, because + // the delta layer 2000-3000 depends on it. + if !layers + .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))? + { + debug!( + "keeping {} because it is the latest layer", + l.filename().display() + ); + result.layers_not_updated += 1; + continue 'outer; + } + + // We didn't find any reason to keep this file, so remove it. + debug!( + "garbage collecting {} is_dropped: xx is_incremental: {}", + l.filename().display(), + l.is_incremental(), + ); + layers_to_remove.push(Arc::clone(&l)); + } + + // Actually delete the layers from disk and remove them from the map. + // (couldn't do this in the loop above, because you cannot modify a collection + // while iterating it. BTreeMap::retain() would be another option) + let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len()); + for doomed_layer in layers_to_remove { + if let Some(path) = doomed_layer.local_path() { + self.metrics + .current_physical_size_gauge + .sub(path.metadata()?.len()); + layer_paths_to_delete.insert(path); + } + doomed_layer.delete()?; + layers.remove_historic(doomed_layer); + result.layers_removed += 1; + } + + info!( + "GC completed removing {} layers, cutoff {}", + result.layers_removed, new_gc_cutoff + ); + + if result.layers_removed != 0 { + fail_point!("after-timeline-gc-removed-layers"); + } + + if self.can_upload_layers() { + storage_sync::schedule_layer_delete( + self.tenant_id, + self.timeline_id, + layer_paths_to_delete, + ); + } + + result.elapsed = now.elapsed()?; + Ok(result) + } + + /// + /// Reconstruct a value, using the given base image and WAL records in 'data'. + /// + fn reconstruct_value( + &self, + key: Key, + request_lsn: Lsn, + mut data: ValueReconstructState, + ) -> anyhow::Result { + // Perform WAL redo if needed + data.records.reverse(); + + // If we have a page image, and no WAL, we're all set + if data.records.is_empty() { + if let Some((img_lsn, img)) = &data.img { + trace!( + "found page image for key {} at {}, no WAL redo required", + key, + img_lsn + ); + Ok(img.clone()) + } else { + bail!("base image for {} at {} not found", key, request_lsn); + } + } else { + // We need to do WAL redo. + // + // If we don't have a base image, then the oldest WAL record better initialize + // the page + if data.img.is_none() && !data.records.first().unwrap().1.will_init() { + bail!( + "Base image for {} at {} not found, but got {} WAL records", + key, + request_lsn, + data.records.len() + ); + } else { + let base_img = if let Some((_lsn, img)) = data.img { + trace!( + "found {} WAL records and a base image for {} at {}, performing WAL redo", + data.records.len(), + key, + request_lsn + ); + Some(img) + } else { + trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); + None + }; + + let last_rec_lsn = data.records.last().unwrap().0; + + let img = self + .walredo_mgr + .request_redo(key, request_lsn, base_img, data.records, self.pg_version) + .context("Failed to reconstruct a page image:")?; + + if img.len() == page_cache::PAGE_SZ { + let cache = page_cache::get(); + cache + .memorize_materialized_page( + self.tenant_id, + self.timeline_id, + key, + last_rec_lsn, + &img, + ) + .context("Materialized page memoization failed")?; + } + + Ok(img) + } + } + } + + fn can_upload_layers(&self) -> bool { + self.upload_layers.load(atomic::Ordering::Relaxed) + && self.current_state() != TimelineState::Broken + } +} + +/// Helper function for get_reconstruct_data() to add the path of layers traversed +/// to an error, as anyhow context information. +fn layer_traversal_error( + msg: String, + path: Vec<(ValueReconstructResult, Lsn, Arc)>, +) -> anyhow::Result<()> { + // We want the original 'msg' to be the outermost context. The outermost context + // is the most high-level information, which also gets propagated to the client. + let mut msg_iter = path + .iter() + .map(|(r, c, l)| { + format!( + "layer traversal: result {:?}, cont_lsn {}, layer: {}", + r, + c, + l.filename().display() + ) + }) + .chain(std::iter::once(msg)); + // Construct initial message from the first traversed layer + let err = anyhow!(msg_iter.next().unwrap()); + + // Append all subsequent traversals, and the error message 'msg', as contexts. + Err(msg_iter.fold(err, |err, msg| err.context(msg))) +} + +/// Various functions to mutate the timeline. +// TODO Currently, Deref is used to allow easy access to read methods from this trait. +// This is probably considered a bad practice in Rust and should be fixed eventually, +// but will cause large code changes. +pub struct TimelineWriter<'a> { + tl: &'a Timeline, + _write_guard: MutexGuard<'a, ()>, +} + +impl Deref for TimelineWriter<'_> { + type Target = Timeline; + + fn deref(&self) -> &Self::Target { + self.tl + } +} + +impl<'a> TimelineWriter<'a> { + /// Put a new page version that can be constructed from a WAL record + /// + /// This will implicitly extend the relation, if the page is beyond the + /// current end-of-file. + pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> { + self.tl.put_value(key, lsn, value) + } + + pub fn delete(&self, key_range: Range, lsn: Lsn) -> anyhow::Result<()> { + self.tl.put_tombstone(key_range, lsn) + } + + /// Track the end of the latest digested WAL record. + /// Remember the (end of) last valid WAL record remembered in the timeline. + /// + /// Call this after you have finished writing all the WAL up to 'lsn'. + /// + /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for + /// the 'lsn' or anything older. The previous last record LSN is stored alongside + /// the latest and can be read. + pub fn finish_write(&self, new_lsn: Lsn) { + self.tl.finish_write(new_lsn); + } + + pub fn update_current_logical_size(&self, delta: i64) { + self.tl.update_current_logical_size(delta) + } +} + +/// Add a suffix to a layer file's name: .{num}.old +/// Uses the first available num (starts at 0) +fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { + let filename = path + .file_name() + .ok_or_else(|| anyhow!("Path {} don't have a file name", path.display()))? + .to_string_lossy(); + let mut new_path = path.clone(); + + for i in 0u32.. { + new_path.set_file_name(format!("{}.{}.old", filename, i)); + if !new_path.exists() { + std::fs::rename(&path, &new_path)?; + return Ok(()); + } + } + + bail!("couldn't find an unused backup number for {:?}", path) +} diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs new file mode 100644 index 0000000000..dd3792450d --- /dev/null +++ b/pageserver/src/tenant_config.rs @@ -0,0 +1,244 @@ +//! Functions for handling per-tenant configuration options +//! +//! If tenant is created with --config option, +//! the tenant-specific config will be stored in tenant's directory. +//! Otherwise, global pageserver's config is used. +//! +//! If the tenant config file is corrupted, the tenant will be disabled. +//! We cannot use global or default config instead, because wrong settings +//! may lead to a data loss. +//! +use serde::{Deserialize, Serialize}; +use std::num::NonZeroU64; +use std::time::Duration; + +pub mod defaults { + // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB + // would be more appropriate. But a low value forces the code to be exercised more, + // which is good for now to trigger bugs. + // This parameter actually determines L0 layer file size. + pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; + pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m"; + + // Target file size, when creating image and delta layers. + // This parameter determines L1 layer file size. + pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; + + pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; + pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; + + pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; + pub const DEFAULT_GC_PERIOD: &str = "100 s"; + pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; + pub const DEFAULT_PITR_INTERVAL: &str = "30 days"; + pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds"; + pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds"; + pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; +} + +/// Per-tenant configuration options +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct TenantConf { + // Flush out an inmemory layer, if it's holding WAL older than this + // This puts a backstop on how much WAL needs to be re-digested if the + // page server crashes. + // This parameter actually determines L0 layer file size. + pub checkpoint_distance: u64, + // Inmemory layer is also flushed at least once in checkpoint_timeout to + // eventually upload WAL after activity is stopped. + pub checkpoint_timeout: Duration, + // Target file size, when creating image and delta layers. + // This parameter determines L1 layer file size. + pub compaction_target_size: u64, + // How often to check if there's compaction work to be done. + #[serde(with = "humantime_serde")] + pub compaction_period: Duration, + // Level0 delta layer threshold for compaction. + pub compaction_threshold: usize, + // Determines how much history is retained, to allow + // branching and read replicas at an older point in time. + // The unit is #of bytes of WAL. + // Page versions older than this are garbage collected away. + pub gc_horizon: u64, + // Interval at which garbage collection is triggered. + #[serde(with = "humantime_serde")] + pub gc_period: Duration, + // Delta layer churn threshold to create L1 image layers. + pub image_creation_threshold: usize, + // Determines how much history is retained, to allow + // branching and read replicas at an older point in time. + // The unit is time. + // Page versions older than this are garbage collected away. + #[serde(with = "humantime_serde")] + pub pitr_interval: Duration, + /// Maximum amount of time to wait while opening a connection to receive wal, before erroring. + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Duration, + /// Considers safekeepers stalled after no WAL updates were received longer than this threshold. + /// A stalled safekeeper will be changed to a newer one when it appears. + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Duration, + /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold. + /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, + /// to avoid eager reconnects. + pub max_lsn_wal_lag: NonZeroU64, + pub trace_read_requests: bool, +} + +/// Same as TenantConf, but this struct preserves the information about +/// which parameters are set and which are not. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] +pub struct TenantConfOpt { + pub checkpoint_distance: Option, + pub checkpoint_timeout: Option, + pub compaction_target_size: Option, + #[serde(with = "humantime_serde")] + pub compaction_period: Option, + pub compaction_threshold: Option, + pub gc_horizon: Option, + #[serde(with = "humantime_serde")] + pub gc_period: Option, + pub image_creation_threshold: Option, + #[serde(with = "humantime_serde")] + pub pitr_interval: Option, + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Option, + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Option, + pub max_lsn_wal_lag: Option, + pub trace_read_requests: Option, +} + +impl TenantConfOpt { + pub fn merge(&self, global_conf: TenantConf) -> TenantConf { + TenantConf { + checkpoint_distance: self + .checkpoint_distance + .unwrap_or(global_conf.checkpoint_distance), + checkpoint_timeout: self + .checkpoint_timeout + .unwrap_or(global_conf.checkpoint_timeout), + compaction_target_size: self + .compaction_target_size + .unwrap_or(global_conf.compaction_target_size), + compaction_period: self + .compaction_period + .unwrap_or(global_conf.compaction_period), + compaction_threshold: self + .compaction_threshold + .unwrap_or(global_conf.compaction_threshold), + gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon), + gc_period: self.gc_period.unwrap_or(global_conf.gc_period), + image_creation_threshold: self + .image_creation_threshold + .unwrap_or(global_conf.image_creation_threshold), + pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval), + walreceiver_connect_timeout: self + .walreceiver_connect_timeout + .unwrap_or(global_conf.walreceiver_connect_timeout), + lagging_wal_timeout: self + .lagging_wal_timeout + .unwrap_or(global_conf.lagging_wal_timeout), + max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag), + trace_read_requests: self + .trace_read_requests + .unwrap_or(global_conf.trace_read_requests), + } + } + + pub fn update(&mut self, other: &TenantConfOpt) { + if let Some(checkpoint_distance) = other.checkpoint_distance { + self.checkpoint_distance = Some(checkpoint_distance); + } + if let Some(checkpoint_timeout) = other.checkpoint_timeout { + self.checkpoint_timeout = Some(checkpoint_timeout); + } + if let Some(compaction_target_size) = other.compaction_target_size { + self.compaction_target_size = Some(compaction_target_size); + } + if let Some(compaction_period) = other.compaction_period { + self.compaction_period = Some(compaction_period); + } + if let Some(compaction_threshold) = other.compaction_threshold { + self.compaction_threshold = Some(compaction_threshold); + } + if let Some(gc_horizon) = other.gc_horizon { + self.gc_horizon = Some(gc_horizon); + } + if let Some(gc_period) = other.gc_period { + self.gc_period = Some(gc_period); + } + if let Some(image_creation_threshold) = other.image_creation_threshold { + self.image_creation_threshold = Some(image_creation_threshold); + } + if let Some(pitr_interval) = other.pitr_interval { + self.pitr_interval = Some(pitr_interval); + } + if let Some(walreceiver_connect_timeout) = other.walreceiver_connect_timeout { + self.walreceiver_connect_timeout = Some(walreceiver_connect_timeout); + } + if let Some(lagging_wal_timeout) = other.lagging_wal_timeout { + self.lagging_wal_timeout = Some(lagging_wal_timeout); + } + if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag { + self.max_lsn_wal_lag = Some(max_lsn_wal_lag); + } + } +} + +impl TenantConf { + pub fn default() -> TenantConf { + use defaults::*; + + TenantConf { + checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, + checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT) + .expect("cannot parse default checkpoint timeout"), + compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE, + compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) + .expect("cannot parse default compaction period"), + compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, + gc_horizon: DEFAULT_GC_HORIZON, + gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) + .expect("cannot parse default gc period"), + image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD, + pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) + .expect("cannot parse default PITR interval"), + walreceiver_connect_timeout: humantime::parse_duration( + DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, + ) + .expect("cannot parse default walreceiver connect timeout"), + lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT) + .expect("cannot parse default walreceiver lagging wal timeout"), + max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) + .expect("cannot parse default max walreceiver Lsn wal lag"), + trace_read_requests: false, + } + } + + #[cfg(test)] + pub fn dummy_conf() -> Self { + TenantConf { + checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, + checkpoint_timeout: Duration::from_secs(600), + compaction_target_size: 4 * 1024 * 1024, + compaction_period: Duration::from_secs(10), + compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD, + gc_horizon: defaults::DEFAULT_GC_HORIZON, + gc_period: Duration::from_secs(10), + image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD, + pitr_interval: Duration::from_secs(60 * 60), + walreceiver_connect_timeout: humantime::parse_duration( + defaults::DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, + ) + .unwrap(), + lagging_wal_timeout: humantime::parse_duration( + defaults::DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT, + ) + .unwrap(), + max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) + .unwrap(), + trace_read_requests: false, + } + } +} diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index d60b5fefd3..3766bc5cb3 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -1,288 +1,870 @@ //! This module acts as a switchboard to access different repositories managed by this //! page server. -use crate::branches; -use crate::config::PageServerConf; -use crate::layered_repository::LayeredRepository; -use crate::repository::{Repository, Timeline, TimelineSyncState}; -use crate::thread_mgr; -use crate::thread_mgr::ThreadKind; -use crate::walredo::PostgresRedoManager; -use crate::CheckpointConfig; -use anyhow::{bail, Context, Result}; -use lazy_static::lazy_static; -use log::*; -use serde::{Deserialize, Serialize}; use std::collections::{hash_map, HashMap}; -use std::fmt; -use std::sync::{Arc, Mutex, MutexGuard}; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use std::ffi::OsStr; +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::Arc; -lazy_static! { - static ref TENANTS: Mutex> = Mutex::new(HashMap::new()); -} +use anyhow::Context; +use tracing::*; -struct Tenant { - state: TenantState, - repo: Arc, -} +use remote_storage::GenericRemoteStorage; -#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] -pub enum TenantState { - // All data for this tenant is complete on local disk, but we haven't loaded the Repository, - // Timeline and Layer structs into memory yet, so it cannot be accessed yet. - //Ready, - // This tenant exists on local disk, and the layer map has been loaded into memory. - // The local disk might have some newer files that don't exist in cloud storage yet. - Active, - // Tenant is active, but there is no walreceiver connection. - Idle, - // This tenant exists on local disk, and the layer map has been loaded into memory. - // The local disk might have some newer files that don't exist in cloud storage yet. - // The tenant cannot be accessed anymore for any reason, but graceful shutdown. - Stopping, -} +use crate::config::{PageServerConf, METADATA_FILE_NAME, TIMELINE_UNINIT_MARK_SUFFIX}; +use crate::http::models::TenantInfo; +use crate::storage_sync::index::{LayerFileMetadata, RemoteIndex, RemoteTimelineIndex}; +use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData, TimelineLocalFiles}; +use crate::task_mgr::{self, TaskKind}; +use crate::tenant::{ + ephemeral_file::is_ephemeral_file, metadata::TimelineMetadata, Tenant, TenantState, +}; +use crate::tenant_config::TenantConfOpt; +use crate::walredo::PostgresRedoManager; +use crate::TEMP_FILE_SUFFIX; -impl fmt::Display for TenantState { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - TenantState::Active => f.write_str("Active"), - TenantState::Idle => f.write_str("Idle"), - TenantState::Stopping => f.write_str("Stopping"), - } +use utils::crashsafe::{self, path_with_suffix_extension}; +use utils::id::{TenantId, TimelineId}; + +mod tenants_state { + use once_cell::sync::Lazy; + use std::{ + collections::HashMap, + sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}, + }; + use utils::id::TenantId; + + use crate::tenant::Tenant; + + static TENANTS: Lazy>>> = + Lazy::new(|| RwLock::new(HashMap::new())); + + pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap>> { + TENANTS + .read() + .expect("Failed to read() tenants lock, it got poisoned") + } + + pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap>> { + TENANTS + .write() + .expect("Failed to write() tenants lock, it got poisoned") } } -fn access_tenants() -> MutexGuard<'static, HashMap> { - TENANTS.lock().unwrap() -} - -/// Updates tenants' repositories, changing their timelines state in memory. -pub fn set_timeline_states( +/// Initialize repositories with locally available timelines. +/// Timelines that are only partially available locally (remote storage has more data than this pageserver) +/// are scheduled for download and added to the tenant once download is completed. +pub fn init_tenant_mgr( conf: &'static PageServerConf, - timeline_states: HashMap>, -) { - if timeline_states.is_empty() { - debug!("no timeline state updates to perform"); - return; - } + remote_storage: Option, +) -> anyhow::Result { + let _entered = info_span!("init_tenant_mgr").entered(); - info!("Updating states for {} timelines", timeline_states.len()); - trace!("States: {:?}", timeline_states); + let local_tenant_files = local_tenant_timeline_files(conf) + .context("Failed to collect local tenant timeline files")?; - let mut m = access_tenants(); - for (tenant_id, timeline_states) in timeline_states { - let tenant = m.entry(tenant_id).or_insert_with(|| { - // TODO (rodionov) reuse one of the initialisation routines - // Set up a WAL redo manager, for applying WAL records. - let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); - - // Set up an object repository, for actual data storage. - let repo: Arc = Arc::new(LayeredRepository::new( - conf, - Arc::new(walredo_mgr), - tenant_id, - conf.remote_storage_config.is_some(), - )); - Tenant { - state: TenantState::Idle, - repo, + let (remote_index, tenants_to_attach) = if let Some(storage) = remote_storage { + let storage_config = conf + .remote_storage_config + .as_ref() + .expect("remote storage without config"); + let mut broken_tenants = HashMap::new(); + let mut ready_tenants = HashMap::new(); + for (tenant_id, tenant_attach_data) in local_tenant_files.into_iter() { + match tenant_attach_data { + TenantAttachData::Ready(t) => { + ready_tenants.insert(tenant_id, t); + } + TenantAttachData::Broken(e) => { + broken_tenants.insert(tenant_id, TenantAttachData::Broken(e)); + } } - }); - if let Err(e) = put_timelines_into_tenant(tenant, tenant_id, timeline_states) { - error!( - "Failed to update timeline states for tenant {}: {:?}", - tenant_id, e - ); } - } + let SyncStartupData { + remote_index, + local_timeline_init_statuses, + } = storage_sync::spawn_storage_sync_task( + conf, + ready_tenants, + storage, + storage_config.max_concurrent_syncs, + storage_config.max_sync_errors, + ) + .context("Failed to spawn the storage sync thread")?; + + let n = local_timeline_init_statuses.0.len(); + let mut synced_timelines = local_timeline_init_statuses.0.into_iter().fold( + HashMap::::with_capacity(n), + |mut new_values, (tenant_id, old_values)| { + let new_timeline_values = new_values + .entry(tenant_id) + .or_insert_with(|| TenantAttachData::Ready(HashMap::new())); + if let TenantAttachData::Ready(t) = new_timeline_values { + for (timeline_id, old_value) in old_values { + if let LocalTimelineInitStatus::LocallyComplete(metadata) = old_value { + t.insert(timeline_id, TimelineLocalFiles::ready(metadata)); + } + } + } + new_values + }, + ); + synced_timelines.extend(broken_tenants); + + (remote_index, synced_timelines) + } else { + info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled"); + (RemoteIndex::default(), local_tenant_files) + }; + attach_local_tenants(conf, &remote_index, tenants_to_attach); + + Ok(remote_index) } -fn put_timelines_into_tenant( - tenant: &mut Tenant, - tenant_id: ZTenantId, - timeline_states: HashMap, -) -> anyhow::Result<()> { - for (timeline_id, timeline_state) in timeline_states { - // If the timeline is being put into any other state than Ready, - // stop any threads operating on it. - // - // FIXME: This is racy. A page service thread could just get - // handle on the Timeline, before we call set_timeline_state() - if !matches!(timeline_state, TimelineSyncState::Ready(_)) { - thread_mgr::shutdown_threads(None, Some(tenant_id), Some(timeline_id)); +/// Reads local files to load tenants and their timelines given into pageserver's memory. +/// Ignores other timelines that might be present for tenant, but were not passed as a parameter. +/// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken", +/// and the load continues. +/// +/// For successful tenant attach, it first has to have a `timelines/` subdirectory and a tenant config file that's loaded into memory successfully. +/// If either of the conditions fails, the tenant will be added to memory with [`TenantState::Broken`] state, otherwise we start to load its timelines. +/// Alternatively, tenant is considered loaded successfully, if it's already in pageserver's memory (i.e. was loaded already before). +/// +/// Attach happens on startup and sucessful timeline downloads +/// (some subset of timeline files, always including its metadata, after which the new one needs to be registered). +pub fn attach_local_tenants( + conf: &'static PageServerConf, + remote_index: &RemoteIndex, + tenants_to_attach: HashMap, +) { + let _entered = info_span!("attach_local_tenants").entered(); + let number_of_tenants = tenants_to_attach.len(); - // Should we run a final checkpoint to flush all the data to - // disk? Doesn't seem necessary; all of the states other than - // Ready imply that the data on local disk is corrupt or incomplete, - // and we don't want to flush that to disk. + for (tenant_id, local_timelines) in tenants_to_attach { + let mut tenants_accessor = tenants_state::write_tenants(); + let tenant = match tenants_accessor.entry(tenant_id) { + hash_map::Entry::Occupied(o) => { + info!("Tenant {tenant_id} was found in pageserver's memory"); + Arc::clone(o.get()) + } + hash_map::Entry::Vacant(v) => { + info!("Tenant {tenant_id} was not found in pageserver's memory, loading it"); + let tenant = Arc::new(Tenant::new( + conf, + TenantConfOpt::default(), + Arc::new(PostgresRedoManager::new(conf, tenant_id)), + tenant_id, + remote_index.clone(), + conf.remote_storage_config.is_some(), + )); + match local_timelines { + TenantAttachData::Broken(_) => { + tenant.set_state(TenantState::Broken); + } + TenantAttachData::Ready(_) => { + match Tenant::load_tenant_config(conf, tenant_id) { + Ok(tenant_conf) => { + tenant.update_tenant_config(tenant_conf); + tenant.activate(false); + } + Err(e) => { + error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}"); + tenant.set_state(TenantState::Broken); + } + }; + } + } + v.insert(Arc::clone(&tenant)); + tenant + } + }; + drop(tenants_accessor); + match local_timelines { + TenantAttachData::Broken(e) => warn!("{}", e), + TenantAttachData::Ready(ref timelines) => { + info!("Attaching {} timelines for {tenant_id}", timelines.len()); + debug!("Timelines to attach: {local_timelines:?}"); + let has_timelines = !timelines.is_empty(); + let timelines_to_attach = timelines + .iter() + .map(|(&k, v)| (k, v.metadata().to_owned())) + .collect(); + match tenant.init_attach_timelines(timelines_to_attach) { + Ok(()) => { + info!("successfully loaded local timelines for tenant {tenant_id}"); + tenant.activate(has_timelines); + } + Err(e) => { + error!("Failed to attach tenant timelines: {e:?}"); + tenant.set_state(TenantState::Broken); + } + } + } } - - tenant - .repo - .set_timeline_state(timeline_id, timeline_state) - .with_context(|| { - format!( - "Failed to update timeline {} state to {:?}", - timeline_id, timeline_state - ) - })?; } - Ok(()) + info!("Processed {number_of_tenants} local tenants during attach") } /// /// Shut down all tenants. This runs as part of pageserver shutdown. /// -pub fn shutdown_all_tenants() { - let mut m = access_tenants(); - let mut tenantids = Vec::new(); - for (tenantid, tenant) in m.iter_mut() { - tenant.state = TenantState::Stopping; - tenantids.push(*tenantid) - } - drop(m); +pub async fn shutdown_all_tenants() { + let tenants_to_shut_down = { + let mut m = tenants_state::write_tenants(); + let mut tenants_to_shut_down = Vec::with_capacity(m.len()); + for (_, tenant) in m.drain() { + if tenant.is_active() { + // updates tenant state, forbidding new GC and compaction iterations from starting + tenant.set_state(TenantState::Paused); + tenants_to_shut_down.push(tenant) + } + } + drop(m); + tenants_to_shut_down + }; - thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiver), None, None); - thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None); - thread_mgr::shutdown_threads(Some(ThreadKind::Checkpointer), None, None); + // Shut down all existing walreceiver connections and stop accepting the new ones. + task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await; - // Ok, no background threads running anymore. Flush any remaining data in + // Ok, no background tasks running anymore. Flush any remaining data in // memory to disk. // // We assume that any incoming connections that might request pages from - // the repository have already been terminated by the caller, so there + // the tenant have already been terminated by the caller, so there // should be no more activity in any of the repositories. // // On error, log it but continue with the shutdown for other tenants. - for tenantid in tenantids { - debug!("shutdown tenant {}", tenantid); - match get_repository_for_tenant(tenantid) { - Ok(repo) => { - if let Err(err) = repo.checkpoint_iteration(CheckpointConfig::Flush) { - error!( - "Could not checkpoint tenant {} during shutdown: {:?}", - tenantid, err - ); - } - } - Err(err) => { - error!( - "Could not get repository for tenant {} during shutdown: {:?}", - tenantid, err - ); - } + for tenant in tenants_to_shut_down { + let tenant_id = tenant.tenant_id(); + debug!("shutdown tenant {tenant_id}"); + + if let Err(err) = tenant.checkpoint().await { + error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}"); } } } -pub fn create_repository_for_tenant( +fn create_tenant_files( conf: &'static PageServerConf, - tenantid: ZTenantId, -) -> Result<()> { - let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid)); - let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?; + tenant_conf: TenantConfOpt, + tenant_id: TenantId, +) -> anyhow::Result<()> { + let target_tenant_directory = conf.tenant_path(&tenant_id); + anyhow::ensure!( + !target_tenant_directory.exists(), + "cannot create new tenant repo: '{tenant_id}' directory already exists", + ); - match access_tenants().entry(tenantid) { - hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", tenantid), + let temporary_tenant_dir = + path_with_suffix_extension(&target_tenant_directory, TEMP_FILE_SUFFIX); + debug!( + "Creating temporary directory structure in {}", + temporary_tenant_dir.display() + ); + + // top-level dir may exist if we are creating it through CLI + crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| { + format!( + "could not create temporary tenant directory {}", + temporary_tenant_dir.display() + ) + })?; + + let creation_result = try_create_target_tenant_dir( + conf, + tenant_conf, + tenant_id, + &temporary_tenant_dir, + &target_tenant_directory, + ); + + if creation_result.is_err() { + error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data"); + if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) { + error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}") + } else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) { + error!( + "Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}" + ) + } + } + + creation_result +} + +fn try_create_target_tenant_dir( + conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, + tenant_id: TenantId, + temporary_tenant_dir: &Path, + target_tenant_directory: &Path, +) -> Result<(), anyhow::Error> { + let temporary_tenant_timelines_dir = rebase_directory( + &conf.timelines_path(&tenant_id), + target_tenant_directory, + temporary_tenant_dir, + ) + .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?; + let temporary_tenant_config_path = rebase_directory( + &conf.tenant_config_path(tenant_id), + target_tenant_directory, + temporary_tenant_dir, + ) + .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?; + + Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context( + || { + format!( + "Failed to write tenant {} config to {}", + tenant_id, + temporary_tenant_config_path.display() + ) + }, + )?; + crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| { + format!( + "could not create tenant {} temporary timelines directory {}", + tenant_id, + temporary_tenant_timelines_dir.display() + ) + })?; + fail::fail_point!("tenant-creation-before-tmp-rename", |_| { + anyhow::bail!("failpoint tenant-creation-before-tmp-rename"); + }); + + fs::rename(&temporary_tenant_dir, target_tenant_directory).with_context(|| { + format!( + "failed to move tenant {} temporary directory {} into the permanent one {}", + tenant_id, + temporary_tenant_dir.display(), + target_tenant_directory.display() + ) + })?; + let target_dir_parent = target_tenant_directory.parent().with_context(|| { + format!( + "Failed to get tenant {} dir parent for {}", + tenant_id, + target_tenant_directory.display() + ) + })?; + crashsafe::fsync(target_dir_parent).with_context(|| { + format!( + "Failed to fsync renamed directory's parent {} for tenant {}", + target_dir_parent.display(), + tenant_id, + ) + })?; + + Ok(()) +} + +fn rebase_directory(original_path: &Path, base: &Path, new_base: &Path) -> anyhow::Result { + let relative_path = original_path.strip_prefix(base).with_context(|| { + format!( + "Failed to strip base prefix '{}' off path '{}'", + base.display(), + original_path.display() + ) + })?; + Ok(new_base.join(relative_path)) +} + +pub fn create_tenant( + conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, + tenant_id: TenantId, + remote_index: RemoteIndex, +) -> anyhow::Result> { + match tenants_state::write_tenants().entry(tenant_id) { + hash_map::Entry::Occupied(_) => { + debug!("tenant {tenant_id} already exists"); + Ok(None) + } hash_map::Entry::Vacant(v) => { - v.insert(Tenant { - state: TenantState::Idle, - repo, - }); + let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id)); + create_tenant_files(conf, tenant_conf, tenant_id)?; + let tenant = Arc::new(Tenant::new( + conf, + tenant_conf, + wal_redo_manager, + tenant_id, + remote_index, + conf.remote_storage_config.is_some(), + )); + tenant.activate(false); + v.insert(tenant); + Ok(Some(tenant_id)) } } +} + +pub fn update_tenant_config( + conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, + tenant_id: TenantId, +) -> anyhow::Result<()> { + info!("configuring tenant {tenant_id}"); + get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf); + Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?; + Ok(()) +} + +/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. +/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. +pub fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result> { + let m = tenants_state::read_tenants(); + let tenant = m + .get(&tenant_id) + .with_context(|| format!("Tenant {tenant_id} not found in the local state"))?; + if active_only && !tenant.is_active() { + anyhow::bail!("Tenant {tenant_id} is not active") + } else { + Ok(Arc::clone(tenant)) + } +} + +pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> { + // Start with the shutdown of timeline tasks (this shuts down the walreceiver) + // It is important that we do not take locks here, and do not check whether the timeline exists + // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join + // we cannot create new timelines and tenants, and that can take quite some time, + // it can even become stuck due to a bug making whole pageserver unavailable for some operations + // so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation + // and then try to actually remove timeline from inmemory state and this is the point when concurrent requests + // will synchronize and either fail with the not found error or succeed + + debug!("waiting for wal receiver to shutdown"); + task_mgr::shutdown_tasks( + Some(TaskKind::WalReceiverManager), + Some(tenant_id), + Some(timeline_id), + ) + .await; + debug!("wal receiver shutdown confirmed"); + + info!("waiting for timeline tasks to shutdown"); + task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await; + info!("timeline task shutdown completed"); + match get_tenant(tenant_id, true) { + Ok(tenant) => { + tenant.delete_timeline(timeline_id)?; + if tenant.list_timelines().is_empty() { + tenant.activate(false); + } + } + Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"), + } Ok(()) } -pub fn get_tenant_state(tenantid: ZTenantId) -> Option { - Some(access_tenants().get(&tenantid)?.state) -} +pub async fn detach_tenant( + conf: &'static PageServerConf, + tenant_id: TenantId, +) -> anyhow::Result<()> { + let tenant = match { + let mut tenants_accessor = tenants_state::write_tenants(); + tenants_accessor.remove(&tenant_id) + } { + Some(tenant) => tenant, + None => anyhow::bail!("Tenant not found for id {tenant_id}"), + }; -/// -/// Change the state of a tenant to Active and launch its checkpointer and GC -/// threads. If the tenant was already in Active state or Stopping, does nothing. -/// -pub fn activate_tenant(conf: &'static PageServerConf, tenantid: ZTenantId) -> Result<()> { - let mut m = access_tenants(); - let tenant = m - .get_mut(&tenantid) - .with_context(|| format!("Tenant not found for id {}", tenantid))?; + tenant.set_state(TenantState::Paused); + // shutdown all tenant and timeline tasks: gc, compaction, page service) + task_mgr::shutdown_tasks(None, Some(tenant_id), None).await; - info!("activating tenant {}", tenantid); + // If removal fails there will be no way to successfully retry detach, + // because the tenant no longer exists in the in-memory map. And it needs to be removed from it + // before we remove files, because it contains references to tenant + // which references ephemeral files which are deleted on drop. So if we keep these references, + // we will attempt to remove files which no longer exist. This can be fixed by having shutdown + // mechanism for tenant that will clean temporary data to avoid any references to ephemeral files + let local_tenant_directory = conf.tenant_path(&tenant_id); + fs::remove_dir_all(&local_tenant_directory).with_context(|| { + format!( + "Failed to remove local tenant directory '{}'", + local_tenant_directory.display() + ) + })?; - match tenant.state { - // If the tenant is already active, nothing to do. - TenantState::Active => {} - - // If it's Idle, launch the checkpointer and GC threads - TenantState::Idle => { - thread_mgr::spawn( - ThreadKind::Checkpointer, - Some(tenantid), - None, - "Checkpointer thread", - move || crate::tenant_threads::checkpoint_loop(tenantid, conf), - )?; - - // FIXME: if we fail to launch the GC thread, but already launched the - // checkpointer, we're in a strange state. - - thread_mgr::spawn( - ThreadKind::GarbageCollector, - Some(tenantid), - None, - "GC thread", - move || crate::tenant_threads::gc_loop(tenantid, conf), - )?; - - tenant.state = TenantState::Active; - } - - TenantState::Stopping => { - // don't re-activate it if it's being stopped - } - } Ok(()) } -pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result> { - let m = access_tenants(); - let tenant = m - .get(&tenantid) - .with_context(|| format!("Tenant not found for tenant {}", tenantid))?; - - Ok(Arc::clone(&tenant.repo)) -} - -pub fn get_timeline_for_tenant( - tenantid: ZTenantId, - timelineid: ZTimelineId, -) -> Result> { - get_repository_for_tenant(tenantid)? - .get_timeline(timelineid)? - .local_timeline() - .with_context(|| format!("cannot fetch timeline {}", timelineid)) -} - -#[derive(Serialize, Deserialize, Clone)] -pub struct TenantInfo { - #[serde(with = "hex")] - pub id: ZTenantId, - pub state: TenantState, -} - -pub fn list_tenants() -> Result> { - access_tenants() +/// +/// Get list of tenants, for the mgmt API +/// +pub fn list_tenant_info(remote_index: &RemoteTimelineIndex) -> Vec { + tenants_state::read_tenants() .iter() - .map(|v| { - let (id, tenant) = v; - Ok(TenantInfo { + .map(|(id, tenant)| { + let has_in_progress_downloads = remote_index + .tenant_entry(id) + .map(|entry| entry.has_in_progress_downloads()); + + // TODO this is not correct when we might have remote storage sync disabled: + // we keep `RemoteTimelineIndex` in memory anyway for simplicity and this error message is printed still + if has_in_progress_downloads.is_none() { + error!("timeline is not found in remote index while it is present in the tenants registry") + } + + TenantInfo { id: *id, - state: tenant.state, - }) + state: tenant.current_state(), + current_physical_size: None, + has_in_progress_downloads, + } }) .collect() } + +#[derive(Debug)] +pub enum TenantAttachData { + Ready(HashMap), + Broken(anyhow::Error), +} +/// Attempts to collect information about all tenant and timelines, existing on the local FS. +/// If finds any, deletes all temporary files and directories, created before. Also removes empty directories, +/// that may appear due to such removals. +/// Does not fail on particular timeline or tenant collection errors, rather logging them and ignoring the entities. +fn local_tenant_timeline_files( + config: &'static PageServerConf, +) -> anyhow::Result> { + let _entered = info_span!("local_tenant_timeline_files").entered(); + + let mut local_tenant_timeline_files = HashMap::new(); + let tenants_dir = config.tenants_path(); + for tenants_dir_entry in fs::read_dir(&tenants_dir) + .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? + { + match &tenants_dir_entry { + Ok(tenants_dir_entry) => { + let tenant_dir_path = tenants_dir_entry.path(); + if is_temporary(&tenant_dir_path) { + info!( + "Found temporary tenant directory, removing: {}", + tenant_dir_path.display() + ); + if let Err(e) = fs::remove_dir_all(&tenant_dir_path) { + error!( + "Failed to remove temporary directory '{}': {:?}", + tenant_dir_path.display(), + e + ); + } + } else { + match collect_timelines_for_tenant(config, &tenant_dir_path) { + Ok((tenant_id, TenantAttachData::Broken(e))) => { + local_tenant_timeline_files.entry(tenant_id).or_insert(TenantAttachData::Broken(e)); + }, + Ok((tenant_id, TenantAttachData::Ready(collected_files))) => { + if collected_files.is_empty() { + match remove_if_empty(&tenant_dir_path) { + Ok(true) => info!("Removed empty tenant directory {}", tenant_dir_path.display()), + Ok(false) => { + // insert empty timeline entry: it has some non-temporary files inside that we cannot remove + // so make obvious for HTTP API callers, that something exists there and try to load the tenant + let _ = local_tenant_timeline_files.entry(tenant_id).or_insert_with(|| TenantAttachData::Ready(HashMap::new())); + }, + Err(e) => error!("Failed to remove empty tenant directory: {e:?}"), + } + } else { + match local_tenant_timeline_files.entry(tenant_id) { + hash_map::Entry::Vacant(entry) => { + entry.insert(TenantAttachData::Ready(collected_files)); + } + hash_map::Entry::Occupied(entry) =>{ + if let TenantAttachData::Ready(old_timelines) = entry.into_mut() { + old_timelines.extend(collected_files); + } + }, + } + } + }, + Err(e) => error!( + "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}", + tenants_dir.display(), + tenants_dir_entry, + e + ), + } + } + } + Err(e) => error!( + "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}", + tenants_dir_entry, + tenants_dir.display(), + e + ), + } + } + + info!( + "Collected files for {} tenants", + local_tenant_timeline_files.len(), + ); + Ok(local_tenant_timeline_files) +} + +fn remove_if_empty(tenant_dir_path: &Path) -> anyhow::Result { + let directory_is_empty = tenant_dir_path + .read_dir() + .with_context(|| { + format!( + "Failed to read directory '{}' contents", + tenant_dir_path.display() + ) + })? + .next() + .is_none(); + + if directory_is_empty { + fs::remove_dir_all(&tenant_dir_path).with_context(|| { + format!( + "Failed to remove empty directory '{}'", + tenant_dir_path.display(), + ) + })?; + + Ok(true) + } else { + Ok(false) + } +} + +fn is_temporary(path: &Path) -> bool { + match path.file_name() { + Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX), + None => false, + } +} + +fn is_uninit_mark(path: &Path) -> bool { + match path.file_name() { + Some(name) => name + .to_string_lossy() + .ends_with(TIMELINE_UNINIT_MARK_SUFFIX), + None => false, + } +} + +fn collect_timelines_for_tenant( + config: &'static PageServerConf, + tenant_path: &Path, +) -> anyhow::Result<(TenantId, TenantAttachData)> { + let tenant_id = tenant_path + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .context("Could not parse tenant id out of the tenant dir name")?; + let timelines_dir = config.timelines_path(&tenant_id); + + if !timelines_dir.as_path().is_dir() { + return Ok(( + tenant_id, + TenantAttachData::Broken(anyhow::anyhow!( + "Tenant {} has no timelines directory at {}", + tenant_id, + timelines_dir.display() + )), + )); + } + + let mut tenant_timelines = HashMap::new(); + for timelines_dir_entry in fs::read_dir(&timelines_dir) + .with_context(|| format!("Failed to list timelines dir entry for tenant {tenant_id}"))? + { + match timelines_dir_entry { + Ok(timelines_dir_entry) => { + let timeline_dir = timelines_dir_entry.path(); + if is_temporary(&timeline_dir) { + info!( + "Found temporary timeline directory, removing: {}", + timeline_dir.display() + ); + if let Err(e) = fs::remove_dir_all(&timeline_dir) { + error!( + "Failed to remove temporary directory '{}': {:?}", + timeline_dir.display(), + e + ); + } + } else if is_uninit_mark(&timeline_dir) { + let timeline_uninit_mark_file = &timeline_dir; + info!( + "Found an uninit mark file {}, removing the timeline and its uninit mark", + timeline_uninit_mark_file.display() + ); + let timeline_id = timeline_uninit_mark_file + .file_stem() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( + "Could not parse timeline id out of the timeline uninit mark name {}", + timeline_uninit_mark_file.display() + ) + })?; + let timeline_dir = config.timeline_path(&timeline_id, &tenant_id); + if let Err(e) = + remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) + { + error!("Failed to clean up uninit marked timeline: {e:?}"); + } + } else { + let timeline_id = timeline_dir + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!( + "Could not parse timeline id out of the timeline dir name {}", + timeline_dir.display() + ) + })?; + let timeline_uninit_mark_file = + config.timeline_uninit_mark_file_path(tenant_id, timeline_id); + if timeline_uninit_mark_file.exists() { + info!("Found an uninit mark file for timeline {tenant_id}/{timeline_id}, removing the timeline and its uninit mark"); + if let Err(e) = remove_timeline_and_uninit_mark( + &timeline_dir, + &timeline_uninit_mark_file, + ) { + error!("Failed to clean up uninit marked timeline: {e:?}"); + } + } else { + match collect_timeline_files(&timeline_dir) { + Ok((metadata, timeline_files)) => { + tenant_timelines.insert( + timeline_id, + TimelineLocalFiles::collected(metadata, timeline_files), + ); + } + Err(e) => { + error!( + "Failed to process timeline dir contents at '{}', reason: {:?}", + timeline_dir.display(), + e + ); + match remove_if_empty(&timeline_dir) { + Ok(true) => info!( + "Removed empty timeline directory {}", + timeline_dir.display() + ), + Ok(false) => (), + Err(e) => { + error!("Failed to remove empty timeline directory: {e:?}") + } + } + } + } + } + } + } + Err(e) => { + error!("Failed to list timelines for entry tenant {tenant_id}, reason: {e:?}") + } + } + } + + if tenant_timelines.is_empty() { + // this is normal, we've removed all broken, empty and temporary timeline dirs + // but should allow the tenant to stay functional and allow creating new timelines + // on a restart, we require tenants to have the timelines dir, so leave it on disk + debug!("Tenant {tenant_id} has no timelines loaded"); + } + + Ok((tenant_id, TenantAttachData::Ready(tenant_timelines))) +} + +fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> { + fs::remove_dir_all(&timeline_dir) + .or_else(|e| { + if e.kind() == std::io::ErrorKind::NotFound { + // we can leave the uninit mark without a timeline dir, + // just remove the mark then + Ok(()) + } else { + Err(e) + } + }) + .with_context(|| { + format!( + "Failed to remove unit marked timeline directory {}", + timeline_dir.display() + ) + })?; + fs::remove_file(&uninit_mark).with_context(|| { + format!( + "Failed to remove timeline uninit mark file {}", + uninit_mark.display() + ) + })?; + + Ok(()) +} + +// discover timeline files and extract timeline metadata +// NOTE: ephemeral files are excluded from the list +fn collect_timeline_files( + timeline_dir: &Path, +) -> anyhow::Result<(TimelineMetadata, HashMap)> { + let mut timeline_files = HashMap::new(); + let mut timeline_metadata_path = None; + + let timeline_dir_entries = + fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?; + for entry in timeline_dir_entries { + let entry_path = entry.context("Failed to list timeline dir entry")?.path(); + let metadata = entry_path.metadata()?; + + if metadata.is_file() { + if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) { + timeline_metadata_path = Some(entry_path); + } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) { + debug!("skipping ephemeral file {}", entry_path.display()); + continue; + } else if is_temporary(&entry_path) { + info!("removing temp timeline file at {}", entry_path.display()); + fs::remove_file(&entry_path).with_context(|| { + format!( + "failed to remove temp download file at {}", + entry_path.display() + ) + })?; + } else { + let layer_metadata = LayerFileMetadata::new(metadata.len()); + timeline_files.insert(entry_path, layer_metadata); + } + } + } + + // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed + // then attach is lost. There would be no retries for that, + // initial collect will fail because there is no metadata. + // We either need to start download if we see empty dir after restart or attach caller should + // be aware of that and retry attach if awaits_download for timeline switched from true to false + // but timelinne didn't appear locally. + // Check what happens with remote index in that case. + let timeline_metadata_path = match timeline_metadata_path { + Some(path) => path, + None => anyhow::bail!("No metadata file found in the timeline directory"), + }; + let metadata = TimelineMetadata::from_bytes( + &fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?, + ) + .context("Failed to parse timeline metadata file bytes")?; + + anyhow::ensure!( + metadata.ancestor_timeline().is_some() || !timeline_files.is_empty(), + "Timeline has no ancestor and no layer files" + ); + + Ok((metadata, timeline_files)) +} diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs new file mode 100644 index 0000000000..a24bdd5812 --- /dev/null +++ b/pageserver/src/tenant_tasks.rs @@ -0,0 +1,189 @@ +//! This module contains functions to serve per-tenant background processes, +//! such as compaction and GC + +use std::ops::ControlFlow; +use std::sync::Arc; +use std::time::Duration; + +use crate::metrics::TENANT_TASK_EVENTS; +use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::{Tenant, TenantState}; +use crate::tenant_mgr; +use tracing::*; +use utils::id::TenantId; + +pub fn start_background_loops(tenant_id: TenantId) { + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::Compaction, + Some(tenant_id), + None, + &format!("compactor for tenant {tenant_id}"), + false, + async move { + compaction_loop(tenant_id) + .instrument(info_span!("compaction_loop", tenant_id = %tenant_id)) + .await; + Ok(()) + }, + ); + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::GarbageCollector, + Some(tenant_id), + None, + &format!("garbage collector for tenant {tenant_id}"), + false, + async move { + gc_loop(tenant_id) + .instrument(info_span!("gc_loop", tenant_id = %tenant_id)) + .await; + Ok(()) + }, + ); +} + +/// +/// Compaction task's main loop +/// +async fn compaction_loop(tenant_id: TenantId) { + let wait_duration = Duration::from_secs(2); + info!("starting"); + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + async { + loop { + trace!("waking up"); + + let tenant = tokio::select! { + _ = task_mgr::shutdown_watcher() => { + info!("received cancellation request"); + return; + }, + tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { + ControlFlow::Break(()) => return, + ControlFlow::Continue(tenant) => tenant, + }, + }; + + // Run blocking part of the task + + // Run compaction + let mut sleep_duration = tenant.get_compaction_period(); + if let Err(e) = tenant.compaction_iteration() { + sleep_duration = wait_duration; + error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration); + #[cfg(feature = "testing")] + std::process::abort(); + } + + // Sleep + tokio::select! { + _ = task_mgr::shutdown_watcher() => { + info!("received cancellation request during idling"); + break ; + }, + _ = tokio::time::sleep(sleep_duration) => {}, + } + } + } + .await; + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); + + trace!("compaction loop stopped."); +} + +/// +/// GC task's main loop +/// +async fn gc_loop(tenant_id: TenantId) { + let wait_duration = Duration::from_secs(2); + info!("starting"); + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + async { + loop { + trace!("waking up"); + + let tenant = tokio::select! { + _ = task_mgr::shutdown_watcher() => { + info!("received cancellation request"); + return; + }, + tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { + ControlFlow::Break(()) => return, + ControlFlow::Continue(tenant) => tenant, + }, + }; + + // Run gc + let gc_period = tenant.get_gc_period(); + let gc_horizon = tenant.get_gc_horizon(); + let mut sleep_duration = gc_period; + if gc_horizon > 0 { + if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await + { + sleep_duration = wait_duration; + error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration); + #[cfg(feature = "testing")] + std::process::abort(); + } + } + + // Sleep + tokio::select! { + _ = task_mgr::shutdown_watcher() => { + info!("received cancellation request during idling"); + break; + }, + _ = tokio::time::sleep(sleep_duration) => {}, + } + } + } + .await; + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); + trace!("GC loop stopped."); +} + +async fn wait_for_active_tenant( + tenant_id: TenantId, + wait: Duration, +) -> ControlFlow<(), Arc> { + let tenant = loop { + match tenant_mgr::get_tenant(tenant_id, false) { + Ok(tenant) => break tenant, + Err(e) => { + error!("Failed to get a tenant {tenant_id}: {e:#}"); + tokio::time::sleep(wait).await; + } + } + }; + + // if the tenant has a proper status already, no need to wait for anything + if tenant.should_run_tasks() { + ControlFlow::Continue(tenant) + } else { + let mut tenant_state_updates = tenant.subscribe_for_state_updates(); + loop { + match tenant_state_updates.changed().await { + Ok(()) => { + let new_state = *tenant_state_updates.borrow(); + match new_state { + TenantState::Active { + background_jobs_running: true, + } => { + debug!("Tenant state changed to active with background jobs enabled, continuing the task loop"); + return ControlFlow::Continue(tenant); + } + state => { + debug!("Not running the task loop, tenant is not active with background jobs enabled: {state:?}"); + continue; + } + } + } + Err(_sender_dropped_error) => { + info!("Tenant dropped the state updates sender, quitting waiting for tenant and the task loop"); + return ControlFlow::Break(()); + } + } + } + } +} diff --git a/pageserver/src/tenant_threads.rs b/pageserver/src/tenant_threads.rs deleted file mode 100644 index 062af9f1ad..0000000000 --- a/pageserver/src/tenant_threads.rs +++ /dev/null @@ -1,70 +0,0 @@ -//! This module contains functions to serve per-tenant background processes, -//! such as checkpointer and GC -use crate::config::PageServerConf; -use crate::tenant_mgr; -use crate::tenant_mgr::TenantState; -use crate::CheckpointConfig; -use anyhow::Result; -use std::time::Duration; -use tracing::*; -use zenith_utils::zid::ZTenantId; - -/// -/// Checkpointer thread's main loop -/// -pub fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { - loop { - if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { - break; - } - - std::thread::sleep(conf.checkpoint_period); - trace!("checkpointer thread for tenant {} waking up", tenantid); - - // checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE - // bytes of WAL since last checkpoint. - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - repo.checkpoint_iteration(CheckpointConfig::Distance(conf.checkpoint_distance))?; - } - - trace!( - "checkpointer thread stopped for tenant {} state is {:?}", - tenantid, - tenant_mgr::get_tenant_state(tenantid) - ); - Ok(()) -} - -/// -/// GC thread's main loop -/// -pub fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { - loop { - if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { - break; - } - - trace!("gc thread for tenant {} waking up", tenantid); - - // Garbage collect old files that are not needed for PITR anymore - if conf.gc_horizon > 0 { - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - repo.gc_iteration(None, conf.gc_horizon, false).unwrap(); - } - - // TODO Write it in more adequate way using - // condvar.wait_timeout() or something - let mut sleep_time = conf.gc_period.as_secs(); - while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == Some(TenantState::Active) - { - sleep_time -= 1; - std::thread::sleep(Duration::from_secs(1)); - } - } - trace!( - "GC thread stopped for tenant {} state is {:?}", - tenantid, - tenant_mgr::get_tenant_state(tenantid) - ); - Ok(()) -} diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs deleted file mode 100644 index a51f0909ca..0000000000 --- a/pageserver/src/thread_mgr.rs +++ /dev/null @@ -1,284 +0,0 @@ -//! -//! This module provides centralized handling of threads in the Page Server. -//! -//! We provide a few basic facilities: -//! - A global registry of threads that lists what kind of threads they are, and -//! which tenant or timeline they are working on -//! -//! - The ability to request a thread to shut down. -//! -//! -//! # How it works? -//! -//! There is a global hashmap of all the threads (`THREADS`). Whenever a new -//! thread is spawned, a PageServerThread entry is added there, and when a -//! thread dies, it removes itself from the hashmap. If you want to kill a -//! thread, you can scan the hashmap to find it. -//! -//! # Thread shutdown -//! -//! To kill a thread, we rely on co-operation from the victim. Each thread is -//! expected to periodically call the `is_shutdown_requested()` function, and -//! if it returns true, exit gracefully. In addition to that, when waiting for -//! the network or other long-running operation, you can use -//! `shutdown_watcher()` function to get a Future that will become ready if -//! the current thread has been requested to shut down. You can use that with -//! Tokio select!(), but note that it relies on thread-local storage, so it -//! will only work with the "current-thread" Tokio runtime! -//! -//! -//! TODO: This would be a good place to also handle panics in a somewhat sane way. -//! Depending on what thread panics, we might want to kill the whole server, or -//! only a single tenant or timeline. -//! - -use std::cell::RefCell; -use std::collections::HashMap; -use std::panic; -use std::panic::AssertUnwindSafe; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; -use std::sync::{Arc, Mutex}; -use std::thread; -use std::thread::JoinHandle; - -use tokio::sync::watch; - -use tracing::{info, warn}; - -use lazy_static::lazy_static; - -use zenith_utils::zid::{ZTenantId, ZTimelineId}; - -lazy_static! { - /// Each thread that we track is associated with a "thread ID". It's just - /// an increasing number that we assign, not related to any system thread - /// id. - static ref NEXT_THREAD_ID: AtomicU64 = AtomicU64::new(1); - - /// Global registry of threads - static ref THREADS: Mutex>> = Mutex::new(HashMap::new()); -} - -// There is a Tokio watch channel for each thread, which can be used to signal the -// thread that it needs to shut down. This thread local variable holds the receiving -// end of the channel. The sender is kept in the global registry, so that anyone -// can send the signal to request thread shutdown. -thread_local!(static SHUTDOWN_RX: RefCell>> = RefCell::new(None)); - -// Each thread holds reference to its own PageServerThread here. -thread_local!(static CURRENT_THREAD: RefCell>> = RefCell::new(None)); - -/// -/// There are many kinds of threads in the system. Some are associated with a particular -/// tenant or timeline, while others are global. -/// -/// Note that we don't try to limit how may threads of a certain kind can be running -/// at the same time. -/// -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub enum ThreadKind { - // libpq listener thread. It just accepts connection and spawns a - // PageRequestHandler thread for each connection. - LibpqEndpointListener, - - // HTTP endpoint listener. - HttpEndpointListener, - - // Thread that handles a single connection. A PageRequestHandler thread - // starts detached from any particular tenant or timeline, but it can be - // associated with one later, after receiving a command from the client. - PageRequestHandler, - - // Thread that connects to a safekeeper to fetch WAL for one timeline. - WalReceiver, - - // Thread that handles checkpointing of all timelines for a tenant. - Checkpointer, - - // Thread that handles GC of a tenant - GarbageCollector, - - // Thread for synchronizing pageserver relish data with the remote storage. - // Shared by all tenants. - StorageSync, -} - -struct PageServerThread { - _thread_id: u64, - - kind: ThreadKind, - - /// Tenant and timeline that this thread is associated with. - tenant_id: Option, - timeline_id: Option, - - name: String, - - // To request thread shutdown, set the flag, and send a dummy message to the - // channel to notify it. - shutdown_requested: AtomicBool, - shutdown_tx: watch::Sender<()>, - - /// Handle for waiting for the thread to exit. It can be None, if the - /// the thread has already exited. - join_handle: Mutex>>, -} - -/// Launch a new thread -pub fn spawn( - kind: ThreadKind, - tenant_id: Option, - timeline_id: Option, - name: &str, - f: F, -) -> std::io::Result<()> -where - F: FnOnce() -> Result<(), E> + Send + 'static, -{ - let (shutdown_tx, shutdown_rx) = watch::channel(()); - let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); - let thread = PageServerThread { - _thread_id: thread_id, - kind, - tenant_id, - timeline_id, - name: name.to_string(), - - shutdown_requested: AtomicBool::new(false), - shutdown_tx, - - join_handle: Mutex::new(None), - }; - - let thread_rc = Arc::new(thread); - - let mut jh_guard = thread_rc.join_handle.lock().unwrap(); - - THREADS - .lock() - .unwrap() - .insert(thread_id, Arc::clone(&thread_rc)); - - let thread_rc2 = Arc::clone(&thread_rc); - let join_handle = match thread::Builder::new() - .name(name.to_string()) - .spawn(move || thread_wrapper(thread_id, thread_rc2, shutdown_rx, f)) - { - Ok(handle) => handle, - Err(err) => { - // Could not spawn the thread. Remove the entry - THREADS.lock().unwrap().remove(&thread_id); - return Err(err); - } - }; - *jh_guard = Some(join_handle); - drop(jh_guard); - - // The thread is now running. Nothing more to do here - Ok(()) -} - -/// This wrapper function runs in a newly-spawned thread. It initializes the -/// thread-local variables and calls the payload function -fn thread_wrapper( - thread_id: u64, - thread: Arc, - shutdown_rx: watch::Receiver<()>, - f: F, -) where - F: FnOnce() -> Result<(), E> + Send + 'static, -{ - SHUTDOWN_RX.with(|rx| { - *rx.borrow_mut() = Some(shutdown_rx); - }); - CURRENT_THREAD.with(|ct| { - *ct.borrow_mut() = Some(thread); - }); - - // We use AssertUnwindSafe here so that the payload function - // doesn't need to be UnwindSafe. We don't do anything after the - // unwinding that would expose us to unwind-unsafe behavior. - let result = panic::catch_unwind(AssertUnwindSafe(f)); - - // Remove our entry from the global hashmap. - THREADS.lock().unwrap().remove(&thread_id); - - // If the thread payload panic'd, exit with the panic. - if let Err(err) = result { - panic::resume_unwind(err); - } -} - -/// Is there a thread running that matches the criteria - -/// Signal and wait for threads to shut down. -/// -/// -/// The arguments are used to select the threads to kill. Any None arguments are -/// ignored. For example, to shut down all WalReceiver threads: -/// -/// shutdown_threads(Some(ThreadKind::WalReceiver), None, None) -/// -/// Or to shut down all threads for given timeline: -/// -/// shutdown_threads(None, Some(timelineid), None) -/// -pub fn shutdown_threads( - kind: Option, - tenant_id: Option, - timeline_id: Option, -) { - let mut victim_threads = Vec::new(); - - let threads = THREADS.lock().unwrap(); - for thread in threads.values() { - if (kind.is_none() || Some(thread.kind) == kind) - && (tenant_id.is_none() || thread.tenant_id == tenant_id) - && (timeline_id.is_none() || thread.timeline_id == timeline_id) - { - thread.shutdown_requested.store(true, Ordering::Relaxed); - // FIXME: handle error? - let _ = thread.shutdown_tx.send(()); - victim_threads.push(Arc::clone(thread)); - } - } - drop(threads); - - for thread in victim_threads { - info!("waiting for {} to shut down", thread.name); - if let Some(join_handle) = thread.join_handle.lock().unwrap().take() { - let _ = join_handle.join(); - } else { - // The thread had not even fully started yet. Or it was shut down - // concurrently and alrady exited - } - } -} - -/// A Future that can be used to check if the current thread has been requested to -/// shut down. -pub async fn shutdown_watcher() { - let _ = SHUTDOWN_RX - .with(|rx| { - rx.borrow() - .as_ref() - .expect("shutdown_requested() called in an unexpected thread") - .clone() - }) - .changed() - .await; -} - -/// Has the current thread been requested to shut down? -pub fn is_shutdown_requested() -> bool { - CURRENT_THREAD.with(|ct| { - if let Some(ct) = ct.borrow().as_ref() { - ct.shutdown_requested.load(Ordering::Relaxed) - } else { - if !cfg!(test) { - warn!("is_shutdown_requested() called in an unexpected thread"); - } - false - } - }) -} diff --git a/pageserver/src/trace.rs b/pageserver/src/trace.rs new file mode 100644 index 0000000000..9e466dd9b0 --- /dev/null +++ b/pageserver/src/trace.rs @@ -0,0 +1,36 @@ +use bytes::Bytes; +use std::{ + fs::{create_dir_all, File}, + io::{BufWriter, Write}, + path::PathBuf, +}; + +pub struct Tracer { + writer: BufWriter, +} + +impl Drop for Tracer { + fn drop(&mut self) { + self.flush() + } +} + +impl Tracer { + pub fn new(path: PathBuf) -> Self { + let parent = path.parent().expect("failed to parse parent path"); + create_dir_all(parent).expect("failed to create trace dir"); + + let file = File::create(path).expect("failed to create trace file"); + Tracer { + writer: BufWriter::new(file), + } + } + + pub fn trace(&mut self, msg: &Bytes) { + self.writer.write_all(msg).expect("failed to write trace"); + } + + pub fn flush(&mut self) { + self.writer.flush().expect("failed to flush trace file"); + } +} diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 73671dcf4e..46e4acd50c 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -10,45 +10,14 @@ //! This is similar to PostgreSQL's virtual file descriptor facility in //! src/backend/storage/file/fd.c //! -use lazy_static::lazy_static; +use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME}; +use once_cell::sync::OnceCell; use std::fs::{File, OpenOptions}; use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write}; use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{RwLock, RwLockWriteGuard}; -use zenith_metrics::{register_histogram_vec, register_int_gauge_vec, HistogramVec, IntGaugeVec}; - -use once_cell::sync::OnceCell; - -// Metrics collected on disk IO operations -const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ - 0.000001, // 1 usec - 0.00001, // 10 usec - 0.0001, // 100 usec - 0.001, // 1 msec - 0.01, // 10 msec - 0.1, // 100 msec - 1.0, // 1 sec -]; - -lazy_static! { - static ref STORAGE_IO_TIME: HistogramVec = register_histogram_vec!( - "pageserver_io_time", - "Time spent in IO operations", - &["operation", "tenant_id", "timeline_id"], - STORAGE_IO_TIME_BUCKETS.into() - ) - .expect("failed to define a metric"); -} -lazy_static! { - static ref STORAGE_IO_SIZE: IntGaugeVec = register_int_gauge_vec!( - "pageserver_io_size", - "Amount of bytes", - &["operation", "tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} /// /// A virtual file descriptor. You can use this just like std::fs::File, but internally @@ -65,6 +34,7 @@ lazy_static! { /// currently open, the 'handle' can still point to the slot where it was last kept. The /// 'tag' field is used to detect whether the handle still is valid or not. /// +#[derive(Debug)] pub struct VirtualFile { /// Lazy handle to the global file descriptor cache. The slot that this points to /// might contain our File, or it may be empty, or it may contain a File that @@ -83,12 +53,11 @@ pub struct VirtualFile { pub path: PathBuf, open_options: OpenOptions, - /// For metrics - tenantid: String, - timelineid: String, + tenant_id: String, + timeline_id: String, } -#[derive(PartialEq, Clone, Copy)] +#[derive(Debug, PartialEq, Clone, Copy)] struct SlotHandle { /// Index into OPEN_FILES.slots index: usize, @@ -180,7 +149,7 @@ impl OpenFiles { // old file. // if let Some(old_file) = slot_guard.file.take() { - // We do not have information about tenantid/timelineid of evicted file. + // We do not have information about tenant_id/timeline_id of evicted file. // It is possible to store path together with file or use filepath crate, // but as far as close() is not expected to be fast, it is not so critical to gather // precise per-tenant statistic here. @@ -226,19 +195,20 @@ impl VirtualFile { path: &Path, open_options: &OpenOptions, ) -> Result { - let parts = path.to_str().unwrap().split('/').collect::>(); - let tenantid; - let timelineid; + let path_str = path.to_string_lossy(); + let parts = path_str.split('/').collect::>(); + let tenant_id; + let timeline_id; if parts.len() > 5 && parts[parts.len() - 5] == "tenants" { - tenantid = parts[parts.len() - 4].to_string(); - timelineid = parts[parts.len() - 2].to_string(); + tenant_id = parts[parts.len() - 4].to_string(); + timeline_id = parts[parts.len() - 2].to_string(); } else { - tenantid = "*".to_string(); - timelineid = "*".to_string(); + tenant_id = "*".to_string(); + timeline_id = "*".to_string(); } let (handle, mut slot_guard) = get_open_files().find_victim_slot(); let file = STORAGE_IO_TIME - .with_label_values(&["open", &tenantid, &timelineid]) + .with_label_values(&["open", &tenant_id, &timeline_id]) .observe_closure_duration(|| open_options.open(path))?; // Strip all options other than read and write. @@ -256,8 +226,8 @@ impl VirtualFile { pos: 0, path: path.to_path_buf(), open_options: reopen_options, - tenantid, - timelineid, + tenant_id, + timeline_id, }; slot_guard.file.replace(file); @@ -297,7 +267,7 @@ impl VirtualFile { // Found a cached file descriptor. slot.recently_used.store(true, Ordering::Relaxed); return Ok(STORAGE_IO_TIME - .with_label_values(&[op, &self.tenantid, &self.timelineid]) + .with_label_values(&[op, &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| func(file))); } } @@ -324,7 +294,7 @@ impl VirtualFile { // Open the physical file let file = STORAGE_IO_TIME - .with_label_values(&["open", &self.tenantid, &self.timelineid]) + .with_label_values(&["open", &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| self.open_options.open(&self.path))?; // Perform the requested operation on it @@ -334,11 +304,11 @@ impl VirtualFile { // library RwLock doesn't allow downgrading without releasing the lock, // and that doesn't seem worth the trouble. // - // XXX: `parking_lot::RwLock` can enable such downgrades, yet its implemenation is fair and + // XXX: `parking_lot::RwLock` can enable such downgrades, yet its implementation is fair and // may deadlock on subsequent read calls. // Simply replacing all `RwLock` in project causes deadlocks, so use it sparingly. let result = STORAGE_IO_TIME - .with_label_values(&[op, &self.tenantid, &self.timelineid]) + .with_label_values(&[op, &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| func(&file)); // Store the File in the slot and update the handle in the VirtualFile @@ -349,6 +319,12 @@ impl VirtualFile { Ok(result) } + + pub fn remove(self) { + let path = self.path.clone(); + drop(self); + std::fs::remove_file(path).expect("failed to remove the virtual file"); + } } impl Drop for VirtualFile { @@ -363,11 +339,11 @@ impl Drop for VirtualFile { if slot_guard.tag == handle.tag { slot.recently_used.store(false, Ordering::Relaxed); // Unlike files evicted by replacement algorithm, here - // we group close time by tenantid/timelineid. + // we group close time by tenant_id/timeline_id. // At allows to compare number/time of "normal" file closes // with file eviction. STORAGE_IO_TIME - .with_label_values(&["close", &self.tenantid, &self.timelineid]) + .with_label_values(&["close", &self.tenant_id, &self.timeline_id]) .observe_closure_duration(|| slot_guard.file.take()); } } @@ -429,7 +405,7 @@ impl FileExt for VirtualFile { let result = self.with_file("read", |file| file.read_at(buf, offset))?; if let Ok(size) = result { STORAGE_IO_SIZE - .with_label_values(&["read", &self.tenantid, &self.timelineid]) + .with_label_values(&["read", &self.tenant_id, &self.timeline_id]) .add(size as i64); } result @@ -439,7 +415,7 @@ impl FileExt for VirtualFile { let result = self.with_file("write", |file| file.write_at(buf, offset))?; if let Ok(size) = result { STORAGE_IO_SIZE - .with_label_values(&["write", &self.tenantid, &self.timelineid]) + .with_label_values(&["write", &self.tenant_id, &self.timeline_id]) .add(size as i64); } result diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 1962c9bbd3..8c81ed824b 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1,5 +1,5 @@ //! -//! Parse PostgreSQL WAL records and store them in a zenith Timeline. +//! Parse PostgreSQL WAL records and store them in a neon Timeline. //! //! The pipeline for ingesting WAL looks like this: //! @@ -9,10 +9,10 @@ //! and decodes it to individual WAL records. It feeds the WAL records //! to WalIngest, which parses them and stores them in the Repository. //! -//! The zenith Repository can store page versions in two formats: as +//! The neon Repository can store page versions in two formats: as //! page images, or a WAL records. WalIngest::ingest_record() extracts //! page images out of some WAL records, but most it stores as WAL -//! records. If a WAL record modifies multple pages, WalIngest +//! records. If a WAL record modifies multiple pages, WalIngest //! will call Repository::put_wal_record or put_page_image functions //! separately for each modified page. //! @@ -21,39 +21,46 @@ //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. -use postgres_ffi::nonrelfile_utils::clogpage_precedes; -use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment; -use std::cmp::min; +use anyhow::Context; +use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; +use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; +use postgres_ffi::{page_is_new, page_set_lsn}; use anyhow::Result; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; -use crate::relish::*; -use crate::repository::*; +use crate::pgdatadir_mapping::*; +use crate::tenant::Timeline; use crate::walrecord::*; -use postgres_ffi::nonrelfile_utils::mx_offset_to_member_segment; -use postgres_ffi::xlog_utils::*; +use crate::ZERO_PAGE; +use pageserver_api::reltag::{RelTag, SlruKind}; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; +use postgres_ffi::v14::xlog_utils::*; +use postgres_ffi::v14::CheckPoint; use postgres_ffi::TransactionId; -use postgres_ffi::{pg_constants, CheckPoint}; -use zenith_utils::lsn::Lsn; +use postgres_ffi::BLCKSZ; +use utils::lsn::Lsn; -static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); +pub struct WalIngest<'a> { + timeline: &'a Timeline, -pub struct WalIngest { checkpoint: CheckPoint, checkpoint_modified: bool, } -impl WalIngest { - pub fn new(timeline: &dyn Timeline, startpoint: Lsn) -> Result { +impl<'a> WalIngest<'a> { + pub fn new(timeline: &Timeline, startpoint: Lsn) -> Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. - let checkpoint_bytes = timeline.get_page_at_lsn(RelishTag::Checkpoint, 0, startpoint)?; + let checkpoint_bytes = timeline.get_checkpoint(startpoint)?; let checkpoint = CheckPoint::decode(&checkpoint_bytes)?; trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); Ok(WalIngest { + timeline, checkpoint, checkpoint_modified: false, }) @@ -62,17 +69,22 @@ impl WalIngest { /// /// Decode a PostgreSQL WAL record and store it in the repository, in the given timeline. /// + /// This function updates `lsn` field of `DatadirModification` /// /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the /// relations/pages that the record affects. /// pub fn ingest_record( &mut self, - timeline: &dyn TimelineWriter, recdata: Bytes, lsn: Lsn, + modification: &mut DatadirModification, + decoded: &mut DecodedWALRecord, ) -> Result<()> { - let mut decoded = decode_wal_record(recdata); + modification.lsn = lsn; + decode_wal_record(recdata, decoded, self.timeline.pg_version) + .context("failed decoding wal record")?; + let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -86,48 +98,65 @@ impl WalIngest { if decoded.xl_rmid == pg_constants::RM_HEAP_ID || decoded.xl_rmid == pg_constants::RM_HEAP2_ID { - self.ingest_heapam_record(&mut buf, timeline, lsn, &mut decoded)?; + self.ingest_heapam_record(&mut buf, modification, decoded)?; } // Handle other special record types if decoded.xl_rmid == pg_constants::RM_SMGR_ID + && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == pg_constants::XLOG_SMGR_CREATE + { + let create = XlSmgrCreate::decode(&mut buf); + self.ingest_xlog_smgr_create(modification, &create)?; + } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE { let truncate = XlSmgrTruncate::decode(&mut buf); - self.ingest_xlog_smgr_truncate(timeline, lsn, &truncate)?; + self.ingest_xlog_smgr_truncate(modification, &truncate)?; } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { - if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == pg_constants::XLOG_DBASE_CREATE - { - let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(timeline, lsn, &createdb)?; - } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) - == pg_constants::XLOG_DBASE_DROP - { - let dropdb = XlDropDatabase::decode(&mut buf); + debug!( + "handle RM_DBASE_ID for Postgres version {:?}", + self.timeline.pg_version + ); + if self.timeline.pg_version == 14 { + if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE + { + let createdb = XlCreateDatabase::decode(&mut buf); + debug!("XLOG_DBASE_CREATE v14"); - // To drop the database, we need to drop all the relations in it. Like in - // ingest_xlog_dbase_create(), use the previous record's LSN in the list_rels() call - let req_lsn = min(timeline.get_last_record_lsn(), lsn); - - for tablespace_id in dropdb.tablespace_ids { - let rels = timeline.list_rels(tablespace_id, dropdb.db_id, req_lsn)?; - for rel in rels { - timeline.drop_relish(rel, lsn)?; + self.ingest_xlog_dbase_create(modification, &createdb)?; + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v14::bindings::XLOG_DBASE_DROP + { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + } + } + } else if self.timeline.pg_version == 15 { + if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG + { + debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY + { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + debug!("XLOG_DBASE_CREATE_FILE_COPY"); + let createdb = XlCreateDatabase::decode(&mut buf); + self.ingest_xlog_dbase_create(modification, &createdb)?; + } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == postgres_ffi::v15::bindings::XLOG_DBASE_DROP + { + let dropdb = XlDropDatabase::decode(&mut buf); + for tablespace_id in dropdb.tablespace_ids { + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + modification.drop_dbdir(tablespace_id, dropdb.db_id)?; } - trace!( - "Drop FileNodeMap {}, {} at lsn {}", - tablespace_id, - dropdb.db_id, - lsn - ); - timeline.drop_relish( - RelishTag::FileNodeMap { - spcnode: tablespace_id, - dbnode: dropdb.db_id, - }, - lsn, - )?; } } } else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID { @@ -138,19 +167,17 @@ impl WalIngest { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - timeline.put_page_image( - RelishTag::Slru { - slru: SlruKind::Clog, - segno, - }, + self.put_slru_page_image( + modification, + SlruKind::Clog, + segno, rpageno, - lsn, ZERO_PAGE.clone(), )?; } else { assert!(info == pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(&mut buf); - self.ingest_clog_truncate_record(timeline, lsn, &xlrec)?; + self.ingest_clog_truncate_record(modification, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_XACT_ID { let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; @@ -158,8 +185,7 @@ impl WalIngest { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); self.ingest_xact_record( - timeline, - lsn, + modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, )?; @@ -169,8 +195,7 @@ impl WalIngest { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); self.ingest_xact_record( - timeline, - lsn, + modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, )?; @@ -179,23 +204,11 @@ impl WalIngest { "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}", decoded.xl_xid, parsed_xact.xid, - lsn + lsn, ); - timeline.drop_relish( - RelishTag::TwoPhase { - xid: parsed_xact.xid, - }, - lsn, - )?; + modification.drop_twophase_file(parsed_xact.xid)?; } else if info == pg_constants::XLOG_XACT_PREPARE { - timeline.put_page_image( - RelishTag::TwoPhase { - xid: decoded.xl_xid, - }, - 0, - lsn, - Bytes::copy_from_slice(&buf[..]), - )?; + modification.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))?; } } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; @@ -204,38 +217,34 @@ impl WalIngest { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - timeline.put_page_image( - RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno, - }, + self.put_slru_page_image( + modification, + SlruKind::MultiXactOffsets, + segno, rpageno, - lsn, ZERO_PAGE.clone(), )?; } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - timeline.put_page_image( - RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno, - }, + self.put_slru_page_image( + modification, + SlruKind::MultiXactMembers, + segno, rpageno, - lsn, ZERO_PAGE.clone(), )?; } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { let xlrec = XlMultiXactCreate::decode(&mut buf); - self.ingest_multixact_create_record(timeline, lsn, &xlrec)?; + self.ingest_multixact_create_record(modification, &xlrec)?; } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { let xlrec = XlMultiXactTruncate::decode(&mut buf); - self.ingest_multixact_truncate_record(timeline, lsn, &xlrec)?; + self.ingest_multixact_truncate_record(modification, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID { let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(timeline, lsn, &xlrec, &decoded)?; + self.ingest_relmap_page(modification, &xlrec, decoded)?; } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_NEXTOID { @@ -249,7 +258,7 @@ impl WalIngest { { let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT]; buf.copy_to_slice(&mut checkpoint_bytes); - let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes).unwrap(); + let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?; trace!( "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", xlog_checkpoint.oldestXid, @@ -270,37 +279,37 @@ impl WalIngest { // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. for blk in decoded.blocks.iter() { - self.ingest_decoded_block(timeline, lsn, &decoded, blk)?; + self.ingest_decoded_block(modification, lsn, decoded, blk)?; } // If checkpoint data was updated, store the new version in the repository if self.checkpoint_modified { - let new_checkpoint_bytes = self.checkpoint.encode(); + let new_checkpoint_bytes = self.checkpoint.encode()?; - timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, new_checkpoint_bytes)?; + modification.put_checkpoint(new_checkpoint_bytes)?; self.checkpoint_modified = false; } // Now that this record has been fully handled, including updating the // checkpoint data, let the repository know that it is up-to-date to this LSN - timeline.advance_last_record_lsn(lsn); + modification.commit()?; Ok(()) } fn ingest_decoded_block( &mut self, - timeline: &dyn TimelineWriter, + modification: &mut DatadirModification, lsn: Lsn, decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, ) -> Result<()> { - let tag = RelishTag::Relation(RelTag { + let rel = RelTag { spcnode: blk.rnode_spcnode, dbnode: blk.rnode_dbnode, relnode: blk.rnode_relnode, forknum: blk.forknum as u8, - }); + }; // // Instead of storing full-page-image WAL record, @@ -314,12 +323,12 @@ impl WalIngest { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED) == 0 + && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)? { // Extract page image from FPI record let img_len = blk.bimg_len as usize; let img_offs = blk.bimg_offset as usize; - let mut image = BytesMut::with_capacity(pg_constants::BLCKSZ as usize); + let mut image = BytesMut::with_capacity(BLCKSZ as usize); image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]); if blk.hole_length != 0 { @@ -327,16 +336,22 @@ impl WalIngest { image.resize(image.len() + blk.hole_length as usize, 0u8); image.unsplit(tail); } - image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); - image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); - assert_eq!(image.len(), pg_constants::BLCKSZ as usize); - timeline.put_page_image(tag, blk.blkno, lsn, image.freeze())?; + // + // Match the logic of XLogReadBufferForRedoExtended: + // The page may be uninitialized. If so, we can't set the LSN because + // that would corrupt the page. + // + if !page_is_new(&image) { + page_set_lsn(&mut image, lsn) + } + assert_eq!(image.len(), BLCKSZ as usize); + self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; } else { - let rec = ZenithWalRecord::Postgres { + let rec = NeonWalRecord::Postgres { will_init: blk.will_init || blk.apply_image, rec: decoded.record.clone(), }; - timeline.put_wal_record(lsn, tag, blk.blkno, rec)?; + self.put_rel_wal_record(modification, rel, blk.blkno, rec)?; } Ok(()) } @@ -344,8 +359,7 @@ impl WalIngest { fn ingest_heapam_record( &mut self, buf: &mut Bytes, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, decoded: &mut DecodedWALRecord, ) -> Result<()> { // Handle VM bit updates that are implicitly part of heap records. @@ -409,54 +423,76 @@ impl WalIngest { // Clear the VM bits if required. if new_heap_blkno.is_some() || old_heap_blkno.is_some() { - let vm_relish = RelishTag::Relation(RelTag { - forknum: pg_constants::VISIBILITYMAP_FORKNUM, + let vm_rel = RelTag { + forknum: VISIBILITYMAP_FORKNUM, spcnode: decoded.blocks[0].rnode_spcnode, dbnode: decoded.blocks[0].rnode_dbnode, relnode: decoded.blocks[0].rnode_relnode, - }); + }; - let new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); - let old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); - if new_vm_blk == old_vm_blk { - // An UPDATE record that needs to clear the bits for both old and the - // new page, both of which reside on the same VM page. - timeline.put_wal_record( - lsn, - vm_relish, - new_vm_blk.unwrap(), - ZenithWalRecord::ClearVisibilityMapFlags { - new_heap_blkno, - old_heap_blkno, - flags: pg_constants::VISIBILITYMAP_VALID_BITS, - }, - )?; - } else { - // Clear VM bits for one heap page, or for two pages that reside on - // different VM pages. - if let Some(new_vm_blk) = new_vm_blk { - timeline.put_wal_record( - lsn, - vm_relish, - new_vm_blk, - ZenithWalRecord::ClearVisibilityMapFlags { - new_heap_blkno, - old_heap_blkno: None, - flags: pg_constants::VISIBILITYMAP_VALID_BITS, - }, - )?; + let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); + let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); + + // Sometimes, Postgres seems to create heap WAL records with the + // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is + // not set. In fact, it's possible that the VM page does not exist at all. + // In that case, we don't want to store a record to clear the VM bit; + // replaying it would fail to find the previous image of the page, because + // it doesn't exist. So check if the VM page(s) exist, and skip the WAL + // record if it doesn't. + let vm_size = self.get_relsize(vm_rel, modification.lsn)?; + if let Some(blknum) = new_vm_blk { + if blknum >= vm_size { + new_vm_blk = None; } - if let Some(old_vm_blk) = old_vm_blk { - timeline.put_wal_record( - lsn, - vm_relish, - old_vm_blk, - ZenithWalRecord::ClearVisibilityMapFlags { - new_heap_blkno: None, + } + if let Some(blknum) = old_vm_blk { + if blknum >= vm_size { + old_vm_blk = None; + } + } + + if new_vm_blk.is_some() || old_vm_blk.is_some() { + if new_vm_blk == old_vm_blk { + // An UPDATE record that needs to clear the bits for both old and the + // new page, both of which reside on the same VM page. + self.put_rel_wal_record( + modification, + vm_rel, + new_vm_blk.unwrap(), + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno, old_heap_blkno, flags: pg_constants::VISIBILITYMAP_VALID_BITS, }, )?; + } else { + // Clear VM bits for one heap page, or for two pages that reside on + // different VM pages. + if let Some(new_vm_blk) = new_vm_blk { + self.put_rel_wal_record( + modification, + vm_rel, + new_vm_blk, + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno, + old_heap_blkno: None, + flags: pg_constants::VISIBILITYMAP_VALID_BITS, + }, + )?; + } + if let Some(old_vm_blk) = old_vm_blk { + self.put_rel_wal_record( + modification, + vm_rel, + old_vm_blk, + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno: None, + old_heap_blkno, + flags: pg_constants::VISIBILITYMAP_VALID_BITS, + }, + )?; + } } } } @@ -467,8 +503,7 @@ impl WalIngest { /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record. fn ingest_xlog_dbase_create( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, rec: &XlCreateDatabase, ) -> Result<()> { let db_id = rec.db_id; @@ -481,76 +516,79 @@ impl WalIngest { // cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for // the last valid LSN to advance up to it. So we use the previous record's LSN in the // get calls instead. - let req_lsn = min(timeline.get_last_record_lsn(), lsn); + let req_lsn = modification.tline.get_last_record_lsn(); - let rels = timeline.list_rels(src_tablespace_id, src_db_id, req_lsn)?; + let rels = modification + .tline + .list_rels(src_tablespace_id, src_db_id, req_lsn)?; - trace!("ingest_xlog_dbase_create: {} rels", rels.len()); + debug!("ingest_xlog_dbase_create: {} rels", rels.len()); + + // Copy relfilemap + let filemap = modification + .tline + .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)?; + modification.put_relmap_file(tablespace_id, db_id, filemap)?; let mut num_rels_copied = 0; let mut num_blocks_copied = 0; - for rel in rels { - if let RelishTag::Relation(src_rel) = rel { - assert_eq!(src_rel.spcnode, src_tablespace_id); - assert_eq!(src_rel.dbnode, src_db_id); + for src_rel in rels { + assert_eq!(src_rel.spcnode, src_tablespace_id); + assert_eq!(src_rel.dbnode, src_db_id); - let nblocks = timeline.get_relish_size(rel, req_lsn)?.unwrap_or(0); - let dst_rel = RelTag { - spcnode: tablespace_id, - dbnode: db_id, - relnode: src_rel.relnode, - forknum: src_rel.forknum, - }; + let nblocks = modification.tline.get_rel_size(src_rel, req_lsn, true)?; + let dst_rel = RelTag { + spcnode: tablespace_id, + dbnode: db_id, + relnode: src_rel.relnode, + forknum: src_rel.forknum, + }; - // Copy content - for blknum in 0..nblocks { - let content = timeline.get_page_at_lsn(rel, blknum, req_lsn)?; + modification.put_rel_creation(dst_rel, nblocks)?; - debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel); + // Copy content + debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks); + for blknum in 0..nblocks { + debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel); - timeline.put_page_image(RelishTag::Relation(dst_rel), blknum, lsn, content)?; - num_blocks_copied += 1; - } - - if nblocks == 0 { - // make sure we have some trace of the relation, even if it's empty - timeline.put_truncation(RelishTag::Relation(dst_rel), lsn, 0)?; - } - - num_rels_copied += 1; + let content = modification + .tline + .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)?; + modification.put_rel_page_image(dst_rel, blknum, content)?; + num_blocks_copied += 1; } + + num_rels_copied += 1; } - // Copy relfilemap - // TODO This implementation is very inefficient - - // it scans all non-rels only to find FileNodeMaps - for tag in timeline.list_nonrels(req_lsn)? { - if let RelishTag::FileNodeMap { spcnode, dbnode } = tag { - if spcnode == src_tablespace_id && dbnode == src_db_id { - let img = timeline.get_page_at_lsn(tag, 0, req_lsn)?; - let new_tag = RelishTag::FileNodeMap { - spcnode: tablespace_id, - dbnode: db_id, - }; - timeline.put_page_image(new_tag, 0, lsn, img)?; - break; - } - } - } info!( - "Created database {}/{}, copied {} blocks in {} rels at {}", - tablespace_id, db_id, num_blocks_copied, num_rels_copied, lsn + "Created database {}/{}, copied {} blocks in {} rels", + tablespace_id, db_id, num_blocks_copied, num_rels_copied ); Ok(()) } + fn ingest_xlog_smgr_create( + &mut self, + modification: &mut DatadirModification, + rec: &XlSmgrCreate, + ) -> Result<()> { + let rel = RelTag { + spcnode: rec.rnode.spcnode, + dbnode: rec.rnode.dbnode, + relnode: rec.rnode.relnode, + forknum: rec.forknum, + }; + self.put_rel_creation(modification, rel)?; + Ok(()) + } + /// Subroutine of ingest_record(), to handle an XLOG_SMGR_TRUNCATE record. /// /// This is the same logic as in PostgreSQL's smgr_redo() function. fn ingest_xlog_smgr_truncate( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, rec: &XlSmgrTruncate, ) -> Result<()> { let spcnode = rec.rnode.spcnode; @@ -562,16 +600,16 @@ impl WalIngest { spcnode, dbnode, relnode, - forknum: pg_constants::MAIN_FORKNUM, + forknum: MAIN_FORKNUM, }; - timeline.put_truncation(RelishTag::Relation(rel), lsn, rec.blkno)?; + self.put_rel_truncation(modification, rel, rec.blkno)?; } if (rec.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 { let rel = RelTag { spcnode, dbnode, relnode, - forknum: pg_constants::FSM_FORKNUM, + forknum: FSM_FORKNUM, }; // FIXME: 'blkno' stored in the WAL record is the new size of the @@ -587,14 +625,14 @@ impl WalIngest { info!("Partial truncation of FSM is not supported"); } let num_fsm_blocks = 0; - timeline.put_truncation(RelishTag::Relation(rel), lsn, num_fsm_blocks)?; + self.put_rel_truncation(modification, rel, num_fsm_blocks)?; } if (rec.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 { let rel = RelTag { spcnode, dbnode, relnode, - forknum: pg_constants::VISIBILITYMAP_FORKNUM, + forknum: VISIBILITYMAP_FORKNUM, }; // FIXME: Like with the FSM above, the logic to truncate the VM @@ -606,7 +644,7 @@ impl WalIngest { info!("Partial truncation of VM is not supported"); } let num_vm_blocks = 0; - timeline.put_truncation(RelishTag::Relation(rel), lsn, num_vm_blocks)?; + self.put_rel_truncation(modification, rel, num_vm_blocks)?; } Ok(()) } @@ -615,8 +653,7 @@ impl WalIngest { /// fn ingest_xact_record( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, parsed: &XlXactParsedRecord, is_commit: bool, ) -> Result<()> { @@ -632,17 +669,17 @@ impl WalIngest { // This subxact goes to different page. Write the record // for all the XIDs on the previous page, and continue // accumulating XIDs on this new page. - timeline.put_wal_record( - lsn, - RelishTag::Slru { - slru: SlruKind::Clog, - segno, - }, + modification.put_slru_wal_record( + SlruKind::Clog, + segno, rpageno, if is_commit { - ZenithWalRecord::ClogSetCommitted { xids: page_xids } + NeonWalRecord::ClogSetCommitted { + xids: page_xids, + timestamp: parsed.xact_time, + } } else { - ZenithWalRecord::ClogSetAborted { xids: page_xids } + NeonWalRecord::ClogSetAborted { xids: page_xids } }, )?; page_xids = Vec::new(); @@ -652,29 +689,32 @@ impl WalIngest { rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; page_xids.push(*subxact); } - timeline.put_wal_record( - lsn, - RelishTag::Slru { - slru: SlruKind::Clog, - segno, - }, + modification.put_slru_wal_record( + SlruKind::Clog, + segno, rpageno, if is_commit { - ZenithWalRecord::ClogSetCommitted { xids: page_xids } + NeonWalRecord::ClogSetCommitted { + xids: page_xids, + timestamp: parsed.xact_time, + } } else { - ZenithWalRecord::ClogSetAborted { xids: page_xids } + NeonWalRecord::ClogSetAborted { xids: page_xids } }, )?; for xnode in &parsed.xnodes { - for forknum in pg_constants::MAIN_FORKNUM..=pg_constants::VISIBILITYMAP_FORKNUM { + for forknum in MAIN_FORKNUM..=VISIBILITYMAP_FORKNUM { let rel = RelTag { forknum, spcnode: xnode.spcnode, dbnode: xnode.dbnode, relnode: xnode.relnode, }; - timeline.drop_relish(RelishTag::Relation(rel), lsn)?; + let last_lsn = self.timeline.get_last_record_lsn(); + if modification.tline.get_rel_exists(rel, last_lsn, true)? { + self.put_rel_drop(modification, rel)?; + } } } Ok(()) @@ -682,13 +722,12 @@ impl WalIngest { fn ingest_clog_truncate_record( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, xlrec: &XlClogTruncate, ) -> Result<()> { info!( - "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {} lsn {}", - xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db, lsn + "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}", + xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db ); // Here we treat oldestXid and oldestXidDB @@ -719,23 +758,20 @@ impl WalIngest { } // Iterate via SLRU CLOG segments and drop segments that we're ready to truncate - // TODO This implementation is very inefficient - - // it scans all non-rels only to find Clog // // We cannot pass 'lsn' to the Timeline.list_nonrels(), or it // will block waiting for the last valid LSN to advance up to // it. So we use the previous record's LSN in the get calls // instead. - let req_lsn = min(timeline.get_last_record_lsn(), lsn); - for obj in timeline.list_nonrels(req_lsn)? { - if let RelishTag::Slru { slru, segno } = obj { - if slru == SlruKind::Clog { - let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; - if slru_may_delete_clogsegment(segpage, xlrec.pageno) { - timeline.drop_relish(RelishTag::Slru { slru, segno }, lsn)?; - trace!("Drop CLOG segment {:>04X} at lsn {}", segno, lsn); - } - } + let req_lsn = modification.tline.get_last_record_lsn(); + for segno in modification + .tline + .list_slru_segments(SlruKind::Clog, req_lsn)? + { + let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; + if slru_may_delete_clogsegment(segpage, xlrec.pageno) { + modification.drop_slru_segment(SlruKind::Clog, segno)?; + trace!("Drop CLOG segment {:>04X}", segno); } } @@ -744,8 +780,7 @@ impl WalIngest { fn ingest_multixact_create_record( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, xlrec: &XlMultiXactCreate, ) -> Result<()> { // Create WAL record for updating the multixact-offsets page @@ -753,14 +788,11 @@ impl WalIngest { let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - timeline.put_wal_record( - lsn, - RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno, - }, + modification.put_slru_wal_record( + SlruKind::MultiXactOffsets, + segno, rpageno, - ZenithWalRecord::MultixactOffsetCreate { + NeonWalRecord::MultixactOffsetCreate { mid: xlrec.mid, moff: xlrec.moff, }, @@ -790,14 +822,11 @@ impl WalIngest { } let n_this_page = this_page_members.len(); - timeline.put_wal_record( - lsn, - RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno: pageno / pg_constants::SLRU_PAGES_PER_SEGMENT, - }, + modification.put_slru_wal_record( + SlruKind::MultiXactMembers, + pageno / pg_constants::SLRU_PAGES_PER_SEGMENT, pageno % pg_constants::SLRU_PAGES_PER_SEGMENT, - ZenithWalRecord::MultixactMembersCreate { + NeonWalRecord::MultixactMembersCreate { moff: offset, members: this_page_members, }, @@ -830,8 +859,7 @@ impl WalIngest { fn ingest_multixact_truncate_record( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, xlrec: &XlMultiXactTruncate, ) -> Result<()> { self.checkpoint.oldestMulti = xlrec.end_trunc_off; @@ -847,13 +875,7 @@ impl WalIngest { // Delete all the segments except the last one. The last segment can still // contain, possibly partially, valid data. while segment != endsegment { - timeline.drop_relish( - RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno: segment as u32, - }, - lsn, - )?; + modification.drop_slru_segment(SlruKind::MultiXactMembers, segment as u32)?; /* move to next segment, handling wraparound correctly */ if segment == maxsegment { @@ -871,22 +893,516 @@ impl WalIngest { fn ingest_relmap_page( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + modification: &mut DatadirModification, xlrec: &XlRelmapUpdate, decoded: &DecodedWALRecord, ) -> Result<()> { - let tag = RelishTag::FileNodeMap { - spcnode: xlrec.tsid, - dbnode: xlrec.dbid, - }; - let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); // skip xl_relmap_update buf.advance(12); - timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buf[..]))?; + modification.put_relmap_file(xlrec.tsid, xlrec.dbid, Bytes::copy_from_slice(&buf[..]))?; + + Ok(()) + } + + fn put_rel_creation( + &mut self, + modification: &mut DatadirModification, + rel: RelTag, + ) -> Result<()> { + modification.put_rel_creation(rel, 0)?; + Ok(()) + } + + fn put_rel_page_image( + &mut self, + modification: &mut DatadirModification, + rel: RelTag, + blknum: BlockNumber, + img: Bytes, + ) -> Result<()> { + self.handle_rel_extend(modification, rel, blknum)?; + modification.put_rel_page_image(rel, blknum, img)?; + Ok(()) + } + + fn put_rel_wal_record( + &mut self, + modification: &mut DatadirModification, + rel: RelTag, + blknum: BlockNumber, + rec: NeonWalRecord, + ) -> Result<()> { + self.handle_rel_extend(modification, rel, blknum)?; + modification.put_rel_wal_record(rel, blknum, rec)?; + Ok(()) + } + + fn put_rel_truncation( + &mut self, + modification: &mut DatadirModification, + rel: RelTag, + nblocks: BlockNumber, + ) -> Result<()> { + modification.put_rel_truncation(rel, nblocks)?; + Ok(()) + } + + fn put_rel_drop(&mut self, modification: &mut DatadirModification, rel: RelTag) -> Result<()> { + modification.put_rel_drop(rel)?; + Ok(()) + } + + fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> Result { + let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true)? { + 0 + } else { + self.timeline.get_rel_size(rel, lsn, true)? + }; + Ok(nblocks) + } + + fn handle_rel_extend( + &mut self, + modification: &mut DatadirModification, + rel: RelTag, + blknum: BlockNumber, + ) -> Result<()> { + let new_nblocks = blknum + 1; + // Check if the relation exists. We implicitly create relations on first + // record. + // TODO: would be nice if to be more explicit about it + let last_lsn = modification.lsn; + let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true)? { + // create it with 0 size initially, the logic below will extend it + modification.put_rel_creation(rel, 0)?; + 0 + } else { + self.timeline.get_rel_size(rel, last_lsn, true)? + }; + + if new_nblocks > old_nblocks { + //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks); + modification.put_rel_extend(rel, new_nblocks)?; + + // fill the gap with zeros + for gap_blknum in old_nblocks..blknum { + modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?; + } + } + Ok(()) + } + + fn put_slru_page_image( + &mut self, + modification: &mut DatadirModification, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + img: Bytes, + ) -> Result<()> { + self.handle_slru_extend(modification, kind, segno, blknum)?; + modification.put_slru_page_image(kind, segno, blknum, img)?; + Ok(()) + } + + fn handle_slru_extend( + &mut self, + modification: &mut DatadirModification, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + ) -> Result<()> { + // we don't use a cache for this like we do for relations. SLRUS are explcitly + // extended with ZEROPAGE records, not with commit records, so it happens + // a lot less frequently. + + let new_nblocks = blknum + 1; + // Check if the relation exists. We implicitly create relations on first + // record. + // TODO: would be nice if to be more explicit about it + let last_lsn = self.timeline.get_last_record_lsn(); + let old_nblocks = if !self + .timeline + .get_slru_segment_exists(kind, segno, last_lsn)? + { + // create it with 0 size initially, the logic below will extend it + modification.put_slru_segment_creation(kind, segno, 0)?; + 0 + } else { + self.timeline.get_slru_segment_size(kind, segno, last_lsn)? + }; + + if new_nblocks > old_nblocks { + trace!( + "extending SLRU {:?} seg {} from {} to {} blocks", + kind, + segno, + old_nblocks, + new_nblocks + ); + modification.put_slru_extend(kind, segno, new_nblocks)?; + + // fill the gap with zeros + for gap_blknum in old_nblocks..blknum { + modification.put_slru_page_image(kind, segno, gap_blknum, ZERO_PAGE.clone())?; + } + } + Ok(()) + } +} + +#[allow(clippy::bool_assert_comparison)] +#[cfg(test)] +mod tests { + use super::*; + use crate::pgdatadir_mapping::create_test_timeline; + use crate::tenant::harness::*; + use crate::tenant::Timeline; + use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; + use postgres_ffi::RELSEG_SIZE; + + use crate::DEFAULT_PG_VERSION; + + /// Arbitrary relation tag, for testing. + const TESTREL_A: RelTag = RelTag { + spcnode: 0, + dbnode: 111, + relnode: 1000, + forknum: 0, + }; + + fn assert_current_logical_size(_timeline: &Timeline, _lsn: Lsn) { + // TODO + } + + static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); + + fn init_walingest_test(tline: &Timeline) -> Result { + let mut m = tline.begin_modification(Lsn(0x10)); + m.put_checkpoint(ZERO_CHECKPOINT.clone())?; + m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file + m.commit()?; + let walingest = WalIngest::new(tline, Lsn(0x10))?; + + Ok(walingest) + } + + #[test] + fn test_relsize() -> Result<()> { + let tenant = TenantHarness::create("test_relsize")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; + let mut walingest = init_walingest_test(&*tline)?; + + let mut m = tline.begin_modification(Lsn(0x20)); + walingest.put_rel_creation(&mut m, TESTREL_A)?; + walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + m.commit()?; + let mut m = tline.begin_modification(Lsn(0x30)); + walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?; + m.commit()?; + let mut m = tline.begin_modification(Lsn(0x40)); + walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?; + m.commit()?; + let mut m = tline.begin_modification(Lsn(0x50)); + walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?; + m.commit()?; + + assert_current_logical_size(&*tline, Lsn(0x50)); + + // The relation was created at LSN 2, not visible at LSN 1 yet. + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false); + assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err()); + + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3); + + // Check page contents at each LSN + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)?, + TEST_IMG("foo blk 0 at 2") + ); + + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)?, + TEST_IMG("foo blk 0 at 3") + ); + + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)?, + TEST_IMG("foo blk 0 at 3") + ); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?, + TEST_IMG("foo blk 1 at 4") + ); + + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)?, + TEST_IMG("foo blk 0 at 3") + ); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?, + TEST_IMG("foo blk 1 at 4") + ); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?, + TEST_IMG("foo blk 2 at 5") + ); + + // Truncate last block + let mut m = tline.begin_modification(Lsn(0x60)); + walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?; + m.commit()?; + assert_current_logical_size(&*tline, Lsn(0x60)); + + // Check reported size and contents after truncation + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 2); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)?, + TEST_IMG("foo blk 0 at 3") + ); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?, + TEST_IMG("foo blk 1 at 4") + ); + + // should still see the truncated block with older LSN + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?, + TEST_IMG("foo blk 2 at 5") + ); + + // Truncate to zero length + let mut m = tline.begin_modification(Lsn(0x68)); + walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?; + m.commit()?; + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false)?, 0); + + // Extend from 0 to 2 blocks, leaving a gap + let mut m = tline.begin_modification(Lsn(0x70)); + walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?; + m.commit()?; + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false)?, 2); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)?, + ZERO_PAGE + ); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)?, + TEST_IMG("foo blk 1") + ); + + // Extend a lot more, leaving a big gap that spans across segments + let mut m = tline.begin_modification(Lsn(0x80)); + walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?; + m.commit()?; + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, 1501); + for blk in 2..1500 { + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)?, + ZERO_PAGE + ); + } + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)?, + TEST_IMG("foo blk 1500") + ); + + Ok(()) + } + + // Test what happens if we dropped a relation + // and then created it again within the same layer. + #[test] + fn test_drop_extend() -> Result<()> { + let tenant = TenantHarness::create("test_drop_extend")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; + let mut walingest = init_walingest_test(&*tline)?; + + let mut m = tline.begin_modification(Lsn(0x20)); + walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + m.commit()?; + + // Check that rel exists and size is correct + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1); + + // Drop rel + let mut m = tline.begin_modification(Lsn(0x30)); + walingest.put_rel_drop(&mut m, TESTREL_A)?; + m.commit()?; + + // Check that rel is not visible anymore + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30), false)?, false); + + // FIXME: should fail + //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30), false)?.is_none()); + + // Re-create it + let mut m = tline.begin_modification(Lsn(0x40)); + walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?; + m.commit()?; + + // Check that rel exists and size is correct + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40), false)?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false)?, 1); + + Ok(()) + } + + // Test what happens if we truncated a relation + // so that one of its segments was dropped + // and then extended it again within the same layer. + #[test] + fn test_truncate_extend() -> Result<()> { + let tenant = TenantHarness::create("test_truncate_extend")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; + let mut walingest = init_walingest_test(&*tline)?; + + // Create a 20 MB relation (the size is arbitrary) + let relsize = 20 * 1024 * 1024 / 8192; + let mut m = tline.begin_modification(Lsn(0x20)); + for blkno in 0..relsize { + let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); + walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; + } + m.commit()?; + + // The relation was created at LSN 20, not visible at LSN 1 yet. + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false); + assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err()); + + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, relsize); + + // Check relation content + for blkno in 0..relsize { + let lsn = Lsn(0x20); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)?, + TEST_IMG(&data) + ); + } + + // Truncate relation so that second segment was dropped + // - only leave one page + let mut m = tline.begin_modification(Lsn(0x60)); + walingest.put_rel_truncation(&mut m, TESTREL_A, 1)?; + m.commit()?; + + // Check reported size and contents after truncation + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 1); + + for blkno in 0..1 { + let lsn = Lsn(0x20); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)?, + TEST_IMG(&data) + ); + } + + // should still see all blocks with older LSN + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, relsize); + for blkno in 0..relsize { + let lsn = Lsn(0x20); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)?, + TEST_IMG(&data) + ); + } + + // Extend relation again. + // Add enough blocks to create second segment + let lsn = Lsn(0x80); + let mut m = tline.begin_modification(lsn); + for blkno in 0..relsize { + let data = format!("foo blk {} at {}", blkno, lsn); + walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; + } + m.commit()?; + + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80), false)?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, relsize); + // Check relation content + for blkno in 0..relsize { + let lsn = Lsn(0x80); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)?, + TEST_IMG(&data) + ); + } + + Ok(()) + } + + /// Test get_relsize() and truncation with a file larger than 1 GB, so that it's + /// split into multiple 1 GB segments in Postgres. + #[test] + fn test_large_rel() -> Result<()> { + let tenant = TenantHarness::create("test_large_rel")?.load(); + let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; + let mut walingest = init_walingest_test(&*tline)?; + + let mut lsn = 0x10; + for blknum in 0..RELSEG_SIZE + 1 { + lsn += 0x10; + let mut m = tline.begin_modification(Lsn(lsn)); + let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); + walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?; + m.commit()?; + } + + assert_current_logical_size(&*tline, Lsn(lsn)); + + assert_eq!( + tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + RELSEG_SIZE + 1 + ); + + // Truncate one block + lsn += 0x10; + let mut m = tline.begin_modification(Lsn(lsn)); + walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)?; + m.commit()?; + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, RELSEG_SIZE); + assert_current_logical_size(&*tline, Lsn(lsn)); + + // Truncate another block + lsn += 0x10; + let mut m = tline.begin_modification(Lsn(lsn)); + walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)?; + m.commit()?; + assert_eq!( + tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + RELSEG_SIZE - 1 + ); + assert_current_logical_size(&*tline, Lsn(lsn)); + + // Truncate to 1500, and then truncate all the way down to 0, one block at a time + // This tests the behavior at segment boundaries + let mut size: i32 = 3000; + while size >= 0 { + lsn += 0x10; + let mut m = tline.begin_modification(Lsn(lsn)); + walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?; + m.commit()?; + assert_eq!( + tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + size as BlockNumber + ); + + size -= 1; + } + assert_current_logical_size(&*tline, Lsn(lsn)); Ok(()) } diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 6fff1d062d..1fad91c836 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -1,383 +1,173 @@ +//! WAL receiver manages an open connection to safekeeper, to get the WAL it streams into. +//! To do so, a current implementation needs to do the following: //! -//! WAL receiver connects to the WAL safekeeper service, streams WAL, -//! decodes records and saves them in the repository for the correct -//! timeline. +//! * acknowledge the timelines that it needs to stream WAL into. +//! Pageserver is able to dynamically (un)load tenants on attach and detach, +//! hence WAL receiver needs to react on such events. //! -//! We keep one WAL receiver active per timeline. +//! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming. +//! For that, it watches specific keys in etcd broker and pulls the relevant data periodically. +//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other. +//! Without this data, no WAL streaming is possible currently. +//! +//! Only one active WAL streaming connection is allowed at a time. +//! The connection is supposed to be updated periodically, based on safekeeper timeline data. +//! +//! * handle the actual connection and WAL streaming +//! +//! Handling happens dynamically, by portions of WAL being processed and registered in the server. +//! Along with the registration, certain metadata is written to show WAL streaming progress and rely on that when considering safekeepers for connection. +//! +//! The current module contains high-level primitives used in the submodules; general synchronization, timeline acknowledgement and shutdown logic. + +mod connection_manager; +mod walreceiver_connection; use crate::config::PageServerConf; -use crate::tenant_mgr; -use crate::thread_mgr; -use crate::thread_mgr::ThreadKind; -use crate::walingest::WalIngest; -use anyhow::{bail, Context, Error, Result}; -use bytes::BytesMut; -use fail::fail_point; -use lazy_static::lazy_static; -use postgres_ffi::waldecoder::*; -use postgres_protocol::message::backend::ReplicationMessage; -use postgres_types::PgLsn; -use std::cell::Cell; -use std::collections::HashMap; -use std::str::FromStr; -use std::sync::Mutex; -use std::thread_local; -use std::time::SystemTime; -use tokio::pin; -use tokio_postgres::replication::ReplicationStream; -use tokio_postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow}; -use tokio_stream::StreamExt; +use crate::task_mgr::WALRECEIVER_RUNTIME; + +use anyhow::{ensure, Context}; +use etcd_broker::Client; +use itertools::Itertools; +use once_cell::sync::OnceCell; +use std::future::Future; +use tokio::sync::watch; use tracing::*; -use zenith_utils::lsn::Lsn; -use zenith_utils::pq_proto::ZenithFeedback; -use zenith_utils::zid::ZTenantId; -use zenith_utils::zid::ZTimelineId; +use url::Url; -// -// We keep one WAL Receiver active per timeline. -// -struct WalReceiverEntry { - wal_producer_connstr: String, -} +pub use connection_manager::spawn_connection_manager_task; -lazy_static! { - static ref WAL_RECEIVERS: Mutex> = - Mutex::new(HashMap::new()); -} +static ETCD_CLIENT: OnceCell = OnceCell::new(); -thread_local! { - // Boolean that is true only for WAL receiver threads - // - // This is used in `wait_lsn` to guard against usage that might lead to a deadlock. - pub(crate) static IS_WAL_RECEIVER: Cell = Cell::new(false); -} - -fn drop_wal_receiver(tenantid: ZTenantId, timelineid: ZTimelineId) { - let mut receivers = WAL_RECEIVERS.lock().unwrap(); - receivers.remove(&(tenantid, timelineid)); -} - -// Launch a new WAL receiver, or tell one that's running about change in connection string -pub fn launch_wal_receiver( - conf: &'static PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, - wal_producer_connstr: &str, -) -> Result<()> { - let mut receivers = WAL_RECEIVERS.lock().unwrap(); - - match receivers.get_mut(&(tenantid, timelineid)) { - Some(receiver) => { - info!("wal receiver already running, updating connection string"); - receiver.wal_producer_connstr = wal_producer_connstr.into(); - } - None => { - thread_mgr::spawn( - ThreadKind::WalReceiver, - Some(tenantid), - Some(timelineid), - "WAL receiver thread", - move || { - IS_WAL_RECEIVER.with(|c| c.set(true)); - thread_main(conf, tenantid, timelineid) - }, - )?; - - let receiver = WalReceiverEntry { - wal_producer_connstr: wal_producer_connstr.into(), - }; - receivers.insert((tenantid, timelineid), receiver); - - // Update tenant state and start tenant threads, if they are not running yet. - tenant_mgr::activate_tenant(conf, tenantid)?; - } - }; - Ok(()) -} - -// Look up current WAL producer connection string in the hash table -fn get_wal_producer_connstr(tenantid: ZTenantId, timelineid: ZTimelineId) -> String { - let receivers = WAL_RECEIVERS.lock().unwrap(); - - receivers - .get(&(tenantid, timelineid)) - .unwrap() - .wal_producer_connstr - .clone() -} - -// -// This is the entry point for the WAL receiver thread. -// -fn thread_main( - conf: &'static PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, -) -> Result<()> { - let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered(); - info!("WAL receiver thread started"); - - // Look up the current WAL producer address - let wal_producer_connstr = get_wal_producer_connstr(tenantid, timelineid); - - // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server, - // and start streaming WAL from it. - let res = walreceiver_main(conf, tenantid, timelineid, &wal_producer_connstr); - - // TODO cleanup info messages - if let Err(e) = res { - info!("WAL streaming connection failed ({})", e); - } else { - info!( - "walreceiver disconnected tenant {}, timelineid {}", - tenantid, timelineid - ); - } - - // Drop it from list of active WAL_RECEIVERS - // so that next callmemaybe request launched a new thread - drop_wal_receiver(tenantid, timelineid); - Ok(()) -} - -fn walreceiver_main( - _conf: &PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, - wal_producer_connstr: &str, -) -> Result<(), Error> { - // Connect to the database in replication mode. - info!("connecting to {:?}", wal_producer_connstr); - let connect_cfg = format!( - "{} application_name=pageserver replication=true", - wal_producer_connstr +/// +/// Initialize the etcd client. This must be called once at page server startup. +/// +pub async fn init_etcd_client(conf: &'static PageServerConf) -> anyhow::Result<()> { + let etcd_endpoints = conf.broker_endpoints.clone(); + ensure!( + !etcd_endpoints.is_empty(), + "Cannot start wal receiver: etcd endpoints are empty" ); - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?; + let etcd_client = Client::connect(etcd_endpoints.clone(), None) + .await + .context("Failed to connect to etcd")?; - let (mut replication_client, connection) = - runtime.block_on(tokio_postgres::connect(&connect_cfg, NoTls))?; - // This is from tokio-postgres docs, but it is a bit weird in our case because we extensively use block_on - runtime.spawn(async move { - if let Err(e) = connection.await { - error!("connection error: {}", e); - } - }); - - info!("connected!"); - - // Immediately increment the gauge, then create a job to decrement it on thread exit. - // One of the pros of `defer!` is that this will *most probably* - // get called, even in presence of panics. - let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); - gauge.inc(); - scopeguard::defer! { - gauge.dec(); + // FIXME: Should we still allow the pageserver to start, if etcd + // doesn't work? It could still serve GetPage requests, with the + // data it has locally and from what it can download from remote + // storage + if ETCD_CLIENT.set(etcd_client).is_err() { + panic!("etcd already initialized"); } - let identify = runtime.block_on(identify_system(&mut replication_client))?; - info!("{:?}", identify); - let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); - let mut caught_up = false; - - let timeline = - tenant_mgr::get_timeline_for_tenant(tenantid, timelineid).with_context(|| { - format!( - "Can not start the walrecever for a remote tenant {}, timeline {}", - tenantid, timelineid, - ) - })?; - - // - // Start streaming the WAL, from where we left off previously. - // - // If we had previously received WAL up to some point in the middle of a WAL record, we - // better start from the end of last full WAL record, not in the middle of one. - let mut last_rec_lsn = timeline.get_last_record_lsn(); - let mut startpoint = last_rec_lsn; - - if startpoint == Lsn(0) { - bail!("No previous WAL position"); - } - - // There might be some padding after the last full record, skip it. - startpoint += startpoint.calc_padding(8u32); - info!( - "last_record_lsn {} starting replication from {}, server is at {}...", - last_rec_lsn, startpoint, end_of_wal + "Initialized etcd client with endpoints: {}", + etcd_endpoints.iter().map(Url::to_string).join(", ") ); - - let query = format!("START_REPLICATION PHYSICAL {}", startpoint); - - let copy_stream = runtime.block_on(replication_client.copy_both_simple(&query))?; - let physical_stream = ReplicationStream::new(copy_stream); - pin!(physical_stream); - - let mut waldecoder = WalStreamDecoder::new(startpoint); - - let mut walingest = WalIngest::new(&*timeline, startpoint)?; - - while let Some(replication_message) = runtime.block_on(async { - let shutdown_watcher = thread_mgr::shutdown_watcher(); - tokio::select! { - // check for shutdown first - biased; - _ = shutdown_watcher => { - info!("walreceiver interrupted"); - None - } - replication_message = physical_stream.next() => replication_message, - } - }) { - let replication_message = replication_message?; - let status_update = match replication_message { - ReplicationMessage::XLogData(xlog_data) => { - // Pass the WAL data to the decoder, and see if we can decode - // more records as a result. - let data = xlog_data.data(); - let startlsn = Lsn::from(xlog_data.wal_start()); - let endlsn = startlsn + data.len() as u64; - - trace!("received XLogData between {} and {}", startlsn, endlsn); - - waldecoder.feed_bytes(data); - - while let Some((lsn, recdata)) = waldecoder.poll_decode()? { - let _enter = info_span!("processing record", lsn = %lsn).entered(); - - // It is important to deal with the aligned records as lsn in getPage@LSN is - // aligned and can be several bytes bigger. Without this alignment we are - // at risk of hittind a deadlock. - assert!(lsn.is_aligned()); - - let writer = timeline.writer(); - walingest.ingest_record(writer.as_ref(), recdata, lsn)?; - - fail_point!("walreceiver-after-ingest"); - - last_rec_lsn = lsn; - } - - if !caught_up && endlsn >= end_of_wal { - info!("caught up at LSN {}", endlsn); - caught_up = true; - } - - Some(endlsn) - } - - ReplicationMessage::PrimaryKeepAlive(keepalive) => { - let wal_end = keepalive.wal_end(); - let timestamp = keepalive.timestamp(); - let reply_requested = keepalive.reply() != 0; - - trace!( - "received PrimaryKeepAlive(wal_end: {}, timestamp: {:?} reply: {})", - wal_end, - timestamp, - reply_requested, - ); - - if reply_requested { - Some(last_rec_lsn) - } else { - None - } - } - - _ => None, - }; - - if let Some(last_lsn) = status_update { - let timeline_synced_disk_consistent_lsn = - tenant_mgr::get_repository_for_tenant(tenantid)? - .get_timeline_state(timelineid) - .and_then(|state| state.remote_disk_consistent_lsn()) - .unwrap_or(Lsn(0)); - - // The last LSN we processed. It is not guaranteed to survive pageserver crash. - let write_lsn = u64::from(last_lsn); - // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data - let flush_lsn = u64::from(timeline.get_disk_consistent_lsn()); - // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash - // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. - let apply_lsn = u64::from(timeline_synced_disk_consistent_lsn); - let ts = SystemTime::now(); - - // Send zenith feedback message. - // Regular standby_status_update fields are put into this message. - let zenith_status_update = ZenithFeedback { - current_timeline_size: timeline.get_current_logical_size() as u64, - ps_writelsn: write_lsn, - ps_flushlsn: flush_lsn, - ps_applylsn: apply_lsn, - ps_replytime: ts, - }; - - debug!("zenith_status_update {:?}", zenith_status_update); - - let mut data = BytesMut::new(); - zenith_status_update.serialize(&mut data)?; - runtime.block_on( - physical_stream - .as_mut() - .zenith_status_update(data.len() as u64, &data), - )?; - } - } - Ok(()) } -/// Data returned from the postgres `IDENTIFY_SYSTEM` command /// -/// See the [postgres docs] for more details. +/// Get a handle to the etcd client /// -/// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html +pub fn get_etcd_client() -> &'static etcd_broker::Client { + ETCD_CLIENT.get().expect("etcd client not initialized") +} + +pub fn is_etcd_client_initialized() -> bool { + ETCD_CLIENT.get().is_some() +} + +/// A handle of an asynchronous task. +/// The task has a channel that it can use to communicate its lifecycle events in a certain form, see [`TaskEvent`] +/// and a cancellation channel that it can listen to for earlier interrupts. +/// +/// Note that the communication happens via the `watch` channel, that does not accumulate the events, replacing the old one with the never one on submission. +/// That may lead to certain events not being observed by the listener. #[derive(Debug)] -// As of nightly 2021-09-11, fields that are only read by the type's `Debug` impl still count as -// unused. Relevant issue: https://github.com/rust-lang/rust/issues/88900 -#[allow(dead_code)] -pub struct IdentifySystem { - systemid: u64, - timeline: u32, - xlogpos: PgLsn, - dbname: Option, +pub struct TaskHandle { + join_handle: Option>>, + events_receiver: watch::Receiver>, + cancellation: watch::Sender<()>, } -/// There was a problem parsing the response to -/// a postgres IDENTIFY_SYSTEM command. -#[derive(Debug, thiserror::Error)] -#[error("IDENTIFY_SYSTEM parse error")] -pub struct IdentifyError; +pub enum TaskEvent { + Update(TaskStateUpdate), + End(anyhow::Result<()>), +} -/// Run the postgres `IDENTIFY_SYSTEM` command -pub async fn identify_system(client: &mut Client) -> Result { - let query_str = "IDENTIFY_SYSTEM"; - let response = client.simple_query(query_str).await?; +#[derive(Debug, Clone)] +pub enum TaskStateUpdate { + Init, + Started, + Progress(E), +} - // get(N) from row, then parse it as some destination type. - fn get_parse(row: &SimpleQueryRow, idx: usize) -> Result +impl TaskHandle { + /// Initializes the task, starting it immediately after the creation. + pub fn spawn( + task: impl FnOnce(watch::Sender>, watch::Receiver<()>) -> Fut + + Send + + 'static, + ) -> Self where - T: FromStr, + Fut: Future> + Send, + E: Send + Sync + 'static, { - let val = row.get(idx).ok_or(IdentifyError)?; - val.parse::().or(Err(IdentifyError)) + let (cancellation, cancellation_receiver) = watch::channel(()); + let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started); + + let join_handle = WALRECEIVER_RUNTIME.spawn(async move { + events_sender.send(TaskStateUpdate::Started).ok(); + task(events_sender, cancellation_receiver).await + }); + + TaskHandle { + join_handle: Some(join_handle), + events_receiver, + cancellation, + } } - // extract the row contents into an IdentifySystem struct. - // written as a closure so I can use ? for Option here. - if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) { - Ok(IdentifySystem { - systemid: get_parse(first_row, 0)?, - timeline: get_parse(first_row, 1)?, - xlogpos: get_parse(first_row, 2)?, - dbname: get_parse(first_row, 3).ok(), - }) - } else { - Err(IdentifyError.into()) + async fn next_task_event(&mut self) -> TaskEvent { + match self.events_receiver.changed().await { + Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()), + Err(_task_channel_part_dropped) => { + TaskEvent::End(match self.join_handle.take() { + Some(jh) => { + if !jh.is_finished() { + warn!("sender is dropped while join handle is still alive"); + } + + jh.await + .map_err(|e| anyhow::anyhow!("Failed to join task: {e}")) + .and_then(|x| x) + } + None => { + // Another option is to have an enum, join handle or result and give away the reference to it + Err(anyhow::anyhow!("Task was joined more than once")) + } + }) + } + } + } + + /// Aborts current task, waiting for it to finish. + pub async fn shutdown(self) { + if let Some(jh) = self.join_handle { + self.cancellation.send(()).ok(); + match jh.await { + Ok(Ok(())) => debug!("Shutdown success"), + Ok(Err(e)) => error!("Shutdown task error: {e:?}"), + Err(join_error) => { + if join_error.is_cancelled() { + error!("Shutdown task was cancelled"); + } else { + error!("Shutdown task join error: {join_error}") + } + } + } + } } } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs new file mode 100644 index 0000000000..d527e521e0 --- /dev/null +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -0,0 +1,1495 @@ +//! WAL receiver logic that ensures the pageserver gets connectected to safekeeper, +//! that contains the latest WAL to stream and this connection does not go stale. +//! +//! To achieve that, a etcd broker is used: safekepers propagate their timelines' state in it, +//! the manager subscribes for changes and accumulates those to query the one with the biggest Lsn for connection. +//! Current connection state is tracked too, to ensure it's not getting stale. +//! +//! After every connection or etcd update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader, +//! then a [re]connection happens, if necessary. +//! Only WAL streaming task expects to be finished, other loops (etcd, connection management) never exit unless cancelled explicitly via the dedicated channel. + +use std::{ + collections::{hash_map, HashMap}, + num::NonZeroU64, + ops::ControlFlow, + sync::Arc, + time::Duration, +}; + +use crate::task_mgr::TaskKind; +use crate::task_mgr::WALRECEIVER_RUNTIME; +use crate::tenant::Timeline; +use crate::{task_mgr, walreceiver::TaskStateUpdate}; +use anyhow::Context; +use chrono::{NaiveDateTime, Utc}; +use etcd_broker::{ + subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription, + BrokerUpdate, Client, +}; +use pageserver_api::models::TimelineState; +use tokio::{select, sync::watch}; +use tracing::*; + +use crate::{ + exponential_backoff, walreceiver::get_etcd_client, DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, +}; +use utils::{ + id::{NodeId, TenantTimelineId}, + lsn::Lsn, +}; + +use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle}; + +/// Spawns the loop to take care of the timeline's WAL streaming connection. +pub fn spawn_connection_manager_task( + broker_loop_prefix: String, + timeline: Arc, + wal_connect_timeout: Duration, + lagging_wal_timeout: Duration, + max_lsn_wal_lag: NonZeroU64, +) { + let mut etcd_client = get_etcd_client().clone(); + + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + + task_mgr::spawn( + WALRECEIVER_RUNTIME.handle(), + TaskKind::WalReceiverManager, + Some(tenant_id), + Some(timeline_id), + &format!("walreceiver for timeline {tenant_id}/{timeline_id}"), + false, + async move { + info!("WAL receiver broker started, connecting to etcd"); + let mut walreceiver_state = WalreceiverState::new( + timeline, + wal_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + ); + loop { + select! { + _ = task_mgr::shutdown_watcher() => { + info!("WAL receiver shutdown requested, shutting down"); + walreceiver_state.shutdown().await; + return Ok(()); + }, + loop_step_result = connection_manager_loop_step( + &broker_loop_prefix, + &mut etcd_client, + &mut walreceiver_state, + ) => match loop_step_result { + ControlFlow::Continue(()) => continue, + ControlFlow::Break(()) => { + info!("Connection manager loop ended, shutting down"); + walreceiver_state.shutdown().await; + return Ok(()); + } + }, + } + } + } + .instrument( + info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id), + ), + ); +} + +/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. +/// Based on the updates, desides whether to start, keep or stop a WAL receiver task. +/// If etcd subscription is cancelled, exits. +async fn connection_manager_loop_step( + broker_prefix: &str, + etcd_client: &mut Client, + walreceiver_state: &mut WalreceiverState, +) -> ControlFlow<(), ()> { + let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates(); + + match wait_for_active_timeline(&mut timeline_state_updates).await { + ControlFlow::Continue(()) => {} + ControlFlow::Break(()) => { + info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop"); + return ControlFlow::Break(()); + } + } + + let id = TenantTimelineId { + tenant_id: walreceiver_state.timeline.tenant_id, + timeline_id: walreceiver_state.timeline.timeline_id, + }; + + // XXX: We never explicitly cancel etcd task, instead establishing one and never letting it go, + // running the entire loop step as much as possible to an end. + // The task removal happens implicitly on drop, both aborting the etcd subscription task and dropping the receiver channel end, + // forcing the etcd subscription to exit either way. + let mut broker_subscription = + subscribe_for_timeline_updates(etcd_client, broker_prefix, id).await; + info!("Subscribed for etcd timeline changes, waiting for new etcd data"); + + loop { + let time_until_next_retry = walreceiver_state.time_until_next_retry(); + + // These things are happening concurrently: + // + // - keep receiving WAL on the current connection + // - if the shared state says we need to change connection, disconnect and return + // - this runs in a separate task and we receive updates via a watch channel + // - change connection if the rules decide so, or if the current connection dies + // - receive updates from broker + // - this might change the current desired connection + // - timeline state changes to something that does not allow walreceiver to run concurrently + select! { + broker_connection_result = &mut broker_subscription.watcher_handle => { + info!("Broker connection was closed from the other side, ending current broker loop step"); + cleanup_broker_connection(broker_connection_result, walreceiver_state); + return ControlFlow::Continue(()); + }, + + Some(wal_connection_update) = async { + match walreceiver_state.wal_connection.as_mut() { + Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await), + None => None, + } + } => { + let wal_connection = walreceiver_state.wal_connection.as_mut() + .expect("Should have a connection, as checked by the corresponding select! guard"); + match wal_connection_update { + TaskEvent::Update(c) => { + match c { + TaskStateUpdate::Init | TaskStateUpdate::Started => {}, + TaskStateUpdate::Progress(status) => { + if status.has_processed_wal { + // We have advanced last_record_lsn by processing the WAL received + // from this safekeeper. This is good enough to clean unsuccessful + // retries history and allow reconnecting to this safekeeper without + // sleeping for a long time. + walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); + } + wal_connection.status = status.to_owned(); + } + } + }, + TaskEvent::End(walreceiver_task_result) => { + match walreceiver_task_result { + Ok(()) => debug!("WAL receiving task finished"), + Err(e) => error!("wal receiver task finished with an error: {e:?}"), + } + walreceiver_state.drop_old_connection(false).await; + }, + } + }, + + // Got a new update from etcd + broker_update = broker_subscription.value_updates.recv() => { + match broker_update { + Some(broker_update) => walreceiver_state.register_timeline_update(broker_update), + None => { + info!("Broker sender end was dropped, ending current broker loop step"); + // Ensure to cancel and wait for the broker subscription task end, to log its result. + // Broker sender end is in the broker subscription task and its drop means abnormal task completion. + // First, ensure that the task is stopped (abort can be done without errors on already stopped tasks and repeated multiple times). + broker_subscription.watcher_handle.abort(); + // Then, wait for the task to finish and print its result. If the task was finished before abort (which we assume in this abnormal case), + // a proper error message will be printed, otherwise an abortion message is printed which is ok, since we're signalled to finish anyway. + cleanup_broker_connection( + (&mut broker_subscription.watcher_handle).await, + walreceiver_state, + ); + return ControlFlow::Continue(()); + } + } + }, + + new_event = async { + loop { + match timeline_state_updates.changed().await { + Ok(()) => { + let new_state = walreceiver_state.timeline.current_state(); + match new_state { + // we're already active as walreceiver, no need to reactivate + TimelineState::Active => continue, + TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return ControlFlow::Continue(new_state), + } + } + Err(_sender_dropped_error) => return ControlFlow::Break(()), + } + } + } => match new_event { + ControlFlow::Continue(new_state) => { + info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates"); + return ControlFlow::Continue(()); + } + ControlFlow::Break(()) => { + info!("Timeline dropped state updates sender, stopping wal connection manager loop"); + return ControlFlow::Break(()); + } + }, + + _ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {} + } + + // Fetch more etcd timeline updates, but limit ourselves since they may arrive quickly. + let mut max_events_to_poll = 100_u32; + while max_events_to_poll > 0 { + if let Ok(broker_update) = broker_subscription.value_updates.try_recv() { + walreceiver_state.register_timeline_update(broker_update); + max_events_to_poll -= 1; + } else { + break; + } + } + + if let Some(new_candidate) = walreceiver_state.next_connection_candidate() { + info!("Switching to new connection candidate: {new_candidate:?}"); + walreceiver_state + .change_connection( + new_candidate.safekeeper_id, + new_candidate.wal_source_connstr, + ) + .await + } + } +} + +async fn wait_for_active_timeline( + timeline_state_updates: &mut watch::Receiver, +) -> ControlFlow<(), ()> { + let current_state = *timeline_state_updates.borrow(); + if current_state == TimelineState::Active { + return ControlFlow::Continue(()); + } + + loop { + match timeline_state_updates.changed().await { + Ok(()) => { + let new_state = *timeline_state_updates.borrow(); + match new_state { + TimelineState::Active => { + debug!("Timeline state changed to active, continuing the walreceiver connection manager"); + return ControlFlow::Continue(()); + } + state => { + debug!("Not running the walreceiver connection manager, timeline is not active: {state:?}"); + continue; + } + } + } + Err(_sender_dropped_error) => return ControlFlow::Break(()), + } + } +} + +fn cleanup_broker_connection( + broker_connection_result: Result, tokio::task::JoinError>, + walreceiver_state: &mut WalreceiverState, +) { + match broker_connection_result { + Ok(Ok(())) => info!("Broker conneciton task finished, ending current broker loop step"), + Ok(Err(broker_error)) => warn!("Broker conneciton ended with error: {broker_error}"), + Err(abort_error) => { + if abort_error.is_panic() { + error!("Broker connection panicked: {abort_error}") + } else { + debug!("Broker connection aborted: {abort_error}") + } + } + } + + walreceiver_state.wal_stream_candidates.clear(); +} + +/// Endlessly try to subscribe for broker updates for a given timeline. +/// If there are no safekeepers to maintain the lease, the timeline subscription will be unavailable in the broker and the operation will fail constantly. +/// This is ok, pageservers should anyway try subscribing (with some backoff) since it's the only way they can get the timeline WAL anyway. +async fn subscribe_for_timeline_updates( + etcd_client: &mut Client, + broker_prefix: &str, + id: TenantTimelineId, +) -> BrokerSubscription { + let mut attempt = 0; + loop { + exponential_backoff( + attempt, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ) + .await; + attempt += 1; + + match etcd_broker::subscribe_for_json_values( + etcd_client, + SubscriptionKey::sk_timeline_info(broker_prefix.to_owned(), id), + ) + .instrument(info_span!("etcd_subscription")) + .await + { + Ok(new_subscription) => { + return new_subscription; + } + Err(e) => { + warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in etcd: {e:#}"); + continue; + } + } + } +} + +const WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS: f64 = 0.1; +const WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS: f64 = 15.0; +const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5; + +/// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. +struct WalreceiverState { + id: TenantTimelineId, + + /// Use pageserver data about the timeline to filter out some of the safekeepers. + timeline: Arc, + /// The timeout on the connection to safekeeper for WAL streaming. + wal_connect_timeout: Duration, + /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. + lagging_wal_timeout: Duration, + /// The Lsn lag to use to determine when the current connection is lagging to much behind and reconnect to the other one. + max_lsn_wal_lag: NonZeroU64, + /// Current connection to safekeeper for WAL streaming. + wal_connection: Option, + /// Info about retries and unsuccessful attempts to connect to safekeepers. + wal_connection_retries: HashMap, + /// Data about all timelines, available for connection, fetched from etcd, grouped by their corresponding safekeeper node id. + wal_stream_candidates: HashMap, +} + +/// Current connection data. +#[derive(Debug)] +struct WalConnection { + /// Time when the connection was initiated. + started_at: NaiveDateTime, + /// Current safekeeper pageserver is connected to for WAL streaming. + sk_id: NodeId, + /// Status of the connection. + status: WalConnectionStatus, + /// WAL streaming task handle. + connection_task: TaskHandle, + /// Have we discovered that other safekeeper has more recent WAL than we do? + discovered_new_wal: Option, +} + +/// Notion of a new committed WAL, which exists on other safekeeper. +#[derive(Debug, Clone, Copy)] +struct NewCommittedWAL { + /// LSN of the new committed WAL. + lsn: Lsn, + /// When we discovered that the new committed WAL exists on other safekeeper. + discovered_at: NaiveDateTime, +} + +#[derive(Debug)] +struct RetryInfo { + next_retry_at: Option, + retry_duration_seconds: f64, +} + +/// Data about the timeline to connect to, received from etcd. +#[derive(Debug)] +struct EtcdSkTimeline { + timeline: SkTimelineInfo, + /// Etcd generation, the bigger it is, the more up to date the timeline data is. + etcd_version: i64, + /// Time at which the data was fetched from etcd last time, to track the stale data. + latest_update: NaiveDateTime, +} + +impl WalreceiverState { + fn new( + timeline: Arc, + wal_connect_timeout: Duration, + lagging_wal_timeout: Duration, + max_lsn_wal_lag: NonZeroU64, + ) -> Self { + let id = TenantTimelineId { + tenant_id: timeline.tenant_id, + timeline_id: timeline.timeline_id, + }; + Self { + id, + timeline, + wal_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + wal_connection: None, + wal_stream_candidates: HashMap::new(), + wal_connection_retries: HashMap::new(), + } + } + + /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. + async fn change_connection(&mut self, new_sk_id: NodeId, new_wal_source_connstr: String) { + self.drop_old_connection(true).await; + + let id = self.id; + let connect_timeout = self.wal_connect_timeout; + let timeline = Arc::clone(&self.timeline); + let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { + async move { + super::walreceiver_connection::handle_walreceiver_connection( + timeline, + new_wal_source_connstr, + events_sender, + cancellation, + connect_timeout, + ) + .await + .context("walreceiver connection handling failure") + } + .instrument(info_span!("walreceiver_connection", id = %id)) + }); + + let now = Utc::now().naive_utc(); + self.wal_connection = Some(WalConnection { + started_at: now, + sk_id: new_sk_id, + status: WalConnectionStatus { + is_connected: false, + has_processed_wal: false, + latest_connection_update: now, + latest_wal_update: now, + streaming_lsn: None, + commit_lsn: None, + }, + connection_task: connection_handle, + discovered_new_wal: None, + }); + } + + /// Drops the current connection (if any) and updates retry timeout for the next + /// connection attempt to the same safekeeper. + async fn drop_old_connection(&mut self, needs_shutdown: bool) { + let wal_connection = match self.wal_connection.take() { + Some(wal_connection) => wal_connection, + None => return, + }; + + if needs_shutdown { + wal_connection.connection_task.shutdown().await; + } + + let retry = self + .wal_connection_retries + .entry(wal_connection.sk_id) + .or_insert(RetryInfo { + next_retry_at: None, + retry_duration_seconds: WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS, + }); + + let now = Utc::now().naive_utc(); + + // Schedule the next retry attempt. We want to have exponential backoff for connection attempts, + // and we add backoff to the time when we started the connection attempt. If the connection + // was active for a long time, then next_retry_at will be in the past. + retry.next_retry_at = + wal_connection + .started_at + .checked_add_signed(chrono::Duration::milliseconds( + (retry.retry_duration_seconds * 1000.0) as i64, + )); + + if let Some(next) = &retry.next_retry_at { + if next > &now { + info!( + "Next connection retry to {:?} is at {}", + wal_connection.sk_id, next + ); + } + } + + let next_retry_duration = + retry.retry_duration_seconds * WALCONNECTION_RETRY_BACKOFF_MULTIPLIER; + // Clamp the next retry duration to the maximum allowed. + let next_retry_duration = next_retry_duration.min(WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS); + // Clamp the next retry duration to the minimum allowed. + let next_retry_duration = next_retry_duration.max(WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS); + + retry.retry_duration_seconds = next_retry_duration; + } + + /// Returns time needed to wait to have a new candidate for WAL streaming. + fn time_until_next_retry(&self) -> Option { + let now = Utc::now().naive_utc(); + + let next_retry_at = self + .wal_connection_retries + .values() + .filter_map(|retry| retry.next_retry_at) + .filter(|next_retry_at| next_retry_at > &now) + .min(); + + next_retry_at.and_then(|next_retry_at| (next_retry_at - now).to_std().ok()) + } + + /// Adds another etcd timeline into the state, if its more recent than the one already added there for the same key. + fn register_timeline_update(&mut self, timeline_update: BrokerUpdate) { + match self + .wal_stream_candidates + .entry(timeline_update.key.node_id) + { + hash_map::Entry::Occupied(mut o) => { + let existing_value = o.get_mut(); + if existing_value.etcd_version < timeline_update.etcd_version { + existing_value.etcd_version = timeline_update.etcd_version; + existing_value.timeline = timeline_update.value; + existing_value.latest_update = Utc::now().naive_utc(); + } + } + hash_map::Entry::Vacant(v) => { + v.insert(EtcdSkTimeline { + timeline: timeline_update.value, + etcd_version: timeline_update.etcd_version, + latest_update: Utc::now().naive_utc(), + }); + } + } + } + + /// Cleans up stale etcd records and checks the rest for the new connection candidate. + /// Returns a new candidate, if the current state is absent or somewhat lagging, `None` otherwise. + /// The current rules for approving new candidates: + /// * pick a candidate different from the connected safekeeper with biggest `commit_lsn` and lowest failed connection attemps + /// * if there's no such entry, no new candidate found, abort + /// * otherwise check if the candidate is much better than the current one + /// + /// To understand exact rules for determining if the candidate is better than the current one, refer to this function's implementation. + /// General rules are following: + /// * if connected safekeeper is not present, pick the candidate + /// * if we haven't received any updates for some time, pick the candidate + /// * if the candidate commit_lsn is much higher than the current one, pick the candidate + /// * if connected safekeeper stopped sending us new WAL which is available on other safekeeper, pick the candidate + /// + /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently. + /// Both thresholds are configured per tenant. + fn next_connection_candidate(&mut self) -> Option { + self.cleanup_old_candidates(); + + match &self.wal_connection { + Some(existing_wal_connection) => { + let connected_sk_node = existing_wal_connection.sk_id; + + let (new_sk_id, new_safekeeper_etcd_data, new_wal_source_connstr) = + self.select_connection_candidate(Some(connected_sk_node))?; + + let now = Utc::now().naive_utc(); + if let Ok(latest_interaciton) = + (now - existing_wal_connection.status.latest_connection_update).to_std() + { + // Drop connection if we haven't received keepalive message for a while. + if latest_interaciton > self.wal_connect_timeout { + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_source_connstr: new_wal_source_connstr, + reason: ReconnectReason::NoKeepAlives { + last_keep_alive: Some( + existing_wal_connection.status.latest_connection_update, + ), + check_time: now, + threshold: self.wal_connect_timeout, + }, + }); + } + } + + if !existing_wal_connection.status.is_connected { + // We haven't connected yet and we shouldn't switch until connection timeout (condition above). + return None; + } + + if let Some(current_commit_lsn) = existing_wal_connection.status.commit_lsn { + let new_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0)); + // Check if the new candidate has much more WAL than the current one. + match new_commit_lsn.0.checked_sub(current_commit_lsn.0) { + Some(new_sk_lsn_advantage) => { + if new_sk_lsn_advantage >= self.max_lsn_wal_lag.get() { + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_source_connstr: new_wal_source_connstr, + reason: ReconnectReason::LaggingWal { + current_commit_lsn, + new_commit_lsn, + threshold: self.max_lsn_wal_lag, + }, + }); + } + } + None => debug!( + "Best SK candidate has its commit_lsn behind connected SK's commit_lsn" + ), + } + } + + let current_lsn = match existing_wal_connection.status.streaming_lsn { + Some(lsn) => lsn, + None => self.timeline.get_last_record_lsn(), + }; + let current_commit_lsn = existing_wal_connection + .status + .commit_lsn + .unwrap_or(current_lsn); + let candidate_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0)); + + // Keep discovered_new_wal only if connected safekeeper has not caught up yet. + let mut discovered_new_wal = existing_wal_connection + .discovered_new_wal + .filter(|new_wal| new_wal.lsn > current_commit_lsn); + + if discovered_new_wal.is_none() { + // Check if the new candidate has more WAL than the current one. + // If the new candidate has more WAL than the current one, we consider switching to the new candidate. + discovered_new_wal = if candidate_commit_lsn > current_commit_lsn { + trace!( + "New candidate has commit_lsn {}, higher than current_commit_lsn {}", + candidate_commit_lsn, + current_commit_lsn + ); + Some(NewCommittedWAL { + lsn: candidate_commit_lsn, + discovered_at: Utc::now().naive_utc(), + }) + } else { + None + }; + } + + let waiting_for_new_lsn_since = if current_lsn < current_commit_lsn { + // Connected safekeeper has more WAL, but we haven't received updates for some time. + trace!( + "Connected safekeeper has more WAL, but we haven't received updates for {:?}. current_lsn: {}, current_commit_lsn: {}", + (now - existing_wal_connection.status.latest_wal_update).to_std(), + current_lsn, + current_commit_lsn + ); + Some(existing_wal_connection.status.latest_wal_update) + } else { + discovered_new_wal.as_ref().map(|new_wal| { + // We know that new WAL is available on other safekeeper, but connected safekeeper don't have it. + new_wal + .discovered_at + .max(existing_wal_connection.status.latest_wal_update) + }) + }; + + // If we haven't received any WAL updates for a while and candidate has more WAL, switch to it. + if let Some(waiting_for_new_lsn_since) = waiting_for_new_lsn_since { + if let Ok(waiting_for_new_wal) = (now - waiting_for_new_lsn_since).to_std() { + if candidate_commit_lsn > current_commit_lsn + && waiting_for_new_wal > self.lagging_wal_timeout + { + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_source_connstr: new_wal_source_connstr, + reason: ReconnectReason::NoWalTimeout { + current_lsn, + current_commit_lsn, + candidate_commit_lsn, + last_wal_interaction: Some( + existing_wal_connection.status.latest_wal_update, + ), + check_time: now, + threshold: self.lagging_wal_timeout, + }, + }); + } + } + } + + self.wal_connection.as_mut().unwrap().discovered_new_wal = discovered_new_wal; + } + None => { + let (new_sk_id, _, new_wal_source_connstr) = + self.select_connection_candidate(None)?; + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_source_connstr: new_wal_source_connstr, + reason: ReconnectReason::NoExistingConnection, + }); + } + } + + None + } + + /// Selects the best possible candidate, based on the data collected from etcd updates about the safekeepers. + /// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another. + /// + /// The candidate that is chosen: + /// * has no pending retry cooldown + /// * has greatest commit_lsn among the ones that are left + fn select_connection_candidate( + &self, + node_to_omit: Option, + ) -> Option<(NodeId, &SkTimelineInfo, String)> { + self.applicable_connection_candidates() + .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit) + .max_by_key(|(_, info, _)| info.commit_lsn) + } + + /// Returns a list of safekeepers that have valid info and ready for connection. + /// Some safekeepers are filtered by the retry cooldown. + fn applicable_connection_candidates( + &self, + ) -> impl Iterator { + let now = Utc::now().naive_utc(); + + self.wal_stream_candidates + .iter() + .filter(|(_, info)| info.timeline.commit_lsn.is_some()) + .filter(move |(sk_id, _)| { + let next_retry_at = self + .wal_connection_retries + .get(sk_id) + .and_then(|retry_info| { + retry_info.next_retry_at + }); + + next_retry_at.is_none() || next_retry_at.unwrap() <= now + }) + .filter_map(|(sk_id, etcd_info)| { + let info = &etcd_info.timeline; + match wal_stream_connection_string( + self.id, + info.safekeeper_connstr.as_deref()?, + ) { + Ok(connstr) => Some((*sk_id, info, connstr)), + Err(e) => { + error!("Failed to create wal receiver connection string from broker data of safekeeper node {}: {e:#}", sk_id); + None + } + } + }) + } + + /// Remove candidates which haven't sent etcd updates for a while. + fn cleanup_old_candidates(&mut self) { + let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len()); + + self.wal_stream_candidates.retain(|node_id, etcd_info| { + if let Ok(time_since_latest_etcd_update) = + (Utc::now().naive_utc() - etcd_info.latest_update).to_std() + { + let should_retain = time_since_latest_etcd_update < self.lagging_wal_timeout; + if !should_retain { + node_ids_to_remove.push(*node_id); + } + should_retain + } else { + true + } + }); + + for node_id in node_ids_to_remove { + self.wal_connection_retries.remove(&node_id); + } + } + + async fn shutdown(mut self) { + if let Some(wal_connection) = self.wal_connection.take() { + wal_connection.connection_task.shutdown().await; + } + } +} + +#[derive(Debug, PartialEq, Eq)] +struct NewWalConnectionCandidate { + safekeeper_id: NodeId, + wal_source_connstr: String, + reason: ReconnectReason, +} + +/// Stores the reason why WAL connection was switched, for furter debugging purposes. +#[derive(Debug, PartialEq, Eq)] +enum ReconnectReason { + NoExistingConnection, + LaggingWal { + current_commit_lsn: Lsn, + new_commit_lsn: Lsn, + threshold: NonZeroU64, + }, + NoWalTimeout { + current_lsn: Lsn, + current_commit_lsn: Lsn, + candidate_commit_lsn: Lsn, + last_wal_interaction: Option, + check_time: NaiveDateTime, + threshold: Duration, + }, + NoKeepAlives { + last_keep_alive: Option, + check_time: NaiveDateTime, + threshold: Duration, + }, +} + +fn wal_stream_connection_string( + TenantTimelineId { + tenant_id, + timeline_id, + }: TenantTimelineId, + listen_pg_addr_str: &str, +) -> anyhow::Result { + let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db"); + sk_connstr + .parse() + .context("bad url") + .and_then(|url: url::Url| { + let host = url.host_str().context("host is missing")?; + let port = url.port().unwrap_or(5432); // default PG port + + Ok(format!( + "host={host} \ + port={port} \ + options='-c timeline_id={timeline_id} tenant_id={tenant_id}'" + )) + }) + .with_context(|| format!("Failed to parse pageserver connection URL '{sk_connstr}'")) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; + + #[test] + fn no_connection_no_candidate() -> anyhow::Result<()> { + let harness = TenantHarness::create("no_connection_no_candidate")?; + let mut state = dummy_state(&harness); + let now = Utc::now().naive_utc(); + + let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?; + let delay_over_threshold = now - lagging_wal_timeout - lagging_wal_timeout; + + state.wal_connection = None; + state.wal_stream_candidates = HashMap::from([ + ( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: None, + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(1), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: None, + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + + safekeeper_connstr: Some("no_commit_lsn".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(2), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: None, + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: Some("no_commit_lsn".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(3), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + safekeeper_connstr: None, + }, + etcd_version: 0, + latest_update: delay_over_threshold, + }, + ), + ]); + + let no_candidate = state.next_connection_candidate(); + assert!( + no_candidate.is_none(), + "Expected no candidate selected out of non full data options, but got {no_candidate:?}" + ); + + Ok(()) + } + + #[tokio::test] + async fn connection_no_candidate() -> anyhow::Result<()> { + let harness = TenantHarness::create("connection_no_candidate")?; + let mut state = dummy_state(&harness); + let now = Utc::now().naive_utc(); + + let connected_sk_id = NodeId(0); + let current_lsn = 100_000; + + let connection_status = WalConnectionStatus { + is_connected: true, + has_processed_wal: true, + latest_connection_update: now, + latest_wal_update: now, + commit_lsn: Some(Lsn(current_lsn)), + streaming_lsn: Some(Lsn(current_lsn)), + }; + + state.max_lsn_wal_lag = NonZeroU64::new(100).unwrap(); + state.wal_connection = Some(WalConnection { + started_at: now, + sk_id: connected_sk_id, + status: connection_status.clone(), + connection_task: TaskHandle::spawn(move |sender, _| async move { + sender + .send(TaskStateUpdate::Progress(connection_status.clone())) + .ok(); + Ok(()) + }), + discovered_new_wal: None, + }); + state.wal_stream_candidates = HashMap::from([ + ( + connected_sk_id, + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() * 2)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(1), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(current_lsn)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + + safekeeper_connstr: Some("not_advanced_lsn".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(2), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() / 2)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + + safekeeper_connstr: Some("not_enough_advanced_lsn".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ]); + + let no_candidate = state.next_connection_candidate(); + assert!( + no_candidate.is_none(), + "Expected no candidate selected out of valid options since candidate Lsn data is ignored and others' was not advanced enough, but got {no_candidate:?}" + ); + + Ok(()) + } + + #[test] + fn no_connection_candidate() -> anyhow::Result<()> { + let harness = TenantHarness::create("no_connection_candidate")?; + let mut state = dummy_state(&harness); + let now = Utc::now().naive_utc(); + + state.wal_connection = None; + state.wal_stream_candidates = HashMap::from([( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + )]); + + let only_candidate = state + .next_connection_candidate() + .expect("Expected one candidate selected out of the only data option, but got none"); + assert_eq!(only_candidate.safekeeper_id, NodeId(0)); + assert_eq!( + only_candidate.reason, + ReconnectReason::NoExistingConnection, + "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" + ); + assert!(only_candidate + .wal_source_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + + let selected_lsn = 100_000; + state.wal_stream_candidates = HashMap::from([ + ( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(selected_lsn - 100)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + + safekeeper_connstr: Some("smaller_commit_lsn".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(1), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(selected_lsn)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(2), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(selected_lsn + 100)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + + safekeeper_connstr: None, + }, + etcd_version: 0, + latest_update: now, + }, + ), + ]); + let biggest_wal_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(biggest_wal_candidate.safekeeper_id, NodeId(1)); + assert_eq!( + biggest_wal_candidate.reason, + ReconnectReason::NoExistingConnection, + "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" + ); + assert!(biggest_wal_candidate + .wal_source_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + + Ok(()) + } + + #[tokio::test] + async fn candidate_with_many_connection_failures() -> anyhow::Result<()> { + let harness = TenantHarness::create("candidate_with_many_connection_failures")?; + let mut state = dummy_state(&harness); + let now = Utc::now().naive_utc(); + + let current_lsn = Lsn(100_000).align(); + let bigger_lsn = Lsn(current_lsn.0 + 100).align(); + + state.wal_connection = None; + state.wal_stream_candidates = HashMap::from([ + ( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(bigger_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(1), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ]); + state.wal_connection_retries = HashMap::from([( + NodeId(0), + RetryInfo { + next_retry_at: now.checked_add_signed(chrono::Duration::hours(1)), + retry_duration_seconds: WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS, + }, + )]); + + let candidate_with_less_errors = state + .next_connection_candidate() + .expect("Expected one candidate selected, but got none"); + assert_eq!( + candidate_with_less_errors.safekeeper_id, + NodeId(1), + "Should select the node with no pending retry cooldown" + ); + + Ok(()) + } + + #[tokio::test] + async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { + let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?; + let mut state = dummy_state(&harness); + let current_lsn = Lsn(100_000).align(); + let now = Utc::now().naive_utc(); + + let connected_sk_id = NodeId(0); + let new_lsn = Lsn(current_lsn.0 + state.max_lsn_wal_lag.get() + 1); + + let connection_status = WalConnectionStatus { + is_connected: true, + has_processed_wal: true, + latest_connection_update: now, + latest_wal_update: now, + commit_lsn: Some(current_lsn), + streaming_lsn: Some(current_lsn), + }; + + state.wal_connection = Some(WalConnection { + started_at: now, + sk_id: connected_sk_id, + status: connection_status.clone(), + connection_task: TaskHandle::spawn(move |sender, _| async move { + sender + .send(TaskStateUpdate::Progress(connection_status.clone())) + .ok(); + Ok(()) + }), + discovered_new_wal: None, + }); + state.wal_stream_candidates = HashMap::from([ + ( + connected_sk_id, + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(1), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(new_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + + safekeeper_connstr: Some("advanced_by_lsn_safekeeper".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ]); + + let over_threshcurrent_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(1)); + assert_eq!( + over_threshcurrent_candidate.reason, + ReconnectReason::LaggingWal { + current_commit_lsn: current_lsn, + new_commit_lsn: new_lsn, + threshold: state.max_lsn_wal_lag + }, + "Should select bigger WAL safekeeper if it starts to lag enough" + ); + assert!(over_threshcurrent_candidate + .wal_source_connstr + .contains("advanced_by_lsn_safekeeper")); + + Ok(()) + } + + #[tokio::test] + async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> { + let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?; + let mut state = dummy_state(&harness); + let current_lsn = Lsn(100_000).align(); + let now = Utc::now().naive_utc(); + + let wal_connect_timeout = chrono::Duration::from_std(state.wal_connect_timeout)?; + let time_over_threshold = + Utc::now().naive_utc() - wal_connect_timeout - wal_connect_timeout; + + let connection_status = WalConnectionStatus { + is_connected: true, + has_processed_wal: true, + latest_connection_update: time_over_threshold, + latest_wal_update: time_over_threshold, + commit_lsn: Some(current_lsn), + streaming_lsn: Some(current_lsn), + }; + + state.wal_connection = Some(WalConnection { + started_at: now, + sk_id: NodeId(1), + status: connection_status.clone(), + connection_task: TaskHandle::spawn(move |sender, _| async move { + sender + .send(TaskStateUpdate::Progress(connection_status.clone())) + .ok(); + Ok(()) + }), + discovered_new_wal: None, + }); + state.wal_stream_candidates = HashMap::from([( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + )]); + + let over_threshcurrent_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0)); + match over_threshcurrent_candidate.reason { + ReconnectReason::NoKeepAlives { + last_keep_alive, + threshold, + .. + } => { + assert_eq!(last_keep_alive, Some(time_over_threshold)); + assert_eq!(threshold, state.lagging_wal_timeout); + } + unexpected => panic!("Unexpected reason: {unexpected:?}"), + } + assert!(over_threshcurrent_candidate + .wal_source_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + + Ok(()) + } + + #[tokio::test] + async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { + let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?; + let mut state = dummy_state(&harness); + let current_lsn = Lsn(100_000).align(); + let new_lsn = Lsn(100_100).align(); + let now = Utc::now().naive_utc(); + + let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?; + let time_over_threshold = + Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; + + let connection_status = WalConnectionStatus { + is_connected: true, + has_processed_wal: true, + latest_connection_update: now, + latest_wal_update: time_over_threshold, + commit_lsn: Some(current_lsn), + streaming_lsn: Some(current_lsn), + }; + + state.wal_connection = Some(WalConnection { + started_at: now, + sk_id: NodeId(1), + status: connection_status, + connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), + discovered_new_wal: Some(NewCommittedWAL { + discovered_at: time_over_threshold, + lsn: new_lsn, + }), + }); + state.wal_stream_candidates = HashMap::from([( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(new_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + local_start_lsn: None, + + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + )]); + + let over_threshcurrent_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0)); + match over_threshcurrent_candidate.reason { + ReconnectReason::NoWalTimeout { + current_lsn, + current_commit_lsn, + candidate_commit_lsn, + last_wal_interaction, + threshold, + .. + } => { + assert_eq!(current_lsn, current_lsn); + assert_eq!(current_commit_lsn, current_lsn); + assert_eq!(candidate_commit_lsn, new_lsn); + assert_eq!(last_wal_interaction, Some(time_over_threshold)); + assert_eq!(threshold, state.lagging_wal_timeout); + } + unexpected => panic!("Unexpected reason: {unexpected:?}"), + } + assert!(over_threshcurrent_candidate + .wal_source_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + + Ok(()) + } + + const DUMMY_SAFEKEEPER_CONNSTR: &str = "safekeeper_connstr"; + + fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState { + WalreceiverState { + id: TenantTimelineId { + tenant_id: harness.tenant_id, + timeline_id: TIMELINE_ID, + }, + timeline: harness + .load() + .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION) + .expect("Failed to create an empty timeline for dummy wal connection manager") + .initialize() + .unwrap(), + wal_connect_timeout: Duration::from_secs(1), + lagging_wal_timeout: Duration::from_secs(1), + max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(), + wal_connection: None, + wal_stream_candidates: HashMap::new(), + wal_connection_retries: HashMap::new(), + } + } +} diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs new file mode 100644 index 0000000000..0070834288 --- /dev/null +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -0,0 +1,402 @@ +//! Actual Postgres connection handler to stream WAL to the server. + +use std::{ + str::FromStr, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use anyhow::{bail, ensure, Context}; +use bytes::BytesMut; +use chrono::{NaiveDateTime, Utc}; +use fail::fail_point; +use futures::StreamExt; +use postgres::{SimpleQueryMessage, SimpleQueryRow}; +use postgres_ffi::v14::xlog_utils::normalize_lsn; +use postgres_ffi::WAL_SEGMENT_SIZE; +use postgres_protocol::message::backend::ReplicationMessage; +use postgres_types::PgLsn; +use tokio::{pin, select, sync::watch, time}; +use tokio_postgres::{replication::ReplicationStream, Client}; +use tracing::{debug, error, info, trace, warn}; + +use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate}; +use crate::{ + task_mgr, + task_mgr::TaskKind, + task_mgr::WALRECEIVER_RUNTIME, + tenant::{Timeline, WalReceiverInfo}, + tenant_mgr, + walingest::WalIngest, + walrecord::DecodedWALRecord, +}; +use postgres_ffi::waldecoder::WalStreamDecoder; +use pq_proto::ReplicationFeedback; +use utils::{id::TenantTimelineId, lsn::Lsn}; + +/// Status of the connection. +#[derive(Debug, Clone)] +pub struct WalConnectionStatus { + /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running. + pub is_connected: bool, + /// Defines a healthy connection as one on which pageserver received WAL from safekeeper + /// and is able to process it in walingest without errors. + pub has_processed_wal: bool, + /// Connection establishment time or the timestamp of a latest connection message received. + pub latest_connection_update: NaiveDateTime, + /// Time of the latest WAL message received. + pub latest_wal_update: NaiveDateTime, + /// Latest WAL update contained WAL up to this LSN. Next WAL message with start from that LSN. + pub streaming_lsn: Option, + /// Latest commit_lsn received from the safekeeper. Can be zero if no message has been received yet. + pub commit_lsn: Option, +} + +/// Open a connection to the given safekeeper and receive WAL, sending back progress +/// messages as we go. +pub async fn handle_walreceiver_connection( + timeline: Arc, + wal_source_connstr: String, + events_sender: watch::Sender>, + mut cancellation: watch::Receiver<()>, + connect_timeout: Duration, +) -> anyhow::Result<()> { + // Connect to the database in replication mode. + info!("connecting to {wal_source_connstr}"); + let connect_cfg = format!("{wal_source_connstr} application_name=pageserver replication=true"); + + let (mut replication_client, connection) = time::timeout( + connect_timeout, + tokio_postgres::connect(&connect_cfg, postgres::NoTls), + ) + .await + .context("Timed out while waiting for walreceiver connection to open")? + .context("Failed to open walreceiver connection")?; + + info!("connected!"); + let mut connection_status = WalConnectionStatus { + is_connected: true, + has_processed_wal: false, + latest_connection_update: Utc::now().naive_utc(), + latest_wal_update: Utc::now().naive_utc(), + streaming_lsn: None, + commit_lsn: None, + }; + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { + warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); + return Ok(()); + } + + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + let mut connection_cancellation = cancellation.clone(); + task_mgr::spawn( + WALRECEIVER_RUNTIME.handle(), + TaskKind::WalReceiverConnection, + Some(timeline.tenant_id), + Some(timeline.timeline_id), + "walreceiver connection", + false, + async move { + select! { + connection_result = connection => match connection_result{ + Ok(()) => info!("Walreceiver db connection closed"), + Err(connection_error) => { + if connection_error.is_closed() { + info!("Connection closed regularly: {connection_error}") + } else { + warn!("Connection aborted: {connection_error}") + } + } + }, + + _ = connection_cancellation.changed() => info!("Connection cancelled"), + } + Ok(()) + }, + ); + + // Immediately increment the gauge, then create a job to decrement it on task exit. + // One of the pros of `defer!` is that this will *most probably* + // get called, even in presence of panics. + let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); + gauge.inc(); + scopeguard::defer! { + gauge.dec(); + } + + let identify = identify_system(&mut replication_client).await?; + info!("{identify:?}"); + + let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); + let mut caught_up = false; + + connection_status.latest_connection_update = Utc::now().naive_utc(); + connection_status.latest_wal_update = Utc::now().naive_utc(); + connection_status.commit_lsn = Some(end_of_wal); + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { + warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); + return Ok(()); + } + + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + let tenant = tenant_mgr::get_tenant(tenant_id, true)?; + + // + // Start streaming the WAL, from where we left off previously. + // + // If we had previously received WAL up to some point in the middle of a WAL record, we + // better start from the end of last full WAL record, not in the middle of one. + let mut last_rec_lsn = timeline.get_last_record_lsn(); + let mut startpoint = last_rec_lsn; + + if startpoint == Lsn(0) { + bail!("No previous WAL position"); + } + + // There might be some padding after the last full record, skip it. + startpoint += startpoint.calc_padding(8u32); + + // If the starting point is at a WAL page boundary, skip past the page header. We don't need the page headers + // for anything, and in some corner cases, the compute node might have never generated the WAL for page headers + //. That happens if you create a branch at page boundary: the start point of the branch is at the page boundary, + // but when the compute node first starts on the branch, we normalize the first REDO position to just after the page + // header (see generate_pg_control()), so the WAL for the page header is never streamed from the compute node + // to the safekeepers. + startpoint = normalize_lsn(startpoint, WAL_SEGMENT_SIZE); + + info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}..."); + + let query = format!("START_REPLICATION PHYSICAL {startpoint}"); + + let copy_stream = replication_client.copy_both_simple(&query).await?; + let physical_stream = ReplicationStream::new(copy_stream); + pin!(physical_stream); + + let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); + + let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?; + + while let Some(replication_message) = { + select! { + _ = cancellation.changed() => { + info!("walreceiver interrupted"); + None + } + replication_message = physical_stream.next() => replication_message, + } + } { + let replication_message = replication_message?; + let now = Utc::now().naive_utc(); + let last_rec_lsn_before_msg = last_rec_lsn; + + // Update the connection status before processing the message. If the message processing + // fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper. + match &replication_message { + ReplicationMessage::XLogData(xlog_data) => { + connection_status.latest_connection_update = now; + connection_status.commit_lsn = Some(Lsn::from(xlog_data.wal_end())); + connection_status.streaming_lsn = Some(Lsn::from( + xlog_data.wal_start() + xlog_data.data().len() as u64, + )); + if !xlog_data.data().is_empty() { + connection_status.latest_wal_update = now; + } + } + ReplicationMessage::PrimaryKeepAlive(keepalive) => { + connection_status.latest_connection_update = now; + connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end())); + } + &_ => {} + }; + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { + warn!("Wal connection event listener dropped, aborting the connection: {e}"); + return Ok(()); + } + + let status_update = match replication_message { + ReplicationMessage::XLogData(xlog_data) => { + // Pass the WAL data to the decoder, and see if we can decode + // more records as a result. + let data = xlog_data.data(); + let startlsn = Lsn::from(xlog_data.wal_start()); + let endlsn = startlsn + data.len() as u64; + + trace!("received XLogData between {startlsn} and {endlsn}"); + + waldecoder.feed_bytes(data); + + { + let mut decoded = DecodedWALRecord::default(); + let mut modification = timeline.begin_modification(endlsn); + while let Some((lsn, recdata)) = waldecoder.poll_decode()? { + // let _enter = info_span!("processing record", lsn = %lsn).entered(); + + // It is important to deal with the aligned records as lsn in getPage@LSN is + // aligned and can be several bytes bigger. Without this alignment we are + // at risk of hitting a deadlock. + ensure!(lsn.is_aligned()); + + walingest + .ingest_record(recdata, lsn, &mut modification, &mut decoded) + .context("could not ingest record at {lsn}")?; + + fail_point!("walreceiver-after-ingest"); + + last_rec_lsn = lsn; + } + } + + if !caught_up && endlsn >= end_of_wal { + info!("caught up at LSN {endlsn}"); + caught_up = true; + } + + Some(endlsn) + } + + ReplicationMessage::PrimaryKeepAlive(keepalive) => { + let wal_end = keepalive.wal_end(); + let timestamp = keepalive.timestamp(); + let reply_requested = keepalive.reply() != 0; + + trace!("received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})"); + + if reply_requested { + Some(last_rec_lsn) + } else { + None + } + } + + _ => None, + }; + + if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg { + // We have successfully processed at least one WAL record. + connection_status.has_processed_wal = true; + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) + { + warn!("Wal connection event listener dropped, aborting the connection: {e}"); + return Ok(()); + } + } + + timeline.check_checkpoint_distance().with_context(|| { + format!( + "Failed to check checkpoint distance for timeline {}", + timeline.timeline_id + ) + })?; + + if let Some(last_lsn) = status_update { + let remote_index = tenant.get_remote_index(); + let timeline_remote_consistent_lsn = remote_index + .read() + .await + // here we either do not have this timeline in remote index + // or there were no checkpoints for it yet + .timeline_entry(&TenantTimelineId { + tenant_id, + timeline_id, + }) + .map(|remote_timeline| remote_timeline.metadata.disk_consistent_lsn()) + // no checkpoint was uploaded + .unwrap_or(Lsn(0)); + + // The last LSN we processed. It is not guaranteed to survive pageserver crash. + let write_lsn = u64::from(last_lsn); + // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data + let flush_lsn = u64::from(timeline.get_disk_consistent_lsn()); + // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash + // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. + let apply_lsn = u64::from(timeline_remote_consistent_lsn); + let ts = SystemTime::now(); + + // Update the status about what we just received. This is shown in the mgmt API. + let last_received_wal = WalReceiverInfo { + wal_source_connstr: wal_source_connstr.to_owned(), + last_received_msg_lsn: last_lsn, + last_received_msg_ts: ts + .duration_since(SystemTime::UNIX_EPOCH) + .expect("Received message time should be before UNIX EPOCH!") + .as_micros(), + }; + *timeline.last_received_wal.lock().unwrap() = Some(last_received_wal); + + // Send the replication feedback message. + // Regular standby_status_update fields are put into this message. + let status_update = ReplicationFeedback { + current_timeline_size: timeline + .get_current_logical_size() + .context("Status update creation failed to get current logical size")?, + ps_writelsn: write_lsn, + ps_flushlsn: flush_lsn, + ps_applylsn: apply_lsn, + ps_replytime: ts, + }; + + debug!("neon_status_update {status_update:?}"); + + let mut data = BytesMut::new(); + status_update.serialize(&mut data)?; + physical_stream + .as_mut() + .zenith_status_update(data.len() as u64, &data) + .await?; + } + } + + Ok(()) +} + +/// Data returned from the postgres `IDENTIFY_SYSTEM` command +/// +/// See the [postgres docs] for more details. +/// +/// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html +#[derive(Debug)] +// As of nightly 2021-09-11, fields that are only read by the type's `Debug` impl still count as +// unused. Relevant issue: https://github.com/rust-lang/rust/issues/88900 +#[allow(dead_code)] +struct IdentifySystem { + systemid: u64, + timeline: u32, + xlogpos: PgLsn, + dbname: Option, +} + +/// There was a problem parsing the response to +/// a postgres IDENTIFY_SYSTEM command. +#[derive(Debug, thiserror::Error)] +#[error("IDENTIFY_SYSTEM parse error")] +struct IdentifyError; + +/// Run the postgres `IDENTIFY_SYSTEM` command +async fn identify_system(client: &mut Client) -> anyhow::Result { + let query_str = "IDENTIFY_SYSTEM"; + let response = client.simple_query(query_str).await?; + + // get(N) from row, then parse it as some destination type. + fn get_parse(row: &SimpleQueryRow, idx: usize) -> Result + where + T: FromStr, + { + let val = row.get(idx).ok_or(IdentifyError)?; + val.parse::().or(Err(IdentifyError)) + } + + // extract the row contents into an IdentifySystem struct. + // written as a closure so I can use ? for Option here. + if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) { + Ok(IdentifySystem { + systemid: get_parse(first_row, 0)?, + timeline: get_parse(first_row, 1)?, + xlogpos: get_parse(first_row, 2)?, + dbname: get_parse(first_row, 3).ok(), + }) + } else { + Err(IdentifyError.into()) + } +} diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 378a015d4a..38fb9a4247 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -1,16 +1,61 @@ //! //! Functions for parsing WAL records. //! +use anyhow::Result; use bytes::{Buf, Bytes}; use postgres_ffi::pg_constants; -use postgres_ffi::xlog_utils::{TimestampTz, XLOG_SIZE_OF_XLOG_RECORD}; -use postgres_ffi::XLogRecord; -use postgres_ffi::{BlockNumber, OffsetNumber}; +use postgres_ffi::BLCKSZ; +use postgres_ffi::{BlockNumber, OffsetNumber, TimestampTz}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; +use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD}; use serde::{Deserialize, Serialize}; use tracing::*; +use utils::bin_ser::DeserializeError; -use crate::repository::ZenithWalRecord; +/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper +/// around a PostgreSQL WAL record, or a custom neon-specific "record". +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub enum NeonWalRecord { + /// Native PostgreSQL WAL record + Postgres { will_init: bool, rec: Bytes }, + + /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear) + ClearVisibilityMapFlags { + new_heap_blkno: Option, + old_heap_blkno: Option, + flags: u8, + }, + /// Mark transaction IDs as committed on a CLOG page + ClogSetCommitted { + xids: Vec, + timestamp: TimestampTz, + }, + /// Mark transaction IDs as aborted on a CLOG page + ClogSetAborted { xids: Vec }, + /// Extend multixact offsets SLRU + MultixactOffsetCreate { + mid: MultiXactId, + moff: MultiXactOffset, + }, + /// Extend multixact members SLRU. + MultixactMembersCreate { + moff: MultiXactOffset, + members: Vec, + }, +} + +impl NeonWalRecord { + /// Does replaying this WAL record initialize the page from scratch, or does + /// it need to be applied over the previous image of the page? + pub fn will_init(&self) -> bool { + match self { + NeonWalRecord::Postgres { will_init, rec: _ } => *will_init, + + // None of the special neon record types currently initialize the page + _ => false, + } + } +} /// DecodedBkpBlock represents per-page data contained in a WAL record. #[derive(Default)] @@ -51,6 +96,7 @@ impl DecodedBkpBlock { } } +#[derive(Default)] pub struct DecodedWALRecord { pub xl_xid: TransactionId, pub xl_info: u8, @@ -87,6 +133,28 @@ impl XlRelmapUpdate { } } +#[repr(C)] +#[derive(Debug)] +pub struct XlSmgrCreate { + pub rnode: RelFileNode, + // FIXME: This is ForkNumber in storage_xlog.h. That's an enum. Does it have + // well-defined size? + pub forknum: u8, +} + +impl XlSmgrCreate { + pub fn decode(buf: &mut Bytes) -> XlSmgrCreate { + XlSmgrCreate { + rnode: RelFileNode { + spcnode: buf.get_u32_le(), /* tablespace */ + dbnode: buf.get_u32_le(), /* database */ + relnode: buf.get_u32_le(), /* relation */ + }, + forknum: buf.get_u32_le() as u8, + } + } +} + #[repr(C)] #[derive(Debug)] pub struct XlSmgrTruncate { @@ -268,12 +336,11 @@ impl XlXactParsedRecord { let info = xl_info & pg_constants::XLOG_XACT_OPMASK; // The record starts with time of commit/abort let xact_time = buf.get_i64_le(); - let xinfo; - if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 { - xinfo = buf.get_u32_le(); + let xinfo = if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 { + buf.get_u32_le() } else { - xinfo = 0; - } + 0 + }; let db_id; let ts_id; if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 { @@ -322,6 +389,16 @@ impl XlXactParsedRecord { xid = buf.get_u32_le(); trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE"); } + + if xinfo & postgres_ffi::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 { + let nitems = buf.get_i32_le(); + debug!( + "XLOG_XACT_COMMIT-XACT_XINFO_HAS_DROPPED_STAT nitems {}", + nitems + ); + //FIXME: do we need to handle dropped stats here? + } + XlXactParsedRecord { xid, info, @@ -439,7 +516,18 @@ impl XlMultiXactTruncate { // block data // ... // main data -pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { +// +// +// For performance reasons, the caller provides the DecodedWALRecord struct and the function just fills it in. +// It would be more natural for this function to return a DecodedWALRecord as return value, +// but reusing the caller-supplied struct avoids an allocation. +// This code is in the hot path for digesting incoming WAL, and is very performance sensitive. +// +pub fn decode_wal_record( + record: Bytes, + decoded: &mut DecodedWALRecord, + pg_version: u32, +) -> Result<()> { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; let mut rnode_relnode: u32 = 0; @@ -450,7 +538,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { // 1. Parse XLogRecord struct // FIXME: assume little-endian here - let xlogrec = XLogRecord::from_bytes(&mut buf); + let xlogrec = XLogRecord::from_bytes(&mut buf)?; trace!( "decode_wal_record xl_rmid = {} xl_info = {}", @@ -468,7 +556,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { let mut blocks_total_len: u32 = 0; let mut main_data_len = 0; let mut datatotal: u32 = 0; - let mut blocks: Vec = Vec::new(); + decoded.blocks.clear(); // 2. Decode the headers. // XLogRecordBlockHeaders if any, @@ -502,7 +590,6 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { 0..=pg_constants::XLR_MAX_BLOCK_ID => { /* XLogRecordBlockHeader */ let mut blk = DecodedBkpBlock::new(); - let fork_flags: u8; if block_id <= max_block_id { // TODO @@ -515,7 +602,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { } max_block_id = block_id; - fork_flags = buf.get_u8(); + let fork_flags: u8 = buf.get_u8(); blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK; blk.flags = fork_flags; blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0; @@ -533,16 +620,28 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { blk.hole_offset = buf.get_u16_le(); blk.bimg_info = buf.get_u8(); - blk.apply_image = (blk.bimg_info & pg_constants::BKPIMAGE_APPLY) != 0; + blk.apply_image = if pg_version == 14 { + (blk.bimg_info & postgres_ffi::v14::bindings::BKPIMAGE_APPLY) != 0 + } else { + assert_eq!(pg_version, 15); + (blk.bimg_info & postgres_ffi::v15::bindings::BKPIMAGE_APPLY) != 0 + }; - if blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED != 0 { + let blk_img_is_compressed = + postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)?; + + if blk_img_is_compressed { + debug!("compressed block image , pg_version = {}", pg_version); + } + + if blk_img_is_compressed { if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 { blk.hole_length = buf.get_u16_le(); } else { blk.hole_length = 0; } } else { - blk.hole_length = pg_constants::BLCKSZ - blk.bimg_len; + blk.hole_length = BLCKSZ - blk.bimg_len; } datatotal += blk.bimg_len as u32; blocks_total_len += blk.bimg_len as u32; @@ -552,9 +651,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { * bimg_len < BLCKSZ if the HAS_HOLE flag is set. */ if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 - && (blk.hole_offset == 0 - || blk.hole_length == 0 - || blk.bimg_len == pg_constants::BLCKSZ) + && (blk.hole_offset == 0 || blk.hole_length == 0 || blk.bimg_len == BLCKSZ) { // TODO /* @@ -590,9 +687,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED * flag is set. */ - if (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0) - && blk.bimg_len == pg_constants::BLCKSZ - { + if !blk_img_is_compressed && blk.bimg_len == BLCKSZ { // TODO /* report_invalid_record(state, @@ -608,8 +703,8 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { * IS_COMPRESSED flag is set. */ if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0 - && blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0 - && blk.bimg_len != pg_constants::BLCKSZ + && !blk_img_is_compressed + && blk.bimg_len != BLCKSZ { // TODO /* @@ -648,7 +743,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { blk.blkno ); - blocks.push(blk); + decoded.blocks.push(blk); } _ => { @@ -659,7 +754,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { // 3. Decode blocks. let mut ptr = record.len() - buf.remaining(); - for blk in blocks.iter_mut() { + for blk in decoded.blocks.iter_mut() { if blk.has_image { blk.bimg_offset = ptr as u32; ptr += blk.bimg_len as usize; @@ -679,34 +774,31 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { assert_eq!(buf.remaining(), main_data_len as usize); } - DecodedWALRecord { - xl_xid: xlogrec.xl_xid, - xl_info: xlogrec.xl_info, - xl_rmid: xlogrec.xl_rmid, - record, - blocks, - main_data_offset, - } + decoded.xl_xid = xlogrec.xl_xid; + decoded.xl_info = xlogrec.xl_info; + decoded.xl_rmid = xlogrec.xl_rmid; + decoded.record = record; + decoded.main_data_offset = main_data_offset; + + Ok(()) } /// /// Build a human-readable string to describe a WAL record /// /// For debugging purposes -pub fn describe_wal_record(rec: &ZenithWalRecord) -> String { +pub fn describe_wal_record(rec: &NeonWalRecord) -> Result { match rec { - ZenithWalRecord::Postgres { will_init, rec } => { - format!( - "will_init: {}, {}", - will_init, - describe_postgres_wal_record(rec) - ) - } - _ => format!("{:?}", rec), + NeonWalRecord::Postgres { will_init, rec } => Ok(format!( + "will_init: {}, {}", + will_init, + describe_postgres_wal_record(rec)? + )), + _ => Ok(format!("{:?}", rec)), } } -fn describe_postgres_wal_record(record: &Bytes) -> String { +fn describe_postgres_wal_record(record: &Bytes) -> Result { // TODO: It would be nice to use the PostgreSQL rmgrdesc infrastructure for this. // Maybe use the postgres wal redo process, the same used for replaying WAL records? // Or could we compile the rmgrdesc routines into the dump_layer_file() binary directly, @@ -719,7 +811,7 @@ fn describe_postgres_wal_record(record: &Bytes) -> String { // 1. Parse XLogRecord struct // FIXME: assume little-endian here - let xlogrec = XLogRecord::from_bytes(&mut buf); + let xlogrec = XLogRecord::from_bytes(&mut buf)?; let unknown_str: String; @@ -767,5 +859,5 @@ fn describe_postgres_wal_record(record: &Bytes) -> String { } }; - String::from(result) + Ok(String::from(result)) } diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 877b81b8d5..f05bf46d96 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -10,7 +10,7 @@ //! process. Then we get the page image back. Communication with the //! postgres process happens via stdin/stdout //! -//! See src/backend/tcop/zenith_wal_redo.c for the other side of +//! See pgxn/neon_walredo/walredoproc.c for the other side of //! this communication. //! //! The Postgres process is assumed to be secure against malicious WAL @@ -20,35 +20,42 @@ //! use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; -use lazy_static::lazy_static; -use log::*; use nix::poll::*; use serde::Serialize; -use std::fs; use std::fs::OpenOptions; use std::io::prelude::*; use std::io::{Error, ErrorKind}; +use std::ops::{Deref, DerefMut}; use std::os::unix::io::AsRawFd; +use std::os::unix::prelude::CommandExt; use std::path::PathBuf; use std::process::Stdio; use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command}; use std::sync::Mutex; use std::time::Duration; use std::time::Instant; -use zenith_metrics::{register_histogram, register_int_counter, Histogram, IntCounter}; -use zenith_utils::bin_ser::BeSer; -use zenith_utils::lsn::Lsn; -use zenith_utils::nonblock::set_nonblock; -use zenith_utils::zid::ZTenantId; +use std::{fs, io}; +use tracing::*; +use utils::crashsafe::path_with_suffix_extension; +use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; -use crate::config::PageServerConf; -use crate::relish::*; -use crate::repository::ZenithWalRecord; -use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift; -use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset; -use postgres_ffi::nonrelfile_utils::mx_offset_to_member_offset; -use postgres_ffi::nonrelfile_utils::transaction_id_set_status; +use crate::metrics::{ + WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, + WAL_REDO_WAIT_TIME, +}; +use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; +use crate::repository::Key; +use crate::task_mgr::BACKGROUND_RUNTIME; +use crate::walrecord::NeonWalRecord; +use crate::{config::PageServerConf, TEMP_FILE_SUFFIX}; +use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; +use postgres_ffi::v14::nonrelfile_utils::{ + mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, + transaction_id_set_status, +}; +use postgres_ffi::BLCKSZ; /// /// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster. @@ -75,76 +82,36 @@ pub trait WalRedoManager: Send + Sync { /// the reords. fn request_redo( &self, - rel: RelishTag, - blknum: u32, + key: Key, lsn: Lsn, base_img: Option, - records: Vec<(Lsn, ZenithWalRecord)>, + records: Vec<(Lsn, NeonWalRecord)>, + pg_version: u32, ) -> Result; } -/// -/// A dummy WAL Redo Manager implementation that doesn't allow replaying -/// anything. Currently used during bootstrapping (zenith init), to create -/// a Repository object without launching the real WAL redo process. -/// -pub struct DummyRedoManager {} -impl crate::walredo::WalRedoManager for DummyRedoManager { - fn request_redo( - &self, - _rel: RelishTag, - _blknum: u32, - _lsn: Lsn, - _base_img: Option, - _records: Vec<(Lsn, ZenithWalRecord)>, - ) -> Result { - Err(WalRedoError::InvalidState) - } -} - -// Metrics collected on WAL redo operations -// -// We collect the time spent in actual WAL redo ('redo'), and time waiting -// for access to the postgres process ('wait') since there is only one for -// each tenant. -lazy_static! { - static ref WAL_REDO_TIME: Histogram = - register_histogram!("pageserver_wal_redo_time", "Time spent on WAL redo") - .expect("failed to define a metric"); - static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!( - "pageserver_wal_redo_wait_time", - "Time spent waiting for access to the WAL redo process" - ) - .expect("failed to define a metric"); - static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!( - "pageserver_wal_records_replayed", - "Number of WAL records replayed" - ) - .unwrap(); -} - /// /// This is the real implementation that uses a Postgres process to -/// perform WAL replay. Only one thread can use the processs at a time, +/// perform WAL replay. Only one thread can use the process at a time, /// that is controlled by the Mutex. In the future, we might want to /// launch a pool of processes to allow concurrent replay of multiple /// records. /// pub struct PostgresRedoManager { - tenantid: ZTenantId, + tenant_id: TenantId, conf: &'static PageServerConf, process: Mutex>, } -/// Can this request be served by zenith redo funcitons +/// Can this request be served by neon redo functions /// or we need to pass it to wal-redo postgres process? -fn can_apply_in_zenith(rec: &ZenithWalRecord) -> bool { +fn can_apply_in_neon(rec: &NeonWalRecord) -> bool { // Currently, we don't have bespoken Rust code to replay any - // Postgres WAL records. But everything else is handled in zenith. + // Postgres WAL records. But everything else is handled in neon. #[allow(clippy::match_like_matches_macro)] match rec { - ZenithWalRecord::Postgres { + NeonWalRecord::Postgres { will_init: _, rec: _, } => false, @@ -152,28 +119,6 @@ fn can_apply_in_zenith(rec: &ZenithWalRecord) -> bool { } } -fn check_forknum(rel: &RelishTag, expected_forknum: u8) -> bool { - if let RelishTag::Relation(RelTag { - forknum, - spcnode: _, - dbnode: _, - relnode: _, - }) = rel - { - *forknum == expected_forknum - } else { - false - } -} - -fn check_slru_segno(rel: &RelishTag, expected_slru: SlruKind, expected_segno: u32) -> bool { - if let RelishTag::Slru { slru, segno } = rel { - *slru == expected_slru && *segno == expected_segno - } else { - false - } -} - /// An error happened in WAL redo #[derive(Debug, thiserror::Error)] pub enum WalRedoError { @@ -184,6 +129,8 @@ pub enum WalRedoError { InvalidState, #[error("cannot perform WAL redo for this request")] InvalidRequest, + #[error("cannot perform WAL redo for this record")] + InvalidRecord, } /// @@ -198,11 +145,11 @@ impl WalRedoManager for PostgresRedoManager { /// fn request_redo( &self, - rel: RelishTag, - blknum: u32, + key: Key, lsn: Lsn, base_img: Option, - records: Vec<(Lsn, ZenithWalRecord)>, + records: Vec<(Lsn, NeonWalRecord)>, + pg_version: u32, ) -> Result { if records.is_empty() { error!("invalid WAL redo request with no records"); @@ -210,41 +157,41 @@ impl WalRedoManager for PostgresRedoManager { } let mut img: Option = base_img; - let mut batch_zenith = can_apply_in_zenith(&records[0].1); + let mut batch_neon = can_apply_in_neon(&records[0].1); let mut batch_start = 0; for i in 1..records.len() { - let rec_zenith = can_apply_in_zenith(&records[i].1); + let rec_neon = can_apply_in_neon(&records[i].1); - if rec_zenith != batch_zenith { - let result = if batch_zenith { - self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..i]) + if rec_neon != batch_neon { + let result = if batch_neon { + self.apply_batch_neon(key, lsn, img, &records[batch_start..i]) } else { self.apply_batch_postgres( - rel, - blknum, + key, lsn, img, &records[batch_start..i], self.conf.wal_redo_timeout, + pg_version, ) }; img = Some(result?); - batch_zenith = rec_zenith; + batch_neon = rec_neon; batch_start = i; } } // last batch - if batch_zenith { - self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..]) + if batch_neon { + self.apply_batch_neon(key, lsn, img, &records[batch_start..]) } else { self.apply_batch_postgres( - rel, - blknum, + key, lsn, img, &records[batch_start..], self.conf.wal_redo_timeout, + pg_version, ) } } @@ -254,10 +201,10 @@ impl PostgresRedoManager { /// /// Create a new PostgresRedoManager. /// - pub fn new(conf: &'static PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager { + pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager { // The actual process is launched lazily, on first request. PostgresRedoManager { - tenantid, + tenant_id, conf, process: Mutex::new(None), } @@ -268,46 +215,55 @@ impl PostgresRedoManager { /// fn apply_batch_postgres( &self, - rel: RelishTag, - blknum: u32, + key: Key, lsn: Lsn, base_img: Option, - records: &[(Lsn, ZenithWalRecord)], + records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, + pg_version: u32, ) -> Result { - let start_time = Instant::now(); + let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; - let apply_result: Result; + let start_time = Instant::now(); let mut process_guard = self.process.lock().unwrap(); let lock_time = Instant::now(); // launch the WAL redo process on first use if process_guard.is_none() { - let p = PostgresRedoProcess::launch(self.conf, &self.tenantid)?; + let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?; *process_guard = Some(p); } let process = process_guard.as_mut().unwrap(); WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64()); - let result = if let RelishTag::Relation(rel) = rel { - // Relational WAL records are applied using wal-redo-postgres - let buf_tag = BufferTag { rel, blknum }; - apply_result = process.apply_wal_records(buf_tag, base_img, records, wal_redo_timeout); - - apply_result.map_err(WalRedoError::IoError) - } else { - error!("unexpected non-relation relish: {:?}", rel); - Err(WalRedoError::InvalidRequest) - }; + // Relational WAL records are applied using wal-redo-postgres + let buf_tag = BufferTag { rel, blknum }; + let result = process + .apply_wal_records(buf_tag, base_img, records, wal_redo_timeout) + .map_err(WalRedoError::IoError); let end_time = Instant::now(); let duration = end_time.duration_since(lock_time); + + let len = records.len(); + let nbytes = records.iter().fold(0, |acumulator, record| { + acumulator + + match &record.1 { + NeonWalRecord::Postgres { rec, .. } => rec.len(), + _ => unreachable!("Only PostgreSQL records are accepted in this batch"), + } + }); + WAL_REDO_TIME.observe(duration.as_secs_f64()); + WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64); + WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64); + debug!( - "postgres applied {} WAL records in {} us to reconstruct page image at LSN {}", - records.len(), + "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}", + len, + nbytes, duration.as_micros(), lsn ); @@ -315,6 +271,12 @@ impl PostgresRedoManager { // If something went wrong, don't try to reuse the process. Kill it, and // next request will launch a new one. if result.is_err() { + error!( + "error applying {} WAL records ({} bytes) to reconstruct page image at LSN {}", + records.len(), + nbytes, + lsn + ); let process = process_guard.take().unwrap(); process.kill(); } @@ -322,15 +284,14 @@ impl PostgresRedoManager { } /// - /// Process a batch of WAL records using bespoken Zenith code. + /// Process a batch of WAL records using bespoken Neon code. /// - fn apply_batch_zenith( + fn apply_batch_neon( &self, - rel: RelishTag, - blknum: u32, + key: Key, lsn: Lsn, base_img: Option, - records: &[(Lsn, ZenithWalRecord)], + records: &[(Lsn, NeonWalRecord)], ) -> Result { let start_time = Instant::now(); @@ -340,13 +301,13 @@ impl PostgresRedoManager { page.extend_from_slice(&fpi[..]); } else { // All the current WAL record types that we can handle require a base image. - error!("invalid zenith WAL redo request with no base image"); + error!("invalid neon WAL redo request with no base image"); return Err(WalRedoError::InvalidRequest); } // Apply all the WAL records in the batch for (record_lsn, record) in records.iter() { - self.apply_record_zenith(rel, blknum, &mut page, *record_lsn, record)?; + self.apply_record_neon(key, &mut page, *record_lsn, record)?; } // Success! let end_time = Instant::now(); @@ -354,7 +315,7 @@ impl PostgresRedoManager { WAL_REDO_TIME.observe(duration.as_secs_f64()); debug!( - "zenith applied {} WAL records in {} ms to reconstruct page image at LSN {}", + "neon applied {} WAL records in {} ms to reconstruct page image at LSN {}", records.len(), duration.as_micros(), lsn @@ -363,28 +324,31 @@ impl PostgresRedoManager { Ok(page.freeze()) } - fn apply_record_zenith( + fn apply_record_neon( &self, - rel: RelishTag, - blknum: u32, + key: Key, page: &mut BytesMut, _record_lsn: Lsn, - record: &ZenithWalRecord, + record: &NeonWalRecord, ) -> Result<(), WalRedoError> { match record { - ZenithWalRecord::Postgres { + NeonWalRecord::Postgres { will_init: _, rec: _, - } => panic!("tried to pass postgres wal record to zenith WAL redo"), - ZenithWalRecord::ClearVisibilityMapFlags { + } => { + error!("tried to pass postgres wal record to neon WAL redo"); + return Err(WalRedoError::InvalidRequest); + } + NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno, flags, } => { - // sanity check that this is modifying the correct relish + // sanity check that this is modifying the correct relation + let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; assert!( - check_forknum(&rel, pg_constants::VISIBILITYMAP_FORKNUM), - "ClearVisibilityMapFlags record on unexpected rel {:?}", + rel.forknum == VISIBILITYMAP_FORKNUM, + "ClearVisibilityMapFlags record on unexpected rel {}", rel ); if let Some(heap_blkno) = *new_heap_blkno { @@ -417,7 +381,15 @@ impl PostgresRedoManager { } // Non-relational WAL records are handled here, with custom code that has the // same effects as the corresponding Postgres WAL redo function. - ZenithWalRecord::ClogSetCommitted { xids } => { + NeonWalRecord::ClogSetCommitted { xids, timestamp } => { + let (slru_kind, segno, blknum) = + key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + assert_eq!( + slru_kind, + SlruKind::Clog, + "ClogSetCommitted record with unexpected key {}", + key + ); for &xid in xids { let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE; let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -425,12 +397,17 @@ impl PostgresRedoManager { // Check that we're modifying the correct CLOG block. assert!( - check_slru_segno(&rel, SlruKind::Clog, expected_segno), - "ClogSetCommitted record for XID {} with unexpected rel {:?}", + segno == expected_segno, + "ClogSetCommitted record for XID {} with unexpected key {}", xid, - rel + key + ); + assert!( + blknum == expected_blknum, + "ClogSetCommitted record for XID {} with unexpected key {}", + xid, + key ); - assert!(blknum == expected_blknum); transaction_id_set_status( xid, @@ -438,8 +415,31 @@ impl PostgresRedoManager { page, ); } + + // Append the timestamp + if page.len() == BLCKSZ as usize + 8 { + page.truncate(BLCKSZ as usize); + } + if page.len() == BLCKSZ as usize { + page.extend_from_slice(×tamp.to_be_bytes()); + } else { + warn!( + "CLOG blk {} in seg {} has invalid size {}", + blknum, + segno, + page.len() + ); + } } - ZenithWalRecord::ClogSetAborted { xids } => { + NeonWalRecord::ClogSetAborted { xids } => { + let (slru_kind, segno, blknum) = + key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + assert_eq!( + slru_kind, + SlruKind::Clog, + "ClogSetAborted record with unexpected key {}", + key + ); for &xid in xids { let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE; let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -447,17 +447,30 @@ impl PostgresRedoManager { // Check that we're modifying the correct CLOG block. assert!( - check_slru_segno(&rel, SlruKind::Clog, expected_segno), - "ClogSetCommitted record for XID {} with unexpected rel {:?}", + segno == expected_segno, + "ClogSetAborted record for XID {} with unexpected key {}", xid, - rel + key + ); + assert!( + blknum == expected_blknum, + "ClogSetAborted record for XID {} with unexpected key {}", + xid, + key ); - assert!(blknum == expected_blknum); transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page); } } - ZenithWalRecord::MultixactOffsetCreate { mid, moff } => { + NeonWalRecord::MultixactOffsetCreate { mid, moff } => { + let (slru_kind, segno, blknum) = + key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + assert_eq!( + slru_kind, + SlruKind::MultiXactOffsets, + "MultixactOffsetCreate record with unexpected key {}", + key + ); // Compute the block and offset to modify. // See RecordNewMultiXact in PostgreSQL sources. let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; @@ -468,16 +481,29 @@ impl PostgresRedoManager { let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; assert!( - check_slru_segno(&rel, SlruKind::MultiXactOffsets, expected_segno), - "MultiXactOffsetsCreate record for multi-xid {} with unexpected rel {:?}", + segno == expected_segno, + "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", mid, - rel + key + ); + assert!( + blknum == expected_blknum, + "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", + mid, + key ); - assert!(blknum == expected_blknum); LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); } - ZenithWalRecord::MultixactMembersCreate { moff, members } => { + NeonWalRecord::MultixactMembersCreate { moff, members } => { + let (slru_kind, segno, blknum) = + key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + assert_eq!( + slru_kind, + SlruKind::MultiXactMembers, + "MultixactMembersCreate record with unexpected key {}", + key + ); for (i, member) in members.iter().enumerate() { let offset = moff + i as u32; @@ -492,12 +518,17 @@ impl PostgresRedoManager { let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; assert!( - check_slru_segno(&rel, SlruKind::MultiXactMembers, expected_segno), - "MultiXactMembersCreate record at offset {} with unexpected rel {:?}", + segno == expected_segno, + "MultiXactMembersCreate record for offset {} with unexpected key {}", moff, - rel + key + ); + assert!( + blknum == expected_blknum, + "MultiXactMembersCreate record for offset {} with unexpected key {}", + moff, + key ); - assert!(blknum == expected_blknum); let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]); flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); @@ -512,11 +543,46 @@ impl PostgresRedoManager { } } +/// +/// Command with ability not to give all file descriptors to child process +/// +trait CloseFileDescriptors: CommandExt { + /// + /// Close file descriptors (other than stdin, stdout, stderr) in child process + /// + fn close_fds(&mut self) -> &mut Command; +} + +impl CloseFileDescriptors for C { + fn close_fds(&mut self) -> &mut Command { + unsafe { + self.pre_exec(move || { + // SAFETY: Code executed inside pre_exec should have async-signal-safety, + // which means it should be safe to execute inside a signal handler. + // The precise meaning depends on platform. See `man signal-safety` + // for the linux definition. + // + // The set_fds_cloexec_threadsafe function is documented to be + // async-signal-safe. + // + // Aside from this function, the rest of the code is re-entrant and + // doesn't make any syscalls. We're just passing constants. + // + // NOTE: It's easy to indirectly cause a malloc or lock a mutex, + // which is not async-signal-safe. Be careful. + close_fds::set_fds_cloexec_threadsafe(3, &[]); + Ok(()) + }) + } + } +} + /// /// Handle to the Postgres WAL redo process /// struct PostgresRedoProcess { - child: Child, + tenant_id: TenantId, + child: NoLeakChild, stdin: ChildStdin, stdout: ChildStdout, stderr: ChildStderr, @@ -526,72 +592,125 @@ impl PostgresRedoProcess { // // Start postgres binary in special WAL redo mode. // - fn launch(conf: &PageServerConf, tenantid: &ZTenantId) -> Result { + #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))] + fn launch( + conf: &PageServerConf, + tenant_id: TenantId, + pg_version: u32, + ) -> Result { // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we // just create one with constant name. That fails if you try to launch more than // one WAL redo manager concurrently. - let datadir = conf.tenant_path(tenantid).join("wal-redo-datadir"); + let datadir = path_with_suffix_extension( + conf.tenant_path(&tenant_id).join("wal-redo-datadir"), + TEMP_FILE_SUFFIX, + ); // Create empty data directory for wal-redo postgres, deleting old one first. if datadir.exists() { - info!("directory {:?} exists, removing", &datadir); - if let Err(e) = fs::remove_dir_all(&datadir) { - error!("could not remove old wal-redo-datadir: {:#}", e); - } + info!( + "old temporary datadir {} exists, removing", + datadir.display() + ); + fs::remove_dir_all(&datadir)?; } - info!("running initdb in {:?}", datadir.display()); - let initdb = Command::new(conf.pg_bin_dir().join("initdb")) - .args(&["-D", datadir.to_str().unwrap()]) + let pg_bin_dir_path = conf.pg_bin_dir(pg_version).map_err(|e| { + Error::new( + ErrorKind::Other, + format!("incorrect pg_bin_dir path: {}", e), + ) + })?; + let pg_lib_dir_path = conf.pg_lib_dir(pg_version).map_err(|e| { + Error::new( + ErrorKind::Other, + format!("incorrect pg_lib_dir path: {}", e), + ) + })?; + + info!("running initdb in {}", datadir.display()); + let initdb = Command::new(pg_bin_dir_path.join("initdb")) + .args(&["-D", &datadir.to_string_lossy()]) .arg("-N") .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) + .env("LD_LIBRARY_PATH", &pg_lib_dir_path) + .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) // macOS + .close_fds() .output() - .expect("failed to execute initdb"); + .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?; if !initdb.status.success() { - panic!( - "initdb failed: {}\nstderr:\n{}", - std::str::from_utf8(&initdb.stdout).unwrap(), - std::str::from_utf8(&initdb.stderr).unwrap() - ); + return Err(Error::new( + ErrorKind::Other, + format!( + "initdb failed\nstdout: {}\nstderr:\n{}", + String::from_utf8_lossy(&initdb.stdout), + String::from_utf8_lossy(&initdb.stderr) + ), + )); } else { - // Limit shared cache for wal-redo-postres + // Limit shared cache for wal-redo-postgres let mut config = OpenOptions::new() .append(true) .open(PathBuf::from(&datadir).join("postgresql.conf"))?; config.write_all(b"shared_buffers=128kB\n")?; config.write_all(b"fsync=off\n")?; - config.write_all(b"shared_preload_libraries=zenith\n")?; - config.write_all(b"zenith.wal_redo=on\n")?; } + // Start postgres itself - let mut child = Command::new(conf.pg_bin_dir().join("postgres")) + let child = Command::new(pg_bin_dir_path.join("postgres")) .arg("--wal-redo") .stdin(Stdio::piped()) .stderr(Stdio::piped()) .stdout(Stdio::piped()) .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) + .env("LD_LIBRARY_PATH", &pg_lib_dir_path) + .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) .env("PGDATA", &datadir) - .spawn() - .expect("postgres --wal-redo command failed to start"); + // The redo process is not trusted, and runs in seccomp mode that + // doesn't allow it to open any files. We have to also make sure it + // doesn't inherit any file descriptors from the pageserver, that + // would allow an attacker to read any files that happen to be open + // in the pageserver. + // + // The Rust standard library makes sure to mark any file descriptors with + // as close-on-exec by default, but that's not enough, since we use + // libraries that directly call libc open without setting that flag. + .close_fds() + .spawn_no_leak_child() + .map_err(|e| { + Error::new( + e.kind(), + format!("postgres --wal-redo command failed to start: {}", e), + ) + })?; - info!( - "launched WAL redo postgres process on {:?}", - datadir.display() - ); + let mut child = scopeguard::guard(child, |child| { + error!("killing wal-redo-postgres process due to a problem during launch"); + child.kill_and_wait(); + }); let stdin = child.stdin.take().unwrap(); let stdout = child.stdout.take().unwrap(); let stderr = child.stderr.take().unwrap(); - set_nonblock(stdin.as_raw_fd())?; - set_nonblock(stdout.as_raw_fd())?; - set_nonblock(stderr.as_raw_fd())?; + macro_rules! set_nonblock_or_log_err { + ($file:ident) => {{ + let res = set_nonblock($file.as_raw_fd()); + if let Err(e) = &res { + error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed"); + } + res + }}; + } + set_nonblock_or_log_err!(stdin)?; + set_nonblock_or_log_err!(stdout)?; + set_nonblock_or_log_err!(stderr)?; + + // all fallible operations post-spawn are complete, so get rid of the guard + let child = scopeguard::ScopeGuard::into_inner(child); Ok(PostgresRedoProcess { + tenant_id, child, stdin, stdout, @@ -599,23 +718,21 @@ impl PostgresRedoProcess { }) } - fn kill(mut self) { - let _ = self.child.kill(); - if let Ok(exit_status) = self.child.wait() { - error!("wal-redo-postgres exited with code {}", exit_status); - } - drop(self); + #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))] + fn kill(self) { + self.child.kill_and_wait(); } // // Apply given WAL records ('records') over an old page image. Returns // new page image. // + #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))] fn apply_wal_records( &mut self, tag: BufferTag, base_img: Option, - records: &[(Lsn, ZenithWalRecord)], + records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, ) -> Result { // Serialize all the messages to send the WAL redo process first. @@ -623,20 +740,27 @@ impl PostgresRedoProcess { // This could be problematic if there are millions of records to replay, // but in practice the number of records is usually so small that it doesn't // matter, and it's better to keep this code simple. - let mut writebuf: Vec = Vec::new(); + // + // Most requests start with a before-image with BLCKSZ bytes, followed by + // by some other WAL records. Start with a buffer that can hold that + // comfortably. + let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); build_begin_redo_for_block_msg(tag, &mut writebuf); if let Some(img) = base_img { build_push_page_msg(tag, &img, &mut writebuf); } for (lsn, rec) in records.iter() { - if let ZenithWalRecord::Postgres { + if let NeonWalRecord::Postgres { will_init: _, rec: postgres_rec, } = rec { build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); } else { - panic!("tried to pass zenith wal record to postgres WAL redo"); + return Err(Error::new( + ErrorKind::Other, + "tried to pass neon wal record to postgres WAL redo", + )); } } build_get_page_msg(tag, &mut writebuf); @@ -650,7 +774,7 @@ impl PostgresRedoProcess { // We expect the WAL redo process to respond with an 8k page image. We read it // into this buffer. - let mut resultbuf = vec![0; pg_constants::BLCKSZ.into()]; + let mut resultbuf = vec![0; BLCKSZ.into()]; let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far // Prepare for calling poll() @@ -663,11 +787,16 @@ impl PostgresRedoProcess { // We do three things simultaneously: send the old base image and WAL records to // the child process's stdin, read the result from child's stdout, and forward any logging // information that the child writes to its stderr to the page server's log. - while nresult < pg_constants::BLCKSZ.into() { + while nresult < BLCKSZ.into() { // If we have more data to write, wake up if 'stdin' becomes writeable or // we have data to read. Otherwise only wake up if there's data to read. let nfds = if nwrite < writebuf.len() { 3 } else { 2 }; - let n = nix::poll::poll(&mut pollfds[0..nfds], wal_redo_timeout.as_millis() as i32)?; + let n = loop { + match nix::poll::poll(&mut pollfds[0..nfds], wal_redo_timeout.as_millis() as i32) { + Err(e) if e == nix::errno::Errno::EINTR => continue, + res => break res, + } + }?; if n == 0 { return Err(Error::new(ErrorKind::Other, "WAL redo timed out")); @@ -728,8 +857,101 @@ impl PostgresRedoProcess { } } +/// Wrapper type around `std::process::Child` which guarantees that the child +/// will be killed and waited-for by this process before being dropped. +struct NoLeakChild { + child: Option, +} + +impl Deref for NoLeakChild { + type Target = Child; + + fn deref(&self) -> &Self::Target { + self.child.as_ref().expect("must not use from drop") + } +} + +impl DerefMut for NoLeakChild { + fn deref_mut(&mut self) -> &mut Self::Target { + self.child.as_mut().expect("must not use from drop") + } +} + +impl NoLeakChild { + fn spawn(command: &mut Command) -> io::Result { + let child = command.spawn()?; + Ok(NoLeakChild { child: Some(child) }) + } + + fn kill_and_wait(mut self) { + let child = match self.child.take() { + Some(child) => child, + None => return, + }; + Self::kill_and_wait_impl(child); + } + + #[instrument(skip_all, fields(pid=child.id()))] + fn kill_and_wait_impl(mut child: Child) { + let res = child.kill(); + if let Err(e) = res { + // This branch is very unlikely because: + // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it. + // - This is the only place that calls .kill() + // - We consume `self`, so, .kill() can't be called twice. + // - If the process exited by itself or was killed by someone else, + // .kill() will still succeed because we haven't wait()'ed yet. + // + // So, if we arrive here, we have really no idea what happened, + // whether the PID stored in self.child is still valid, etc. + // If this function were fallible, we'd return an error, but + // since it isn't, all we can do is log an error and proceed + // with the wait(). + error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process"); + } + + match child.wait() { + Ok(exit_status) => { + // log at error level since .kill() is something we only do on errors ATM + error!(exit_status = %exit_status, "wait successful"); + } + Err(e) => { + error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)"); + } + } + } +} + +impl Drop for NoLeakChild { + fn drop(&mut self) { + let child = match self.child.take() { + Some(child) => child, + None => return, + }; + // Offload the kill+wait of the child process into the background. + // If someone stops the runtime, we'll leak the child process. + // We can ignore that case because we only stop the runtime on pageserver exit. + BACKGROUND_RUNTIME.spawn(async move { + tokio::task::spawn_blocking(move || { + Self::kill_and_wait_impl(child); + }) + .await + }); + } +} + +trait NoLeakChildCommandExt { + fn spawn_no_leak_child(&mut self) -> io::Result; +} + +impl NoLeakChildCommandExt for Command { + fn spawn_no_leak_child(&mut self) -> io::Result { + NoLeakChild::spawn(self) + } +} + // Functions for constructing messages to send to the postgres WAL redo -// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for +// process. See pgxn/neon_walredo/walredoproc.c for // explanation of the protocol. fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec) { diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile new file mode 100644 index 0000000000..7f4e30a12e --- /dev/null +++ b/pgxn/neon/Makefile @@ -0,0 +1,25 @@ +# pgxs/neon/Makefile + + +MODULE_big = neon +OBJS = \ + $(WIN32RES) \ + libpagestore.o \ + libpqwalproposer.o \ + pagestore_smgr.o \ + relsize_cache.o \ + neon.o \ + walproposer.o \ + walproposer_utils.o + +PG_CPPFLAGS = -I$(libpq_srcdir) +SHLIB_LINK_INTERNAL = $(libpq) + +EXTENSION = neon +DATA = neon--1.0.sql +PGFILEDESC = "neon - cloud storage for PostgreSQL" + + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c new file mode 100644 index 0000000000..d8e9d8b52c --- /dev/null +++ b/pgxn/neon/libpagestore.c @@ -0,0 +1,492 @@ +/*------------------------------------------------------------------------- + * + * libpagestore.c + * Handles network communications with the remote pagestore. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/neon/libpqpagestore.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pagestore_client.h" +#include "fmgr.h" +#include "access/xlog.h" + +#include "libpq-fe.h" +#include "libpq/pqformat.h" +#include "libpq/libpq.h" + +#include "miscadmin.h" +#include "pgstat.h" +#include "utils/guc.h" + +#include "neon.h" +#include "walproposer.h" +#include "walproposer_utils.h" + +#define PageStoreTrace DEBUG5 + +#define NEON_TAG "[NEON_SMGR] " +#define neon_log(tag, fmt, ...) ereport(tag, \ + (errmsg(NEON_TAG fmt, ##__VA_ARGS__), \ + errhidestmt(true), errhidecontext(true))) + +bool connected = false; +PGconn *pageserver_conn = NULL; + +/* + * WaitEventSet containing: + * - WL_SOCKET_READABLE on pageserver_conn, + * - WL_LATCH_SET on MyLatch, and + * - WL_EXIT_ON_PM_DEATH. + */ +WaitEventSet *pageserver_conn_wes = NULL; + +char *page_server_connstring_raw; + +int n_unflushed_requests = 0; +int flush_every_n_requests = 8; +int readahead_buffer_size = 128; + +static void pageserver_flush(void); + +static void +pageserver_connect() +{ + char *query; + int ret; + + Assert(!connected); + + pageserver_conn = PQconnectdb(page_server_connstring); + + if (PQstatus(pageserver_conn) == CONNECTION_BAD) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + PQfinish(pageserver_conn); + pageserver_conn = NULL; + + ereport(ERROR, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "could not establish connection to pageserver"), + errdetail_internal("%s", msg))); + } + + query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); + ret = PQsendQuery(pageserver_conn, query); + if (ret != 1) + { + PQfinish(pageserver_conn); + pageserver_conn = NULL; + neon_log(ERROR, "could not send pagestream command to pageserver"); + } + + pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3); + AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); + AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL); + + while (PQisBusy(pageserver_conn)) + { + int wc; + WaitEvent event; + + /* Sleep until there's something to do */ + wc = WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Data available in socket? */ + if (event.events & WL_SOCKET_READABLE) + { + if (!PQconsumeInput(pageserver_conn)) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + PQfinish(pageserver_conn); + pageserver_conn = NULL; + FreeWaitEventSet(pageserver_conn_wes); + + neon_log(ERROR, "could not complete handshake with pageserver: %s", + msg); + } + } + } + + neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw); + + connected = true; +} + +/* + * A wrapper around PQgetCopyData that checks for interrupts while sleeping. + */ +static int +call_PQgetCopyData(char **buffer) +{ + int ret; + +retry: + ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ ); + + if (ret == 0) + { + int wc; + WaitEvent event; + + /* Sleep until there's something to do */ + wc = WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Data available in socket? */ + if (event.events & WL_SOCKET_READABLE) + { + if (!PQconsumeInput(pageserver_conn)) + neon_log(ERROR, "could not get response from pageserver: %s", + PQerrorMessage(pageserver_conn)); + } + + goto retry; + } + + return ret; +} + + +static void +pageserver_disconnect(void) +{ + /* + * If anything goes wrong while we were sending a request, it's not clear + * what state the connection is in. For example, if we sent the request + * but didn't receive a response yet, we might receive the response some + * time later after we have already sent a new unrelated request. Close + * the connection to avoid getting confused. + */ + if (connected) + { + neon_log(LOG, "dropping connection to page server due to error"); + PQfinish(pageserver_conn); + pageserver_conn = NULL; + connected = false; + + prefetch_on_ps_disconnect(); + } + if (pageserver_conn_wes != NULL) + FreeWaitEventSet(pageserver_conn_wes); +} + +static void +pageserver_send(NeonRequest * request) +{ + StringInfoData req_buff; + + /* If the connection was lost for some reason, reconnect */ + if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) + pageserver_disconnect(); + + if (!connected) + pageserver_connect(); + + req_buff = nm_pack_request(request); + + /* + * Send request. + * + * In principle, this could block if the output buffer is full, and we + * should use async mode and check for interrupts while waiting. In + * practice, our requests are small enough to always fit in the output and + * TCP buffer. + */ + if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) + { + char *msg = PQerrorMessage(pageserver_conn); + + pageserver_disconnect(); + neon_log(ERROR, "failed to send page request: %s", msg); + } + pfree(req_buff.data); + + n_unflushed_requests++; + + if (flush_every_n_requests > 0 && n_unflushed_requests >= flush_every_n_requests) + pageserver_flush(); + + if (message_level_is_interesting(PageStoreTrace)) + { + char *msg = nm_to_string((NeonMessage *) request); + + neon_log(PageStoreTrace, "sent request: %s", msg); + pfree(msg); + } +} + +static NeonResponse * +pageserver_receive(void) +{ + StringInfoData resp_buff; + NeonResponse *resp; + + PG_TRY(); + { + /* read response */ + resp_buff.len = call_PQgetCopyData(&resp_buff.data); + resp_buff.cursor = 0; + + if (resp_buff.len < 0) + { + if (resp_buff.len == -1) + neon_log(ERROR, "end of COPY"); + else if (resp_buff.len == -2) + neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); + } + resp = nm_unpack_response(&resp_buff); + PQfreemem(resp_buff.data); + + if (message_level_is_interesting(PageStoreTrace)) + { + char *msg = nm_to_string((NeonMessage *) resp); + + neon_log(PageStoreTrace, "got response: %s", msg); + pfree(msg); + } + } + PG_CATCH(); + { + pageserver_disconnect(); + PG_RE_THROW(); + } + PG_END_TRY(); + + return (NeonResponse *) resp; +} + + +static void +pageserver_flush(void) +{ + if (!connected) + { + neon_log(WARNING, "Tried to flush while disconnected"); + } + else if (PQflush(pageserver_conn)) + { + char *msg = PQerrorMessage(pageserver_conn); + + pageserver_disconnect(); + neon_log(ERROR, "failed to flush page requests: %s", msg); + } + n_unflushed_requests = 0; +} + +page_server_api api = { + .send = pageserver_send, + .flush = pageserver_flush, + .receive = pageserver_receive +}; + +static bool +check_neon_id(char **newval, void **extra, GucSource source) +{ + uint8 id[16]; + + return **newval == '\0' || HexDecodeString(id, *newval, 16); +} + +static char * +substitute_pageserver_password(const char *page_server_connstring_raw) +{ + char *host = NULL; + char *port = NULL; + char *user = NULL; + char *auth_token = NULL; + char *err = NULL; + char *page_server_connstring = NULL; + PQconninfoOption *conn_options; + PQconninfoOption *conn_option; + MemoryContext oldcontext; + + /* + * Here we substitute password in connection string with an environment + * variable. To simplify things we construct a connection string back with + * only known options. In particular: host port user and password. We do + * not currently use other options and constructing full connstring in an + * URI shape is quite messy. + */ + + if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0') + return NULL; + + /* extract the auth token from the connection string */ + conn_options = PQconninfoParse(page_server_connstring_raw, &err); + if (conn_options == NULL) + { + /* The error string is malloc'd, so we must free it explicitly */ + char *errcopy = err ? pstrdup(err) : "out of memory"; + + PQfreemem(err); + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid connection string syntax: %s", errcopy))); + } + + /* + * Trying to populate pageserver connection string with auth token from + * environment. We are looking for password in with placeholder value like + * $ENV_VAR_NAME, so if password field is present and starts with $ we try + * to fetch environment variable value and fail loudly if it is not set. + */ + for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++) + { + if (strcmp(conn_option->keyword, "host") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + host = conn_option->val; + } + else if (strcmp(conn_option->keyword, "port") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + port = conn_option->val; + } + else if (strcmp(conn_option->keyword, "user") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + user = conn_option->val; + } + else if (strcmp(conn_option->keyword, "password") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + { + /* ensure that this is a template */ + if (strncmp(conn_option->val, "$", 1) != 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1]))); + + neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]); + auth_token = getenv(&conn_option->val[1]); + if (!auth_token) + { + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1]))); + } + else + { + neon_log(LOG, "using auth token from environment passed via env"); + } + } + } + } + + /* + * allocate connection string in TopMemoryContext to make sure it is not + * freed + */ + oldcontext = CurrentMemoryContext; + MemoryContextSwitchTo(TopMemoryContext); + page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port); + MemoryContextSwitchTo(oldcontext); + + PQconninfoFree(conn_options); + return page_server_connstring; +} + +/* + * Module initialization function + */ +void +pg_init_libpagestore(void) +{ + DefineCustomStringVariable("neon.pageserver_connstring", + "connection string to the page server", + NULL, + &page_server_connstring_raw, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + NULL, NULL, NULL); + + DefineCustomStringVariable("neon.timeline_id", + "Neon timeline_id the server is running on", + NULL, + &neon_timeline, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_neon_id, NULL, NULL); + + DefineCustomStringVariable("neon.tenant_id", + "Neon tenant_id the server is running on", + NULL, + &neon_tenant, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_neon_id, NULL, NULL); + + DefineCustomIntVariable("neon.max_cluster_size", + "cluster size limit", + NULL, + &max_cluster_size, + -1, -1, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MB, + NULL, NULL, NULL); + DefineCustomIntVariable("neon.flush_output_after", + "Flush the output buffer after every N unflushed requests", + NULL, + &flush_every_n_requests, + 8, -1, INT_MAX, + PGC_USERSET, + 0, /* no flags required */ + NULL, NULL, NULL); + DefineCustomIntVariable("neon.readahead_buffer_size", + "number of prefetches to buffer", + "This buffer is used to store prefetched data; so " + "it is important that this buffer is at least as " + "large as the configured value of all tablespaces' " + "effective_io_concurrency and maintenance_io_concurrency, " + "your sessions' values of these, and the value for " + "seqscan_prefetch_buffers.", + &readahead_buffer_size, + 128, 16, 1024, + PGC_USERSET, + 0, /* no flags required */ + NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL); + + relsize_hash_init(); + + if (page_server != NULL) + neon_log(ERROR, "libpagestore already loaded"); + + neon_log(PageStoreTrace, "libpagestore already loaded"); + page_server = &api; + + /* substitute password in pageserver_connstring */ + page_server_connstring = substitute_pageserver_password(page_server_connstring_raw); + + /* Is there more correct way to pass CustomGUC to postgres code? */ + neon_timeline_walproposer = neon_timeline; + neon_tenant_walproposer = neon_tenant; + + if (page_server_connstring && page_server_connstring[0]) + { + neon_log(PageStoreTrace, "set neon_smgr hook"); + smgr_hook = smgr_neon; + smgr_init_hook = smgr_init_neon; + dbsize_hook = neon_dbsize; + } +} diff --git a/pgxn/neon/libpqwalproposer.c b/pgxn/neon/libpqwalproposer.c new file mode 100644 index 0000000000..6b1e6a8bcc --- /dev/null +++ b/pgxn/neon/libpqwalproposer.c @@ -0,0 +1,397 @@ +#include "postgres.h" + +#include "libpq-fe.h" +#include "neon.h" +#include "walproposer.h" + +/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */ +struct WalProposerConn +{ + PGconn *pg_conn; + bool is_nonblocking; /* whether the connection is non-blocking */ + char *recvbuf; /* last received data from + * walprop_async_read */ +}; + +/* Helper function */ +static bool +ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking) +{ + /* If we're already correctly blocking or nonblocking, all good */ + if (is_nonblocking == conn->is_nonblocking) + return true; + + /* Otherwise, set it appropriately */ + if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1) + return false; + + conn->is_nonblocking = is_nonblocking; + return true; +} + +/* Exported function definitions */ +char * +walprop_error_message(WalProposerConn *conn) +{ + return PQerrorMessage(conn->pg_conn); +} + +WalProposerConnStatusType +walprop_status(WalProposerConn *conn) +{ + switch (PQstatus(conn->pg_conn)) + { + case CONNECTION_OK: + return WP_CONNECTION_OK; + case CONNECTION_BAD: + return WP_CONNECTION_BAD; + default: + return WP_CONNECTION_IN_PROGRESS; + } +} + +WalProposerConn * +walprop_connect_start(char *conninfo) +{ + WalProposerConn *conn; + PGconn *pg_conn; + + pg_conn = PQconnectStart(conninfo); + + /* + * Allocation of a PQconn can fail, and will return NULL. We want to fully + * replicate the behavior of PQconnectStart here. + */ + if (!pg_conn) + return NULL; + + /* + * And in theory this allocation can fail as well, but it's incredibly + * unlikely if we just successfully allocated a PGconn. + * + * palloc will exit on failure though, so there's not much we could do if + * it *did* fail. + */ + conn = palloc(sizeof(WalProposerConn)); + conn->pg_conn = pg_conn; + conn->is_nonblocking = false; /* connections always start in blocking + * mode */ + conn->recvbuf = NULL; + return conn; +} + +WalProposerConnectPollStatusType +walprop_connect_poll(WalProposerConn *conn) +{ + WalProposerConnectPollStatusType return_val; + + switch (PQconnectPoll(conn->pg_conn)) + { + case PGRES_POLLING_FAILED: + return_val = WP_CONN_POLLING_FAILED; + break; + case PGRES_POLLING_READING: + return_val = WP_CONN_POLLING_READING; + break; + case PGRES_POLLING_WRITING: + return_val = WP_CONN_POLLING_WRITING; + break; + case PGRES_POLLING_OK: + return_val = WP_CONN_POLLING_OK; + break; + + /* + * There's a comment at its source about this constant being + * unused. We'll expect it's never returned. + */ + case PGRES_POLLING_ACTIVE: + elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); + + /* + * This return is never actually reached, but it's here to make + * the compiler happy + */ + return WP_CONN_POLLING_FAILED; + + default: + Assert(false); + return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */ + } + + return return_val; +} + +bool +walprop_send_query(WalProposerConn *conn, char *query) +{ + /* + * We need to be in blocking mode for sending the query to run without + * requiring a call to PQflush + */ + if (!ensure_nonblocking_status(conn, false)) + return false; + + /* PQsendQuery returns 1 on success, 0 on failure */ + if (!PQsendQuery(conn->pg_conn, query)) + return false; + + return true; +} + +WalProposerExecStatusType +walprop_get_query_result(WalProposerConn *conn) +{ + PGresult *result; + WalProposerExecStatusType return_val; + + /* Marker variable if we need to log an unexpected success result */ + char *unexpected_success = NULL; + + /* Consume any input that we might be missing */ + if (!PQconsumeInput(conn->pg_conn)) + return WP_EXEC_FAILED; + + if (PQisBusy(conn->pg_conn)) + return WP_EXEC_NEEDS_INPUT; + + + result = PQgetResult(conn->pg_conn); + + /* + * PQgetResult returns NULL only if getting the result was successful & + * there's no more of the result to get. + */ + if (!result) + { + elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); + return WP_EXEC_UNEXPECTED_SUCCESS; + } + + /* Helper macro to reduce boilerplate */ +#define UNEXPECTED_SUCCESS(msg) \ + return_val = WP_EXEC_UNEXPECTED_SUCCESS; \ + unexpected_success = msg; \ + break; + + + switch (PQresultStatus(result)) + { + /* "true" success case */ + case PGRES_COPY_BOTH: + return_val = WP_EXEC_SUCCESS_COPYBOTH; + break; + + /* Unexpected success case */ + case PGRES_EMPTY_QUERY: + UNEXPECTED_SUCCESS("empty query return"); + case PGRES_COMMAND_OK: + UNEXPECTED_SUCCESS("data-less command end"); + case PGRES_TUPLES_OK: + UNEXPECTED_SUCCESS("tuples return"); + case PGRES_COPY_OUT: + UNEXPECTED_SUCCESS("'Copy Out' response"); + case PGRES_COPY_IN: + UNEXPECTED_SUCCESS("'Copy In' response"); + case PGRES_SINGLE_TUPLE: + UNEXPECTED_SUCCESS("single tuple return"); + case PGRES_PIPELINE_SYNC: + UNEXPECTED_SUCCESS("pipeline sync point"); + + /* Failure cases */ + case PGRES_BAD_RESPONSE: + case PGRES_NONFATAL_ERROR: + case PGRES_FATAL_ERROR: + case PGRES_PIPELINE_ABORTED: + return_val = WP_EXEC_FAILED; + break; + + default: + Assert(false); + return_val = WP_EXEC_FAILED; /* keep the compiler quiet */ + } + + if (unexpected_success) + elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success); + + return return_val; +} + +pgsocket +walprop_socket(WalProposerConn *conn) +{ + return PQsocket(conn->pg_conn); +} + +int +walprop_flush(WalProposerConn *conn) +{ + return (PQflush(conn->pg_conn)); +} + +void +walprop_finish(WalProposerConn *conn) +{ + if (conn->recvbuf != NULL) + PQfreemem(conn->recvbuf); + PQfinish(conn->pg_conn); + pfree(conn); +} + +/* + * Receive a message from the safekeeper. + * + * On success, the data is placed in *buf. It is valid until the next call + * to this function. + */ +PGAsyncReadResult +walprop_async_read(WalProposerConn *conn, char **buf, int *amount) +{ + int result; + + if (conn->recvbuf != NULL) + { + PQfreemem(conn->recvbuf); + conn->recvbuf = NULL; + } + + /* Call PQconsumeInput so that we have the data we need */ + if (!PQconsumeInput(conn->pg_conn)) + { + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } + + /* + * The docs for PQgetCopyData list the return values as: 0 if the copy is + * still in progress, but no "complete row" is available -1 if the copy is + * done -2 if an error occured (> 0) if it was successful; that value is + * the amount transferred. + * + * The protocol we use between walproposer and safekeeper means that we + * *usually* wouldn't expect to see that the copy is done, but this can + * sometimes be triggered by the server returning an ErrorResponse (which + * also happens to have the effect that the copy is done). + */ + switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true)) + { + case 0: + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_TRY_AGAIN; + case -1: + { + /* + * If we get -1, it's probably because of a server error; the + * safekeeper won't normally send a CopyDone message. + * + * We can check PQgetResult to make sure that the server + * failed; it'll always result in PGRES_FATAL_ERROR + */ + ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); + + if (status != PGRES_FATAL_ERROR) + elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); + + /* + * If there was actually an error, it'll be properly reported + * by calls to PQerrorMessage -- we don't have to do anything + * else + */ + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } + case -2: + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + default: + /* Positive values indicate the size of the returned result */ + *amount = result; + *buf = conn->recvbuf; + return PG_ASYNC_READ_SUCCESS; + } +} + +PGAsyncWriteResult +walprop_async_write(WalProposerConn *conn, void const *buf, size_t size) +{ + int result; + + /* If we aren't in non-blocking mode, switch to it. */ + if (!ensure_nonblocking_status(conn, true)) + return PG_ASYNC_WRITE_FAIL; + + /* + * The docs for PQputcopyData list the return values as: 1 if the data was + * queued, 0 if it was not queued because of full buffers, or -1 if an + * error occured + */ + result = PQputCopyData(conn->pg_conn, buf, size); + + /* + * We won't get a result of zero because walproposer always empties the + * connection's buffers before sending more + */ + Assert(result != 0); + + switch (result) + { + case 1: + /* good -- continue */ + break; + case -1: + return PG_ASYNC_WRITE_FAIL; + default: + elog(FATAL, "invalid return %d from PQputCopyData", result); + } + + /* + * After queueing the data, we still need to flush to get it to send. This + * might take multiple tries, but we don't want to wait around until it's + * done. + * + * PQflush has the following returns (directly quoting the docs): 0 if + * sucessful, 1 if it was unable to send all the data in the send queue + * yet -1 if it failed for some reason + */ + switch (result = PQflush(conn->pg_conn)) + { + case 0: + return PG_ASYNC_WRITE_SUCCESS; + case 1: + return PG_ASYNC_WRITE_TRY_FLUSH; + case -1: + return PG_ASYNC_WRITE_FAIL; + default: + elog(FATAL, "invalid return %d from PQflush", result); + } +} + +/* + * This function is very similar to walprop_async_write. For more + * information, refer to the comments there. + */ +bool +walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size) +{ + int result; + + /* If we are in non-blocking mode, switch out of it. */ + if (!ensure_nonblocking_status(conn, false)) + return false; + + if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1) + return false; + + Assert(result == 1); + + /* Because the connection is non-blocking, flushing returns 0 or -1 */ + + if ((result = PQflush(conn->pg_conn)) == -1) + return false; + + Assert(result == 0); + return true; +} diff --git a/pgxn/neon/neon--1.0.sql b/pgxn/neon/neon--1.0.sql new file mode 100644 index 0000000000..58b98a5923 --- /dev/null +++ b/pgxn/neon/neon--1.0.sql @@ -0,0 +1,24 @@ +\echo Use "CREATE EXTENSION neon" to load this file. \quit + +CREATE FUNCTION pg_cluster_size() +RETURNS bigint +AS 'MODULE_PATHNAME', 'pg_cluster_size' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION backpressure_lsns( + OUT received_lsn pg_lsn, + OUT disk_consistent_lsn pg_lsn, + OUT remote_consistent_lsn pg_lsn +) +RETURNS record +AS 'MODULE_PATHNAME', 'backpressure_lsns' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION backpressure_throttling_time() +RETURNS bigint +AS 'MODULE_PATHNAME', 'backpressure_throttling_time' +LANGUAGE C STRICT +PARALLEL UNSAFE; + diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c new file mode 100644 index 0000000000..5c98902554 --- /dev/null +++ b/pgxn/neon/neon.c @@ -0,0 +1,87 @@ +/*------------------------------------------------------------------------- + * + * neon.c + * Utility functions to expose neon specific information to user + * + * IDENTIFICATION + * contrib/neon/neon.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "fmgr.h" + +#include "access/xact.h" +#include "access/xlog.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "catalog/pg_type.h" +#include "replication/walsender.h" +#include "funcapi.h" +#include "access/htup_details.h" +#include "utils/pg_lsn.h" +#include "utils/guc.h" + +#include "neon.h" +#include "walproposer.h" + +PG_MODULE_MAGIC; +void _PG_init(void); + +void +_PG_init(void) +{ + pg_init_libpagestore(); + pg_init_walproposer(); + + EmitWarningsOnPlaceholders("neon"); +} + +PG_FUNCTION_INFO_V1(pg_cluster_size); +PG_FUNCTION_INFO_V1(backpressure_lsns); +PG_FUNCTION_INFO_V1(backpressure_throttling_time); + +Datum +pg_cluster_size(PG_FUNCTION_ARGS) +{ + int64 size; + + size = GetZenithCurrentClusterSize(); + + if (size == 0) + PG_RETURN_NULL(); + + PG_RETURN_INT64(size); +} + +Datum +backpressure_lsns(PG_FUNCTION_ARGS) +{ + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; + Datum values[3]; + bool nulls[3]; + TupleDesc tupdesc; + + replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); + + tupdesc = CreateTemplateTupleDesc(3); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "received_lsn", PG_LSNOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "disk_consistent_lsn", PG_LSNOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "remote_consistent_lsn", PG_LSNOID, -1, 0); + tupdesc = BlessTupleDesc(tupdesc); + + MemSet(nulls, 0, sizeof(nulls)); + values[0] = LSNGetDatum(writePtr); + values[1] = LSNGetDatum(flushPtr); + values[2] = LSNGetDatum(applyPtr); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + +Datum +backpressure_throttling_time(PG_FUNCTION_ARGS) +{ + PG_RETURN_UINT64(BackpressureThrottlingTime()); +} diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control new file mode 100644 index 0000000000..84f79881c1 --- /dev/null +++ b/pgxn/neon/neon.control @@ -0,0 +1,4 @@ +# neon extension +comment = 'cloud storage for PostgreSQL' +default_version = '1.0' +module_pathname = '$libdir/neon' diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h new file mode 100644 index 0000000000..6b9ba372fb --- /dev/null +++ b/pgxn/neon/neon.h @@ -0,0 +1,18 @@ +/*------------------------------------------------------------------------- + * + * neon.h + * Functions used in the initialization of this extension. + * + * IDENTIFICATION + * contrib/neon/neon.h + * + *------------------------------------------------------------------------- + */ + +#ifndef NEON_H +#define NEON_H + +extern void pg_init_libpagestore(void); +extern void pg_init_walproposer(void); + +#endif /* NEON_H */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h new file mode 100644 index 0000000000..9b8081065c --- /dev/null +++ b/pgxn/neon/pagestore_client.h @@ -0,0 +1,201 @@ +/*------------------------------------------------------------------------- + * + * pagestore_client.h + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * contrib/neon/pagestore_client.h + * + *------------------------------------------------------------------------- + */ +#ifndef pageserver_h +#define pageserver_h + +#include "postgres.h" + +#include "access/xlogdefs.h" +#include "storage/relfilenode.h" +#include "storage/block.h" +#include "storage/smgr.h" +#include "lib/stringinfo.h" +#include "libpq/pqformat.h" +#include "utils/memutils.h" + +#include "pg_config.h" + +typedef enum +{ + /* pagestore_client -> pagestore */ + T_NeonExistsRequest = 0, + T_NeonNblocksRequest, + T_NeonGetPageRequest, + T_NeonDbSizeRequest, + + /* pagestore -> pagestore_client */ + T_NeonExistsResponse = 100, + T_NeonNblocksResponse, + T_NeonGetPageResponse, + T_NeonErrorResponse, + T_NeonDbSizeResponse, +} NeonMessageTag; + +/* base struct for c-style inheritance */ +typedef struct +{ + NeonMessageTag tag; +} NeonMessage; + +#define messageTag(m) (((const NeonMessage *)(m))->tag) + +/* + * supertype of all the Neon*Request structs below + * + * If 'latest' is true, we are requesting the latest page version, and 'lsn' + * is just a hint to the server that we know there are no versions of the page + * (or relation size, for exists/nblocks requests) later than the 'lsn'. + */ +typedef struct +{ + NeonMessageTag tag; + bool latest; /* if true, request latest page version */ + XLogRecPtr lsn; /* request page version @ this LSN */ +} NeonRequest; + +typedef struct +{ + NeonRequest req; + RelFileNode rnode; + ForkNumber forknum; +} NeonExistsRequest; + +typedef struct +{ + NeonRequest req; + RelFileNode rnode; + ForkNumber forknum; +} NeonNblocksRequest; + +typedef struct +{ + NeonRequest req; + Oid dbNode; +} NeonDbSizeRequest; + +typedef struct +{ + NeonRequest req; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; +} NeonGetPageRequest; + +/* supertype of all the Neon*Response structs below */ +typedef struct +{ + NeonMessageTag tag; +} NeonResponse; + +typedef struct +{ + NeonMessageTag tag; + bool exists; +} NeonExistsResponse; + +typedef struct +{ + NeonMessageTag tag; + uint32 n_blocks; +} NeonNblocksResponse; + +typedef struct +{ + NeonMessageTag tag; + char page[FLEXIBLE_ARRAY_MEMBER]; +} NeonGetPageResponse; + +#define PS_GETPAGERESPONSE_SIZE (MAXALIGN(offsetof(NeonGetPageResponse, page) + BLCKSZ)) + +typedef struct +{ + NeonMessageTag tag; + int64 db_size; +} NeonDbSizeResponse; + +typedef struct +{ + NeonMessageTag tag; + char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error + * message */ +} NeonErrorResponse; + +extern StringInfoData nm_pack_request(NeonRequest * msg); +extern NeonResponse * nm_unpack_response(StringInfo s); +extern char *nm_to_string(NeonMessage * msg); + +/* + * API + */ + +typedef struct +{ + void (*send) (NeonRequest * request); + NeonResponse *(*receive) (void); + void (*flush) (void); +} page_server_api; + +extern void prefetch_on_ps_disconnect(void); + +extern page_server_api * page_server; + +extern char *page_server_connstring; +extern int flush_every_n_requests; +extern int readahead_buffer_size; +extern bool seqscan_prefetch_enabled; +extern int seqscan_prefetch_distance; +extern char *neon_timeline; +extern char *neon_tenant; +extern bool wal_redo; +extern int32 max_cluster_size; + +extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode); +extern void smgr_init_neon(void); +extern void readahead_buffer_resize(int newsize, void *extra); + +/* Neon storage manager functionality */ + +extern void neon_init(void); +extern void neon_open(SMgrRelation reln); +extern void neon_close(SMgrRelation reln, ForkNumber forknum); +extern void neon_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern bool neon_exists(SMgrRelation reln, ForkNumber forknum); +extern void neon_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void neon_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); + +extern void neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); + +extern void neon_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void neon_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); +extern int64 neon_dbsize(Oid dbNode); +extern void neon_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum); + +/* utils for neon relsize cache */ +extern void relsize_hash_init(void); +extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size); +extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); +extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); +extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); + +#endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c new file mode 100644 index 0000000000..d6fa7c46c9 --- /dev/null +++ b/pgxn/neon/pagestore_smgr.c @@ -0,0 +1,2456 @@ +/*------------------------------------------------------------------------- + * + * pagestore_smgr.c + * + * + * + * Temporary and unlogged rels + * --------------------------- + * + * Temporary and unlogged tables are stored locally, by md.c. The functions + * here just pass the calls through to corresponding md.c functions. + * + * Index build operations that use the buffer cache are also handled locally, + * just like unlogged tables. Such operations must be marked by calling + * smgr_start_unlogged_build() and friends. + * + * In order to know what relations are permanent and which ones are not, we + * have added a 'smgr_relpersistence' field to SmgrRelationData, and it is set + * by smgropen() callers, when they have the relcache entry at hand. However, + * sometimes we need to open an SmgrRelation for a relation without the + * relcache. That is needed when we evict a buffer; we might not have the + * SmgrRelation for that relation open yet. To deal with that, the + * 'relpersistence' can be left to zero, meaning we don't know if it's + * permanent or not. Most operations are not allowed with relpersistence==0, + * but smgrwrite() does work, which is what we need for buffer eviction. and + * smgrunlink() so that a backend doesn't need to have the relcache entry at + * transaction commit, where relations that were dropped in the transaction + * are unlinked. + * + * If smgrwrite() is called and smgr_relpersistence == 0, we check if the + * relation file exists locally or not. If it does exist, we assume it's an + * unlogged relation and write the page there. Otherwise it must be a + * permanent relation, WAL-logged and stored on the page server, and we ignore + * the write like we do for permanent relations. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/neon/pagestore_smgr.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "access/xlog_internal.h" +#include "access/xlogdefs.h" +#include "catalog/pg_class.h" +#include "common/hashfn.h" +#include "pagestore_client.h" +#include "postmaster/interrupt.h" +#include "postmaster/autovacuum.h" +#include "replication/walsender.h" +#include "storage/bufmgr.h" +#include "storage/relfilenode.h" +#include "storage/buf_internals.h" +#include "storage/smgr.h" +#include "storage/md.h" +#include "pgstat.h" + + +#if PG_VERSION_NUM >= 150000 +#include "access/xlogutils.h" +#include "access/xlogrecovery.h" +#endif + +/* + * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API + * calls to md.c, and *also* do the calls to the Page Server. On every + * read, compare the versions we read from local disk and Page Server, + * and Assert that they are identical. + */ +/* #define DEBUG_COMPARE_LOCAL */ + +#ifdef DEBUG_COMPARE_LOCAL +#include "access/nbtree.h" +#include "storage/bufpage.h" +#include "access/xlog_internal.h" + +static char *hexdump_page(char *page); +#endif + +#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId) + +const int SmgrTrace = DEBUG5; + +page_server_api *page_server; + +/* GUCs */ +char *page_server_connstring; + +/*with substituted password*/ +char *neon_timeline; +char *neon_tenant; +int32 max_cluster_size; + +/* unlogged relation build states */ +typedef enum +{ + UNLOGGED_BUILD_NOT_IN_PROGRESS = 0, + UNLOGGED_BUILD_PHASE_1, + UNLOGGED_BUILD_PHASE_2, + UNLOGGED_BUILD_NOT_PERMANENT +} UnloggedBuildPhase; + +static SMgrRelation unlogged_build_rel = NULL; +static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + +/* + * Prefetch implementation: + * + * Prefetch is performed locally by each backend. + * + * There can be up to readahead_buffer_size active IO requests registered at + * any time. Requests using smgr_prefetch are sent to the pageserver, but we + * don't wait on the response. Requests using smgr_read are either read from + * the buffer, or (if that's not possible) we wait on the response to arrive - + * this also will allow us to receive other prefetched pages. + * Each request is immediately written to the output buffer of the pageserver + * connection, but may not be flushed if smgr_prefetch is used: pageserver + * flushes sent requests on manual flush, or every neon.flush_output_after + * unflushed requests; which is not necessarily always and all the time. + * + * Once we have received a response, this value will be stored in the response + * buffer, indexed in a hash table. This allows us to retain our buffered + * prefetch responses even when we have cache misses. + * + * Reading of prefetch responses is delayed until them are actually needed + * (smgr_read). In case of prefetch miss or any other SMGR request other than + * smgr_read, all prefetch responses in the pipeline will need to be read from + * the connection; the responses are stored for later use. + * + * NOTE: The current implementation of the prefetch system implements a ring + * buffer of up to readahead_buffer_size requests. If there are more _read and + * _prefetch requests between the initial _prefetch and the _read of a buffer, + * the prefetch request will have been dropped from this prefetch buffer, and + * your prefetch was wasted. + */ + +/* + * State machine: + * + * not in hash : in hash + * : + * UNUSED ------> REQUESTED --> RECEIVED + * ^ : | | + * | : v | + * | : TAG_UNUSED | + * | : | | + * +----------------+------------+ + * : + */ +typedef enum PrefetchStatus { + PRFS_UNUSED = 0, /* unused slot */ + PRFS_REQUESTED, /* request was written to the sendbuffer to PS, but not + * necessarily flushed. + * all fields except response valid */ + PRFS_RECEIVED, /* all fields valid */ + PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still valid */ +} PrefetchStatus; + +typedef struct PrefetchRequest { + BufferTag buftag; /* must be first entry in the struct */ + XLogRecPtr effective_request_lsn; + NeonResponse *response; /* may be null */ + PrefetchStatus status; + uint64 my_ring_index; +} PrefetchRequest; + +/* prefetch buffer lookup hash table */ + +typedef struct PrfHashEntry { + PrefetchRequest *slot; + uint32 status; + uint32 hash; +} PrfHashEntry; + +#define SH_PREFIX prfh +#define SH_ELEMENT_TYPE PrfHashEntry +#define SH_KEY_TYPE PrefetchRequest * +#define SH_KEY slot +#define SH_STORE_HASH +#define SH_GET_HASH(tb, a) ((a)->hash) +#define SH_HASH_KEY(tb, key) hash_bytes( \ + ((const unsigned char *) &(key)->buftag), \ + sizeof(BufferTag) \ +) + +#define SH_EQUAL(tb, a, b) (BUFFERTAGS_EQUAL((a)->buftag, (b)->buftag)) +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + +/* + * PrefetchState maintains the state of (prefetch) getPage@LSN requests. + * It maintains a (ring) buffer of in-flight requests and responses. + * + * We maintain several indexes into the ring buffer: + * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0 + * + * ring_unused points to the first unused slot of the buffer + * ring_receive is the next request that is to be received + * ring_last is the oldest received entry in the buffer + * + * Apart from being an entry in the ring buffer of prefetch requests, each + * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag. + */ +typedef struct PrefetchState { + MemoryContext bufctx; /* context for prf_buffer[].response allocations */ + MemoryContext errctx; /* context for prf_buffer[].response allocations */ + MemoryContext hashctx; /* context for prf_buffer */ + + /* buffer indexes */ + uint64 ring_unused; /* first unused slot */ + uint64 ring_flush; /* next request to flush */ + uint64 ring_receive; /* next slot that is to receive a response */ + uint64 ring_last; /* min slot with a response value */ + + /* metrics / statistics */ + int n_responses_buffered; /* count of PS responses not yet in buffers */ + int n_requests_inflight; /* count of PS requests considered in flight */ + int n_unused; /* count of buffers < unused, > last, that are also unused */ + + /* the buffers */ + prfh_hash *prf_hash; + PrefetchRequest prf_buffer[]; /* prefetch buffers */ +} PrefetchState; + +PrefetchState *MyPState; + +#define GetPrfSlot(ring_index) ( \ + ( \ + AssertMacro((ring_index) < MyPState->ring_unused && \ + (ring_index) >= MyPState->ring_last), \ + &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ + ) \ +) + +int n_prefetch_hits = 0; +int n_prefetch_misses = 0; +int n_prefetch_missed_caches = 0; +int n_prefetch_dupes = 0; + +XLogRecPtr prefetch_lsn = 0; + +static void consume_prefetch_responses(void); +static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn); +static void prefetch_read(PrefetchRequest *slot); +static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn); +static void prefetch_wait_for(uint64 ring_index); +static void prefetch_cleanup(void); +static inline void prefetch_set_unused(uint64 ring_index); + +static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode, + ForkNumber forknum, BlockNumber blkno); + +void +readahead_buffer_resize(int newsize, void *extra) +{ + uint64 end, + nfree = newsize; + PrefetchState *newPState; + Size newprfs_size = offsetof(PrefetchState, prf_buffer) + ( + sizeof(PrefetchRequest) * readahead_buffer_size + ); + + /* don't try to re-initialize if we haven't initialized yet */ + if (MyPState == NULL) + return; + + /* + * Make sure that we don't lose track of active prefetch requests by + * ensuring we have received all but the last n requests (n = newsize). + */ + if (MyPState->n_requests_inflight > newsize) + prefetch_wait_for(MyPState->ring_unused - newsize); + + /* construct the new PrefetchState, and copy over the memory contexts */ + newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size); + + newPState->bufctx = MyPState->bufctx; + newPState->errctx = MyPState->errctx; + newPState->hashctx = MyPState->hashctx; + newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL); + newPState->n_unused = newsize; + newPState->n_requests_inflight = 0; + newPState->n_responses_buffered = 0; + newPState->ring_last = newsize; + newPState->ring_unused = newsize; + newPState->ring_receive = newsize; + newPState->ring_flush = newsize; + + /* + * Copy over the prefetches. + * + * We populate the prefetch array from the end; to retain the most recent + * prefetches, but this has the benefit of only needing to do one iteration + * on the dataset, and trivial compaction. + */ + for (end = MyPState->ring_unused - 1; + end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0; + end -= 1) + { + PrefetchRequest *slot = GetPrfSlot(end); + PrefetchRequest *newslot; + bool found; + + if (slot->status == PRFS_UNUSED) + continue; + + nfree -= 1; + + newslot = &newPState->prf_buffer[nfree]; + *newslot = *slot; + newslot->my_ring_index = nfree; + + prfh_insert(newPState->prf_hash, newslot, &found); + + Assert(!found); + + switch (newslot->status) + { + case PRFS_UNUSED: + pg_unreachable(); + case PRFS_REQUESTED: + newPState->n_requests_inflight += 1; + newPState->ring_receive -= 1; + newPState->ring_last -= 1; + break; + case PRFS_RECEIVED: + newPState->n_responses_buffered += 1; + newPState->ring_last -= 1; + break; + case PRFS_TAG_REMAINS: + newPState->ring_last -= 1; + break; + } + newPState->n_unused -= 1; + } + + for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) + { + prefetch_set_unused(end); + } + + prfh_destroy(MyPState->prf_hash); + pfree(MyPState); + MyPState = newPState; +} + + + +/* + * Make sure that there are no responses still in the buffer. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. + */ +static void +consume_prefetch_responses(void) +{ + if (MyPState->ring_receive < MyPState->ring_unused) + prefetch_wait_for(MyPState->ring_unused - 1); +} + +static void +prefetch_cleanup(void) +{ + uint64 ring_index; + PrefetchRequest *slot; + + while (MyPState->ring_last < MyPState->ring_receive) { + ring_index = MyPState->ring_last; + slot = GetPrfSlot(ring_index); + + if (slot->status == PRFS_UNUSED) + MyPState->ring_last += 1; + else + break; + } +} + +/* + * Wait for slot of ring_index to have received its response. + * The caller is responsible for making sure the request buffer is flushed. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. + */ +static void +prefetch_wait_for(uint64 ring_index) +{ + PrefetchRequest *entry; + + if (MyPState->ring_flush <= ring_index && + MyPState->ring_unused > MyPState->ring_flush) + { + page_server->flush(); + MyPState->ring_flush = MyPState->ring_unused; + } + + Assert(MyPState->ring_unused > ring_index); + + while (MyPState->ring_receive <= ring_index) + { + entry = GetPrfSlot(MyPState->ring_receive); + + Assert(entry->status == PRFS_REQUESTED); + prefetch_read(entry); + } +} + +/* + * Read the response of a prefetch request into its slot. + * + * The caller is responsible for making sure that the request for this buffer + * was flushed to the PageServer. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. + */ +static void +prefetch_read(PrefetchRequest *slot) +{ + NeonResponse *response; + MemoryContext old; + + Assert(slot->status == PRFS_REQUESTED); + Assert(slot->response == NULL); + Assert(slot->my_ring_index == MyPState->ring_receive); + + old = MemoryContextSwitchTo(MyPState->errctx); + response = (NeonResponse *) page_server->receive(); + MemoryContextSwitchTo(old); + + /* update prefetch state */ + MyPState->n_responses_buffered += 1; + MyPState->n_requests_inflight -= 1; + MyPState->ring_receive += 1; + + /* update slot state */ + slot->status = PRFS_RECEIVED; + slot->response = response; +} + +/* + * Disconnect hook - drop prefetches when the connection drops + * + * If we don't remove the failed prefetches, we'd be serving incorrect + * data to the smgr. + */ +void +prefetch_on_ps_disconnect(void) +{ + MyPState->ring_flush = MyPState->ring_unused; + while (MyPState->ring_receive < MyPState->ring_unused) + { + PrefetchRequest *slot; + uint64 ring_index = MyPState->ring_receive; + + slot = GetPrfSlot(ring_index); + + Assert(slot->status == PRFS_REQUESTED); + Assert(slot->my_ring_index == ring_index); + + /* clean up the request */ + slot->status = PRFS_TAG_REMAINS; + MyPState->n_requests_inflight -= 1; + MyPState->ring_receive += 1; + prefetch_set_unused(ring_index); + } +} + +/* + * prefetch_set_unused() - clear a received prefetch slot + * + * The slot at ring_index must be a current member of the ring buffer, + * and may not be in the PRFS_REQUESTED state. + * + * NOTE: this function will update MyPState->pfs_hash; which invalidates any + * active pointers into the hash table. + */ +static inline void +prefetch_set_unused(uint64 ring_index) +{ + PrefetchRequest *slot = GetPrfSlot(ring_index); + + if (ring_index < MyPState->ring_last) + return; /* Should already be unused */ + + Assert(MyPState->ring_unused > ring_index); + + if (slot->status == PRFS_UNUSED) + return; + + Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS); + + if (slot->status == PRFS_RECEIVED) + { + pfree(slot->response); + slot->response = NULL; + + MyPState->n_responses_buffered -= 1; + MyPState->n_unused += 1; + } + else + { + Assert(slot->response == NULL); + } + + prfh_delete(MyPState->prf_hash, slot); + + /* clear all fields */ + MemSet(slot, 0, sizeof(PrefetchRequest)); + slot->status = PRFS_UNUSED; + + /* run cleanup if we're holding back ring_last */ + if (MyPState->ring_last == ring_index) + prefetch_cleanup(); +} + +static void +prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn) +{ + bool found; + NeonGetPageRequest request = { + .req.tag = T_NeonGetPageRequest, + .req.latest = false, + .req.lsn = 0, + .rnode = slot->buftag.rnode, + .forknum = slot->buftag.forkNum, + .blkno = slot->buftag.blockNum, + }; + + if (force_lsn && force_latest) + { + request.req.lsn = *force_lsn; + request.req.latest = *force_latest; + slot->effective_request_lsn = *force_lsn; + } + else + { + XLogRecPtr lsn = neon_get_request_lsn( + &request.req.latest, + slot->buftag.rnode, + slot->buftag.forkNum, + slot->buftag.blockNum + ); + /* + * Note: effective_request_lsn is potentially higher than the requested + * LSN, but still correct: + * + * We know there are no changes between the actual requested LSN and + * the value of effective_request_lsn: If there were, the page would + * have been in cache and evicted between those LSN values, which + * then would have had to result in a larger request LSN for this page. + * + * It is possible that a concurrent backend loads the page, modifies + * it and then evicts it again, but the LSN of that eviction cannot be + * smaller than the current WAL insert/redo pointer, which is already + * larger than this prefetch_lsn. So in any case, that would + * invalidate this cache. + * + * The best LSN to use for effective_request_lsn would be + * XLogCtl->Insert.RedoRecPtr, but that's expensive to access. + */ + request.req.lsn = lsn; + prefetch_lsn = Max(prefetch_lsn, lsn); + slot->effective_request_lsn = prefetch_lsn; + } + + Assert(slot->response == NULL); + Assert(slot->my_ring_index == MyPState->ring_unused); + page_server->send((NeonRequest *) &request); + + /* update prefetch state */ + MyPState->n_requests_inflight += 1; + MyPState->n_unused -= 1; + MyPState->ring_unused += 1; + + /* update slot state */ + slot->status = PRFS_REQUESTED; + + prfh_insert(MyPState->prf_hash, slot, &found); + Assert(!found); +} + +/* + * prefetch_register_buffer() - register and prefetch buffer + * + * Register that we may want the contents of BufferTag in the near future. + * + * If force_latest and force_lsn are not NULL, those values are sent to the + * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure + * to fill in these values manually. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. + */ + +static uint64 +prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn) +{ + uint64 ring_index; + PrefetchRequest req; + PrefetchRequest *slot; + PrfHashEntry *entry; + + /* use an intermediate PrefetchRequest struct to ensure correct alignment */ + req.buftag = tag; + + entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req); + + if (entry != NULL) + { + slot = entry->slot; + ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(ring_index)); + + Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + Assert(BUFFERTAGS_EQUAL(slot->buftag, tag)); + + /* + * If we want a specific lsn, we do not accept requests that were made + * with a potentially different LSN. + */ + if (force_latest && force_lsn) + { + /* if we want the latest version, any effective_request_lsn < request lsn is OK */ + if (*force_latest) + { + if (*force_lsn > slot->effective_request_lsn) + { + prefetch_wait_for(ring_index); + prefetch_set_unused(ring_index); + entry = NULL; + } + } + /* if we don't want the latest version, only accept requests with the exact same LSN */ + else + { + if (*force_lsn != slot->effective_request_lsn) + { + prefetch_wait_for(ring_index); + prefetch_set_unused(ring_index); + entry = NULL; + } + } + } + + /* + * We received a prefetch for a page that was recently read and + * removed from the buffers. Remove that request from the buffers. + */ + else if (slot->status == PRFS_TAG_REMAINS) + { + prefetch_set_unused(ring_index); + entry = NULL; + } + else + { + /* The buffered request is good enough, return that index */ + n_prefetch_dupes++; + return ring_index; + } + } + + /* + * If the prefetch queue is full, we need to make room by clearing the + * oldest slot. If the oldest slot holds a buffer that was already + * received, we can just throw it away; we fetched the page unnecessarily + * in that case. If the oldest slot holds a request that we haven't + * received a response for yet, we have to wait for the response to that + * before we can continue. We might not have even flushed the request to + * the pageserver yet, it might be just sitting in the output buffer. In + * that case, we flush it and wait for the response. (We could decide not + * to send it, but it's hard to abort when the request is already in the + * output buffer, and 'not sending' a prefetch request kind of goes + * against the principles of prefetching) + */ + if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused) + { + uint64 cleanup_index = MyPState->ring_last; + slot = GetPrfSlot(cleanup_index); + + Assert(slot->status != PRFS_UNUSED); + + /* We have the slot for ring_last, so that must still be in progress */ + switch (slot->status) + { + case PRFS_REQUESTED: + Assert(MyPState->ring_receive == cleanup_index); + prefetch_wait_for(cleanup_index); + prefetch_set_unused(cleanup_index); + break; + case PRFS_RECEIVED: + case PRFS_TAG_REMAINS: + prefetch_set_unused(cleanup_index); + break; + default: + pg_unreachable(); + } + } + + /* + * The next buffer pointed to by `ring_unused` is now definitely empty, + * so we can insert the new request to it. + */ + ring_index = MyPState->ring_unused; + slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)]; + + Assert(MyPState->ring_last <= ring_index); + + Assert(slot->status == PRFS_UNUSED); + + /* + * We must update the slot data before insertion, because the hash + * function reads the buffer tag from the slot. + */ + slot->buftag = tag; + slot->my_ring_index = ring_index; + + prefetch_do_request(slot, force_latest, force_lsn); + Assert(slot->status == PRFS_REQUESTED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + + if (flush_every_n_requests > 0 && + MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) + { + page_server->flush(); + MyPState->ring_flush = MyPState->ring_unused; + } + + return ring_index; +} + +static NeonResponse * +page_server_request(void const *req) +{ + page_server->send((NeonRequest *) req); + page_server->flush(); + MyPState->ring_flush = MyPState->ring_unused; + consume_prefetch_responses(); + return page_server->receive(); +} + + +StringInfoData +nm_pack_request(NeonRequest * msg) +{ + StringInfoData s; + + initStringInfo(&s); + pq_sendbyte(&s, msg->tag); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_NeonExistsRequest: + { + NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->rnode.spcNode); + pq_sendint32(&s, msg_req->rnode.dbNode); + pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_NeonNblocksRequest: + { + NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->rnode.spcNode); + pq_sendint32(&s, msg_req->rnode.dbNode); + pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_NeonDbSizeRequest: + { + NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->dbNode); + + break; + } + case T_NeonGetPageRequest: + { + NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->rnode.spcNode); + pq_sendint32(&s, msg_req->rnode.dbNode); + pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendbyte(&s, msg_req->forknum); + pq_sendint32(&s, msg_req->blkno); + + break; + } + + /* pagestore -> pagestore_client. We never need to create these. */ + case T_NeonExistsResponse: + case T_NeonNblocksResponse: + case T_NeonGetPageResponse: + case T_NeonErrorResponse: + case T_NeonDbSizeResponse: + default: + elog(ERROR, "unexpected neon message tag 0x%02x", msg->tag); + break; + } + return s; +} + +NeonResponse * +nm_unpack_response(StringInfo s) +{ + NeonMessageTag tag = pq_getmsgbyte(s); + NeonResponse *resp = NULL; + + switch (tag) + { + /* pagestore -> pagestore_client */ + case T_NeonExistsResponse: + { + NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse)); + + msg_resp->tag = tag; + msg_resp->exists = pq_getmsgbyte(s); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonNblocksResponse: + { + NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse)); + + msg_resp->tag = tag; + msg_resp->n_blocks = pq_getmsgint(s, 4); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonGetPageResponse: + { + NeonGetPageResponse *msg_resp; + + msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE); + msg_resp->tag = tag; + /* XXX: should be varlena */ + memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); + pq_getmsgend(s); + + Assert(msg_resp->tag == T_NeonGetPageResponse); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonDbSizeResponse: + { + NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse)); + + msg_resp->tag = tag; + msg_resp->db_size = pq_getmsgint64(s); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonErrorResponse: + { + NeonErrorResponse *msg_resp; + size_t msglen; + const char *msgtext; + + msgtext = pq_getmsgrawstring(s); + msglen = strlen(msgtext); + + msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1); + msg_resp->tag = tag; + memcpy(msg_resp->message, msgtext, msglen + 1); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + /* + * pagestore_client -> pagestore + * + * We create these ourselves, and don't need to decode them. + */ + case T_NeonExistsRequest: + case T_NeonNblocksRequest: + case T_NeonGetPageRequest: + case T_NeonDbSizeRequest: + default: + elog(ERROR, "unexpected neon message tag 0x%02x", tag); + break; + } + + return resp; +} + +/* dump to json for debugging / error reporting purposes */ +char * +nm_to_string(NeonMessage * msg) +{ + StringInfoData s; + + initStringInfo(&s); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_NeonExistsRequest: + { + NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\""); + appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", + msg_req->rnode.spcNode, + msg_req->rnode.dbNode, + msg_req->rnode.relNode); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + + case T_NeonNblocksRequest: + { + NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\""); + appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", + msg_req->rnode.spcNode, + msg_req->rnode.dbNode, + msg_req->rnode.relNode); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + + case T_NeonGetPageRequest: + { + NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\""); + appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", + msg_req->rnode.spcNode, + msg_req->rnode.dbNode, + msg_req->rnode.relNode); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + case T_NeonDbSizeRequest: + { + NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); + appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + + /* pagestore -> pagestore_client */ + case T_NeonExistsResponse: + { + NeonExistsResponse *msg_resp = (NeonExistsResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonExistsResponse\""); + appendStringInfo(&s, ", \"exists\": %d}", + msg_resp->exists); + appendStringInfoChar(&s, '}'); + + break; + } + case T_NeonNblocksResponse: + { + NeonNblocksResponse *msg_resp = (NeonNblocksResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonNblocksResponse\""); + appendStringInfo(&s, ", \"n_blocks\": %u}", + msg_resp->n_blocks); + appendStringInfoChar(&s, '}'); + + break; + } + case T_NeonGetPageResponse: + { +#if 0 + NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg; +#endif + + appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\""); + appendStringInfo(&s, ", \"page\": \"XXX\"}"); + appendStringInfoChar(&s, '}'); + break; + } + case T_NeonErrorResponse: + { + NeonErrorResponse *msg_resp = (NeonErrorResponse *) msg; + + /* FIXME: escape double-quotes in the message */ + appendStringInfoString(&s, "{\"type\": \"NeonErrorResponse\""); + appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); + appendStringInfoChar(&s, '}'); + break; + } + case T_NeonDbSizeResponse: + { + NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\""); + appendStringInfo(&s, ", \"db_size\": %ld}", + msg_resp->db_size); + appendStringInfoChar(&s, '}'); + + break; + } + + default: + appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag); + } + return s.data; +} + +/* + * Wrapper around log_newpage() that makes a temporary copy of the block and + * WAL-logs that. This makes it safe to use while holding only a shared lock + * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint + * directly because it skips the logging if the LSN is new enough. + */ +static XLogRecPtr +log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, + Page page, bool page_std) +{ + PGAlignedBlock copied_buffer; + + memcpy(copied_buffer.data, page, BLCKSZ); + return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std); +} + +/* + * Is 'buffer' identical to a freshly initialized empty heap page? + */ +static bool +PageIsEmptyHeapPage(char *buffer) +{ + PGAlignedBlock empty_page; + + PageInit((Page) empty_page.data, BLCKSZ, 0); + + return memcmp(buffer, empty_page.data, BLCKSZ) == 0; +} + +static void +neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) +{ + XLogRecPtr lsn = PageGetLSN(buffer); + + if (ShutdownRequestPending) + return; + + /* + * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM + * changes are not WAL-logged when the changes are made, so this is our + * last chance to log them, otherwise they're lost. That's OK for + * correctness, the non-logged updates are not critical. But we want to + * have a reasonably up-to-date VM and FSM in the page server. + */ + if (forknum == FSM_FORKNUM && !RecoveryInProgress()) + { + /* FSM is never WAL-logged and we don't care. */ + XLogRecPtr recptr; + + recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false); + XLogFlush(recptr); + lsn = recptr; + ereport(SmgrTrace, + (errmsg("FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, LSN_FORMAT_ARGS(lsn)))); + } + else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress()) + { + /* + * Always WAL-log vm. We should never miss clearing visibility map + * bits. + * + * TODO Is it too bad for performance? Hopefully we do not evict + * actively used vm too often. + */ + XLogRecPtr recptr; + + recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false); + XLogFlush(recptr); + lsn = recptr; + + ereport(SmgrTrace, + (errmsg("Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X/%X", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, LSN_FORMAT_ARGS(lsn)))); + } + else if (lsn == InvalidXLogRecPtr) + { + /* + * When PostgreSQL extends a relation, it calls smgrextend() with an + * all-zeros pages, and we can just ignore that in Neon. We do need to + * remember the new size, though, so that smgrnblocks() returns the + * right answer after the rel has been extended. We rely on the + * relsize cache for that. + * + * A completely empty heap page doesn't need to be WAL-logged, either. + * The heapam can leave such a page behind, if e.g. an insert errors + * out after initializing the page, but before it has inserted the + * tuple and WAL-logged the change. When we read the page from the + * page server, it will come back as all-zeros. That's OK, the heapam + * will initialize an all-zeros page on first use. + * + * In other scenarios, evicting a dirty page with no LSN is a bad + * sign: it implies that the page was not WAL-logged, and its contents + * will be lost when it's evicted. + */ + if (PageIsNew(buffer)) + { + ereport(SmgrTrace, + (errmsg("Page %u of relation %u/%u/%u.%u is all-zeros", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum))); + } + else if (PageIsEmptyHeapPage(buffer)) + { + ereport(SmgrTrace, + (errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum))); + } + else + { + ereport(PANIC, + (errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum))); + } + } + else + { + ereport(SmgrTrace, + (errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, LSN_FORMAT_ARGS(lsn)))); + } + + /* + * Remember the LSN on this page. When we read the page again, we must + * read the same or newer version of it. + */ + SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forknum, blocknum); +} + +/* + * neon_init() -- Initialize private state + */ +void +neon_init(void) +{ + Size prfs_size; + + if (MyPState != NULL) + return; + + prfs_size = offsetof(PrefetchState, prf_buffer) + ( + sizeof(PrefetchRequest) * readahead_buffer_size + ); + + MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size); + + MyPState->n_unused = readahead_buffer_size; + + MyPState->bufctx = SlabContextCreate(TopMemoryContext, + "NeonSMGR/prefetch", + SLAB_DEFAULT_BLOCK_SIZE * 17, + PS_GETPAGERESPONSE_SIZE); + MyPState->errctx = AllocSetContextCreate(TopMemoryContext, + "NeonSMGR/errors", + ALLOCSET_DEFAULT_SIZES); + MyPState->hashctx = AllocSetContextCreate(TopMemoryContext, + "NeonSMGR/prefetch", + ALLOCSET_DEFAULT_SIZES); + + MyPState->prf_hash = prfh_create(MyPState->hashctx, + readahead_buffer_size, NULL); + +#ifdef DEBUG_COMPARE_LOCAL + mdinit(); +#endif +} + +/* + * GetXLogInsertRecPtr uses XLogBytePosToRecPtr to convert logical insert (reserved) position + * to physical position in WAL. It always adds SizeOfXLogShortPHD: + * seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + * so even if there are no records on the page, offset will be SizeOfXLogShortPHD. + * It may cause problems with XLogFlush. So return pointer backward to the origin of the page. + */ +static XLogRecPtr +nm_adjust_lsn(XLogRecPtr lsn) +{ + /* + * If lsn points to the beging of first record on page or segment, then + * "return" it back to the page origin + */ + if ((lsn & (XLOG_BLCKSZ - 1)) == SizeOfXLogShortPHD) + { + lsn -= SizeOfXLogShortPHD; + } + else if ((lsn & (wal_segment_size - 1)) == SizeOfXLogLongPHD) + { + lsn -= SizeOfXLogLongPHD; + } + return lsn; +} + +/* + * Return LSN for requesting pages and number of blocks from page server + */ +static XLogRecPtr +neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) +{ + XLogRecPtr lsn; + + if (RecoveryInProgress()) + { + *latest = false; + lsn = GetXLogReplayRecPtr(NULL); + elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", + (uint32) ((lsn) >> 32), (uint32) (lsn)); + } + else if (am_walsender) + { + *latest = true; + lsn = InvalidXLogRecPtr; + elog(DEBUG1, "am walsender neon_get_request_lsn lsn 0 "); + } + else + { + XLogRecPtr flushlsn; + + /* + * Use the latest LSN that was evicted from the buffer cache. Any + * pages modified by later WAL records must still in the buffer cache, + * so our request cannot concern those. + */ + *latest = true; + lsn = GetLastWrittenLSN(rnode, forknum, blkno); + Assert(lsn != InvalidXLogRecPtr); + elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", + (uint32) ((lsn) >> 32), (uint32) (lsn)); + + lsn = nm_adjust_lsn(lsn); + + /* + * Is it possible that the last-written LSN is ahead of last flush + * LSN? Generally not, we shouldn't evict a page from the buffer cache + * before all its modifications have been safely flushed. That's the + * "WAL before data" rule. However, such case does exist at index + * building, _bt_blwritepage logs the full page without flushing WAL + * before smgrextend (files are fsynced before build ends). + */ +#if PG_VERSION_NUM >= 150000 + flushlsn = GetFlushRecPtr(NULL); +#else + flushlsn = GetFlushRecPtr(); +#endif + if (lsn > flushlsn) + { + elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", + (uint32) (lsn >> 32), (uint32) lsn, + (uint32) (flushlsn >> 32), (uint32) flushlsn); + XLogFlush(lsn); + } + } + + return lsn; +} + +/* + * neon_exists() -- Does the physical file exist? + */ +bool +neon_exists(SMgrRelation reln, ForkNumber forkNum) +{ + bool exists; + NeonResponse *resp; + BlockNumber n_blocks; + bool latest; + XLogRecPtr request_lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + + /* + * We don't know if it's an unlogged rel stored locally, or + * permanent rel stored in the page server. First check if it + * exists locally. If it does, great. Otherwise check if it exists + * in the page server. + */ + if (mdexists(reln, forkNum)) + return true; + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdexists(reln, forkNum); + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks)) + { + return true; + } + + /* + * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server + * will error out if you check that, because the whole dbdir for + * tablespace 0, db 0 doesn't exists. We possibly should change the page + * server to accept that and return 'false', to be consistent with + * mdexists(). But we probably also should fix pg_table_size() to not call + * smgrexists() with bogus relfilenode. + * + * For now, handle that special case here. + */ + if (reln->smgr_rnode.node.spcNode == 0 && + reln->smgr_rnode.node.dbNode == 0 && + reln->smgr_rnode.node.relNode == 0) + { + return false; + } + + request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, REL_METADATA_PSEUDO_BLOCKNO); + { + NeonExistsRequest request = { + .req.tag = T_NeonExistsRequest, + .req.latest = latest, + .req.lsn = request_lsn, + .rnode = reln->smgr_rnode.node, + .forknum = forkNum}; + + resp = page_server_request(&request); + } + + switch (resp->tag) + { + case T_NeonExistsResponse: + exists = ((NeonExistsResponse *) resp)->exists; + break; + + case T_NeonErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + pfree(resp); + return exists; +} + +/* + * neon_create() -- Create a new relation on neond storage + * + * If isRedo is true, it's okay for the relation to exist already. + */ +void +neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) +{ + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdcreate(reln, forkNum, isRedo); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + elog(SmgrTrace, "Create relation %u/%u/%u.%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum); + + /* + * Newly created relation is empty, remember that in the relsize cache. + * + * FIXME: This is currently not just an optimization, but required for + * correctness. Postgres can call smgrnblocks() on the newly-created + * relation. Currently, we don't call SetLastWrittenLSN() when a new + * relation created, so if we didn't remember the size in the relsize + * cache, we might call smgrnblocks() on the newly-created relation before + * the creation WAL record hass been received by the page server. + */ + set_cached_relsize(reln->smgr_rnode.node, forkNum, 0); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdcreate(reln, forkNum, isRedo); +#endif +} + +/* + * neon_unlink() -- Unlink a relation. + * + * Note that we're passed a RelFileNodeBackend --- by the time this is called, + * there won't be an SMgrRelation hashtable entry anymore. + * + * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber + * to delete all forks. + * + * + * If isRedo is true, it's unsurprising for the relation to be already gone. + * Also, we should remove the file immediately instead of queuing a request + * for later, since during redo there's no possibility of creating a + * conflicting relation. + * + * Note: any failure should be reported as WARNING not ERROR, because + * we are usually not in a transaction anymore when this is called. + */ +void +neon_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) +{ + /* + * Might or might not exist locally, depending on whether it's an unlogged + * or permanent relation (or if DEBUG_COMPARE_LOCAL is set). Try to + * unlink, it won't do any harm if the file doesn't exist. + */ + mdunlink(rnode, forkNum, isRedo); + if (!RelFileNodeBackendIsTemp(rnode)) + { + forget_cached_relsize(rnode.node, forkNum); + } +} + +/* + * neon_extend() -- Add a block to the specified relation. + * + * The semantics are nearly the same as mdwrite(): write at the + * specified position. However, this is to be used for the case of + * extending a relation (i.e., blocknum is at or beyond the current + * EOF). Note that we assume writing a block beyond current EOF + * causes intervening file space to become filled with zeroes. + */ +void +neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer, bool skipFsync) +{ + XLogRecPtr lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdextend(reln, forkNum, blkno, buffer, skipFsync); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* + * Check that the cluster size limit has not been exceeded. + * + * Temporary and unlogged relations are not included in the cluster size + * measured by the page server, so ignore those. Autovacuum processes are + * also exempt. + */ + if (max_cluster_size > 0 && + reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && + !IsAutoVacuumWorkerProcess()) + { + uint64 current_size = GetZenithCurrentClusterSize(); + + if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) + ereport(ERROR, + (errcode(ERRCODE_DISK_FULL), + errmsg("could not extend file because cluster size limit (%d MB) has been exceeded", + max_cluster_size), + errhint("This limit is defined by neon.max_cluster_size GUC"))); + } + + neon_wallog_page(reln, forkNum, blkno, buffer); + set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1); + + lsn = PageGetLSN(buffer); + elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, blkno, + (uint32) (lsn >> 32), (uint32) lsn); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdextend(reln, forkNum, blkno, buffer, skipFsync); +#endif + /* + * smgr_extend is often called with an all-zeroes page, so lsn==InvalidXLogRecPtr. + * An smgr_write() call will come for the buffer later, after it has been initialized + * with the real page contents, and it is eventually evicted from the buffer cache. + * But we need a valid LSN to the relation metadata update now. + */ + if (lsn == InvalidXLogRecPtr) + { + lsn = GetXLogInsertRecPtr(); + SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forkNum, blkno); + } + SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forkNum); +} + +/* + * neon_open() -- Initialize newly-opened relation. + */ +void +neon_open(SMgrRelation reln) +{ + /* + * We don't have anything special to do here. Call mdopen() to let md.c + * initialize itself. That's only needed for temporary or unlogged + * relations, but it's dirt cheap so do it always to make sure the md + * fields are initialized, for debugging purposes if nothing else. + */ + mdopen(reln); + + /* no work */ + elog(SmgrTrace, "[NEON_SMGR] open noop"); +} + +/* + * neon_close() -- Close the specified relation, if it isn't closed already. + */ +void +neon_close(SMgrRelation reln, ForkNumber forknum) +{ + /* + * Let md.c close it, if it had it open. Doesn't hurt to do this even for + * permanent relations that have no local storage. + */ + mdclose(reln, forknum); +} + + +/* + * neon_prefetch() -- Initiate asynchronous read of the specified block of a relation + */ +bool +neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; + + switch (reln->smgr_relpersistence) + { + case 0: /* probably shouldn't happen, but ignore it */ + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdprefetch(reln, forknum, blocknum); + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + BufferTag tag = (BufferTag) { + .rnode = reln->smgr_rnode.node, + .forkNum = forknum, + .blockNum = blocknum + }; + + ring_index = prefetch_register_buffer(tag, NULL, NULL); + + Assert(ring_index < MyPState->ring_unused && + MyPState->ring_last <= ring_index); + + return false; +} + +/* + * neon_writeback() -- Tell the kernel to write pages back to storage. + * + * This accepts a range of blocks because flushing several pages at once is + * considerably more efficient than doing so individually. + */ +void +neon_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ + switch (reln->smgr_relpersistence) + { + case 0: + /* mdwriteback() does nothing if the file doesn't exist */ + mdwriteback(reln, forknum, blocknum, nblocks); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdwriteback(reln, forknum, blocknum, nblocks); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* not implemented */ + elog(SmgrTrace, "[NEON_SMGR] writeback noop"); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdwriteback(reln, forknum, blocknum, nblocks); +#endif +} + +/* + * While function is defined in the neon extension it's used within neon_test_utils directly. + * To avoid breaking tests in the runtime please keep function signature in sync. + */ +void +neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer) +{ + NeonResponse *resp; + BufferTag buftag; + uint64 ring_index; + PrfHashEntry *entry; + PrefetchRequest *slot; + + buftag = (BufferTag) { + .rnode = rnode, + .forkNum = forkNum, + .blockNum = blkno, + }; + + /* + * Try to find prefetched page in the list of received pages. + */ + entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); + + if (entry != NULL) + { + slot = entry->slot; + if (slot->effective_request_lsn >= request_lsn) + { + ring_index = slot->my_ring_index; + n_prefetch_hits += 1; + } + else /* the current prefetch LSN is not large enough, so drop the prefetch */ + { + /* + * We can't drop cache for not-yet-received requested items. It is + * unlikely this happens, but it can happen if prefetch distance is + * large enough and a backend didn't consume all prefetch requests. + */ + if (slot->status == PRFS_REQUESTED) + { + prefetch_wait_for(slot->my_ring_index); + } + /* drop caches */ + prefetch_set_unused(slot->my_ring_index); + n_prefetch_missed_caches += 1; + /* make it look like a prefetch cache miss */ + entry = NULL; + } + } + + if (entry == NULL) + { + n_prefetch_misses += 1; + + ring_index = prefetch_register_buffer(buftag, &request_latest, + &request_lsn); + slot = GetPrfSlot(ring_index); + } + + Assert(slot->my_ring_index == ring_index); + Assert(MyPState->ring_last <= ring_index && + MyPState->ring_unused > ring_index); + Assert(slot->status != PRFS_UNUSED); + Assert(GetPrfSlot(ring_index) == slot); + + prefetch_wait_for(ring_index); + + Assert(slot->status == PRFS_RECEIVED); + + resp = slot->response; + + switch (resp->tag) + { + case T_NeonGetPageResponse: + memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); + break; + + case T_NeonErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + blkno, + rnode.spcNode, + rnode.dbNode, + rnode.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + + /* buffer was used, clean up for later reuse */ + prefetch_set_unused(ring_index); + prefetch_cleanup(); +} + +/* + * neon_read() -- Read the specified block from a relation. + */ +void +neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer) +{ + bool latest; + XLogRecPtr request_lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrread() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdread(reln, forkNum, blkno, buffer); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno); + neon_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); + +#ifdef DEBUG_COMPARE_LOCAL + if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + { + char pageserver_masked[BLCKSZ]; + char mdbuf[BLCKSZ]; + char mdbuf_masked[BLCKSZ]; + + mdread(reln, forkNum, blkno, mdbuf); + + memcpy(pageserver_masked, buffer, BLCKSZ); + memcpy(mdbuf_masked, mdbuf, BLCKSZ); + + if (PageIsNew(mdbuf)) + { + if (!PageIsNew(pageserver_masked)) + { + elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(buffer)); + } + } + else if (PageIsNew(buffer)) + { + elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf)); + } + else if (PageGetSpecialSize(mdbuf) == 0) + { + /* assume heap */ + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + { + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + { + /* assume btree */ + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + } + } +#endif +} + +#ifdef DEBUG_COMPARE_LOCAL +static char * +hexdump_page(char *page) +{ + StringInfoData result; + + initStringInfo(&result); + + for (int i = 0; i < BLCKSZ; i++) + { + if (i % 8 == 0) + appendStringInfo(&result, " "); + if (i % 40 == 0) + appendStringInfo(&result, "\n"); + appendStringInfo(&result, "%02x", (unsigned char) (page[i])); + } + + return result.data; +} +#endif + +/* + * neon_write() -- Write the supplied block at the appropriate location. + * + * This is to be used only for updating already-existing blocks of a + * relation (ie, those before the current EOF). To extend a relation, + * use mdextend(). + */ +void +neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync) +{ + XLogRecPtr lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + /* This is a bit tricky. Check if the relation exists locally */ + if (mdexists(reln, forknum)) + { + /* It exists locally. Guess it's unlogged then. */ + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + + /* + * We could set relpersistence now that we have determined + * that it's local. But we don't dare to do it, because that + * would immediately allow reads as well, which shouldn't + * happen. We could cache it with a different 'relpersistence' + * value, but this isn't performance critical. + */ + return; + } + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + neon_wallog_page(reln, forknum, blocknum, buffer); + + lsn = PageGetLSN(buffer); + elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, blocknum, + (uint32) (lsn >> 32), (uint32) lsn); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdwrite(reln, forknum, blocknum, buffer, skipFsync); +#endif +} + +/* + * neon_nblocks() -- Get the number of blocks stored in a relation. + */ +BlockNumber +neon_nblocks(SMgrRelation reln, ForkNumber forknum) +{ + NeonResponse *resp; + BlockNumber n_blocks; + bool latest; + XLogRecPtr request_lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdnblocks(reln, forknum); + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks)) + { + elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, n_blocks); + return n_blocks; + } + + request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forknum, REL_METADATA_PSEUDO_BLOCKNO); + { + NeonNblocksRequest request = { + .req.tag = T_NeonNblocksRequest, + .req.latest = latest, + .req.lsn = request_lsn, + .rnode = reln->smgr_rnode.node, + .forknum = forknum, + }; + + resp = page_server_request(&request); + } + + switch (resp->tag) + { + case T_NeonNblocksResponse: + n_blocks = ((NeonNblocksResponse *) resp)->n_blocks; + break; + + case T_NeonErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks); + + elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + n_blocks); + + pfree(resp); + return n_blocks; +} + +/* + * neon_db_size() -- Get the size of the database in bytes. + */ +int64 +neon_dbsize(Oid dbNode) +{ + NeonResponse *resp; + int64 db_size; + XLogRecPtr request_lsn; + bool latest; + RelFileNode dummy_node = {InvalidOid, InvalidOid, InvalidOid}; + + request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); + { + NeonDbSizeRequest request = { + .req.tag = T_NeonDbSizeRequest, + .req.latest = latest, + .req.lsn = request_lsn, + .dbNode = dbNode, + }; + + resp = page_server_request(&request); + } + + switch (resp->tag) + { + case T_NeonDbSizeResponse: + db_size = ((NeonDbSizeResponse *) resp)->db_size; + break; + + case T_NeonErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read db size of db %u from page server at lsn %X/%08X", + dbNode, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + + elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", + dbNode, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + db_size); + + pfree(resp); + return db_size; +} + +/* + * neon_truncate() -- Truncate relation to specified number of blocks. + */ +void +neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +{ + XLogRecPtr lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdtruncate(reln, forknum, nblocks); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks); + + /* + * Truncating a relation drops all its buffers from the buffer cache + * without calling smgrwrite() on them. But we must account for that in + * our tracking of last-written-LSN all the same: any future smgrnblocks() + * request must return the new size after the truncation. We don't know + * what the LSN of the truncation record was, so be conservative and use + * the most recently inserted WAL record's LSN. + */ + lsn = GetXLogInsertRecPtr(); + + lsn = nm_adjust_lsn(lsn); + + /* + * Flush it, too. We don't actually care about it here, but let's uphold + * the invariant that last-written LSN <= flush LSN. + */ + XLogFlush(lsn); + + /* + * Truncate may affect several chunks of relations. So we should either + * update last written LSN for all of them, or update LSN for "dummy" + * metadata block. Second approach seems more efficient. If the relation + * is extended again later, the extension will update the last-written LSN + * for the extended pages, so there's no harm in leaving behind obsolete + * entries for the truncated chunks. + */ + SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forknum); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdtruncate(reln, forknum, nblocks); +#endif +} + +/* + * neon_immedsync() -- Immediately sync a relation to stable storage. + * + * Note that only writes already issued are synced; this routine knows + * nothing of dirty buffers that may exist inside the buffer manager. We + * sync active and inactive segments; smgrDoPendingSyncs() relies on this. + * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of + * some segment, then mdtruncate() renders that segment inactive. If we + * crash before the next checkpoint syncs the newly-inactive segment, that + * segment may survive recovery, reintroducing unwanted data into the table. + */ +void +neon_immedsync(SMgrRelation reln, ForkNumber forknum) +{ + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdimmedsync(reln, forknum); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + elog(SmgrTrace, "[NEON_SMGR] immedsync noop"); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdimmedsync(reln, forknum); +#endif +} + +/* + * neon_start_unlogged_build() -- Starting build operation on a rel. + * + * Some indexes are built in two phases, by first populating the table with + * regular inserts, using the shared buffer cache but skipping WAL-logging, + * and WAL-logging the whole relation after it's done. Neon relies on the + * WAL to reconstruct pages, so we cannot use the page server in the + * first phase when the changes are not logged. + */ +static void +neon_start_unlogged_build(SMgrRelation reln) +{ + /* + * Currently, there can be only one unlogged relation build operation in + * progress at a time. That's enough for the current usage. + */ + if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) + elog(ERROR, "unlogged relation build is already in progress"); + Assert(unlogged_build_rel == NULL); + + ereport(SmgrTrace, + (errmsg("starting unlogged build of relation %u/%u/%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode))); + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + unlogged_build_rel = reln; + unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT; + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (smgrnblocks(reln, MAIN_FORKNUM) != 0) + elog(ERROR, "cannot perform unlogged index build, index is not empty "); + + unlogged_build_rel = reln; + unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; + + /* Make the relation look like it's unlogged */ + reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; + + /* + * FIXME: should we pass isRedo true to create the tablespace dir if it + * doesn't exist? Is it needed? + */ + mdcreate(reln, MAIN_FORKNUM, false); +} + +/* + * neon_finish_unlogged_build_phase_1() + * + * Call this after you have finished populating a relation in unlogged mode, + * before you start WAL-logging it. + */ +static void +neon_finish_unlogged_build_phase_1(SMgrRelation reln) +{ + Assert(unlogged_build_rel == reln); + + ereport(SmgrTrace, + (errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode))); + + if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT) + return; + + Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); + Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); + + unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; +} + +/* + * neon_end_unlogged_build() -- Finish an unlogged rel build. + * + * Call this after you have finished WAL-logging an relation that was + * first populated without WAL-logging. + * + * This removes the local copy of the rel, since it's now been fully + * WAL-logged and is present in the page server. + */ +static void +neon_end_unlogged_build(SMgrRelation reln) +{ + Assert(unlogged_build_rel == reln); + + ereport(SmgrTrace, + (errmsg("ending unlogged build of relation %u/%u/%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode))); + + if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT) + { + RelFileNodeBackend rnode; + + Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2); + Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); + + /* Make the relation look permanent again */ + reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT; + + /* Remove local copy */ + rnode = reln->smgr_rnode; + for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) + { + elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u", + rnode.node.spcNode, + rnode.node.dbNode, + rnode.node.relNode, + forknum); + + forget_cached_relsize(rnode.node, forknum); + mdclose(reln, forknum); + /* use isRedo == true, so that we drop it immediately */ + mdunlink(rnode, forknum, true); + } + } + + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; +} + +static void +AtEOXact_neon(XactEvent event, void *arg) +{ + switch (event) + { + case XACT_EVENT_ABORT: + case XACT_EVENT_PARALLEL_ABORT: + + /* + * Forget about any build we might have had in progress. The local + * file will be unlinked by smgrDoPendingDeletes() + */ + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + break; + + case XACT_EVENT_COMMIT: + case XACT_EVENT_PARALLEL_COMMIT: + case XACT_EVENT_PREPARE: + case XACT_EVENT_PRE_COMMIT: + case XACT_EVENT_PARALLEL_PRE_COMMIT: + case XACT_EVENT_PRE_PREPARE: + if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) + { + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + (errmsg("unlogged index build was not properly finished")))); + } + break; + } +} + +static const struct f_smgr neon_smgr = +{ + .smgr_init = neon_init, + .smgr_shutdown = NULL, + .smgr_open = neon_open, + .smgr_close = neon_close, + .smgr_create = neon_create, + .smgr_exists = neon_exists, + .smgr_unlink = neon_unlink, + .smgr_extend = neon_extend, + .smgr_prefetch = neon_prefetch, + .smgr_read = neon_read, + .smgr_write = neon_write, + .smgr_writeback = neon_writeback, + .smgr_nblocks = neon_nblocks, + .smgr_truncate = neon_truncate, + .smgr_immedsync = neon_immedsync, + + .smgr_start_unlogged_build = neon_start_unlogged_build, + .smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1, + .smgr_end_unlogged_build = neon_end_unlogged_build, +}; + +const f_smgr * +smgr_neon(BackendId backend, RelFileNode rnode) +{ + + /* Don't use page server for temp relations */ + if (backend != InvalidBackendId) + return smgr_standard(backend, rnode); + else + return &neon_smgr; +} + +void +smgr_init_neon(void) +{ + RegisterXactCallback(AtEOXact_neon, NULL); + + smgr_init_standard(); + neon_init(); +} diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c new file mode 100644 index 0000000000..d4262c730a --- /dev/null +++ b/pgxn/neon/relsize_cache.c @@ -0,0 +1,195 @@ +/*------------------------------------------------------------------------- + * + * relsize_cache.c + * Relation size cache for better zentih performance. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/neon/relsize_cache.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pagestore_client.h" +#include "storage/relfilenode.h" +#include "storage/smgr.h" +#include "storage/lwlock.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "catalog/pg_tablespace_d.h" +#include "utils/dynahash.h" +#include "utils/guc.h" + +#if PG_VERSION_NUM >= 150000 +#include "miscadmin.h" +#endif + +typedef struct +{ + RelFileNode rnode; + ForkNumber forknum; +} RelTag; + +typedef struct +{ + RelTag tag; + BlockNumber size; +} RelSizeEntry; + +static HTAB *relsize_hash; +static LWLockId relsize_lock; +static int relsize_hash_size; +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; +#if PG_VERSION_NUM >= 150000 +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static void relsize_shmem_request(void); +#endif + +/* + * Size of a cache entry is 20 bytes. So this default will take about 1.2 MB, + * which seems reasonable. + */ +#define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024) + +static void +neon_smgr_shmem_startup(void) +{ + static HASHCTL info; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize"); + info.keysize = sizeof(RelTag); + info.entrysize = sizeof(RelSizeEntry); + relsize_hash = ShmemInitHash("neon_relsize", + relsize_hash_size, relsize_hash_size, + &info, + HASH_ELEM | HASH_BLOBS); + LWLockRelease(AddinShmemInitLock); +} + +bool +get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size) +{ + bool found = false; + + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_SHARED); + entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL); + if (entry != NULL) + { + *size = entry->size; + found = true; + } + LWLockRelease(relsize_lock); + } + return found; +} + +void +set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) +{ + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL); + entry->size = size; + LWLockRelease(relsize_lock); + } +} + +void +update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) +{ + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + bool found; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found); + if (!found || entry->size < size) + entry->size = size; + LWLockRelease(relsize_lock); + } +} + +void +forget_cached_relsize(RelFileNode rnode, ForkNumber forknum) +{ + if (relsize_hash_size > 0) + { + RelTag tag; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + hash_search(relsize_hash, &tag, HASH_REMOVE, NULL); + LWLockRelease(relsize_lock); + } +} + +void +relsize_hash_init(void) +{ + DefineCustomIntVariable("neon.relsize_hash_size", + "Sets the maximum number of cached relation sizes for neon", + NULL, + &relsize_hash_size, + DEFAULT_RELSIZE_HASH_SIZE, + 0, + INT_MAX, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + + if (relsize_hash_size > 0) + { +#if PG_VERSION_NUM >= 150000 + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = relsize_shmem_request; +#else + RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); + RequestNamedLWLockTranche("neon_relsize", 1); +#endif + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = neon_smgr_shmem_startup; + } +} + +#if PG_VERSION_NUM >= 150000 +/* + * shmem_request hook: request additional shared resources. We'll allocate or + * attach to the shared resources in neon_smgr_shmem_startup(). + */ +static void +relsize_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + + RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); + RequestNamedLWLockTranche("neon_relsize", 1); +} +#endif diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c new file mode 100644 index 0000000000..c24142dca1 --- /dev/null +++ b/pgxn/neon/walproposer.c @@ -0,0 +1,2547 @@ +/*------------------------------------------------------------------------- + * + * walproposer.c + * + * Proposer/leader part of the total order broadcast protocol between postgres + * and WAL safekeepers. + * + * We have two ways of launching WalProposer: + * + * 1. As a background worker which will run physical WalSender with + * am_wal_proposer flag set to true. WalSender in turn would handle WAL + * reading part and call WalProposer when ready to scatter WAL. + * + * 2. As a standalone utility by running `postgres --sync-safekeepers`. That + * is needed to create LSN from which it is safe to start postgres. More + * specifically it addresses following problems: + * + * a) Chicken-or-the-egg problem: compute postgres needs data directory + * with non-rel files that are downloaded from pageserver by calling + * basebackup@LSN. This LSN is not arbitrary, it must include all + * previously committed transactions and defined through consensus + * voting, which happens... in walproposer, a part of compute node. + * + * b) Just warranting such LSN is not enough, we must also actually commit + * it and make sure there is a safekeeper who knows this LSN is + * committed so WAL before it can be streamed to pageserver -- otherwise + * basebackup will hang waiting for WAL. Advancing commit_lsn without + * playing consensus game is impossible, so speculative 'let's just poll + * safekeepers, learn start LSN of future epoch and run basebackup' + * won't work. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include +#include "access/xact.h" +#include "access/xlogdefs.h" +#include "access/xlogutils.h" +#include "access/xloginsert.h" +#if PG_VERSION_NUM >= 150000 +#include "access/xlogrecovery.h" +#endif +#include "storage/fd.h" +#include "storage/latch.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "access/xlog.h" +#include "libpq/pqformat.h" +#include "replication/slot.h" +#include "replication/walreceiver.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" +#include "utils/timestamp.h" + +#include "neon.h" +#include "walproposer.h" +#include "walproposer_utils.h" + +static bool syncSafekeepers = false; + +char *wal_acceptors_list; +int wal_acceptor_reconnect_timeout; +int wal_acceptor_connection_timeout; +bool am_wal_proposer; + +char *neon_timeline_walproposer = NULL; +char *neon_tenant_walproposer = NULL; + +#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" + +static int n_safekeepers = 0; +static int quorum = 0; +static Safekeeper safekeeper[MAX_SAFEKEEPERS]; +static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ +static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to* + * safekeepers */ +static ProposerGreeting greetRequest; +static VoteRequest voteRequest; /* Vote request for safekeeper */ +static WaitEventSet *waitEvents; +static AppendResponse quorumFeedback; +/* + * Minimal LSN which may be needed for recovery of some safekeeper, + * record-aligned (first record which might not yet received by someone). + */ +static XLogRecPtr truncateLsn; + +/* + * Term of the proposer. We want our term to be highest and unique, + * so we collect terms from safekeepers quorum, choose max and +1. + * After that our term is fixed and must not change. If we observe + * that some safekeeper has higher term, it means that we have another + * running compute, so we must stop immediately. + */ +static term_t propTerm; +static TermHistory propTermHistory; /* term history of the proposer */ +static XLogRecPtr propEpochStartLsn; /* epoch start lsn of the proposer */ +static term_t donorEpoch; /* Most advanced acceptor epoch */ +static int donor; /* Most advanced acceptor */ +static XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ +static int n_votes = 0; +static int n_connected = 0; +static TimestampTz last_reconnect_attempt; + +static WalproposerShmemState * walprop_shared; + +/* Prototypes for private functions */ +static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId); +static void WalProposerStart(void); +static void WalProposerLoop(void); +static void InitEventSet(void); +static void UpdateEventSet(Safekeeper *sk, uint32 events); +static void HackyRemoveWalProposerEvent(Safekeeper *to_remove); +static void ShutdownConnection(Safekeeper *sk); +static void ResetConnection(Safekeeper *sk); +static long TimeToReconnect(TimestampTz now); +static void ReconnectSafekeepers(void); +static void AdvancePollState(Safekeeper *sk, uint32 events); +static void HandleConnectionEvent(Safekeeper *sk); +static void SendStartWALPush(Safekeeper *sk); +static void RecvStartWALPushResult(Safekeeper *sk); +static void SendProposerGreeting(Safekeeper *sk); +static void RecvAcceptorGreeting(Safekeeper *sk); +static void SendVoteRequest(Safekeeper *sk); +static void RecvVoteResponse(Safekeeper *sk); +static void HandleElectedProposer(void); +static term_t GetHighestTerm(TermHistory * th); +static term_t GetEpoch(Safekeeper *sk); +static void DetermineEpochStartLsn(void); +static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos); +static void SendProposerElected(Safekeeper *sk); +static void WalProposerStartStreaming(XLogRecPtr startpos); +static void StartStreaming(Safekeeper *sk); +static void SendMessageToNode(Safekeeper *sk); +static void BroadcastAppendRequest(void); +static void HandleActiveState(Safekeeper *sk, uint32 events); +static bool SendAppendRequests(Safekeeper *sk); +static bool RecvAppendResponses(Safekeeper *sk); +static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs); +static XLogRecPtr CalculateMinFlushLsn(void); +static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void); +static void HandleSafekeeperResponse(void); +static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); +static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg); +static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state); +static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); +static bool AsyncFlush(Safekeeper *sk); + +static void nwp_shmem_startup_hook(void); +static void nwp_register_gucs(void); +static void nwp_prepare_shmem(void); +static uint64 backpressure_lag_impl(void); +static bool backpressure_throttling_impl(void); + +static process_interrupts_callback_t PrevProcessInterruptsCallback; +static shmem_startup_hook_type prev_shmem_startup_hook_type; +#if PG_VERSION_NUM >= 150000 +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static void walproposer_shmem_request(void); +#endif + +void +pg_init_walproposer(void) +{ + if (!process_shared_preload_libraries_in_progress) + return; + + nwp_register_gucs(); + + nwp_prepare_shmem(); + + delay_backend_us = &backpressure_lag_impl; + PrevProcessInterruptsCallback = ProcessInterruptsCallback; + ProcessInterruptsCallback = backpressure_throttling_impl; + + WalProposerRegister(); +} + +/* + * Entry point for `postgres --sync-safekeepers`. + */ +void +WalProposerSync(int argc, char *argv[]) +{ + struct stat stat_buf; + + syncSafekeepers = true; +#if PG_VERSION_NUM < 150000 + ThisTimeLineID = 1; +#endif + + /* + * Initialize postmaster_alive_fds as WaitEventSet checks them. + * + * Copied from InitPostmasterDeathWatchHandle() + */ + if (pipe(postmaster_alive_fds) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg_internal("could not create pipe to monitor postmaster death: %m"))); + if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1) + ereport(FATAL, + (errcode_for_socket_access(), + errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m"))); + + ChangeToDataDir(); + + /* Create pg_wal directory, if it doesn't exist */ + if (stat(XLOGDIR, &stat_buf) != 0) + { + ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR))); + if (MakePGDirectory(XLOGDIR) < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + XLOGDIR))); + exit(1); + } + } + + WalProposerInit(0, 0); + + BackgroundWorkerUnblockSignals(); + + WalProposerStart(); +} + +static void +nwp_register_gucs(void) +{ + DefineCustomStringVariable( + "neon.safekeepers", + "List of Neon WAL acceptors (host:port)", + NULL, /* long_desc */ + &wal_acceptors_list, /* valueAddr */ + "", /* bootValue */ + PGC_POSTMASTER, + GUC_LIST_INPUT, /* extensions can't use* + * GUC_LIST_QUOTE */ + NULL, NULL, NULL); + + DefineCustomIntVariable( + "neon.safekeeper_reconnect_timeout", + "Timeout for reconnecting to offline wal acceptor.", + NULL, + &wal_acceptor_reconnect_timeout, + 1000, 0, INT_MAX, /* default, min, max */ + PGC_SIGHUP, /* context */ + GUC_UNIT_MS, /* flags */ + NULL, NULL, NULL); + + DefineCustomIntVariable( + "neon.safekeeper_connect_timeout", + "Timeout for connection establishement and it's maintenance against safekeeper", + NULL, + &wal_acceptor_connection_timeout, + 5000, 0, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MS, + NULL, NULL, NULL); +} + +/* shmem handling */ + +static void +nwp_prepare_shmem(void) +{ +#if PG_VERSION_NUM >= 150000 + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = walproposer_shmem_request; +#else + RequestAddinShmemSpace(WalproposerShmemSize()); +#endif + prev_shmem_startup_hook_type = shmem_startup_hook; + shmem_startup_hook = nwp_shmem_startup_hook; +} + +#if PG_VERSION_NUM >= 150000 +/* + * shmem_request hook: request additional shared resources. We'll allocate or + * attach to the shared resources in nwp_shmem_startup_hook(). + */ +static void +walproposer_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + + RequestAddinShmemSpace(WalproposerShmemSize()); +} +#endif + +static void +nwp_shmem_startup_hook(void) +{ + if (prev_shmem_startup_hook_type) + prev_shmem_startup_hook_type(); + + WalproposerShmemInit(); +} + +/* + * WAL proposer bgworker entry point. + */ +void +WalProposerMain(Datum main_arg) +{ +#if PG_VERSION_NUM >= 150000 + TimeLineID tli; +#endif + + /* Establish signal handlers. */ + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + +#if PG_VERSION_NUM >= 150000 + /* FIXME pass proper tli to WalProposerInit ? */ + GetXLogReplayRecPtr(&tli); + WalProposerInit(GetFlushRecPtr(NULL), GetSystemIdentifier()); +#else + GetXLogReplayRecPtr(&ThisTimeLineID); + WalProposerInit(GetFlushRecPtr(), GetSystemIdentifier()); +#endif + + last_reconnect_attempt = GetCurrentTimestamp(); + + application_name = (char *) "walproposer"; /* for + * synchronous_standby_names */ + am_wal_proposer = true; + am_walsender = true; + InitWalSender(); + InitProcessPhase2(); + + /* Create replication slot for WAL proposer if not exists */ + if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL) + { + ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false); + ReplicationSlotReserveWal(); + /* Write this slot to disk */ + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + ReplicationSlotRelease(); + } + + WalProposerStart(); +} + +/* + * Create new AppendRequest message and start sending it. This function is + * called from walsender every time the new WAL is available. + */ +void +WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos) +{ + Assert(startpos == availableLsn && endpos >= availableLsn); + availableLsn = endpos; + BroadcastAppendRequest(); +} + +/* + * Advance the WAL proposer state machine, waiting each time for events to occur. + * Will exit only when latch is set, i.e. new WAL should be pushed from walsender + * to walproposer. + */ +void +WalProposerPoll(void) +{ + while (true) + { + Safekeeper *sk; + int rc; + WaitEvent event; + TimestampTz now = GetCurrentTimestamp(); + + rc = WaitEventSetWait(waitEvents, TimeToReconnect(now), + &event, 1, WAIT_EVENT_WAL_SENDER_MAIN); + sk = (Safekeeper *) event.user_data; + + /* + * If the event contains something that one of our safekeeper states + * was waiting for, we'll advance its state. + */ + if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))) + AdvancePollState(sk, event.events); + + /* + * If the timeout expired, attempt to reconnect to any safekeepers + * that we dropped + */ + ReconnectSafekeepers(); + + /* + * If wait is terminated by latch set (walsenders' latch is set on + * each wal flush), then exit loop. (no need for pm death check due to + * WL_EXIT_ON_PM_DEATH) + */ + if (rc != 0 && (event.events & WL_LATCH_SET)) + { + ResetLatch(MyLatch); + break; + } + + now = GetCurrentTimestamp(); + if (rc == 0 || TimeToReconnect(now) <= 0) /* timeout expired: poll state */ + { + TimestampTz now; + + /* + * If no WAL was generated during timeout (and we have already + * collected the quorum), then send pool message + */ + if (availableLsn != InvalidXLogRecPtr) + { + BroadcastAppendRequest(); + } + + /* + * Abandon connection attempts which take too long. + */ + now = GetCurrentTimestamp(); + for (int i = 0; i < n_safekeepers; i++) + { + Safekeeper *sk = &safekeeper[i]; + + if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now, + wal_acceptor_connection_timeout)) + { + elog(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms", + sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout); + ShutdownConnection(sk); + } + } + } + } +} + +/* + * Register a background worker proposing WAL to wal acceptors. + */ +void +WalProposerRegister(void) +{ + BackgroundWorker bgw; + + if (*wal_acceptors_list == '\0') + return; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + +static void +WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId) +{ + char *host; + char *sep; + char *port; + + load_file("libpqwalreceiver", false); + if (WalReceiverFunctions == NULL) + elog(ERROR, "libpqwalreceiver didn't initialize correctly"); + + for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep) + { + port = strchr(host, ':'); + if (port == NULL) + { + elog(FATAL, "port is not specified"); + } + *port++ = '\0'; + sep = strchr(port, ','); + if (sep != NULL) + *sep++ = '\0'; + if (n_safekeepers + 1 >= MAX_SAFEKEEPERS) + { + elog(FATAL, "Too many safekeepers"); + } + safekeeper[n_safekeepers].host = host; + safekeeper[n_safekeepers].port = port; + safekeeper[n_safekeepers].state = SS_OFFLINE; + safekeeper[n_safekeepers].conn = NULL; + + /* + * Set conninfo to empty. We'll fill it out once later, in + * `ResetConnection` as needed + */ + safekeeper[n_safekeepers].conninfo[0] = '\0'; + initStringInfo(&safekeeper[n_safekeepers].outbuf); + safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL); + if (safekeeper[n_safekeepers].xlogreader == NULL) + elog(FATAL, "Failed to allocate xlog reader"); + safekeeper[n_safekeepers].flushWrite = false; + safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr; + safekeeper[n_safekeepers].streamingAt = InvalidXLogRecPtr; + n_safekeepers += 1; + } + if (n_safekeepers < 1) + { + elog(FATAL, "Safekeepers addresses are not specified"); + } + quorum = n_safekeepers / 2 + 1; + + /* Fill the greeting package */ + greetRequest.tag = 'g'; + greetRequest.protocolVersion = SK_PROTOCOL_VERSION; + greetRequest.pgVersion = PG_VERSION_NUM; + pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId)); + greetRequest.systemId = systemId; + if (!neon_timeline_walproposer) + elog(FATAL, "neon.timeline_id is not provided"); + if (*neon_timeline_walproposer != '\0' && + !HexDecodeString(greetRequest.timeline_id, neon_timeline_walproposer, 16)) + elog(FATAL, "Could not parse neon.timeline_id, %s", neon_timeline_walproposer); + if (!neon_tenant_walproposer) + elog(FATAL, "neon.tenant_id is not provided"); + if (*neon_tenant_walproposer != '\0' && + !HexDecodeString(greetRequest.tenant_id, neon_tenant_walproposer, 16)) + elog(FATAL, "Could not parse neon.tenant_id, %s", neon_tenant_walproposer); + +#if PG_VERSION_NUM >= 150000 + /* FIXME don't use hardcoded timeline id */ + greetRequest.timeline = 1; +#else + greetRequest.timeline = ThisTimeLineID; +#endif + greetRequest.walSegSize = wal_segment_size; + + InitEventSet(); +} + +static void +WalProposerStart(void) +{ + + /* Initiate connections to all safekeeper nodes */ + for (int i = 0; i < n_safekeepers; i++) + { + ResetConnection(&safekeeper[i]); + } + + WalProposerLoop(); +} + +static void +WalProposerLoop(void) +{ + while (true) + WalProposerPoll(); +} + +/* Initializes the internal event set, provided that it is currently null */ +static void +InitEventSet(void) +{ + if (waitEvents) + elog(FATAL, "double-initialization of event set"); + + waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers); + AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); +} + +/* + * Updates the events we're already waiting on for the safekeeper, setting it to + * the provided `events` + * + * This function is called any time the safekeeper's state switches to one where + * it has to wait to continue. This includes the full body of AdvancePollState + * and calls to IO helper functions. + */ +static void +UpdateEventSet(Safekeeper *sk, uint32 events) +{ + /* eventPos = -1 when we don't have an event */ + Assert(sk->eventPos != -1); + + ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL); +} + +/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set. + * + * Note: Internally, this completely reconstructs the event set. It should be avoided if possible. + */ +static void +HackyRemoveWalProposerEvent(Safekeeper *to_remove) +{ + /* Remove the existing event set */ + if (waitEvents) + { + FreeWaitEventSet(waitEvents); + waitEvents = NULL; + } + /* Re-initialize it without adding any safekeeper events */ + InitEventSet(); + + /* + * loop through the existing safekeepers. If they aren't the one we're + * removing, and if they have a socket we can use, re-add the applicable + * events. + */ + for (int i = 0; i < n_safekeepers; i++) + { + uint32 desired_events = WL_NO_EVENTS; + Safekeeper *sk = &safekeeper[i]; + + sk->eventPos = -1; + + if (sk == to_remove) + continue; + + /* If this safekeeper isn't offline, add an event for it! */ + if (sk->conn != NULL) + { + desired_events = SafekeeperStateDesiredEvents(sk->state); + sk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(sk->conn), NULL, sk); + } + } +} + +/* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */ +static void +ShutdownConnection(Safekeeper *sk) +{ + if (sk->conn) + walprop_finish(sk->conn); + sk->conn = NULL; + sk->state = SS_OFFLINE; + sk->flushWrite = false; + sk->streamingAt = InvalidXLogRecPtr; + + if (sk->voteResponse.termHistory.entries) + pfree(sk->voteResponse.termHistory.entries); + sk->voteResponse.termHistory.entries = NULL; + + HackyRemoveWalProposerEvent(sk); +} + +/* + * This function is called to establish new connection or to reestablish + * connection in case of connection failure. + * + * On success, sets the state to SS_CONNECTING_WRITE. + */ +static void +ResetConnection(Safekeeper *sk) +{ + pgsocket sock; /* socket of the new connection */ + + if (sk->state != SS_OFFLINE) + { + ShutdownConnection(sk); + } + + /* + * Try to establish new connection + * + * If the connection information hasn't been filled out, we need to do + * that here. + */ + if (sk->conninfo[0] == '\0') + { + int written = 0; + + written = snprintf((char *) &sk->conninfo, MAXCONNINFO, + "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", + sk->host, sk->port, neon_timeline_walproposer, neon_tenant_walproposer); + + /* + * currently connection string is not that long, but once we pass + * something like jwt we might overflow the buffer, + */ + + /* + * so it is better to be defensive and check that everything aligns + * well + */ + if (written > MAXCONNINFO || written < 0) + elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); + } + + sk->conn = walprop_connect_start((char *) &sk->conninfo); + + /* + * "If the result is null, then libpq has been unable to allocate a new + * PGconn structure" + */ + if (!sk->conn) + elog(FATAL, "failed to allocate new PGconn object"); + + /* + * PQconnectStart won't actually start connecting until we run + * PQconnectPoll. Before we do that though, we need to check that it + * didn't immediately fail. + */ + if (walprop_status(sk->conn) == WP_CONNECTION_BAD) + { + /*--- + * According to libpq docs: + * "If the result is CONNECTION_BAD, the connection attempt has already failed, + * typically because of invalid connection parameters." + * We should report this failure. + * + * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS + */ + elog(WARNING, "Immediate failure to connect with node:\n\t%s\n\terror: %s", + sk->conninfo, walprop_error_message(sk->conn)); + + /* + * Even though the connection failed, we still need to clean up the + * object + */ + walprop_finish(sk->conn); + sk->conn = NULL; + return; + } + + /* + * The documentation for PQconnectStart states that we should call + * PQconnectPoll in a loop until it returns PGRES_POLLING_OK or + * PGRES_POLLING_FAILED. The other two possible returns indicate whether + * we should wait for reading or writing on the socket. For the first + * iteration of the loop, we're expected to wait until the socket becomes + * writable. + * + * The wording of the documentation is a little ambiguous; thankfully + * there's an example in the postgres source itself showing this behavior. + * (see libpqrcv_connect, defined in + * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c) + */ + elog(LOG, "connecting with node %s:%s", sk->host, sk->port); + + sk->state = SS_CONNECTING_WRITE; + sk->latestMsgReceivedAt = GetCurrentTimestamp(); + + sock = walprop_socket(sk->conn); + sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk); + return; +} + +/* + * How much milliseconds left till we should attempt reconnection to + * safekeepers? Returns 0 if it is already high time, -1 if we never reconnect + * (do we actually need this?). + */ +static long +TimeToReconnect(TimestampTz now) +{ + TimestampTz passed; + TimestampTz till_reconnect; + + if (wal_acceptor_reconnect_timeout <= 0) + return -1; + + passed = now - last_reconnect_attempt; + till_reconnect = wal_acceptor_reconnect_timeout * 1000 - passed; + if (till_reconnect <= 0) + return 0; + return (long) (till_reconnect / 1000); +} + +/* If the timeout has expired, attempt to reconnect to all offline safekeepers */ +static void +ReconnectSafekeepers(void) +{ + TimestampTz now = GetCurrentTimestamp(); + + if (TimeToReconnect(now) == 0) + { + last_reconnect_attempt = now; + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].state == SS_OFFLINE) + ResetConnection(&safekeeper[i]); + } + } +} + +/* + * Performs the logic for advancing the state machine of the specified safekeeper, + * given that a certain set of events has occured. + */ +static void +AdvancePollState(Safekeeper *sk, uint32 events) +{ + /* + * Sanity check. We assume further down that the operations don't block + * because the socket is ready. + */ + AssertEventsOkForState(events, sk); + + /* Execute the code corresponding to the current state */ + switch (sk->state) + { + /* + * safekeepers are only taken out of SS_OFFLINE by calls to + * ResetConnection + */ + case SS_OFFLINE: + elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline", + sk->host, sk->port); + break; /* actually unreachable, but prevents + * -Wimplicit-fallthrough */ + + /* + * Both connecting states run the same logic. The only difference + * is the events they're expecting + */ + case SS_CONNECTING_READ: + case SS_CONNECTING_WRITE: + HandleConnectionEvent(sk); + break; + + /* + * Waiting for a successful CopyBoth response. + */ + case SS_WAIT_EXEC_RESULT: + RecvStartWALPushResult(sk); + break; + + /* + * Finish handshake comms: receive information about the + * safekeeper. + */ + case SS_HANDSHAKE_RECV: + RecvAcceptorGreeting(sk); + break; + + /* + * Voting is an idle state - we don't expect any events to + * trigger. Refer to the execution of SS_HANDSHAKE_RECV to see how + * nodes are transferred from SS_VOTING to sending actual vote + * requests. + */ + case SS_VOTING: + elog(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk->state)); + ResetConnection(sk); + return; + + /* Read the safekeeper response for our candidate */ + case SS_WAIT_VERDICT: + RecvVoteResponse(sk); + break; + + /* Flush proposer announcement message */ + case SS_SEND_ELECTED_FLUSH: + + /* + * AsyncFlush ensures we only move on to SS_ACTIVE once the flush + * completes. If we still have more to do, we'll wait until the + * next poll comes along. + */ + if (!AsyncFlush(sk)) + return; + + /* flush is done, event set and state will be updated later */ + StartStreaming(sk); + break; + + /* + * Idle state for waiting votes from quorum. + */ + case SS_IDLE: + elog(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk->state)); + ResetConnection(sk); + return; + + /* + * Active state is used for streaming WAL and receiving feedback. + */ + case SS_ACTIVE: + HandleActiveState(sk, events); + break; + } +} + +static void +HandleConnectionEvent(Safekeeper *sk) +{ + WalProposerConnectPollStatusType result = walprop_connect_poll(sk->conn); + + /* The new set of events we'll wait on, after updating */ + uint32 new_events = WL_NO_EVENTS; + + switch (result) + { + case WP_CONN_POLLING_OK: + elog(LOG, "connected with node %s:%s", sk->host, + sk->port); + sk->latestMsgReceivedAt = GetCurrentTimestamp(); + /* + * We have to pick some event to update event set. We'll + * eventually need the socket to be readable, so we go with that. + */ + new_events = WL_SOCKET_READABLE; + break; + + /* + * If we need to poll to finish connecting, continue doing that + */ + case WP_CONN_POLLING_READING: + sk->state = SS_CONNECTING_READ; + new_events = WL_SOCKET_READABLE; + break; + case WP_CONN_POLLING_WRITING: + sk->state = SS_CONNECTING_WRITE; + new_events = WL_SOCKET_WRITEABLE; + break; + + case WP_CONN_POLLING_FAILED: + elog(WARNING, "failed to connect to node '%s:%s': %s", + sk->host, sk->port, walprop_error_message(sk->conn)); + + /* + * If connecting failed, we don't want to restart the connection + * because that might run us into a loop. Instead, shut it down -- + * it'll naturally restart at a slower interval on calls to + * ReconnectSafekeepers. + */ + ShutdownConnection(sk); + return; + } + + /* + * Because PQconnectPoll can change the socket, we have to un-register the + * old event and re-register an event on the new socket. + */ + HackyRemoveWalProposerEvent(sk); + sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk); + + /* If we successfully connected, send START_WAL_PUSH query */ + if (result == WP_CONN_POLLING_OK) + SendStartWALPush(sk); +} + +/* + * Send "START_WAL_PUSH" message as an empty query to the safekeeper. Performs + * a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT. If something + * goes wrong, change state to SS_OFFLINE and shutdown the connection. + */ +static void +SendStartWALPush(Safekeeper *sk) +{ + if (!walprop_send_query(sk->conn, "START_WAL_PUSH")) + { + elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", + sk->host, sk->port, walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return; + } + sk->state = SS_WAIT_EXEC_RESULT; + UpdateEventSet(sk, WL_SOCKET_READABLE); +} + +static void +RecvStartWALPushResult(Safekeeper *sk) +{ + switch (walprop_get_query_result(sk->conn)) + { + /* + * Successful result, move on to starting the handshake + */ + case WP_EXEC_SUCCESS_COPYBOTH: + + SendProposerGreeting(sk); + break; + + /* + * Needs repeated calls to finish. Wait until the socket is + * readable + */ + case WP_EXEC_NEEDS_INPUT: + + /* + * SS_WAIT_EXEC_RESULT is always reached through an event, so we + * don't need to update the event set + */ + break; + + case WP_EXEC_FAILED: + elog(WARNING, "Failed to send query to safekeeper %s:%s: %s", + sk->host, sk->port, walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return; + + /* + * Unexpected result -- funamdentally an error, but we want to + * produce a custom message, rather than a generic "something went + * wrong" + */ + case WP_EXEC_UNEXPECTED_SUCCESS: + elog(WARNING, "Received bad response from safekeeper %s:%s query execution", + sk->host, sk->port); + ShutdownConnection(sk); + return; + } +} + +/* + * Start handshake: first of all send information about the + * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for + * a response to finish the handshake. + */ +static void +SendProposerGreeting(Safekeeper *sk) +{ + /* + * On failure, logging & resetting the connection is handled. We just need + * to handle the control flow. + */ + BlockingWrite(sk, &greetRequest, sizeof(greetRequest), SS_HANDSHAKE_RECV); +} + +static void +RecvAcceptorGreeting(Safekeeper *sk) +{ + /* + * If our reading doesn't immediately succeed, any necessary error + * handling or state setting is taken care of. We can leave any other work + * until later. + */ + sk->greetResponse.apm.tag = 'g'; + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->greetResponse)) + return; + + /* Protocol is all good, move to voting. */ + sk->state = SS_VOTING; + + ++n_connected; + if (n_connected <= quorum) + { + /* We're still collecting terms from the majority. */ + propTerm = Max(sk->greetResponse.term, propTerm); + + /* Quorum is acquried, prepare the vote request. */ + if (n_connected == quorum) + { + propTerm++; + elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, quorum, propTerm); + + voteRequest = (VoteRequest) + { + .tag = 'v', + .term = propTerm + }; + memcpy(voteRequest.proposerId.data, greetRequest.proposerId.data, UUID_LEN); + } + } + else if (sk->greetResponse.term > propTerm) + { + /* Another compute with higher term is running. */ + elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->greetResponse.term, propTerm); + } + + /* + * Check if we have quorum. If there aren't enough safekeepers, wait and + * do nothing. We'll eventually get a task when the election starts. + * + * If we do have quorum, we can start an election. + */ + if (n_connected < quorum) + { + /* + * SS_VOTING is an idle state; read-ready indicates the connection + * closed. + */ + UpdateEventSet(sk, WL_SOCKET_READABLE); + } + else + { + /* + * Now send voting request to the cohort and wait responses + */ + for (int j = 0; j < n_safekeepers; j++) + { + /* + * Remember: SS_VOTING indicates that the safekeeper is + * participating in voting, but hasn't sent anything yet. + */ + if (safekeeper[j].state == SS_VOTING) + SendVoteRequest(&safekeeper[j]); + } + } +} + +static void +SendVoteRequest(Safekeeper *sk) +{ + /* We have quorum for voting, send our vote request */ + elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, voteRequest.term); + /* On failure, logging & resetting is handled */ + if (!BlockingWrite(sk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT)) + return; + + /* If successful, wait for read-ready with SS_WAIT_VERDICT */ +} + +static void +RecvVoteResponse(Safekeeper *sk) +{ + sk->voteResponse.apm.tag = 'v'; + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->voteResponse)) + return; + + elog(LOG, + "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", + sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), + LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), + LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), + LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); + + /* + * In case of acceptor rejecting our vote, bail out, but only if either it + * already lives in strictly higher term (concurrent compute spotted) or + * we are not elected yet and thus need the vote. + */ + if ((!sk->voteResponse.voteGiven) && + (sk->voteResponse.term > propTerm || n_votes < quorum)) + { + elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->voteResponse.term, propTerm); + } + Assert(sk->voteResponse.term == propTerm); + + /* Handshake completed, do we have quorum? */ + n_votes++; + if (n_votes < quorum) + { + sk->state = SS_IDLE; /* can't do much yet, no quorum */ + } + else if (n_votes > quorum) + { + /* recovery already performed, just start streaming */ + SendProposerElected(sk); + } + else + { + sk->state = SS_IDLE; + UpdateEventSet(sk, WL_SOCKET_READABLE); /* Idle states wait for + * read-ready */ + + HandleElectedProposer(); + } +} + +/* + * Called once a majority of acceptors have voted for us and current proposer + * has been elected. + * + * Sends ProposerElected message to all acceptors in SS_IDLE state and starts + * replication from walsender. + */ +static void +HandleElectedProposer(void) +{ + DetermineEpochStartLsn(); + + /* + * Check if not all safekeepers are up-to-date, we need to download WAL + * needed to synchronize them + */ + if (truncateLsn < propEpochStartLsn) + { + elog(LOG, + "start recovery because truncateLsn=%X/%X is not " + "equal to epochStartLsn=%X/%X", + LSN_FORMAT_ARGS(truncateLsn), + LSN_FORMAT_ARGS(propEpochStartLsn)); + /* Perform recovery */ + if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn)) + elog(FATAL, "Failed to recover state"); + } + else if (syncSafekeepers) + { + /* Sync is not needed: just exit */ + fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn)); + exit(0); + } + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].state == SS_IDLE) + SendProposerElected(&safekeeper[i]); + } + + /* + * The proposer has been elected, and there will be no quorum waiting + * after this point. There will be no safekeeper with state SS_IDLE also, + * because that state is used only for quorum waiting. + */ + + if (syncSafekeepers) + { + /* + * Send empty message to enforce receiving feedback even from nodes + * who are fully recovered; this is required to learn they switched + * epoch which finishes sync-safeekepers who doesn't generate any real + * new records. Will go away once we switch to async acks. + */ + BroadcastAppendRequest(); + + /* keep polling until all safekeepers are synced */ + return; + } + + WalProposerStartStreaming(propEpochStartLsn); + /* Should not return here */ +} + +/* latest term in TermHistory, or 0 is there is no entries */ +static term_t +GetHighestTerm(TermHistory * th) +{ + return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0; +} + +/* safekeeper's epoch is the term of the highest entry in the log */ +static term_t +GetEpoch(Safekeeper *sk) +{ + return GetHighestTerm(&sk->voteResponse.termHistory); +} + +/* If LSN points to the page header, skip it */ +static XLogRecPtr +SkipXLogPageHeader(XLogRecPtr lsn) +{ + if (XLogSegmentOffset(lsn, wal_segment_size) == 0) + { + lsn += SizeOfXLogLongPHD; + } + else if (lsn % XLOG_BLCKSZ == 0) + { + lsn += SizeOfXLogShortPHD; + } + return lsn; +} + +/* + * Called after majority of acceptors gave votes, it calculates the most + * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since + * which we'll write WAL in our term. + * + * Sets truncateLsn along the way (though it is not of much use at this point -- + * only for skipping recovery). + */ +static void +DetermineEpochStartLsn(void) +{ + TermHistory *dth; + + propEpochStartLsn = InvalidXLogRecPtr; + donorEpoch = 0; + truncateLsn = InvalidXLogRecPtr; + timelineStartLsn = InvalidXLogRecPtr; + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].state == SS_IDLE) + { + if (GetEpoch(&safekeeper[i]) > donorEpoch || + (GetEpoch(&safekeeper[i]) == donorEpoch && + safekeeper[i].voteResponse.flushLsn > propEpochStartLsn)) + { + donorEpoch = GetEpoch(&safekeeper[i]); + propEpochStartLsn = safekeeper[i].voteResponse.flushLsn; + donor = i; + } + truncateLsn = Max(safekeeper[i].voteResponse.truncateLsn, truncateLsn); + + if (safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr) + { + /* timelineStartLsn should be the same everywhere or unknown */ + if (timelineStartLsn != InvalidXLogRecPtr && + timelineStartLsn != safekeeper[i].voteResponse.timelineStartLsn) + { + elog(WARNING, + "inconsistent timelineStartLsn: current %X/%X, received %X/%X", + LSN_FORMAT_ARGS(timelineStartLsn), + LSN_FORMAT_ARGS(safekeeper[i].voteResponse.timelineStartLsn)); + } + timelineStartLsn = safekeeper[i].voteResponse.timelineStartLsn; + } + } + } + + /* + * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing + * was committed yet. Start streaming then from the basebackup LSN. + */ + if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers) + { + propEpochStartLsn = truncateLsn = GetRedoStartLsn(); + if (timelineStartLsn == InvalidXLogRecPtr) + { + timelineStartLsn = GetRedoStartLsn(); + } + elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn)); + } + + /* + * If propEpochStartLsn is not 0, at least one msg with WAL was sent to + * some connected safekeeper; it must have carried truncateLsn pointing to + * the first record. + */ + Assert((truncateLsn != InvalidXLogRecPtr) || + (syncSafekeepers && truncateLsn == propEpochStartLsn)); + + /* + * We will be generating WAL since propEpochStartLsn, so we should set + * availableLsn to mark this LSN as the latest available position. + */ + availableLsn = propEpochStartLsn; + + /* + * Proposer's term history is the donor's + its own entry. + */ + dth = &safekeeper[donor].voteResponse.termHistory; + propTermHistory.n_entries = dth->n_entries + 1; + propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries); + memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries); + propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm; + propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn; + + elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", + quorum, + propTerm, + LSN_FORMAT_ARGS(propEpochStartLsn), + safekeeper[donor].host, safekeeper[donor].port, + LSN_FORMAT_ARGS(truncateLsn)); + + /* + * Ensure the basebackup we are running (at RedoStartLsn) matches LSN + * since which we are going to write according to the consensus. If not, + * we must bail out, as clog and other non rel data is inconsistent. + */ + if (!syncSafekeepers) + { + /* + * Basebackup LSN always points to the beginning of the record (not + * the page), as StartupXLOG most probably wants it this way. + * Safekeepers don't skip header as they need continious stream of + * data, so correct LSN for comparison. + */ + if (SkipXLogPageHeader(propEpochStartLsn) != GetRedoStartLsn()) + { + /* + * However, allow to proceed if previously elected leader was me; + * plain restart of walproposer not intervened by concurrent + * compute (who could generate WAL) is ok. + */ + if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == + walprop_shared->mineLastElectedTerm))) + { + elog(PANIC, + "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X", + LSN_FORMAT_ARGS(propEpochStartLsn), + LSN_FORMAT_ARGS(GetRedoStartLsn())); + } + } + walprop_shared->mineLastElectedTerm = propTerm; + } +} + +/* + * Receive WAL from most advanced safekeeper + */ +static bool +WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos) +{ + char conninfo[MAXCONNINFO]; + char *err; + WalReceiverConn *wrconn; + WalRcvStreamOptions options; + + sprintf(conninfo, "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'", + safekeeper[donor].host, safekeeper[donor].port, neon_timeline_walproposer, neon_tenant_walproposer); + wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); + if (!wrconn) + { + ereport(WARNING, + (errmsg("could not connect to WAL acceptor %s:%s: %s", + safekeeper[donor].host, safekeeper[donor].port, + err))); + return false; + } + elog(LOG, + "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline " + "%d", + safekeeper[donor].host, safekeeper[donor].port, (uint32) (startpos >> 32), + (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); + + options.logical = false; + options.startpoint = startpos; + options.slotname = NULL; + options.proto.physical.startpointTLI = timeline; + + if (walrcv_startstreaming(wrconn, &options)) + { + XLogRecPtr rec_start_lsn; + XLogRecPtr rec_end_lsn = 0; + int len; + char *buf; + pgsocket wait_fd = PGINVALID_SOCKET; + + while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0) + { + if (len == 0) + { + (void) WaitLatchOrSocket( + MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd, + -1, WAIT_EVENT_WAL_RECEIVER_MAIN); + } + else + { + Assert(buf[0] == 'w' || buf[0] == 'k'); + if (buf[0] == 'k') + continue; /* keepalive */ + memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], + sizeof rec_start_lsn); + rec_start_lsn = pg_ntoh64(rec_start_lsn); + rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE; + + /* write WAL to disk */ + XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn); + + ereport(DEBUG1, + (errmsg("Recover message %X/%X length %d", + LSN_FORMAT_ARGS(rec_start_lsn), len))); + if (rec_end_lsn >= endpos) + break; + } + } + ereport(LOG, + (errmsg("end of replication stream at %X/%X: %m", + LSN_FORMAT_ARGS(rec_end_lsn)))); + walrcv_disconnect(wrconn); + + /* failed to receive all WAL till endpos */ + if (rec_end_lsn < endpos) + return false; + } + else + { + ereport(LOG, + (errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X", + timeline, (uint32) (startpos >> 32), (uint32) startpos))); + return false; + } + + return true; +} + +/* + * Determine for sk the starting streaming point and send it message + * 1) Announcing we are elected proposer (which immediately advances epoch if + * safekeeper is synced, being important for sync-safekeepers) + * 2) Communicating starting streaming point -- safekeeper must truncate its WAL + * beyond it -- and history of term switching. + * + * Sets sk->startStreamingAt. + */ +static void +SendProposerElected(Safekeeper *sk) +{ + ProposerElected msg; + TermHistory *th; + term_t lastCommonTerm; + int i; + + /* + * Determine start LSN by comparing safekeeper's log term switch history + * and proposer's, searching for the divergence point. + * + * Note: there is a vanishingly small chance of no common point even if + * there is some WAL on safekeeper, if immediately after bootstrap compute + * wrote some WAL on single sk and died; we stream since the beginning + * then. + */ + th = &sk->voteResponse.termHistory; + + /* We must start somewhere. */ + Assert(propTermHistory.n_entries >= 1); + + for (i = 0; i < Min(propTermHistory.n_entries, th->n_entries); i++) + { + if (propTermHistory.entries[i].term != th->entries[i].term) + break; + /* term must begin everywhere at the same point */ + Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn); + } + i--; /* step back to the last common term */ + if (i < 0) + { + /* safekeeper is empty or no common point, start from the beginning */ + sk->startStreamingAt = propTermHistory.entries[0].lsn; + + if (sk->startStreamingAt < truncateLsn) + { + /* + * There's a gap between the WAL starting point and a truncateLsn, + * which can't appear in a normal working cluster. That gap means + * that all safekeepers reported that they have persisted WAL up + * to the truncateLsn before, but now current safekeeper tells + * otherwise. + * + * Also we have a special condition here, which is empty + * safekeeper with no history. In combination with a gap, that can + * happen when we introduce a new safekeeper to the cluster. This + * is a rare case, which is triggered manually for now, and should + * be treated with care. + */ + + /* + * truncateLsn will not change without ack from current + * safekeeper, and it's aligned to the WAL record, so we can + * safely start streaming from this point. + */ + sk->startStreamingAt = truncateLsn; + + elog(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X", + sk->host, sk->port, LSN_FORMAT_ARGS(propTermHistory.entries[0].lsn), + LSN_FORMAT_ARGS(sk->startStreamingAt)); + } + } + else + { + /* + * End of (common) term is the start of the next except it is the last + * one; there it is flush_lsn in case of safekeeper or, in case of + * proposer, LSN it is currently writing, but then we just pick + * safekeeper pos as it obviously can't be higher. + */ + if (propTermHistory.entries[i].term == propTerm) + { + sk->startStreamingAt = sk->voteResponse.flushLsn; + } + else + { + XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; + XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : sk->voteResponse.flushLsn); + + sk->startStreamingAt = Min(propEndLsn, skEndLsn); + } + } + + Assert(sk->startStreamingAt >= truncateLsn && sk->startStreamingAt <= availableLsn); + + msg.tag = 'e'; + msg.term = propTerm; + msg.startStreamingAt = sk->startStreamingAt; + msg.termHistory = &propTermHistory; + msg.timelineStartLsn = timelineStartLsn; + + lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0; + elog(LOG, + "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", + sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); + + resetStringInfo(&sk->outbuf); + pq_sendint64_le(&sk->outbuf, msg.tag); + pq_sendint64_le(&sk->outbuf, msg.term); + pq_sendint64_le(&sk->outbuf, msg.startStreamingAt); + pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries); + for (int i = 0; i < msg.termHistory->n_entries; i++) + { + pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term); + pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn); + } + pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn); + + if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH)) + return; + + StartStreaming(sk); +} + +/* + * Start walsender streaming replication + */ +static void +WalProposerStartStreaming(XLogRecPtr startpos) +{ + StartReplicationCmd cmd; + + elog(LOG, "WAL proposer starts streaming at %X/%X", + LSN_FORMAT_ARGS(startpos)); + cmd.slotname = WAL_PROPOSER_SLOT_NAME; + cmd.timeline = greetRequest.timeline; + cmd.startpoint = startpos; + StartProposerReplication(&cmd); +} + +/* + * Start streaming to safekeeper sk, always updates state to SS_ACTIVE and sets + * correct event set. + */ +static void +StartStreaming(Safekeeper *sk) +{ + /* + * This is the only entrypoint to state SS_ACTIVE. It's executed exactly + * once for a connection. + */ + sk->state = SS_ACTIVE; + sk->streamingAt = sk->startStreamingAt; + + /* event set will be updated inside SendMessageToNode */ + SendMessageToNode(sk); +} + +/* + * Try to send message to the particular node. Always updates event set. Will + * send at least one message, if socket is ready. + * + * Can be used only for safekeepers in SS_ACTIVE state. State can be changed + * in case of errors. + */ +static void +SendMessageToNode(Safekeeper *sk) +{ + Assert(sk->state == SS_ACTIVE); + + /* + * Note: we always send everything to the safekeeper until WOULDBLOCK or + * nothing left to send + */ + HandleActiveState(sk, WL_SOCKET_WRITEABLE); +} + +/* + * Broadcast new message to all caught-up safekeepers + */ +static void +BroadcastAppendRequest() +{ + for (int i = 0; i < n_safekeepers; i++) + if (safekeeper[i].state == SS_ACTIVE) + SendMessageToNode(&safekeeper[i]); +} + +static void +PrepareAppendRequest(AppendRequestHeader * req, XLogRecPtr beginLsn, XLogRecPtr endLsn) +{ + Assert(endLsn >= beginLsn); + req->tag = 'a'; + req->term = propTerm; + req->epochStartLsn = propEpochStartLsn; + req->beginLsn = beginLsn; + req->endLsn = endLsn; + req->commitLsn = GetAcknowledgedByQuorumWALPosition(); + req->truncateLsn = truncateLsn; + req->proposerId = greetRequest.proposerId; +} + +/* + * Process all events happened in SS_ACTIVE state, update event set after that. + */ +static void +HandleActiveState(Safekeeper *sk, uint32 events) +{ + uint32 newEvents = WL_SOCKET_READABLE; + + if (events & WL_SOCKET_WRITEABLE) + if (!SendAppendRequests(sk)) + return; + + if (events & WL_SOCKET_READABLE) + if (!RecvAppendResponses(sk)) + return; + + /* + * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data + * in the buffer. + * + * LSN comparison checks if we have pending unsent messages. This check + * isn't necessary now, because we always send append messages immediately + * after arrival. But it's good to have it here in case we change this + * behavior in the future. + */ + if (sk->streamingAt != availableLsn || sk->flushWrite) + newEvents |= WL_SOCKET_WRITEABLE; + + UpdateEventSet(sk, newEvents); +} + +/* + * Send WAL messages starting from sk->streamingAt until the end or non-writable + * socket, whichever comes first. Caller should take care of updating event set. + * Even if no unsent WAL is available, at least one empty message will be sent + * as a heartbeat, if socket is ready. + * + * Can change state if Async* functions encounter errors and reset connection. + * Returns false in this case, true otherwise. + */ +static bool +SendAppendRequests(Safekeeper *sk) +{ + XLogRecPtr endLsn; + AppendRequestHeader *req; + PGAsyncWriteResult writeResult; + WALReadError errinfo; + bool sentAnything = false; + + if (sk->flushWrite) + { + if (!AsyncFlush(sk)) + + /* + * AsyncFlush failed, that could happen if the socket is closed or + * we have nothing to write and should wait for writeable socket. + */ + return sk->state == SS_ACTIVE; + + /* Event set will be updated in the end of HandleActiveState */ + sk->flushWrite = false; + } + + while (sk->streamingAt != availableLsn || !sentAnything) + { + sentAnything = true; + + endLsn = sk->streamingAt; + endLsn += MAX_SEND_SIZE; + + /* if we went beyond available WAL, back off */ + if (endLsn > availableLsn) + { + endLsn = availableLsn; + } + + req = &sk->appendRequest; + PrepareAppendRequest(&sk->appendRequest, sk->streamingAt, endLsn); + + ereport(DEBUG2, + (errmsg("sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", + req->endLsn - req->beginLsn, + LSN_FORMAT_ARGS(req->beginLsn), + LSN_FORMAT_ARGS(req->endLsn), + LSN_FORMAT_ARGS(req->commitLsn), + LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port))); + + resetStringInfo(&sk->outbuf); + + /* write AppendRequest header */ + appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader)); + + /* write the WAL itself */ + enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); + if (!WALRead(sk->xlogreader, + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req->endLsn - req->beginLsn, +#if PG_VERSION_NUM >= 150000 + /* FIXME don't use hardcoded timeline_id here */ + 1, +#else + ThisTimeLineID, +#endif + &errinfo)) + { + WALReadRaiseError(&errinfo); + } + sk->outbuf.len += req->endLsn - req->beginLsn; + + writeResult = walprop_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len); + + /* Mark current message as sent, whatever the result is */ + sk->streamingAt = endLsn; + + switch (writeResult) + { + case PG_ASYNC_WRITE_SUCCESS: + /* Continue writing the next message */ + break; + + case PG_ASYNC_WRITE_TRY_FLUSH: + + /* + * * We still need to call PQflush some more to finish the + * job. Caller function will handle this by setting right + * event* set. + */ + sk->flushWrite = true; + return true; + + case PG_ASYNC_WRITE_FAIL: + elog(WARNING, "Failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + default: + Assert(false); + return false; + } + } + + return true; +} + +/* + * Receive and process all available feedback. + * + * Can change state if Async* functions encounter errors and reset connection. + * Returns false in this case, true otherwise. + * + * NB: This function can call SendMessageToNode and produce new messages. + */ +static bool +RecvAppendResponses(Safekeeper *sk) +{ + XLogRecPtr minQuorumLsn; + bool readAnything = false; + + while (true) + { + /* + * If our reading doesn't immediately succeed, any necessary error + * handling or state setting is taken care of. We can leave any other + * work until later. + */ + sk->appendResponse.apm.tag = 'a'; + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->appendResponse)) + break; + + ereport(DEBUG2, + (errmsg("received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s", + sk->appendResponse.term, + LSN_FORMAT_ARGS(sk->appendResponse.flushLsn), + LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), + sk->host, sk->port))); + + if (sk->appendResponse.term > propTerm) + { + /* Another compute with higher term is running. */ + elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", + sk->host, sk->port, + sk->appendResponse.term, propTerm); + } + + readAnything = true; + } + + if (!readAnything) + return sk->state == SS_ACTIVE; + + HandleSafekeeperResponse(); + + /* + * Also send the new commit lsn to all the safekeepers. + */ + minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); + if (minQuorumLsn > lastSentCommitLsn) + { + BroadcastAppendRequest(); + lastSentCommitLsn = minQuorumLsn; + } + + return sk->state == SS_ACTIVE; +} + +/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */ +void +ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * rf) +{ + uint8 nkeys; + int i; + int32 len; + + /* get number of custom keys */ + nkeys = pq_getmsgbyte(reply_message); + + for (i = 0; i < nkeys; i++) + { + const char *key = pq_getmsgstring(reply_message); + + if (strcmp(key, "current_timeline_size") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); + /* read value length */ + rf->currentClusterSize = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", + rf->currentClusterSize); + } + else if (strcmp(key, "ps_writelsn") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); + /* read value length */ + rf->ps_writelsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_writelsn)); + } + else if (strcmp(key, "ps_flushlsn") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); + /* read value length */ + rf->ps_flushlsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_flushlsn)); + } + else if (strcmp(key, "ps_applylsn") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); + /* read value length */ + rf->ps_applylsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_applylsn)); + } + else if (strcmp(key, "ps_replytime") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); + /* read value length */ + rf->ps_replytime = pq_getmsgint64(reply_message); + { + char *replyTimeStr; + + /* Copy because timestamptz_to_str returns a static buffer */ + replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime)); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s", + rf->ps_replytime, replyTimeStr); + + pfree(replyTimeStr); + } + } + else + { + len = pq_getmsgint(reply_message, sizeof(int32)); + /* read value length */ + + /* + * Skip unknown keys to support backward compatibile protocol + * changes + */ + elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); + pq_getmsgbytes(reply_message, len); + }; + } +} + +/* + * Combine hot standby feedbacks from all safekeepers. + */ +static void +CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) +{ + hs->ts = 0; + hs->xmin.value = ~0; /* largest unsigned value */ + hs->catalog_xmin.value = ~0; /* largest unsigned value */ + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].appendResponse.hs.ts != 0) + { + if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin)) + { + hs->xmin = safekeeper[i].appendResponse.hs.xmin; + hs->ts = safekeeper[i].appendResponse.hs.ts; + } + if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin)) + { + hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin; + hs->ts = safekeeper[i].appendResponse.hs.ts; + } + } + } +} + +/* + * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the + * last WAL record that can be safely discarded. + */ +static XLogRecPtr +CalculateMinFlushLsn(void) +{ + XLogRecPtr lsn = n_safekeepers > 0 + ? safekeeper[0].appendResponse.flushLsn + : InvalidXLogRecPtr; + + for (int i = 1; i < n_safekeepers; i++) + { + lsn = Min(lsn, safekeeper[i].appendResponse.flushLsn); + } + return lsn; +} + +/* + * Calculate WAL position acknowledged by quorum + */ +static XLogRecPtr +GetAcknowledgedByQuorumWALPosition(void) +{ + XLogRecPtr responses[MAX_SAFEKEEPERS]; + + /* + * Sort acknowledged LSNs + */ + for (int i = 0; i < n_safekeepers; i++) + { + /* + * Like in Raft, we aren't allowed to commit entries from previous + * terms, so ignore reported LSN until it gets to epochStartLsn. + */ + responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ? safekeeper[i].appendResponse.flushLsn : 0; + } + qsort(responses, n_safekeepers, sizeof(XLogRecPtr), CompareLsn); + + /* + * Get the smallest LSN committed by quorum + */ + return responses[n_safekeepers - quorum]; +} + +/* + * ReplicationFeedbackShmemSize --- report amount of shared memory space needed + */ +Size +WalproposerShmemSize(void) +{ + return sizeof(WalproposerShmemState); +} + +bool +WalproposerShmemInit(void) +{ + bool found; + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + walprop_shared = ShmemInitStruct("Walproposer shared state", + sizeof(WalproposerShmemState), + &found); + + if (!found) + { + memset(walprop_shared, 0, WalproposerShmemSize()); + SpinLockInit(&walprop_shared->mutex); + pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); + } + LWLockRelease(AddinShmemInitLock); + + return found; +} + +void +replication_feedback_set(ReplicationFeedback * rf) +{ + SpinLockAcquire(&walprop_shared->mutex); + memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback)); + SpinLockRelease(&walprop_shared->mutex); +} + +void +replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) +{ + SpinLockAcquire(&walprop_shared->mutex); + *writeLsn = walprop_shared->feedback.ps_writelsn; + *flushLsn = walprop_shared->feedback.ps_flushlsn; + *applyLsn = walprop_shared->feedback.ps_applylsn; + SpinLockRelease(&walprop_shared->mutex); +} + +/* + * Get ReplicationFeedback fields from the most advanced safekeeper + */ +static void +GetLatestNeonFeedback(ReplicationFeedback * rf) +{ + int latest_safekeeper = 0; + XLogRecPtr ps_writelsn = InvalidXLogRecPtr; + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn) + { + latest_safekeeper = i; + ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn; + } + } + + rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; + rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn; + rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn; + rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn; + rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; + + elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," + " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", + rf->currentClusterSize, + LSN_FORMAT_ARGS(rf->ps_writelsn), + LSN_FORMAT_ARGS(rf->ps_flushlsn), + LSN_FORMAT_ARGS(rf->ps_applylsn), + rf->ps_replytime); + + replication_feedback_set(rf); +} + +static void +HandleSafekeeperResponse(void) +{ + HotStandbyFeedback hsFeedback; + XLogRecPtr minQuorumLsn; + XLogRecPtr diskConsistentLsn; + XLogRecPtr minFlushLsn; + + minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); + diskConsistentLsn = quorumFeedback.rf.ps_flushlsn; + + if (!syncSafekeepers) + { + /* Get ReplicationFeedback fields from the most advanced safekeeper */ + GetLatestNeonFeedback(&quorumFeedback.rf); + SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); + } + + if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn) + { + + if (minQuorumLsn > quorumFeedback.flushLsn) + quorumFeedback.flushLsn = minQuorumLsn; + + /* advance the replication slot */ + if (!syncSafekeepers) + ProcessStandbyReply( + /* write_lsn - This is what durably stored in WAL service. */ + quorumFeedback.flushLsn, + /* flush_lsn - This is what durably stored in WAL service. */ + quorumFeedback.flushLsn, + + /* + * apply_lsn - This is what processed and durably saved at* + * pageserver. + */ + quorumFeedback.rf.ps_flushlsn, + GetCurrentTimestamp(), false); + } + + CombineHotStanbyFeedbacks(&hsFeedback); + if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0) + { + quorumFeedback.hs = hsFeedback; + if (!syncSafekeepers) + ProcessStandbyHSFeedback(hsFeedback.ts, + XidFromFullTransactionId(hsFeedback.xmin), + EpochFromFullTransactionId(hsFeedback.xmin), + XidFromFullTransactionId(hsFeedback.catalog_xmin), + EpochFromFullTransactionId(hsFeedback.catalog_xmin)); + } + + /* + * Try to advance truncateLsn to minFlushLsn, which is the last record + * flushed to all safekeepers. We must always start streaming from the + * beginning of the record, which simplifies decoding on the far end. + * + * Advanced truncateLsn should be not further than nearest commitLsn. This + * prevents surprising violation of truncateLsn <= commitLsn invariant + * which might occur because 1) truncateLsn can be advanced immediately + * once chunk is broadcast to all safekeepers, and commitLsn generally + * can't be advanced based on feedback from safekeeper who is still in the + * previous epoch (similar to 'leader can't commit entries from previous + * term' in Raft); 2) chunks we read from WAL and send are plain sheets of + * bytes, but safekeepers ack only on record boundaries. + */ + minFlushLsn = CalculateMinFlushLsn(); + if (minFlushLsn > truncateLsn) + { + truncateLsn = minFlushLsn; + + /* + * Advance the replication slot to free up old WAL files. Note that + * slot doesn't exist if we are in syncSafekeepers mode. + */ + if (MyReplicationSlot) + PhysicalConfirmReceivedLocation(truncateLsn); + } + + /* + * Generally sync is done when majority switched the epoch so we committed + * epochStartLsn and made the majority aware of it, ensuring they are + * ready to give all WAL to pageserver. It would mean whichever majority + * is alive, there will be at least one safekeeper who is able to stream + * WAL to pageserver to make basebackup possible. However, since at the + * moment we don't have any good mechanism of defining the healthy and + * most advanced safekeeper who should push the wal into pageserver and + * basically the random one gets connected, to prevent hanging basebackup + * (due to pageserver connecting to not-synced-safekeeper) we currently + * wait for all seemingly alive safekeepers to get synced. + */ + if (syncSafekeepers) + { + int n_synced; + + n_synced = 0; + for (int i = 0; i < n_safekeepers; i++) + { + Safekeeper *sk = &safekeeper[i]; + bool synced = sk->appendResponse.commitLsn >= propEpochStartLsn; + + /* alive safekeeper which is not synced yet; wait for it */ + if (sk->state != SS_OFFLINE && !synced) + return; + if (synced) + n_synced++; + } + if (n_synced >= quorum) + { + /* All safekeepers synced! */ + fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn)); + exit(0); + } + } +} + +/* + * Try to read CopyData message from i'th safekeeper, resetting connection on + * failure. + */ +static bool +AsyncRead(Safekeeper *sk, char **buf, int *buf_size) +{ + switch (walprop_async_read(sk->conn, buf, buf_size)) + { + case PG_ASYNC_READ_SUCCESS: + return true; + + case PG_ASYNC_READ_TRY_AGAIN: + /* WL_SOCKET_READABLE is always set during copyboth */ + return false; + + case PG_ASYNC_READ_FAIL: + elog(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host, + sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + } + Assert(false); + return false; +} + +/* + * Read next message with known type into provided struct, by reading a CopyData + * block from the safekeeper's postgres connection, returning whether the read + * was successful. + * + * If the read needs more polling, we return 'false' and keep the state + * unmodified, waiting until it becomes read-ready to try again. If it fully + * failed, a warning is emitted and the connection is reset. + */ +static bool +AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg) +{ + char *buf; + int buf_size; + uint64 tag; + StringInfoData s; + + if (!(AsyncRead(sk, &buf, &buf_size))) + return false; + + /* parse it */ + s.data = buf; + s.len = buf_size; + s.cursor = 0; + + tag = pq_getmsgint64_le(&s); + if (tag != anymsg->tag) + { + elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk->state)); + ResetConnection(sk); + return false; + } + sk->latestMsgReceivedAt = GetCurrentTimestamp(); + switch (tag) + { + case 'g': + { + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->nodeId = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } + + case 'v': + { + VoteResponse *msg = (VoteResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->voteGiven = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->truncateLsn = pq_getmsgint64_le(&s); + msg->termHistory.n_entries = pq_getmsgint32_le(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (int i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + } + msg->timelineStartLsn = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } + + case 'a': + { + AppendResponse *msg = (AppendResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->commitLsn = pq_getmsgint64_le(&s); + msg->hs.ts = pq_getmsgint64_le(&s); + msg->hs.xmin.value = pq_getmsgint64_le(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); + if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) + ParseReplicationFeedbackMessage(&s, &msg->rf); + pq_getmsgend(&s); + return true; + } + + default: + { + Assert(false); + return false; + } + } +} + +/* + * Blocking equivalent to AsyncWrite. + * + * We use this everywhere messages are small enough that they should fit in a + * single packet. + */ +static bool +BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state) +{ + uint32 events; + + if (!walprop_blocking_write(sk->conn, msg, msg_size)) + { + elog(WARNING, "Failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + } + + sk->state = success_state; + + /* + * If the new state will be waiting for events to happen, update the event + * set to wait for those + */ + events = SafekeeperStateDesiredEvents(success_state); + if (events) + UpdateEventSet(sk, events); + + return true; +} + +/* + * Starts a write into the 'i'th safekeeper's postgres connection, moving to + * flush_state (adjusting eventset) if write still needs flushing. + * + * Returns false if sending is unfinished (requires flushing or conn failed). + * Upon failure, a warning is emitted and the connection is reset. + */ +static bool +AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state) +{ + switch (walprop_async_write(sk->conn, msg, msg_size)) + { + case PG_ASYNC_WRITE_SUCCESS: + return true; + case PG_ASYNC_WRITE_TRY_FLUSH: + + /* + * We still need to call PQflush some more to finish the job; go + * to the appropriate state. Update the event set at the bottom of + * this function + */ + sk->state = flush_state; + UpdateEventSet(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE); + return false; + case PG_ASYNC_WRITE_FAIL: + elog(WARNING, "Failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + default: + Assert(false); + return false; + } +} + +/* + * Flushes a previous call to AsyncWrite. This only needs to be called when the + * socket becomes read or write ready *after* calling AsyncWrite. + * + * If flushing successfully completes returns true, otherwise false. Event set + * is updated only if connection fails, otherwise caller should manually unset + * WL_SOCKET_WRITEABLE. + */ +static bool +AsyncFlush(Safekeeper *sk) +{ + /*--- + * PQflush returns: + * 0 if successful [we're good to move on] + * 1 if unable to send everything yet [call PQflush again] + * -1 if it failed [emit an error] + */ + switch (walprop_flush(sk->conn)) + { + case 0: + /* flush is done */ + return true; + case 1: + /* Nothing to do; try again when the socket's ready */ + return false; + case -1: + elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ResetConnection(sk); + return false; + default: + Assert(false); + return false; + } +} + +/* Check if we need to suspend inserts because of lagging replication. */ +static uint64 +backpressure_lag_impl(void) +{ + if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0) + { + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; +#if PG_VERSION_NUM >= 150000 + XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL); +#else + XLogRecPtr myFlushLsn = GetFlushRecPtr(); +#endif + replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); +#define MB ((XLogRecPtr)1024 * 1024) + + elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X", + LSN_FORMAT_ARGS(myFlushLsn), + LSN_FORMAT_ARGS(writePtr), + LSN_FORMAT_ARGS(flushPtr), + LSN_FORMAT_ARGS(applyPtr)); + + if ((writePtr != InvalidXLogRecPtr && max_replication_write_lag > 0 && myFlushLsn > writePtr + max_replication_write_lag * MB)) + { + return (myFlushLsn - writePtr - max_replication_write_lag * MB); + } + + if ((flushPtr != InvalidXLogRecPtr && max_replication_flush_lag > 0 && myFlushLsn > flushPtr + max_replication_flush_lag * MB)) + { + return (myFlushLsn - flushPtr - max_replication_flush_lag * MB); + } + + if ((applyPtr != InvalidXLogRecPtr && max_replication_apply_lag > 0 && myFlushLsn > applyPtr + max_replication_apply_lag * MB)) + { + return (myFlushLsn - applyPtr - max_replication_apply_lag * MB); + } + } + return 0; +} + +#define BACK_PRESSURE_DELAY 10000L // 0.01 sec + +static bool +backpressure_throttling_impl(void) +{ + int64 lag; + TimestampTz start, + stop; + bool retry = PrevProcessInterruptsCallback + ? PrevProcessInterruptsCallback() + : false; + + /* Don't throttle read only transactions and wal sender. */ + if (am_walsender || !TransactionIdIsValid(GetCurrentTransactionIdIfAny())) + return retry; + + /* Calculate replicas lag */ + lag = backpressure_lag_impl(); + if (lag == 0) + return retry; + + /* Suspend writers until replicas catch up */ + set_ps_display("backpressure throttling"); + + elog(DEBUG2, "backpressure throttling: lag %lu", lag); + start = GetCurrentTimestamp(); + pg_usleep(BACK_PRESSURE_DELAY); + stop = GetCurrentTimestamp(); + pg_atomic_add_fetch_u64(&walprop_shared->backpressureThrottlingTime, stop - start); + return true; +} + +uint64 +BackpressureThrottlingTime(void) +{ + return pg_atomic_read_u64(&walprop_shared->backpressureThrottlingTime); +} diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h new file mode 100644 index 0000000000..3c4f080353 --- /dev/null +++ b/pgxn/neon/walproposer.h @@ -0,0 +1,509 @@ +#ifndef __NEON_WALPROPOSER_H__ +#define __NEON_WALPROPOSER_H__ + +#include "access/xlogdefs.h" +#include "postgres.h" +#include "port.h" +#include "access/xlog_internal.h" +#include "access/transam.h" +#include "nodes/replnodes.h" +#include "utils/uuid.h" +#include "replication/walreceiver.h" + +#define SK_MAGIC 0xCafeCeefu +#define SK_PROTOCOL_VERSION 2 + +#define MAX_SAFEKEEPERS 32 +#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single* WAL + * message */ +#define XLOG_HDR_SIZE (1 + 8 * 3) /* 'w' + startPos + walEnd + timestamp */ +#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender* + * message header */ +#define XLOG_HDR_END_POS (1 + 8) /* offset of end position in wal sender* + * message header */ + +/* + * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured, + * because all WL_* events are given flags equal to some (1 << i), starting from i = 0 + */ +#define WL_NO_EVENTS 0 + +extern char *wal_acceptors_list; +extern int wal_acceptor_reconnect_timeout; +extern int wal_acceptor_connection_timeout; +extern bool am_wal_proposer; + +struct WalProposerConn; /* Defined in libpqwalproposer */ +typedef struct WalProposerConn WalProposerConn; + +struct WalMessage; +typedef struct WalMessage WalMessage; + +extern char *neon_timeline_walproposer; +extern char *neon_tenant_walproposer; + +/* Possible return values from ReadPGAsync */ +typedef enum +{ + /* The full read was successful. buf now points to the data */ + PG_ASYNC_READ_SUCCESS, + + /* + * The read is ongoing. Wait until the connection is read-ready, then try + * again. + */ + PG_ASYNC_READ_TRY_AGAIN, + /* Reading failed. Check PQerrorMessage(conn) */ + PG_ASYNC_READ_FAIL, +} PGAsyncReadResult; + +/* Possible return values from WritePGAsync */ +typedef enum +{ + /* The write fully completed */ + PG_ASYNC_WRITE_SUCCESS, + + /* + * The write started, but you'll need to call PQflush some more times to + * finish it off. We just tried, so it's best to wait until the connection + * is read- or write-ready to try again. + * + * If it becomes read-ready, call PQconsumeInput and flush again. If it + * becomes write-ready, just call PQflush. + */ + PG_ASYNC_WRITE_TRY_FLUSH, + /* Writing failed. Check PQerrorMessage(conn) */ + PG_ASYNC_WRITE_FAIL, +} PGAsyncWriteResult; + +/* + * WAL safekeeper state, which is used to wait for some event. + * + * States are listed here in the order that they're executed. + * + * Most states, upon failure, will move back to SS_OFFLINE by calls to + * ResetConnection or ShutdownConnection. + */ +typedef enum +{ + /* + * Does not have an active connection and will stay that way until further + * notice. + * + * Moves to SS_CONNECTING_WRITE by calls to ResetConnection. + */ + SS_OFFLINE, + + /* + * Connecting states. "_READ" waits for the socket to be available for + * reading, "_WRITE" waits for writing. There's no difference in the code + * they execute when polled, but we have this distinction in order to + * recreate the event set in HackyRemoveWalProposerEvent. + * + * After the connection is made, "START_WAL_PUSH" query is sent. + */ + SS_CONNECTING_WRITE, + SS_CONNECTING_READ, + + /* + * Waiting for the result of the "START_WAL_PUSH" command. + * + * After we get a successful result, sends handshake to safekeeper. + */ + SS_WAIT_EXEC_RESULT, + + /* + * Executing the receiving half of the handshake. After receiving, moves + * to SS_VOTING. + */ + SS_HANDSHAKE_RECV, + + /* + * Waiting to participate in voting, but a quorum hasn't yet been reached. + * This is an idle state - we do not expect AdvancePollState to be called. + * + * Moved externally by execution of SS_HANDSHAKE_RECV, when we received a + * quorum of handshakes. + */ + SS_VOTING, + + /* + * Already sent voting information, waiting to receive confirmation from + * the node. After receiving, moves to SS_IDLE, if the quorum isn't + * reached yet. + */ + SS_WAIT_VERDICT, + + /* Need to flush ProposerElected message. */ + SS_SEND_ELECTED_FLUSH, + + /* + * Waiting for quorum to send WAL. Idle state. If the socket becomes + * read-ready, the connection has been closed. + * + * Moves to SS_ACTIVE only by call to StartStreaming. + */ + SS_IDLE, + + /* + * Active phase, when we acquired quorum and have WAL to send or feedback + * to read. + */ + SS_ACTIVE, +} SafekeeperState; + +/* Consensus logical timestamp. */ +typedef uint64 term_t; + +/* neon storage node id */ +typedef uint64 NNodeId; + +/* + * Proposer <-> Acceptor messaging. + */ + +/* Initial Proposer -> Acceptor message */ +typedef struct ProposerGreeting +{ + uint64 tag; /* message tag */ + uint32 protocolVersion; /* proposer-safekeeper protocol version */ + uint32 pgVersion; + pg_uuid_t proposerId; + uint64 systemId; /* Postgres system identifier */ + uint8 timeline_id[16]; /* Neon timeline id */ + uint8 tenant_id[16]; + TimeLineID timeline; + uint32 walSegSize; +} ProposerGreeting; + +typedef struct AcceptorProposerMessage +{ + uint64 tag; +} AcceptorProposerMessage; + +/* + * Acceptor -> Proposer initial response: the highest term acceptor voted for. + */ +typedef struct AcceptorGreeting +{ + AcceptorProposerMessage apm; + term_t term; + NNodeId nodeId; +} AcceptorGreeting; + +/* + * Proposer -> Acceptor vote request. + */ +typedef struct VoteRequest +{ + uint64 tag; + term_t term; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} VoteRequest; + +/* Element of term switching chain. */ +typedef struct TermSwitchEntry +{ + term_t term; + XLogRecPtr lsn; +} TermSwitchEntry; + +typedef struct TermHistory +{ + uint32 n_entries; + TermSwitchEntry *entries; +} TermHistory; + +/* Vote itself, sent from safekeeper to proposer */ +typedef struct VoteResponse +{ + AcceptorProposerMessage apm; + term_t term; + uint64 voteGiven; + + /* + * Safekeeper flush_lsn (end of WAL) + history of term switches allow + * proposer to choose the most advanced one. + */ + XLogRecPtr flushLsn; + XLogRecPtr truncateLsn; /* minimal LSN which may be needed for* + * recovery of some safekeeper */ + TermHistory termHistory; + XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ +} VoteResponse; + +/* + * Proposer -> Acceptor message announcing proposer is elected and communicating + * epoch history to it. + */ +typedef struct ProposerElected +{ + uint64 tag; + term_t term; + /* proposer will send since this point */ + XLogRecPtr startStreamingAt; + /* history of term switches up to this proposer */ + TermHistory *termHistory; + /* timeline globally starts at this LSN */ + XLogRecPtr timelineStartLsn; +} ProposerElected; + +/* + * Header of request with WAL message sent from proposer to safekeeper. + */ +typedef struct AppendRequestHeader +{ + uint64 tag; + term_t term; /* term of the proposer */ + + /* + * LSN since which current proposer appends WAL (begin_lsn of its first + * record); determines epoch switch point. + */ + XLogRecPtr epochStartLsn; + XLogRecPtr beginLsn; /* start position of message in WAL */ + XLogRecPtr endLsn; /* end position of message in WAL */ + XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ + + /* + * minimal LSN which may be needed for recovery of some safekeeper (end + * lsn + 1 of last chunk streamed to everyone) + */ + XLogRecPtr truncateLsn; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} AppendRequestHeader; + +/* + * Hot standby feedback received from replica + */ +typedef struct HotStandbyFeedback +{ + TimestampTz ts; + FullTransactionId xmin; + FullTransactionId catalog_xmin; +} HotStandbyFeedback; + +typedef struct ReplicationFeedback +{ + /* current size of the timeline on pageserver */ + uint64 currentClusterSize; + /* standby_status_update fields that safekeeper received from pageserver */ + XLogRecPtr ps_writelsn; + XLogRecPtr ps_flushlsn; + XLogRecPtr ps_applylsn; + TimestampTz ps_replytime; +} ReplicationFeedback; + +typedef struct WalproposerShmemState +{ + slock_t mutex; + ReplicationFeedback feedback; + term_t mineLastElectedTerm; + pg_atomic_uint64 backpressureThrottlingTime; +} WalproposerShmemState; + +/* + * Report safekeeper state to proposer + */ +typedef struct AppendResponse +{ + AcceptorProposerMessage apm; + + /* + * Current term of the safekeeper; if it is higher than proposer's, the + * compute is out of date. + */ + term_t term; + /* TODO: add comment */ + XLogRecPtr flushLsn; + /* Safekeeper reports back his awareness about which WAL is committed, as */ + /* this is a criterion for walproposer --sync mode exit */ + XLogRecPtr commitLsn; + HotStandbyFeedback hs; + /* Feedback recieved from pageserver includes standby_status_update fields */ + /* and custom neon feedback. */ + /* This part of the message is extensible. */ + ReplicationFeedback rf; +} AppendResponse; + +/* ReplicationFeedback is extensible part of the message that is parsed separately */ +/* Other fields are fixed part */ +#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) + +/* + * Descriptor of safekeeper + */ +typedef struct Safekeeper +{ + char const *host; + char const *port; + char conninfo[MAXCONNINFO]; /* connection info for* + * connecting/reconnecting */ + + /* + * postgres protocol connection to the WAL acceptor + * + * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we + * reach SS_ACTIVE; not before. + */ + WalProposerConn *conn; + + /* + * Temporary buffer for the message being sent to the safekeeper. + */ + StringInfoData outbuf; + + /* + * WAL reader, allocated for each safekeeper. + */ + XLogReaderState *xlogreader; + + /* + * Streaming will start here; must be record boundary. + */ + XLogRecPtr startStreamingAt; + + bool flushWrite; /* set to true if we need to call AsyncFlush,* + * to flush pending messages */ + XLogRecPtr streamingAt; /* current streaming position */ + AppendRequestHeader appendRequest; /* request for sending to safekeeper */ + + int eventPos; /* position in wait event set. Equal to -1 if* + * no event */ + SafekeeperState state; /* safekeeper state machine state */ + TimestampTz latestMsgReceivedAt; /* when latest msg is received */ + AcceptorGreeting greetResponse; /* acceptor greeting */ + VoteResponse voteResponse; /* the vote */ + AppendResponse appendResponse; /* feedback for master */ +} Safekeeper; + +extern PGDLLIMPORT void WalProposerMain(Datum main_arg); +void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); +void WalProposerPoll(void); +void WalProposerRegister(void); +void ParseReplicationFeedbackMessage(StringInfo reply_message, + ReplicationFeedback * rf); +extern void StartProposerReplication(StartReplicationCmd *cmd); + +Size WalproposerShmemSize(void); +bool WalproposerShmemInit(void); +void replication_feedback_set(ReplicationFeedback * rf); +void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); + +/* libpqwalproposer hooks & helper type */ + +/* Re-exported PostgresPollingStatusType */ +typedef enum +{ + WP_CONN_POLLING_FAILED = 0, + WP_CONN_POLLING_READING, + WP_CONN_POLLING_WRITING, + WP_CONN_POLLING_OK, + + /* + * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused. + * We've removed it here to avoid clutter. + */ +} WalProposerConnectPollStatusType; + +/* Re-exported and modified ExecStatusType */ +typedef enum +{ + /* We received a single CopyBoth result */ + WP_EXEC_SUCCESS_COPYBOTH, + + /* + * Any success result other than a single CopyBoth was received. The + * specifics of the result were already logged, but it may be useful to + * provide an error message indicating which safekeeper messed up. + * + * Do not expect PQerrorMessage to be appropriately set. + */ + WP_EXEC_UNEXPECTED_SUCCESS, + + /* + * No result available at this time. Wait until read-ready, then call + * again. Internally, this is returned when PQisBusy indicates that + * PQgetResult would block. + */ + WP_EXEC_NEEDS_INPUT, + /* Catch-all failure. Check PQerrorMessage. */ + WP_EXEC_FAILED, +} WalProposerExecStatusType; + +/* Re-exported ConnStatusType */ +typedef enum +{ + WP_CONNECTION_OK, + WP_CONNECTION_BAD, + + /* + * The original ConnStatusType has many more tags, but requests that they + * not be relied upon (except for displaying to the user). We don't need + * that extra functionality, so we collect them into a single tag here. + */ + WP_CONNECTION_IN_PROGRESS, +} WalProposerConnStatusType; + +/* Re-exported PQerrorMessage */ +extern char *walprop_error_message(WalProposerConn *conn); + +/* Re-exported PQstatus */ +extern WalProposerConnStatusType walprop_status(WalProposerConn *conn); + +/* Re-exported PQconnectStart */ +extern WalProposerConn * walprop_connect_start(char *conninfo); + +/* Re-exported PQconectPoll */ +extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn); + +/* Blocking wrapper around PQsendQuery */ +extern bool walprop_send_query(WalProposerConn *conn, char *query); + +/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */ +extern WalProposerExecStatusType walprop_get_query_result(WalProposerConn *conn); + +/* Re-exported PQsocket */ +extern pgsocket walprop_socket(WalProposerConn *conn); + +/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */ +extern int walprop_flush(WalProposerConn *conn); + +/* Re-exported PQfinish */ +extern void walprop_finish(WalProposerConn *conn); + +/* + * Ergonomic wrapper around PGgetCopyData + * + * Reads a CopyData block from a safekeeper, setting *amount to the number + * of bytes returned. + * + * This function is allowed to assume certain properties specific to the + * protocol with the safekeepers, so it should not be used as-is for any + * other purpose. + * + * Note: If possible, using is generally preferred, because it + * performs a bit of extra checking work that's always required and is normally + * somewhat verbose. + */ +extern PGAsyncReadResult walprop_async_read(WalProposerConn *conn, char **buf, int *amount); + +/* + * Ergonomic wrapper around PQputCopyData + PQflush + * + * Starts to write a CopyData block to a safekeeper. + * + * For information on the meaning of return codes, refer to PGAsyncWriteResult. + */ +extern PGAsyncWriteResult walprop_async_write(WalProposerConn *conn, void const *buf, size_t size); + +/* + * Blocking equivalent to walprop_async_write_fn + * + * Returns 'true' if successful, 'false' on failure. + */ +extern bool walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size); + +extern uint64 BackpressureThrottlingTime(void); + +#endif /* __NEON_WALPROPOSER_H__ */ diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c new file mode 100644 index 0000000000..e1dcaa081d --- /dev/null +++ b/pgxn/neon/walproposer_utils.c @@ -0,0 +1,1191 @@ +#include "postgres.h" + +#include "access/timeline.h" +#include "access/xlogutils.h" +#include "common/logging.h" +#include "common/ip.h" +#include "funcapi.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "miscadmin.h" +#include "postmaster/interrupt.h" +#include "replication/slot.h" +#include "walproposer_utils.h" +#include "replication/walsender_private.h" + +#include "storage/ipc.h" +#include "utils/builtins.h" +#include "utils/ps_status.h" + +#include "libpq-fe.h" +#include +#include + +#if PG_VERSION_NUM >= 150000 +#include "access/xlogutils.h" +#include "access/xlogrecovery.h" +#endif + +/* + * These variables are used similarly to openLogFile/SegNo, + * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID + * corresponding the filename of walpropFile. + */ +static int walpropFile = -1; +static TimeLineID walpropFileTLI = 0; +static XLogSegNo walpropSegNo = 0; + +/* START cloned file-local variables and functions from walsender.c */ + +/* + * xlogreader used for replication. Note that a WAL sender doing physical + * replication does not need xlogreader to read WAL, but it needs one to + * keep a state of its work. + */ +static XLogReaderState *xlogreader = NULL; + +/* + * These variables keep track of the state of the timeline we're currently + * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric, + * the timeline is not the latest timeline on this server, and the server's + * history forked off from that timeline at sendTimeLineValidUpto. + */ +static TimeLineID sendTimeLine = 0; +static TimeLineID sendTimeLineNextTLI = 0; +static bool sendTimeLineIsHistoric = false; +static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr; + +/* + * Timestamp of last ProcessRepliesIfAny() that saw a reply from the + * standby. Set to 0 if wal_sender_timeout doesn't need to be active. + */ +static TimestampTz last_reply_timestamp = 0; + +/* Have we sent a heartbeat message asking for reply, since last reply? */ +static bool waiting_for_ping_response = false; + +static bool streamingDoneSending; +static bool streamingDoneReceiving; + +/* Are we there yet? */ +static bool WalSndCaughtUp = false; + +/* Flags set by signal handlers for later service in main loop */ +static volatile sig_atomic_t got_STOPPING = false; + +/* + * How far have we sent WAL already? This is also advertised in + * MyWalSnd->sentPtr. (Actually, this is the next WAL location to send.) + */ +static XLogRecPtr sentPtr = InvalidXLogRecPtr; + +/* + * This is set while we are streaming. When not set + * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set, + * the main loop is responsible for checking got_STOPPING and terminating when + * it's set (after streaming any remaining WAL). + */ +static volatile sig_atomic_t replication_active = false; + +typedef void (*WalSndSendDataCallback) (void); +static void WalSndLoop(WalSndSendDataCallback send_data); +static void XLogSendPhysical(void); +#if PG_VERSION_NUM >= 150000 +static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli); +#else +static XLogRecPtr GetStandbyFlushRecPtr(void); +#endif + +static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, + TimeLineID *tli_p); + +/* END cloned file-level variables and functions from walsender.c */ + +int +CompareLsn(const void *a, const void *b) +{ + XLogRecPtr lsn1 = *((const XLogRecPtr *) a); + XLogRecPtr lsn2 = *((const XLogRecPtr *) b); + + if (lsn1 < lsn2) + return -1; + else if (lsn1 == lsn2) + return 0; + else + return 1; +} + +/* Returns a human-readable string corresonding to the SafekeeperState + * + * The string should not be freed. + * + * The strings are intended to be used as a prefix to "state", e.g.: + * + * elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state)); + * + * If this sort of phrasing doesn't fit the message, instead use something like: + * + * elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state)); + */ +char * +FormatSafekeeperState(SafekeeperState state) +{ + char *return_val = NULL; + + switch (state) + { + case SS_OFFLINE: + return_val = "offline"; + break; + case SS_CONNECTING_READ: + case SS_CONNECTING_WRITE: + return_val = "connecting"; + break; + case SS_WAIT_EXEC_RESULT: + return_val = "receiving query result"; + break; + case SS_HANDSHAKE_RECV: + return_val = "handshake (receiving)"; + break; + case SS_VOTING: + return_val = "voting"; + break; + case SS_WAIT_VERDICT: + return_val = "wait-for-verdict"; + break; + case SS_SEND_ELECTED_FLUSH: + return_val = "send-announcement-flush"; + break; + case SS_IDLE: + return_val = "idle"; + break; + case SS_ACTIVE: + return_val = "active"; + break; + } + + Assert(return_val != NULL); + + return return_val; +} + +/* Asserts that the provided events are expected for given safekeeper's state */ +void +AssertEventsOkForState(uint32 events, Safekeeper *sk) +{ + uint32 expected = SafekeeperStateDesiredEvents(sk->state); + + /* + * The events are in-line with what we're expecting, under two conditions: + * (a) if we aren't expecting anything, `events` has no read- or + * write-ready component. (b) if we are expecting something, there's + * overlap (i.e. `events & expected != 0`) + */ + bool events_ok_for_state; /* long name so the `Assert` is more + * clear later */ + + if (expected == WL_NO_EVENTS) + events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0); + else + events_ok_for_state = ((events & expected) != 0); + + if (!events_ok_for_state) + { + /* + * To give a descriptive message in the case of failure, we use elog + * and then an assertion that's guaranteed to fail. + */ + elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", + FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state)); + Assert(events_ok_for_state); + } +} + +/* Returns the set of events a safekeeper in this state should be waiting on + * + * This will return WL_NO_EVENTS (= 0) for some events. */ +uint32 +SafekeeperStateDesiredEvents(SafekeeperState state) +{ + uint32 result = WL_NO_EVENTS; + + /* If the state doesn't have a modifier, we can check the base state */ + switch (state) + { + /* Connecting states say what they want in the name */ + case SS_CONNECTING_READ: + result = WL_SOCKET_READABLE; + break; + case SS_CONNECTING_WRITE: + result = WL_SOCKET_WRITEABLE; + break; + + /* Reading states need the socket to be read-ready to continue */ + case SS_WAIT_EXEC_RESULT: + case SS_HANDSHAKE_RECV: + case SS_WAIT_VERDICT: + result = WL_SOCKET_READABLE; + break; + + /* + * Idle states use read-readiness as a sign that the connection + * has been disconnected. + */ + case SS_VOTING: + case SS_IDLE: + result = WL_SOCKET_READABLE; + break; + + /* + * Flush states require write-ready for flushing. Active state + * does both reading and writing. + * + * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We + * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE. + */ + case SS_SEND_ELECTED_FLUSH: + case SS_ACTIVE: + result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; + break; + + /* The offline state expects no events. */ + case SS_OFFLINE: + result = WL_NO_EVENTS; + break; + + default: + Assert(false); + break; + } + + return result; +} + +/* Returns a human-readable string corresponding to the event set + * + * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the + * returned string may be meaingless. + * + * The string should not be freed. It should also not be expected to remain the same between + * function calls. */ +char * +FormatEvents(uint32 events) +{ + static char return_str[8]; + + /* Helper variable to check if there's extra bits */ + uint32 all_flags = WL_LATCH_SET + | WL_SOCKET_READABLE + | WL_SOCKET_WRITEABLE + | WL_TIMEOUT + | WL_POSTMASTER_DEATH + | WL_EXIT_ON_PM_DEATH + | WL_SOCKET_CONNECTED; + + /* + * The formatting here isn't supposed to be *particularly* useful -- it's + * just to give an sense of what events have been triggered without + * needing to remember your powers of two. + */ + + return_str[0] = (events & WL_LATCH_SET) ? 'L' : '_'; + return_str[1] = (events & WL_SOCKET_READABLE) ? 'R' : '_'; + return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_'; + return_str[3] = (events & WL_TIMEOUT) ? 'T' : '_'; + return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_'; + return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_'; + return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_'; + + if (events & (~all_flags)) + { + elog(WARNING, "Event formatting found unexpected component %d", + events & (~all_flags)); + return_str[6] = '*'; + return_str[7] = '\0'; + } + else + return_str[6] = '\0'; + + return (char *) &return_str; +} + +/* + * Convert a character which represents a hexadecimal digit to an integer. + * + * Returns -1 if the character is not a hexadecimal digit. + */ +static int +HexDecodeChar(char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + + return -1; +} + +/* + * Decode a hex string into a byte string, 2 hex chars per byte. + * + * Returns false if invalid characters are encountered; otherwise true. + */ +bool +HexDecodeString(uint8 *result, char *input, int nbytes) +{ + int i; + + for (i = 0; i < nbytes; ++i) + { + int n1 = HexDecodeChar(input[i * 2]); + int n2 = HexDecodeChar(input[i * 2 + 1]); + + if (n1 < 0 || n2 < 0) + return false; + result[i] = n1 * 16 + n2; + } + + return true; +} + +/* -------------------------------- + * pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order + * -------------------------------- + */ +uint32 +pq_getmsgint32_le(StringInfo msg) +{ + uint32 n32; + + pq_copymsgbytes(msg, (char *) &n32, sizeof(n32)); + + return n32; +} + +/* -------------------------------- + * pq_getmsgint64 - get a binary 8-byte int from a message buffer in native (LE) order + * -------------------------------- + */ +uint64 +pq_getmsgint64_le(StringInfo msg) +{ + uint64 n64; + + pq_copymsgbytes(msg, (char *) &n64, sizeof(n64)); + + return n64; +} + +/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */ +void +pq_sendint32_le(StringInfo buf, uint32 i) +{ + enlargeStringInfo(buf, sizeof(uint32)); + memcpy(buf->data + buf->len, &i, sizeof(uint32)); + buf->len += sizeof(uint32); +} + +/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */ +void +pq_sendint64_le(StringInfo buf, uint64 i) +{ + enlargeStringInfo(buf, sizeof(uint64)); + memcpy(buf->data + buf->len, &i, sizeof(uint64)); + buf->len += sizeof(uint64); +} + +/* + * Write XLOG data to disk. + */ +void +XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr) +{ + int startoff; + int byteswritten; + + while (nbytes > 0) + { + int segbytes; + + /* Close the current segment if it's completed */ + if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) + XLogWalPropClose(recptr); + + if (walpropFile < 0) + { +#if PG_VERSION_NUM >= 150000 + /* FIXME Is it ok to use hardcoded value here? */ + TimeLineID tli = 1; +#else + bool use_existent = true; +#endif + /* Create/use new log file */ + XLByteToSeg(recptr, walpropSegNo, wal_segment_size); +#if PG_VERSION_NUM >= 150000 + walpropFile = XLogFileInit(walpropSegNo, tli); + walpropFileTLI = tli; +#else + walpropFile = XLogFileInit(walpropSegNo, &use_existent, false); + walpropFileTLI = ThisTimeLineID; +#endif + } + + /* Calculate the start offset of the received logs */ + startoff = XLogSegmentOffset(recptr, wal_segment_size); + + if (startoff + nbytes > wal_segment_size) + segbytes = wal_segment_size - startoff; + else + segbytes = nbytes; + + /* OK to write the logs */ + errno = 0; + + byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff); + if (byteswritten <= 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno; + + /* if write didn't set errno, assume no disk space */ + if (errno == 0) + errno = ENOSPC; + + save_errno = errno; + XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to log segment %s " + "at offset %u, length %lu: %m", + xlogfname, startoff, (unsigned long) segbytes))); + } + + /* Update state for write */ + recptr += byteswritten; + + nbytes -= byteswritten; + buf += byteswritten; + } + + /* + * Close the current segment if it's fully written up in the last cycle of + * the loop. + */ + if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) + { + XLogWalPropClose(recptr); + } +} + +/* + * Close the current segment. + */ +void +XLogWalPropClose(XLogRecPtr recptr) +{ + Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)); + + if (close(walpropFile) != 0) + { + char xlogfname[MAXFNAMELEN]; + + XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); + + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close log segment %s: %m", + xlogfname))); + } + + walpropFile = -1; +} + +/* START of cloned functions from walsender.c */ + +/* + * Handle START_REPLICATION command. + * + * At the moment, this never returns, but an ereport(ERROR) will take us back + * to the main loop. + */ +void +StartProposerReplication(StartReplicationCmd *cmd) +{ + XLogRecPtr FlushPtr; + TimeLineID currTLI; + +#if PG_VERSION_NUM < 150000 + if (ThisTimeLineID == 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); +#endif + + /* create xlogreader for physical replication */ + xlogreader = + XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.segment_open = WalSndSegmentOpen, + .segment_close = wal_segment_close), + NULL); + + if (!xlogreader) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* + * We assume here that we're logging enough information in the WAL for + * log-shipping, since this is checked in PostmasterMain(). + * + * NOTE: wal_level can only change at shutdown, so in most cases it is + * difficult for there to be WAL data that we can still see that was + * written at wal_level='minimal'. + */ + + if (cmd->slotname) + { + ReplicationSlotAcquire(cmd->slotname, true); + if (SlotIsLogical(MyReplicationSlot)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot use a logical replication slot for physical replication"))); + + /* + * We don't need to verify the slot's restart_lsn here; instead we + * rely on the caller requesting the starting point to use. If the + * WAL segment doesn't exist, we'll fail later. + */ + } + + /* + * Select the timeline. If it was given explicitly by the client, use + * that. Otherwise use the timeline of the last replayed record, which is + * kept in ThisTimeLineID. + * + * Neon doesn't currently use PG Timelines, but it may in the future, so + * we keep this code around to lighten the load for when we need it. + */ +#if PG_VERSION_NUM >= 150000 + if (am_cascading_walsender) + { + /* this also updates ThisTimeLineID */ + FlushPtr = GetStandbyFlushRecPtr(&currTLI); + } + else + FlushPtr = GetFlushRecPtr(&currTLI); +#else + if (am_cascading_walsender) + { + /* this also updates ThisTimeLineID */ + FlushPtr = GetStandbyFlushRecPtr(); + } + else + FlushPtr = GetFlushRecPtr(); + + currTLI = ThisTimeLineID; +#endif + + + if (cmd->timeline != 0) + { + XLogRecPtr switchpoint; + + sendTimeLine = cmd->timeline; + if (sendTimeLine == currTLI) + { + sendTimeLineIsHistoric = false; + sendTimeLineValidUpto = InvalidXLogRecPtr; + } + else + { + List *timeLineHistory; + + sendTimeLineIsHistoric = true; + + /* + * Check that the timeline the client requested exists, and the + * requested start location is on that timeline. + */ + timeLineHistory = readTimeLineHistory(currTLI); + switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory, + &sendTimeLineNextTLI); + list_free_deep(timeLineHistory); + + /* + * Found the requested timeline in the history. Check that + * requested startpoint is on that timeline in our history. + * + * This is quite loose on purpose. We only check that we didn't + * fork off the requested timeline before the switchpoint. We + * don't check that we switched *to* it before the requested + * starting point. This is because the client can legitimately + * request to start replication from the beginning of the WAL + * segment that contains switchpoint, but on the new timeline, so + * that it doesn't end up with a partial segment. If you ask for + * too old a starting point, you'll get an error later when we + * fail to find the requested WAL segment in pg_wal. + * + * XXX: we could be more strict here and only allow a startpoint + * that's older than the switchpoint, if it's still in the same + * WAL segment. + */ + if (!XLogRecPtrIsInvalid(switchpoint) && + switchpoint < cmd->startpoint) + { + ereport(ERROR, + (errmsg("requested starting point %X/%X on timeline %u is not in this server's history", + LSN_FORMAT_ARGS(cmd->startpoint), + cmd->timeline), + errdetail("This server's history forked from timeline %u at %X/%X.", + cmd->timeline, + LSN_FORMAT_ARGS(switchpoint)))); + } + sendTimeLineValidUpto = switchpoint; + } + } + else + { + sendTimeLine = currTLI; + sendTimeLineValidUpto = InvalidXLogRecPtr; + sendTimeLineIsHistoric = false; + } + + streamingDoneSending = streamingDoneReceiving = false; + + /* If there is nothing to stream, don't even enter COPY mode */ + if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto) + { + /* + * When we first start replication the standby will be behind the + * primary. For some applications, for example synchronous + * replication, it is important to have a clear state for this initial + * catchup mode, so we can trigger actions when we change streaming + * state later. We may stay in this state for a long time, which is + * exactly why we want to be able to monitor whether or not we are + * still here. + */ + WalSndSetState(WALSNDSTATE_CATCHUP); + + /* + * Don't allow a request to stream from a future point in WAL that + * hasn't been flushed to disk in this server yet. + */ + if (FlushPtr < cmd->startpoint) + { + ereport(ERROR, + (errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X", + LSN_FORMAT_ARGS(cmd->startpoint), + LSN_FORMAT_ARGS(FlushPtr)))); + } + + /* Start streaming from the requested point */ + sentPtr = cmd->startpoint; + + /* Initialize shared memory status, too */ + SpinLockAcquire(&MyWalSnd->mutex); + MyWalSnd->sentPtr = sentPtr; + SpinLockRelease(&MyWalSnd->mutex); + + SyncRepInitConfig(); + + /* Main loop of walsender */ + replication_active = true; + + WalSndLoop(XLogSendPhysical); + + replication_active = false; + if (got_STOPPING) + proc_exit(0); + WalSndSetState(WALSNDSTATE_STARTUP); + + Assert(streamingDoneSending && streamingDoneReceiving); + } + + if (cmd->slotname) + ReplicationSlotRelease(); + + /* + * Copy is finished now. Send a single-row result set indicating the next + * timeline. + */ + if (sendTimeLineIsHistoric) + { + char startpos_str[8 + 1 + 8 + 1]; + DestReceiver *dest; + TupOutputState *tstate; + TupleDesc tupdesc; + Datum values[2]; + bool nulls[2]; + + snprintf(startpos_str, sizeof(startpos_str), "%X/%X", + LSN_FORMAT_ARGS(sendTimeLineValidUpto)); + + dest = CreateDestReceiver(DestRemoteSimple); + MemSet(nulls, false, sizeof(nulls)); + + /* + * Need a tuple descriptor representing two columns. int8 may seem + * like a surprising data type for this, but in theory int4 would not + * be wide enough for this, as TimeLineID is unsigned. + */ + tupdesc = CreateTemplateTupleDesc(2); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli", + INT8OID, -1, 0); + TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos", + TEXTOID, -1, 0); + + /* prepare for projection of tuple */ + tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual); + + values[0] = Int64GetDatum((int64) sendTimeLineNextTLI); + values[1] = CStringGetTextDatum(startpos_str); + + /* send it to dest */ + do_tup_output(tstate, values, nulls); + + end_tup_output(tstate); + } + + /* Send CommandComplete message */ + EndReplicationCommand("START_STREAMING"); +} + +#if PG_VERSION_NUM >= 150000 +static XLogRecPtr +GetStandbyFlushRecPtr(TimeLineID *tli) +{ + XLogRecPtr replayPtr; + TimeLineID replayTLI; + XLogRecPtr receivePtr; + TimeLineID receiveTLI; + XLogRecPtr result; + + /* + * We can safely send what's already been replayed. Also, if walreceiver + * is streaming WAL from the same timeline, we can send anything that it + * has streamed, but hasn't been replayed yet. + */ + + receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI); + replayPtr = GetXLogReplayRecPtr(&replayTLI); + + *tli = replayTLI; + + result = replayPtr; + if (receiveTLI == replayTLI && receivePtr > replayPtr) + result = receivePtr; + + return result; +} +#else +/* + * Returns the latest point in WAL that has been safely flushed to disk, and + * can be sent to the standby. This should only be called when in recovery, + * ie. we're streaming to a cascaded standby. + * + * As a side-effect, ThisTimeLineID is updated to the TLI of the last + * replayed WAL record. + */ +static XLogRecPtr +GetStandbyFlushRecPtr(void) +{ + XLogRecPtr replayPtr; + TimeLineID replayTLI; + XLogRecPtr receivePtr; + TimeLineID receiveTLI; + XLogRecPtr result; + + /* + * We can safely send what's already been replayed. Also, if walreceiver + * is streaming WAL from the same timeline, we can send anything that it + * has streamed, but hasn't been replayed yet. + */ + + receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI); + replayPtr = GetXLogReplayRecPtr(&replayTLI); + + ThisTimeLineID = replayTLI; + + result = replayPtr; + if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr) + result = receivePtr; + + return result; +} +#endif + + + +/* XLogReaderRoutine->segment_open callback */ +static void +WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, + TimeLineID *tli_p) +{ + char path[MAXPGPATH]; + + /*------- + * When reading from a historic timeline, and there is a timeline switch + * within this segment, read from the WAL segment belonging to the new + * timeline. + * + * For example, imagine that this server is currently on timeline 5, and + * we're streaming timeline 4. The switch from timeline 4 to 5 happened at + * 0/13002088. In pg_wal, we have these files: + * + * ... + * 000000040000000000000012 + * 000000040000000000000013 + * 000000050000000000000013 + * 000000050000000000000014 + * ... + * + * In this situation, when requested to send the WAL from segment 0x13, on + * timeline 4, we read the WAL from file 000000050000000000000013. Archive + * recovery prefers files from newer timelines, so if the segment was + * restored from the archive on this server, the file belonging to the old + * timeline, 000000040000000000000013, might not exist. Their contents are + * equal up to the switchpoint, because at a timeline switch, the used + * portion of the old segment is copied to the new file. ------- + */ + *tli_p = sendTimeLine; + if (sendTimeLineIsHistoric) + { + XLogSegNo endSegNo; + + XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize); + if (nextSegNo == endSegNo) + *tli_p = sendTimeLineNextTLI; + } + + XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize); + state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); + if (state->seg.ws_file >= 0) + return; + + /* + * If the file is not found, assume it's because the standby asked for a + * too old WAL segment that has already been removed or recycled. + */ + if (errno == ENOENT) + { + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; + + XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("requested WAL segment %s has already been removed", + xlogfname))); + } + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); +} + + +/* Main loop of walsender process that streams the WAL over Copy messages. */ +static void +WalSndLoop(WalSndSendDataCallback send_data) +{ + /* + * Initialize the last reply timestamp. That enables timeout processing + * from hereon. + */ + last_reply_timestamp = GetCurrentTimestamp(); + waiting_for_ping_response = false; + + /* + * Loop until we reach the end of this timeline or the client requests to + * stop streaming. + */ + for (;;) + { + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Process any requests or signals received recently */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + SyncRepInitConfig(); + } + + /* always true */ + if (am_wal_proposer) + { + send_data(); + if (WalSndCaughtUp) + { + if (MyWalSnd->state == WALSNDSTATE_CATCHUP) + WalSndSetState(WALSNDSTATE_STREAMING); + WalProposerPoll(); + WalSndCaughtUp = false; + } + continue; + } + } +} + +/* + * Send out the WAL in its normal physical/stored form. + * + * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk, + * but not yet sent to the client, and buffer it in the libpq output + * buffer. + * + * If there is no unsent WAL remaining, WalSndCaughtUp is set to true, + * otherwise WalSndCaughtUp is set to false. + */ +static void +XLogSendPhysical(void) +{ + XLogRecPtr SendRqstPtr; + XLogRecPtr startptr; + XLogRecPtr endptr; + Size nbytes PG_USED_FOR_ASSERTS_ONLY; + TimeLineID currTLI; + + /* If requested switch the WAL sender to the stopping state. */ + if (got_STOPPING) + WalSndSetState(WALSNDSTATE_STOPPING); + + if (streamingDoneSending) + { + WalSndCaughtUp = true; + return; + } + + /* Figure out how far we can safely send the WAL. */ + if (sendTimeLineIsHistoric) + { + /* + * Streaming an old timeline that's in this server's history, but is + * not the one we're currently inserting or replaying. It can be + * streamed up to the point where we switched off that timeline. + */ + SendRqstPtr = sendTimeLineValidUpto; + } + else if (am_cascading_walsender) + { + /* + * Streaming the latest timeline on a standby. + * + * Attempt to send all WAL that has already been replayed, so that we + * know it's valid. If we're receiving WAL through streaming + * replication, it's also OK to send any WAL that has been received + * but not replayed. + * + * The timeline we're recovering from can change, or we can be + * promoted. In either case, the current timeline becomes historic. We + * need to detect that so that we don't try to stream past the point + * where we switched to another timeline. We check for promotion or + * timeline switch after calculating FlushPtr, to avoid a race + * condition: if the timeline becomes historic just after we checked + * that it was still current, it's still be OK to stream it up to the + * FlushPtr that was calculated before it became historic. + */ + bool becameHistoric = false; +#if PG_VERSION_NUM >= 150000 + SendRqstPtr = GetStandbyFlushRecPtr(&currTLI); +#else + SendRqstPtr = GetStandbyFlushRecPtr(); + currTLI = ThisTimeLineID; +#endif + if (!RecoveryInProgress()) + { + /* + * We have been promoted. RecoveryInProgress() updated + * ThisTimeLineID to the new current timeline. + */ + am_cascading_walsender = false; + becameHistoric = true; + } + else + { + /* + * Still a cascading standby. But is the timeline we're sending + * still the one recovery is recovering from? currTLI was updated + * by the GetStandbyFlushRecPtr() call above. + */ + if (sendTimeLine != currTLI) + becameHistoric = true; + } + + if (becameHistoric) + { + /* + * The timeline we were sending has become historic. Read the + * timeline history file of the new timeline to see where exactly + * we forked off from the timeline we were sending. + */ + List *history; + + history = readTimeLineHistory(currTLI); + sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI); + + Assert(sendTimeLine < sendTimeLineNextTLI); + list_free_deep(history); + + sendTimeLineIsHistoric = true; + + SendRqstPtr = sendTimeLineValidUpto; + } + } + else + { + /* + * Streaming the current timeline on a primary. + * + * Attempt to send all data that's already been written out and + * fsync'd to disk. We cannot go further than what's been written out + * given the current implementation of WALRead(). And in any case + * it's unsafe to send WAL that is not securely down to disk on the + * primary: if the primary subsequently crashes and restarts, standbys + * must not have applied any WAL that got lost on the primary. + */ +#if PG_VERSION_NUM >= 150000 + SendRqstPtr = GetFlushRecPtr(NULL); +#else + SendRqstPtr = GetFlushRecPtr(); +#endif + } + + /* + * Record the current system time as an approximation of the time at which + * this WAL location was written for the purposes of lag tracking. + * + * In theory we could make XLogFlush() record a time in shmem whenever WAL + * is flushed and we could get that time as well as the LSN when we call + * GetFlushRecPtr() above (and likewise for the cascading standby + * equivalent), but rather than putting any new code into the hot WAL path + * it seems good enough to capture the time here. We should reach this + * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that + * may take some time, we read the WAL flush pointer and take the time + * very close to together here so that we'll get a later position if it is + * still moving. + * + * Because LagTrackerWrite ignores samples when the LSN hasn't advanced, + * this gives us a cheap approximation for the WAL flush time for this + * LSN. + * + * Note that the LSN is not necessarily the LSN for the data contained in + * the present message; it's the end of the WAL, which might be further + * ahead. All the lag tracking machinery cares about is finding out when + * that arbitrary LSN is eventually reported as written, flushed and + * applied, so that it can measure the elapsed time. + */ + LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp()); + + /* + * If this is a historic timeline and we've reached the point where we + * forked to the next timeline, stop streaming. + * + * Note: We might already have sent WAL > sendTimeLineValidUpto. The + * startup process will normally replay all WAL that has been received + * from the primary, before promoting, but if the WAL streaming is + * terminated at a WAL page boundary, the valid portion of the timeline + * might end in the middle of a WAL record. We might've already sent the + * first half of that partial WAL record to the cascading standby, so that + * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't + * replay the partial WAL record either, so it can still follow our + * timeline switch. + */ + if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr) + { + /* close the current file. */ + if (xlogreader->seg.ws_file >= 0) + wal_segment_close(xlogreader); + + /* Send CopyDone */ + pq_putmessage_noblock('c', NULL, 0); + streamingDoneSending = true; + + WalSndCaughtUp = true; + + elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)", + LSN_FORMAT_ARGS(sendTimeLineValidUpto), + LSN_FORMAT_ARGS(sentPtr)); + return; + } + + /* Do we have any work to do? */ + Assert(sentPtr <= SendRqstPtr); + if (SendRqstPtr <= sentPtr) + { + WalSndCaughtUp = true; + return; + } + + /* + * Figure out how much to send in one message. If there's no more than + * MAX_SEND_SIZE bytes to send, send everything. Otherwise send + * MAX_SEND_SIZE bytes, but round back to logfile or page boundary. + * + * The rounding is not only for performance reasons. Walreceiver relies on + * the fact that we never split a WAL record across two messages. Since a + * long WAL record is split at page boundary into continuation records, + * page boundary is always a safe cut-off point. We also assume that + * SendRqstPtr never points to the middle of a WAL record. + */ + startptr = sentPtr; + endptr = startptr; + endptr += MAX_SEND_SIZE; + + /* if we went beyond SendRqstPtr, back off */ + if (SendRqstPtr <= endptr) + { + endptr = SendRqstPtr; + if (sendTimeLineIsHistoric) + WalSndCaughtUp = false; + else + WalSndCaughtUp = true; + } + else + { + /* round down to page boundary. */ + endptr -= (endptr % XLOG_BLCKSZ); + WalSndCaughtUp = false; + } + + nbytes = endptr - startptr; + Assert(nbytes <= MAX_SEND_SIZE); + + /* always true */ + if (am_wal_proposer) + { + WalProposerBroadcast(startptr, endptr); + } + else + { + /* code removed for brevity */ + } + sentPtr = endptr; + + /* Update shared memory status */ + { + WalSnd *walsnd = MyWalSnd; + + SpinLockAcquire(&walsnd->mutex); + walsnd->sentPtr = sentPtr; + SpinLockRelease(&walsnd->mutex); + } + + /* Report progress of XLOG streaming in PS display */ + if (update_process_title) + { + char activitymsg[50]; + + snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", + LSN_FORMAT_ARGS(sentPtr)); + set_ps_display(activitymsg); + } +} diff --git a/pgxn/neon/walproposer_utils.h b/pgxn/neon/walproposer_utils.h new file mode 100644 index 0000000000..aa5df5fa43 --- /dev/null +++ b/pgxn/neon/walproposer_utils.h @@ -0,0 +1,19 @@ +#ifndef __NEON_WALPROPOSER_UTILS_H__ +#define __NEON_WALPROPOSER_UTILS_H__ + +#include "walproposer.h" + +int CompareLsn(const void *a, const void *b); +char *FormatSafekeeperState(SafekeeperState state); +void AssertEventsOkForState(uint32 events, Safekeeper *sk); +uint32 SafekeeperStateDesiredEvents(SafekeeperState state); +char *FormatEvents(uint32 events); +bool HexDecodeString(uint8 *result, char *input, int nbytes); +uint32 pq_getmsgint32_le(StringInfo msg); +uint64 pq_getmsgint64_le(StringInfo msg); +void pq_sendint32_le(StringInfo buf, uint32 i); +void pq_sendint64_le(StringInfo buf, uint64 i); +void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr); +void XLogWalPropClose(XLogRecPtr recptr); + +#endif /* __NEON_WALPROPOSER_UTILS_H__ */ diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile new file mode 100644 index 0000000000..9c774ec185 --- /dev/null +++ b/pgxn/neon_test_utils/Makefile @@ -0,0 +1,15 @@ +# pgxs/neon_test_utils/Makefile + + +MODULE_big = neon_test_utils +OBJS = \ + $(WIN32RES) \ + neontest.o + +EXTENSION = neon_test_utils +DATA = neon_test_utils--1.0.sql +PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging" + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/pgxn/neon_test_utils/neon_test_utils--1.0.sql b/pgxn/neon_test_utils/neon_test_utils--1.0.sql new file mode 100644 index 0000000000..402981a9a6 --- /dev/null +++ b/pgxn/neon_test_utils/neon_test_utils--1.0.sql @@ -0,0 +1,29 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION neon_test_utils" to load this file. \quit + +CREATE FUNCTION test_consume_xids(nxids int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_xids' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION clear_buffer_cache() +RETURNS VOID +AS 'MODULE_PATHNAME', 'clear_buffer_cache' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION neon_xlogflush(lsn pg_lsn) +RETURNS VOID +AS 'MODULE_PATHNAME', 'neon_xlogflush' +LANGUAGE C PARALLEL UNSAFE; diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control new file mode 100644 index 0000000000..94e6720503 --- /dev/null +++ b/pgxn/neon_test_utils/neon_test_utils.control @@ -0,0 +1,5 @@ +# neon_test_utils extension +comment = 'helpers for neon testing and debugging' +default_version = '1.0' +module_pathname = '$libdir/neon_test_utils' +relocatable = true diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c new file mode 100644 index 0000000000..e0cea4177b --- /dev/null +++ b/pgxn/neon_test_utils/neontest.c @@ -0,0 +1,302 @@ +/*------------------------------------------------------------------------- + * + * neontest.c + * Helpers for neon testing and debugging + * + * IDENTIFICATION + * contrib/neon_test_utils/neontest.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/relation.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/namespace.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "utils/builtins.h" +#include "utils/pg_lsn.h" +#include "utils/rel.h" +#include "utils/varlena.h" +#include "../neon/pagestore_client.h" + +PG_MODULE_MAGIC; + +extern void _PG_init(void); + +PG_FUNCTION_INFO_V1(test_consume_xids); +PG_FUNCTION_INFO_V1(clear_buffer_cache); +PG_FUNCTION_INFO_V1(get_raw_page_at_lsn); +PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); +PG_FUNCTION_INFO_V1(neon_xlogflush); + +/* + * Linkage to functions in neon module. + * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c + */ +typedef void (*neon_read_at_lsn_type) (RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); + +static neon_read_at_lsn_type neon_read_at_lsn_ptr; + +/* + * Module initialize function: fetch function pointers for cross-module calls. + */ +void +_PG_init(void) +{ + /* Asserts verify that typedefs above match original declarations */ + AssertVariableIsOfType(&neon_read_at_lsn, neon_read_at_lsn_type); + neon_read_at_lsn_ptr = (neon_read_at_lsn_type) + load_external_function("$libdir/neon", "neon_read_at_lsn", + true, NULL); +} + +#define neon_read_at_lsn neon_read_at_lsn_ptr + +/* + * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound. + */ +Datum +test_consume_xids(PG_FUNCTION_ARGS) +{ + int32 nxids = PG_GETARG_INT32(0); + TransactionId topxid; + FullTransactionId fullxid; + TransactionId xid; + TransactionId targetxid; + + /* make sure we have a top-XID first */ + topxid = GetTopTransactionId(); + + xid = ReadNextTransactionId(); + + targetxid = xid + nxids; + while (targetxid < FirstNormalTransactionId) + targetxid++; + + while (TransactionIdPrecedes(xid, targetxid)) + { + fullxid = GetNewTransactionId(true); + xid = XidFromFullTransactionId(fullxid); + elog(DEBUG1, "topxid: %u xid: %u", topxid, xid); + } + + PG_RETURN_VOID(); +} + +/* + * Flush the buffer cache, evicting all pages that are not currently pinned. + */ +Datum +clear_buffer_cache(PG_FUNCTION_ARGS) +{ + bool save_neon_test_evict; + + /* + * Temporarily set the zenith_test_evict GUC, so that when we pin and + * unpin a buffer, the buffer is evicted. We use that hack to evict all + * buffers, as there is no explicit "evict this buffer" function in the + * buffer manager. + */ + save_neon_test_evict = zenith_test_evict; + zenith_test_evict = true; + PG_TRY(); + { + /* Scan through all the buffers */ + for (int i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr; + uint32 buf_state; + Buffer bufferid; + bool isvalid; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blocknum; + + /* Peek into the buffer header to see what page it holds. */ + bufHdr = GetBufferDescriptor(i); + buf_state = LockBufHdr(bufHdr); + + if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) + isvalid = true; + else + isvalid = false; + bufferid = BufferDescriptorGetBuffer(bufHdr); + rnode = bufHdr->tag.rnode; + forknum = bufHdr->tag.forkNum; + blocknum = bufHdr->tag.blockNum; + + UnlockBufHdr(bufHdr, buf_state); + + /* + * Pin the buffer, and release it again. Because we have + * zenith_test_evict==true, this will evict the page from the + * buffer cache if no one else is holding a pin on it. + */ + if (isvalid) + { + if (ReadRecentBuffer(rnode, forknum, blocknum, bufferid)) + ReleaseBuffer(bufferid); + } + } + } + PG_FINALLY(); + { + /* restore the GUC */ + zenith_test_evict = save_neon_test_evict; + } + PG_END_TRY(); + + PG_RETURN_VOID(); +} + +/* + * Reads the page from page server without buffer cache + * usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN + * NULL read lsn will result in reading the latest version. + * + * Note: reading latest version will result in waiting for latest changes to reach the page server, + * if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page + */ +Datum +get_raw_page_at_lsn(PG_FUNCTION_ARGS) +{ + bytea *raw_page; + ForkNumber forknum; + RangeVar *relrv; + Relation rel; + char *raw_page_data; + text *relname; + text *forkname; + uint32 blkno; + + bool request_latest = PG_ARGISNULL(3); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); + + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) + PG_RETURN_NULL(); + + relname = PG_GETARG_TEXT_PP(0); + forkname = PG_GETARG_TEXT_PP(1); + blkno = PG_GETARG_UINT32(2); + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to use raw page functions"))); + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, AccessShareLock); + + /* Check that this relation has storage */ + if (rel->rd_rel->relkind == RELKIND_VIEW) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from view \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from composite type \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from foreign table \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from partitioned table \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from partitioned index \"%s\"", + RelationGetRelationName(rel)))); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + forknum = forkname_to_number(text_to_cstring(forkname)); + + /* Initialize buffer to copy to */ + raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); + raw_page_data = VARDATA(raw_page); + + neon_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data); + + relation_close(rel, AccessShareLock); + + PG_RETURN_BYTEA_P(raw_page); +} + +/* + * Another option to read a relation page from page server without cache + * this version doesn't validate input and allows reading blocks of dropped relations + * + * Note: reading latest version will result in waiting for latest changes to reach the page server, + * if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page + */ +Datum +get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) +{ + char *raw_page_data; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to use raw page functions"))); + + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) || + PG_ARGISNULL(3) || PG_ARGISNULL(4)) + PG_RETURN_NULL(); + + { + RelFileNode rnode = { + .spcNode = PG_GETARG_OID(0), + .dbNode = PG_GETARG_OID(1), + .relNode = PG_GETARG_OID(2)}; + + ForkNumber forknum = PG_GETARG_UINT32(3); + + uint32 blkno = PG_GETARG_UINT32(4); + bool request_latest = PG_ARGISNULL(5); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); + + /* Initialize buffer to copy to */ + bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); + raw_page_data = VARDATA(raw_page); + + neon_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data); + PG_RETURN_BYTEA_P(raw_page); + } +} + +/* + * Directly calls XLogFlush(lsn) to flush WAL buffers. + */ +Datum +neon_xlogflush(PG_FUNCTION_ARGS) +{ + XLogRecPtr lsn = PG_GETARG_LSN(0); + + XLogFlush(lsn); + PG_RETURN_VOID(); +} diff --git a/pgxn/neon_walredo/Makefile b/pgxn/neon_walredo/Makefile new file mode 100644 index 0000000000..495527c89b --- /dev/null +++ b/pgxn/neon_walredo/Makefile @@ -0,0 +1,22 @@ +# pgxs/neon_walredo/Makefile + +MODULE_big = neon_walredo +OBJS = \ + $(WIN32RES) \ + inmem_smgr.o \ + walredoproc.o \ + +# This really should be guarded by $(with_libseccomp), but I couldn't +# make that work with pgxs. So we always compile it, but its contents +# are wrapped in #ifdef HAVE_LIBSECCOMP instead. +OBJS += seccomp.o + +PGFILEDESC = "neon_walredo - helper process that runs in Neon pageserver" + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) + +ifeq ($(with_libseccomp),yes) +SHLIB_LINK += -lseccomp +endif diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c new file mode 100644 index 0000000000..2219543628 --- /dev/null +++ b/pgxn/neon_walredo/inmem_smgr.c @@ -0,0 +1,313 @@ +/*------------------------------------------------------------------------- + * + * inmem_smgr.c + * + * This is an implementation of the SMGR interface, used in the WAL redo + * process. It has no persistent storage, the pages that are written out + * are kept in a small number of in-memory buffers. + * + * Normally, replaying a WAL record only needs to access a handful of + * buffers, which fit in the normal buffer cache, so this is just for + * "overflow" storage when the buffer cache is not large enough. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xlog.h" +#include "storage/block.h" +#include "storage/buf_internals.h" +#include "storage/relfilenode.h" +#include "storage/smgr.h" + +#if PG_VERSION_NUM >= 150000 +#include "access/xlogutils.h" +#endif + +#include "inmem_smgr.h" + +/* Size of the in-memory smgr */ +#define MAX_PAGES 64 + +/* If more than WARN_PAGES are used, print a warning in the log */ +#define WARN_PAGES 32 + +static BufferTag page_tag[MAX_PAGES]; +static char page_body[MAX_PAGES][BLCKSZ]; +static int used_pages; + +static int +locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno) +{ + /* We only hold a small number of pages, so linear search */ + for (int i = 0; i < used_pages; i++) + { + if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) + && forknum == page_tag[i].forkNum + && blkno == page_tag[i].blockNum) + { + return i; + } + } + return -1; +} + + +/* neon wal-redo storage manager functionality */ +static void inmem_init(void); +static void inmem_open(SMgrRelation reln); +static void inmem_close(SMgrRelation reln, ForkNumber forknum); +static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); +static bool inmem_exists(SMgrRelation reln, ForkNumber forknum); +static void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +static void inmem_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); +static void inmem_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +static void inmem_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); +static void inmem_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); + + +/* + * inmem_init() -- Initialize private state + */ +static void +inmem_init(void) +{ + used_pages = 0; +} + +/* + * inmem_exists() -- Does the physical file exist? + */ +static bool +inmem_exists(SMgrRelation reln, ForkNumber forknum) +{ + for (int i = 0; i < used_pages; i++) + { + if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) + && forknum == page_tag[i].forkNum) + { + return true; + } + } + return false; +} + +/* + * inmem_create() -- Create a new relation on neon storage + * + * If isRedo is true, it's okay for the relation to exist already. + */ +static void +inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo) +{ +} + +/* + * inmem_unlink() -- Unlink a relation. + */ +static void +inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo) +{ +} + +/* + * inmem_extend() -- Add a block to the specified relation. + * + * The semantics are nearly the same as mdwrite(): write at the + * specified position. However, this is to be used for the case of + * extending a relation (i.e., blocknum is at or beyond the current + * EOF). Note that we assume writing a block beyond current EOF + * causes intervening file space to become filled with zeroes. + */ +static void +inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + char *buffer, bool skipFsync) +{ + /* same as smgwrite() for us */ + inmem_write(reln, forknum, blkno, buffer, skipFsync); +} + +/* + * inmem_open() -- Initialize newly-opened relation. + */ +static void +inmem_open(SMgrRelation reln) +{ +} + +/* + * inmem_close() -- Close the specified relation, if it isn't closed already. + */ +static void +inmem_close(SMgrRelation reln, ForkNumber forknum) +{ +} + +/* + * inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation + */ +static bool +inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + return true; +} + +/* + * inmem_writeback() -- Tell the kernel to write pages back to storage. + */ +static void +inmem_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ +} + +/* + * inmem_read() -- Read the specified block from a relation. + */ +static void +inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + char *buffer) +{ + int pg; + + pg = locate_page(reln, forknum, blkno); + if (pg < 0) + memset(buffer, 0, BLCKSZ); + else + memcpy(buffer, page_body[pg], BLCKSZ); +} + +/* + * inmem_write() -- Write the supplied block at the appropriate location. + * + * This is to be used only for updating already-existing blocks of a + * relation (ie, those before the current EOF). To extend a relation, + * use mdextend(). + */ +static void +inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync) +{ + int pg; + + pg = locate_page(reln, forknum, blocknum); + if (pg < 0) + { + /* + * We assume the buffer cache is large enough to hold all the buffers + * needed for most operations. Overflowing to this "in-mem smgr" in + * rare cases is OK. But if we find that we're using more than + * WARN_PAGES, print a warning so that we get alerted and get to + * investigate why we're accessing so many buffers. + */ + elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1, + "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + blocknum, + used_pages); + if (used_pages == MAX_PAGES) + elog(ERROR, "Inmem storage overflow"); + + pg = used_pages; + used_pages++; + INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum); + } + else + { + elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + blocknum, + used_pages); + } + memcpy(page_body[pg], buffer, BLCKSZ); +} + +/* + * inmem_nblocks() -- Get the number of blocks stored in a relation. + */ +static BlockNumber +inmem_nblocks(SMgrRelation reln, ForkNumber forknum) +{ + /* + * It's not clear why a WAL redo function would call smgrnblocks(). During + * recovery, at least before reaching consistency, the size of a relation + * could be arbitrarily small, if it was truncated after the record being + * replayed, or arbitrarily large if it was extended afterwards. But one + * place where it's called is in XLogReadBufferExtended(): it extends the + * relation, if it's smaller than the requested page. That's a waste of + * time in the WAL redo process. Pretend that all relations are maximally + * sized to avoid it. + */ + return MaxBlockNumber; +} + +/* + * inmem_truncate() -- Truncate relation to specified number of blocks. + */ +static void +inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +{ +} + +/* + * inmem_immedsync() -- Immediately sync a relation to stable storage. + */ +static void +inmem_immedsync(SMgrRelation reln, ForkNumber forknum) +{ +} + +static const struct f_smgr inmem_smgr = +{ + .smgr_init = inmem_init, + .smgr_shutdown = NULL, + .smgr_open = inmem_open, + .smgr_close = inmem_close, + .smgr_create = inmem_create, + .smgr_exists = inmem_exists, + .smgr_unlink = inmem_unlink, + .smgr_extend = inmem_extend, + .smgr_prefetch = inmem_prefetch, + .smgr_read = inmem_read, + .smgr_write = inmem_write, + .smgr_writeback = inmem_writeback, + .smgr_nblocks = inmem_nblocks, + .smgr_truncate = inmem_truncate, + .smgr_immedsync = inmem_immedsync, +}; + +const f_smgr * +smgr_inmem(BackendId backend, RelFileNode rnode) +{ + Assert(InRecovery); + if (backend != InvalidBackendId) + return smgr_standard(backend, rnode); + else + return &inmem_smgr; +} + +void +smgr_init_inmem() +{ + inmem_init(); +} diff --git a/pgxn/neon_walredo/inmem_smgr.h b/pgxn/neon_walredo/inmem_smgr.h new file mode 100644 index 0000000000..af7c3fe6cc --- /dev/null +++ b/pgxn/neon_walredo/inmem_smgr.h @@ -0,0 +1,17 @@ +/*------------------------------------------------------------------------- + * + * inmem_smgr.h + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#ifndef INMEM_SMGR_H +#define INMEM_SMGR_H + +extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode); +extern void smgr_init_inmem(void); + +#endif /* INMEM_SMGR_H */ diff --git a/pgxn/neon_walredo/neon_seccomp.h b/pgxn/neon_walredo/neon_seccomp.h new file mode 100644 index 0000000000..ea92d38a77 --- /dev/null +++ b/pgxn/neon_walredo/neon_seccomp.h @@ -0,0 +1,22 @@ +#ifndef NEON_SECCOMP_H +#define NEON_SECCOMP_H + +#include + +typedef struct { + int psr_syscall; /* syscall number */ + uint32 psr_action; /* libseccomp action, e.g. SCMP_ACT_ALLOW */ +} PgSeccompRule; + +#define PG_SCMP(syscall, action) \ + (PgSeccompRule) { \ + .psr_syscall = SCMP_SYS(syscall), \ + .psr_action = (action), \ + } + +#define PG_SCMP_ALLOW(syscall) \ + PG_SCMP(syscall, SCMP_ACT_ALLOW) + +extern void seccomp_load_rules(PgSeccompRule *syscalls, int count); + +#endif /* NEON_SECCOMP_H */ diff --git a/pgxn/neon_walredo/seccomp.c b/pgxn/neon_walredo/seccomp.c new file mode 100644 index 0000000000..5d5ba549ef --- /dev/null +++ b/pgxn/neon_walredo/seccomp.c @@ -0,0 +1,257 @@ +/*------------------------------------------------------------------------- + * + * seccomp.c + * Secure Computing BPF API wrapper. + * + * Pageserver delegates complex WAL decoding duties to postgres, + * which means that the latter might fall victim to carefully designed + * malicious WAL records and start doing harmful things to the system. + * To prevent this, it has been decided to limit possible interactions + * with the outside world using the Secure Computing BPF mode. + * + * We use this mode to disable all syscalls not in the allowlist. This + * approach has its pros & cons: + * + * - We have to carefully handpick and maintain the set of syscalls + * required for the WAL redo process. Core dumps help with that. + * The method of trial and error seems to work reasonably well, + * but it would be nice to find a proper way to "prove" that + * the set in question is both necessary and sufficient. + * + * - Once we enter the seccomp bpf mode, it's impossible to lift those + * restrictions (otherwise, what kind of "protection" would that be?). + * Thus, we have to either enable extra syscalls for the clean shutdown, + * or exit the process immediately via _exit() instead of proc_exit(). + * + * - Should we simply use SCMP_ACT_KILL_PROCESS, or implement a custom + * facility to deal with the forbidden syscalls? If we'd like to embed + * a startup security test, we should go with the latter; In that + * case, which one of the following options is preferable? + * + * * Catch the denied syscalls with a signal handler using SCMP_ACT_TRAP. + * Provide a common signal handler with a static switch to override + * its behavior for the test case. This would undermine the whole + * purpose of such protection, so we'd have to go further and remap + * the memory backing the switch as readonly, then ban mprotect(). + * Ugly and fragile, to say the least. + * + * * Yet again, catch the denied syscalls using SCMP_ACT_TRAP. + * Provide 2 different signal handlers: one for a test case, + * another for the main processing loop. Install the first one, + * enable seccomp, perform the test, switch to the second one, + * finally ban sigaction(), presto! + * + * * Spoof the result of a syscall using SECCOMP_RET_ERRNO for the + * test, then ban it altogether with another filter. The downside + * of this solution is that we don't actually check that + * SCMP_ACT_KILL_PROCESS/SCMP_ACT_TRAP works. + * + * Either approach seems to require two eBPF filter programs, + * which is unfortunate: the man page tells this is uncommon. + * Maybe I (@funbringer) am missing something, though; I encourage + * any reader to get familiar with it and scrutinize my conclusions. + * + * TODOs and ideas in no particular order: + * + * - Do something about mmap() in musl's malloc(). + * Definitely not a priority if we don't care about musl. + * + * - See if we can untangle PG's shutdown sequence (involving unlink()): + * + * * Simplify (or rather get rid of) shmem setup in PG's WAL redo mode. + * * Investigate chroot() or mount namespaces for better FS isolation. + * * (Per Heikki) Simply call _exit(), no big deal. + * * Come up with a better idea? + * + * - Make use of seccomp's argument inspection (for what?). + * Unfortunately, it views all syscall arguments as scalars, + * so it won't work for e.g. string comparison in unlink(). + * + * - Benchmark with bpf jit on/off, try seccomp_syscall_priority(). + * + * - Test against various linux distros & glibc versions. + * I suspect that certain libc functions might involve slightly + * different syscalls, e.g. select/pselect6/pselect6_time64/whatever. + * + * - Test on any arch other than amd64 to see if it works there. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +/* + * I couldn't find a good way to do a conditional OBJS += seccomp.o in + * the Makefile, so this file is compiled even when seccomp is disabled, + * it's just empty in that case. + */ +#ifdef HAVE_LIBSECCOMP + +#include +#include + +#include "miscadmin.h" + +#include "neon_seccomp.h" + +static void die(int code, const char *str); + +static bool seccomp_test_sighandler_done = false; +static void seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt); +static void seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt); + +static int do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action); + +void +seccomp_load_rules(PgSeccompRule *rules, int count) +{ + struct sigaction action = { .sa_flags = SA_SIGINFO }; + PgSeccompRule rule; + long fd; + + /* + * Install a test signal handler. + * XXX: pqsignal() is too restrictive for our purposes, + * since we'd like to examine the contents of siginfo_t. + */ + action.sa_sigaction = seccomp_test_sighandler; + if (sigaction(SIGSYS, &action, NULL) != 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not install test SIGSYS handler"))); + + /* + * First, check that open of a well-known file works. + * XXX: We use raw syscall() to call the very open(). + */ + fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); + if (seccomp_test_sighandler_done) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: signal handler test flag was set unexpectedly"))); + if (fd < 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not open /dev/null for seccomp testing: %m"))); + close((int) fd); + + /* Set a trap on open() to test seccomp bpf */ + rule = PG_SCMP(open, SCMP_ACT_TRAP); + if (do_seccomp_load_rules(&rule, 1, SCMP_ACT_ALLOW) != 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not load test trap"))); + + /* Finally, check that open() now raises SIGSYS */ + (void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); + if (!seccomp_test_sighandler_done) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: SIGSYS handler doesn't seem to work"))); + + /* Now that everything seems to work, install a proper handler */ + action.sa_sigaction = seccomp_deny_sighandler; + if (sigaction(SIGSYS, &action, NULL) != 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not install SIGSYS handler"))); + + /* If this succeeds, any syscall not in the list will crash the process */ + if (do_seccomp_load_rules(rules, count, SCMP_ACT_TRAP) != 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not enter seccomp mode"))); +} + +/* + * Enter seccomp mode with a BPF filter that will only allow + * certain syscalls to proceed. + */ +static int +do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action) +{ + scmp_filter_ctx ctx; + int rc = -1; + + /* Create a context with a default action for syscalls not in the list */ + if ((ctx = seccomp_init(def_action)) == NULL) + goto cleanup; + + for (int i = 0; i < count; i++) + { + PgSeccompRule *rule = &rules[i]; + if ((rc = seccomp_rule_add(ctx, rule->psr_action, rule->psr_syscall, 0)) != 0) + goto cleanup; + } + + /* Try building & loading the program into the kernel */ + if ((rc = seccomp_load(ctx)) != 0) + goto cleanup; + +cleanup: + /* + * We don't need the context anymore regardless of the result, + * since either we failed or the eBPF program has already been + * loaded into the linux kernel. + */ + seccomp_release(ctx); + return rc; +} + +static void +die(int code, const char *str) +{ + /* work around gcc ignoring that it shouldn't warn on (void) result being unused */ + ssize_t _unused pg_attribute_unused(); + /* Best effort write to stderr */ + _unused = write(fileno(stderr), str, strlen(str)); + + /* XXX: we don't want to run any atexit callbacks */ + _exit(code); +} + +static void +seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused()) +{ +#define DIE_PREFIX "seccomp test signal handler: " + + /* Check that this signal handler is used only for a single test case */ + if (seccomp_test_sighandler_done) + die(1, DIE_PREFIX "test handler should only be used for 1 test\n"); + seccomp_test_sighandler_done = true; + + if (signum != SIGSYS) + die(1, DIE_PREFIX "bad signal number\n"); + + /* TODO: maybe somehow extract the hardcoded syscall number */ + if (info->si_syscall != SCMP_SYS(open)) + die(1, DIE_PREFIX "bad syscall number\n"); + +#undef DIE_PREFIX +} + +static void +seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused()) +{ + /* + * Unfortunately, we can't use seccomp_syscall_resolve_num_arch() + * to resolve the syscall's name, since it calls strdup() + * under the hood (wtf!). + */ + char buffer[128]; + (void)snprintf(buffer, lengthof(buffer), + "---------------------------------------\n" + "seccomp: bad syscall %d\n" + "---------------------------------------\n", + info->si_syscall); + + /* + * Instead of silently crashing the process with + * a fake SIGSYS caused by SCMP_ACT_KILL_PROCESS, + * we'd like to receive a real SIGSYS to print the + * message and *then* immediately exit. + */ + die(1, buffer); +} + +#endif /* HAVE_LIBSECCOMP */ diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c new file mode 100644 index 0000000000..ffbfca5a40 --- /dev/null +++ b/pgxn/neon_walredo/walredoproc.c @@ -0,0 +1,847 @@ +/*------------------------------------------------------------------------- + * + * walredoproc.c + * Entry point for WAL redo helper + * + * + * This file contains an alternative main() function for the 'postgres' + * binary. In the special mode, we go into a special mode that's similar + * to the single user mode. We don't launch postmaster or any auxiliary + * processes. Instead, we wait for command from 'stdin', and respond to + * 'stdout'. + * + * The protocol through stdin/stdout is loosely based on the libpq protocol. + * The process accepts messages through stdin, and each message has the format: + * + * char msgtype; + * int32 length; // length of message including 'length' but excluding + * // 'msgtype', in network byte order + * + * + * There are three message types: + * + * BeginRedoForBlock ('B'): Prepare for WAL replay for given block + * PushPage ('P'): Copy a page image (in the payload) to buffer cache + * ApplyRecord ('A'): Apply a WAL record (in the payload) + * GetPage ('G'): Return a page image from buffer cache. + * + * Currently, you only get a response to GetPage requests; the response is + * simply a 8k page, without any headers. Errors are logged to stderr. + * + * FIXME: + * - this currently requires a valid PGDATA, and creates a lock file there + * like a normal postmaster. There's no fundamental reason for that, though. + * - should have EndRedoForBlock, and flush page cache, to allow using this + * mechanism for more than one block without restarting the process. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include +#include +#include +#include +#ifdef HAVE_SYS_SELECT_H +#include +#endif +#ifdef HAVE_SYS_RESOURCE_H +#include +#include +#endif + +#if defined(HAVE_LIBSECCOMP) && defined(__GLIBC__) +#define MALLOC_NO_MMAP +#include +#endif + +#ifndef HAVE_GETRUSAGE +#include "rusagestub.h" +#endif + +#include "access/xlog.h" +#include "access/xlog_internal.h" +#if PG_VERSION_NUM >= 150000 +#include "access/xlogrecovery.h" +#endif +#include "access/xlogutils.h" +#include "catalog/pg_class.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "miscadmin.h" +#include "postmaster/postmaster.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" + +#include "inmem_smgr.h" + +#ifdef HAVE_LIBSECCOMP +#include "neon_seccomp.h" +#endif + +PG_MODULE_MAGIC; + +static int ReadRedoCommand(StringInfo inBuf); +static void BeginRedoForBlock(StringInfo input_message); +static void PushPage(StringInfo input_message); +static void ApplyRecord(StringInfo input_message); +static void apply_error_callback(void *arg); +static bool redo_block_filter(XLogReaderState *record, uint8 block_id); +static void GetPage(StringInfo input_message); +static ssize_t buffered_read(void *buf, size_t count); + +static BufferTag target_redo_tag; + +static XLogReaderState *reader_state; + +#define TRACE DEBUG5 + +#ifdef HAVE_LIBSECCOMP +static void +enter_seccomp_mode(void) +{ + PgSeccompRule syscalls[] = + { + /* Hard requirements */ + PG_SCMP_ALLOW(exit_group), + PG_SCMP_ALLOW(pselect6), + PG_SCMP_ALLOW(read), + PG_SCMP_ALLOW(select), + PG_SCMP_ALLOW(write), + + /* Memory allocation */ + PG_SCMP_ALLOW(brk), +#ifndef MALLOC_NO_MMAP + /* TODO: musl doesn't have mallopt */ + PG_SCMP_ALLOW(mmap), + PG_SCMP_ALLOW(munmap), +#endif + /* + * getpid() is called on assertion failure, in ExceptionalCondition. + * It's not really needed, but seems pointless to hide it either. The + * system call unlikely to expose a kernel vulnerability, and the PID + * is stored in MyProcPid anyway. + */ + PG_SCMP_ALLOW(getpid), + + /* Enable those for a proper shutdown. + PG_SCMP_ALLOW(munmap), + PG_SCMP_ALLOW(shmctl), + PG_SCMP_ALLOW(shmdt), + PG_SCMP_ALLOW(unlink), // shm_unlink + */ + }; + +#ifdef MALLOC_NO_MMAP + /* Ask glibc not to use mmap() */ + mallopt(M_MMAP_MAX, 0); +#endif + + seccomp_load_rules(syscalls, lengthof(syscalls)); +} +#endif /* HAVE_LIBSECCOMP */ + +/* + * Entry point for the WAL redo process. + * + * Performs similar initialization as PostgresMain does for normal + * backend processes. Some initialization was done in CallExtMain + * already. + */ +void +WalRedoMain(int argc, char *argv[]) +{ + int firstchar; + StringInfoData input_message; +#ifdef HAVE_LIBSECCOMP + bool enable_seccomp; +#endif + + am_wal_redo_postgres = true; + + /* + * WAL redo does not need a large number of buffers. And speed of + * DropRelFileNodeAllLocalBuffers() is proportional to the number of + * buffers. So let's keep it small (default value is 1024) + */ + num_temp_buffers = 4; + + /* + * install the simple in-memory smgr + */ + smgr_hook = smgr_inmem; + smgr_init_hook = smgr_init_inmem; + + /* + * Validate we have been given a reasonable-looking DataDir and change into it. + */ + checkDataDir(); + ChangeToDataDir(); + + /* + * Create lockfile for data directory. + */ + CreateDataDirLockFile(false); + + /* read control file (error checking and contains config ) */ + LocalProcessControlFile(false); + + /* + * process any libraries that should be preloaded at postmaster start + */ + process_shared_preload_libraries(); + + /* Initialize MaxBackends (if under postmaster, was done already) */ + InitializeMaxBackends(); + +#if PG_VERSION_NUM >= 150000 + /* + * Give preloaded libraries a chance to request additional shared memory. + */ + process_shmem_requests(); + + /* + * Now that loadable modules have had their chance to request additional + * shared memory, determine the value of any runtime-computed GUCs that + * depend on the amount of shared memory required. + */ + InitializeShmemGUCs(); + + /* + * Now that modules have been loaded, we can process any custom resource + * managers specified in the wal_consistency_checking GUC. + */ + InitializeWalConsistencyChecking(); +#endif + + CreateSharedMemoryAndSemaphores(); + + /* + * Remember stand-alone backend startup time,roughly at the same point + * during startup that postmaster does so. + */ + PgStartTime = GetCurrentTimestamp(); + + /* + * Create a per-backend PGPROC struct in shared memory. We must do + * this before we can use LWLocks. + */ + InitAuxiliaryProcess(); + + SetProcessingMode(NormalProcessing); + + /* Redo routines won't work if we're not "in recovery" */ + InRecovery = true; + + /* + * Create the memory context we will use in the main loop. + * + * MessageContext is reset once per iteration of the main loop, ie, upon + * completion of processing of each command message from the client. + */ + MessageContext = AllocSetContextCreate(TopMemoryContext, + "MessageContext", + ALLOCSET_DEFAULT_SIZES); + + /* we need a ResourceOwner to hold buffer pins */ + Assert(CurrentResourceOwner == NULL); + CurrentResourceOwner = ResourceOwnerCreate(NULL, "wal redo"); + + /* Initialize resource managers */ + for (int rmid = 0; rmid <= RM_MAX_ID; rmid++) + { + if (RmgrTable[rmid].rm_startup != NULL) + RmgrTable[rmid].rm_startup(); + } + reader_state = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(), NULL); + +#ifdef HAVE_LIBSECCOMP + /* We prefer opt-out to opt-in for greater security */ + enable_seccomp = true; + for (int i = 1; i < argc; i++) + if (strcmp(argv[i], "--disable-seccomp") == 0) + enable_seccomp = false; + + /* + * We deliberately delay the transition to the seccomp mode + * until it's time to enter the main processing loop; + * else we'd have to add a lot more syscalls to the allowlist. + */ + if (enable_seccomp) + enter_seccomp_mode(); +#endif /* HAVE_LIBSECCOMP */ + + /* + * Main processing loop + */ + MemoryContextSwitchTo(MessageContext); + initStringInfo(&input_message); + + for (;;) + { + /* Release memory left over from prior query cycle. */ + resetStringInfo(&input_message); + + set_ps_display("idle"); + + /* + * (3) read a command (loop blocks here) + */ + firstchar = ReadRedoCommand(&input_message); + switch (firstchar) + { + case 'B': /* BeginRedoForBlock */ + BeginRedoForBlock(&input_message); + break; + + case 'P': /* PushPage */ + PushPage(&input_message); + break; + + case 'A': /* ApplyRecord */ + ApplyRecord(&input_message); + break; + + case 'G': /* GetPage */ + GetPage(&input_message); + break; + + /* + * EOF means we're done. Perform normal shutdown. + */ + case EOF: + ereport(LOG, + (errmsg("received EOF on stdin, shutting down"))); + +#ifdef HAVE_LIBSECCOMP + /* + * Skip the shutdown sequence, leaving some garbage behind. + * Hopefully, postgres will clean it up in the next run. + * This way we don't have to enable extra syscalls, which is nice. + * See enter_seccomp_mode() above. + */ + if (enable_seccomp) + _exit(0); +#endif /* HAVE_LIBSECCOMP */ + /* + * NOTE: if you are tempted to add more code here, DON'T! + * Whatever you had in mind to do should be set up as an + * on_proc_exit or on_shmem_exit callback, instead. Otherwise + * it will fail to be called during other backend-shutdown + * scenarios. + */ + proc_exit(0); + + default: + ereport(FATAL, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid frontend message type %d", + firstchar))); + } + } /* end of input-reading loop */ +} + + +/* Version compatility wrapper for ReadBufferWithoutRelcache */ +static inline Buffer +NeonRedoReadBuffer(RelFileNode rnode, + ForkNumber forkNum, BlockNumber blockNum, + ReadBufferMode mode) +{ +#if PG_VERSION_NUM >= 150000 + return ReadBufferWithoutRelcache(rnode, forkNum, blockNum, mode, + NULL, /* no strategy */ + true); /* WAL redo is only performed on permanent rels */ +#else + return ReadBufferWithoutRelcache(rnode, forkNum, blockNum, mode, + NULL); /* no strategy */ +#endif +} + + +/* + * Some debug function that may be handy for now. + */ +pg_attribute_unused() +static char * +pprint_buffer(char *data, int len) +{ + StringInfoData s; + + initStringInfo(&s); + appendStringInfo(&s, "\n"); + for (int i = 0; i < len; i++) { + + appendStringInfo(&s, "%02x ", (*(((char *) data) + i) & 0xff) ); + if (i % 32 == 31) { + appendStringInfo(&s, "\n"); + } + } + appendStringInfo(&s, "\n"); + + return s.data; +} + +/* ---------------------------------------------------------------- + * routines to obtain user input + * ---------------------------------------------------------------- + */ + +/* + * Read next command from the client. + * + * the string entered by the user is placed in its parameter inBuf, + * and we act like a Q message was received. + * + * EOF is returned if end-of-file input is seen; time to shut down. + * ---------------- + */ +static int +ReadRedoCommand(StringInfo inBuf) +{ + ssize_t ret; + char hdr[1 + sizeof(int32)]; + int qtype; + int32 len; + + /* Read message type and message length */ + ret = buffered_read(hdr, sizeof(hdr)); + if (ret != sizeof(hdr)) + { + if (ret == 0) + return EOF; + else if (ret < 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not read message header: %m"))); + else + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected EOF"))); + } + + qtype = hdr[0]; + memcpy(&len, &hdr[1], sizeof(int32)); + len = pg_ntoh32(len); + + if (len < 4) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid message length"))); + + len -= 4; /* discount length itself */ + + /* Read the message payload */ + enlargeStringInfo(inBuf, len); + ret = buffered_read(inBuf->data, len); + if (ret != len) + { + if (ret < 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not read message: %m"))); + else + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected EOF"))); + } + inBuf->len = len; + inBuf->data[len] = '\0'; + + return qtype; +} + +/* + * Prepare for WAL replay on given block + */ +static void +BeginRedoForBlock(StringInfo input_message) +{ + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blknum; + SMgrRelation reln; + + /* + * message format: + * + * spcNode + * dbNode + * relNode + * ForkNumber + * BlockNumber + */ + forknum = pq_getmsgbyte(input_message); + rnode.spcNode = pq_getmsgint(input_message, 4); + rnode.dbNode = pq_getmsgint(input_message, 4); + rnode.relNode = pq_getmsgint(input_message, 4); + blknum = pq_getmsgint(input_message, 4); + wal_redo_buffer = InvalidBuffer; + + INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum); + + elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u", + target_redo_tag.rnode.spcNode, + target_redo_tag.rnode.dbNode, + target_redo_tag.rnode.relNode, + target_redo_tag.forkNum, + target_redo_tag.blockNum); + + reln = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); + if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber || + reln->smgr_cached_nblocks[forknum] < blknum + 1) + { + reln->smgr_cached_nblocks[forknum] = blknum + 1; + } +} + +/* + * Receive a page given by the client, and put it into buffer cache. + */ +static void +PushPage(StringInfo input_message) +{ + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blknum; + const char *content; + Buffer buf; + Page page; + + /* + * message format: + * + * spcNode + * dbNode + * relNode + * ForkNumber + * BlockNumber + * 8k page content + */ + forknum = pq_getmsgbyte(input_message); + rnode.spcNode = pq_getmsgint(input_message, 4); + rnode.dbNode = pq_getmsgint(input_message, 4); + rnode.relNode = pq_getmsgint(input_message, 4); + blknum = pq_getmsgint(input_message, 4); + content = pq_getmsgbytes(input_message, BLCKSZ); + + buf = NeonRedoReadBuffer(rnode, forknum, blknum, RBM_ZERO_AND_LOCK); + wal_redo_buffer = buf; + page = BufferGetPage(buf); + memcpy(page, content, BLCKSZ); + MarkBufferDirty(buf); /* pro forma */ + UnlockReleaseBuffer(buf); +} + +/* + * Receive a WAL record, and apply it. + * + * All the pages should be loaded into the buffer cache by PushPage calls already. + */ +static void +ApplyRecord(StringInfo input_message) +{ + char *errormsg; + XLogRecPtr lsn; + XLogRecord *record; + int nleft; + ErrorContextCallback errcallback; +#if PG_VERSION_NUM >= 150000 + DecodedXLogRecord *decoded; +#endif + + /* + * message format: + * + * LSN (the *end* of the record) + * record + */ + lsn = pq_getmsgint64(input_message); + + smgrinit(); /* reset inmem smgr state */ + + /* note: the input must be aligned here */ + record = (XLogRecord *) pq_getmsgbytes(input_message, sizeof(XLogRecord)); + + nleft = input_message->len - input_message->cursor; + if (record->xl_tot_len != sizeof(XLogRecord) + nleft) + elog(ERROR, "mismatch between record (%d) and message size (%d)", + record->xl_tot_len, (int) sizeof(XLogRecord) + nleft); + + /* Setup error traceback support for ereport() */ + errcallback.callback = apply_error_callback; + errcallback.arg = (void *) reader_state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + XLogBeginRead(reader_state, lsn); + +#if PG_VERSION_NUM >= 150000 + decoded = (DecodedXLogRecord *) XLogReadRecordAlloc(reader_state, record->xl_tot_len, true); + + if (!DecodeXLogRecord(reader_state, decoded, record, lsn, &errormsg)) + elog(ERROR, "failed to decode WAL record: %s", errormsg); + else + { + /* Record the location of the next record. */ + decoded->next_lsn = reader_state->NextRecPtr; + + /* + * If it's in the decode buffer, mark the decode buffer space as + * occupied. + */ + if (!decoded->oversized) + { + /* The new decode buffer head must be MAXALIGNed. */ + Assert(decoded->size == MAXALIGN(decoded->size)); + if ((char *) decoded == reader_state->decode_buffer) + reader_state->decode_buffer_tail = reader_state->decode_buffer + decoded->size; + else + reader_state->decode_buffer_tail += decoded->size; + } + + /* Insert it into the queue of decoded records. */ + Assert(reader_state->decode_queue_tail != decoded); + if (reader_state->decode_queue_tail) + reader_state->decode_queue_tail->next = decoded; + reader_state->decode_queue_tail = decoded; + if (!reader_state->decode_queue_head) + reader_state->decode_queue_head = decoded; + + /* + * Update the pointers to the beginning and one-past-the-end of this + * record, again for the benefit of historical code that expected the + * decoder to track this rather than accessing these fields of the record + * itself. + */ + reader_state->record = reader_state->decode_queue_head; + reader_state->ReadRecPtr = reader_state->record->lsn; + reader_state->EndRecPtr = reader_state->record->next_lsn; + } +#else + /* + * In lieu of calling XLogReadRecord, store the record 'decoded_record' + * buffer directly. + */ + reader_state->ReadRecPtr = lsn; + reader_state->decoded_record = record; + if (!DecodeXLogRecord(reader_state, record, &errormsg)) + elog(ERROR, "failed to decode WAL record: %s", errormsg); +#endif + + /* Ignore any other blocks than the ones the caller is interested in */ + redo_read_buffer_filter = redo_block_filter; + + RmgrTable[record->xl_rmid].rm_redo(reader_state); + + /* + * If no base image of the page was provided by PushPage, initialize + * wal_redo_buffer here. The first WAL record must initialize the page + * in that case. + */ + if (BufferIsInvalid(wal_redo_buffer)) + { + wal_redo_buffer = NeonRedoReadBuffer(target_redo_tag.rnode, + target_redo_tag.forkNum, + target_redo_tag.blockNum, + RBM_NORMAL); + Assert(!BufferIsInvalid(wal_redo_buffer)); + ReleaseBuffer(wal_redo_buffer); + } + + redo_read_buffer_filter = NULL; + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + elog(TRACE, "applied WAL record with LSN %X/%X", + (uint32) (lsn >> 32), (uint32) lsn); +#if PG_VERSION_NUM >= 150000 + if (decoded && decoded->oversized) + pfree(decoded); +#endif +} + +/* + * Error context callback for errors occurring during ApplyRecord + */ +static void +apply_error_callback(void *arg) +{ + XLogReaderState *record = (XLogReaderState *) arg; + StringInfoData buf; + + initStringInfo(&buf); + xlog_outdesc(&buf, record); + + /* translator: %s is a WAL record description */ + errcontext("WAL redo at %X/%X for %s", + LSN_FORMAT_ARGS(record->ReadRecPtr), + buf.data); + + pfree(buf.data); +} + + + +static bool +redo_block_filter(XLogReaderState *record, uint8 block_id) +{ + BufferTag target_tag; + +#if PG_VERSION_NUM >= 150000 + XLogRecGetBlockTag(record, block_id, + &target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum); +#else + if (!XLogRecGetBlockTag(record, block_id, + &target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum)) + { + /* Caller specified a bogus block_id */ + elog(PANIC, "failed to locate backup block with ID %d", block_id); + } +#endif + + /* + * Can a WAL redo function ever access a relation other than the one that + * it modifies? I don't see why it would. + */ + if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode)) + elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u", + target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode, target_tag.forkNum, target_tag.blockNum); + + /* + * If this block isn't one we are currently restoring, then return 'true' + * so that this gets ignored + */ + return !BUFFERTAGS_EQUAL(target_tag, target_redo_tag); +} + +/* + * Get a page image back from buffer cache. + * + * After applying some records. + */ +static void +GetPage(StringInfo input_message) +{ + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blknum; + Buffer buf; + Page page; + int tot_written; + + /* + * message format: + * + * spcNode + * dbNode + * relNode + * ForkNumber + * BlockNumber + */ + forknum = pq_getmsgbyte(input_message); + rnode.spcNode = pq_getmsgint(input_message, 4); + rnode.dbNode = pq_getmsgint(input_message, 4); + rnode.relNode = pq_getmsgint(input_message, 4); + blknum = pq_getmsgint(input_message, 4); + + /* FIXME: check that we got a BeginRedoForBlock message or this earlier */ + + buf = NeonRedoReadBuffer(rnode, forknum, blknum, RBM_NORMAL); + Assert(buf == wal_redo_buffer); + page = BufferGetPage(buf); + /* single thread, so don't bother locking the page */ + + /* Response: Page content */ + tot_written = 0; + do { + ssize_t rc; + + rc = write(STDOUT_FILENO, &page[tot_written], BLCKSZ - tot_written); + if (rc < 0) { + /* If interrupted by signal, just retry */ + if (errno == EINTR) + continue; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to stdout: %m"))); + } + tot_written += rc; + } while (tot_written < BLCKSZ); + + ReleaseBuffer(buf); + DropRelFileNodeAllLocalBuffers(rnode); + wal_redo_buffer = InvalidBuffer; + + elog(TRACE, "Page sent back for block %u", blknum); +} + + +/* Buffer used by buffered_read() */ +static char stdin_buf[16 * 1024]; +static size_t stdin_len = 0; /* # of bytes in buffer */ +static size_t stdin_ptr = 0; /* # of bytes already consumed */ + +/* + * Like read() on stdin, but buffered. + * + * We cannot use libc's buffered fread(), because it uses syscalls that we + * have disabled with seccomp(). Depending on the platform, it can call + * 'fstat' or 'newfstatat'. 'fstat' is probably harmless, but 'newfstatat' + * seems problematic because it allows interrogating files by path name. + * + * The return value is the number of bytes read. On error, -1 is returned, and + * errno is set appropriately. Unlike read(), this fills the buffer completely + * unless an error happens or EOF is reached. + */ +static ssize_t +buffered_read(void *buf, size_t count) +{ + char *dst = buf; + + while (count > 0) + { + size_t nthis; + + if (stdin_ptr == stdin_len) + { + ssize_t ret; + + ret = read(STDIN_FILENO, stdin_buf, sizeof(stdin_buf)); + if (ret < 0) + { + /* don't do anything here that could set 'errno' */ + return ret; + } + if (ret == 0) + { + /* EOF */ + break; + } + stdin_len = (size_t) ret; + stdin_ptr = 0; + } + nthis = Min(stdin_len - stdin_ptr, count); + + memcpy(dst, &stdin_buf[stdin_ptr], nthis); + + stdin_ptr += nthis; + count -= nthis; + dst += nthis; + } + + return (dst - (char *) buf); +} diff --git a/pgxn/typedefs.list b/pgxn/typedefs.list new file mode 100644 index 0000000000..760f384212 --- /dev/null +++ b/pgxn/typedefs.list @@ -0,0 +1,3776 @@ +ACCESS_ALLOWED_ACE +ACL +ACL_SIZE_INFORMATION +AFFIX +ASN1_INTEGER +ASN1_OBJECT +ASN1_STRING +AV +A_ArrayExpr +A_Const +A_Expr +A_Expr_Kind +A_Indices +A_Indirection +A_Star +AbsoluteTime +AccessMethodInfo +AccessPriv +Acl +AclItem +AclMaskHow +AclMode +AclResult +AcquireSampleRowsFunc +ActionList +ActiveSnapshotElt +AddForeignUpdateTargets_function +AffixNode +AffixNodeData +AfterTriggerEvent +AfterTriggerEventChunk +AfterTriggerEventData +AfterTriggerEventList +AfterTriggerShared +AfterTriggerSharedData +AfterTriggersData +AfterTriggersQueryData +AfterTriggersTableData +AfterTriggersTransData +Agg +AggClauseCosts +AggInfo +AggPath +AggSplit +AggState +AggStatePerAgg +AggStatePerGroup +AggStatePerHash +AggStatePerPhase +AggStatePerTrans +AggStrategy +AggTransInfo +Aggref +AggregateInstrumentation +AlenState +Alias +AllocBlock +AllocChunk +AllocPointer +AllocSet +AllocSetContext +AllocSetFreeList +AllocateDesc +AllocateDescKind +AlterCollationStmt +AlterDatabaseSetStmt +AlterDatabaseStmt +AlterDefaultPrivilegesStmt +AlterDomainStmt +AlterEnumStmt +AlterEventTrigStmt +AlterExtensionContentsStmt +AlterExtensionStmt +AlterFdwStmt +AlterForeignServerStmt +AlterFunctionStmt +AlterObjectDependsStmt +AlterObjectSchemaStmt +AlterOpFamilyStmt +AlterOperatorStmt +AlterOwnerStmt +AlterPolicyStmt +AlterPublicationStmt +AlterRoleSetStmt +AlterRoleStmt +AlterSeqStmt +AlterStatsStmt +AlterSubscriptionStmt +AlterSubscriptionType +AlterSystemStmt +AlterTSConfigType +AlterTSConfigurationStmt +AlterTSDictionaryStmt +AlterTableCmd +AlterTableMoveAllStmt +AlterTableSpaceOptionsStmt +AlterTableStmt +AlterTableType +AlterTableUtilityContext +AlterTypeRecurseParams +AlterTypeStmt +AlterUserMappingStmt +AlteredTableInfo +AlternativeSubPlan +AmcheckOptions +AnalyzeAttrComputeStatsFunc +AnalyzeAttrFetchFunc +AnalyzeForeignTable_function +AnlExprData +AnlIndexData +AnyArrayType +Append +AppendPath +AppendRelInfo +AppendState +ApplyExecutionData +ApplySubXactData +Archive +ArchiveEntryPtrType +ArchiveFormat +ArchiveHandle +ArchiveMode +ArchiveOpts +ArchiverOutput +ArchiverStage +ArrayAnalyzeExtraData +ArrayBuildState +ArrayBuildStateAny +ArrayBuildStateArr +ArrayCoerceExpr +ArrayConstIterState +ArrayExpr +ArrayExprIterState +ArrayIOData +ArrayIterator +ArrayMapState +ArrayMetaState +ArrayParseState +ArraySubWorkspace +ArrayType +AsyncQueueControl +AsyncQueueEntry +AsyncRequest +AttInMetadata +AttStatsSlot +AttoptCacheEntry +AttoptCacheKey +AttrDefInfo +AttrDefault +AttrMap +AttrMissing +AttrNumber +AttributeOpts +AuthRequest +AutoPrewarmSharedState +AutoVacOpts +AutoVacuumShmemStruct +AutoVacuumWorkItem +AutoVacuumWorkItemType +AuxProcType +BF_ctx +BF_key +BF_word +BF_word_signed +BIGNUM +BIO +BIO_METHOD +BITVECP +BMS_Comparison +BMS_Membership +BN_CTX +BOOL +BOOLEAN +BOX +BTArrayKeyInfo +BTBuildState +BTCycleId +BTDedupInterval +BTDedupState +BTDedupStateData +BTDeletedPageData +BTIndexStat +BTInsertState +BTInsertStateData +BTLeader +BTMetaPageData +BTOneVacInfo +BTOptions +BTPS_State +BTPageOpaque +BTPageOpaqueData +BTPageStat +BTPageState +BTParallelScanDesc +BTPendingFSM +BTScanInsert +BTScanInsertData +BTScanOpaque +BTScanOpaqueData +BTScanPos +BTScanPosData +BTScanPosItem +BTShared +BTSortArrayContext +BTSpool +BTStack +BTStackData +BTVacInfo +BTVacState +BTVacuumPosting +BTVacuumPostingData +BTWriteState +BUF_MEM +BYTE +BY_HANDLE_FILE_INFORMATION +Backend +BackendId +BackendParameters +BackendState +BackendType +BackgroundWorker +BackgroundWorkerArray +BackgroundWorkerHandle +BackgroundWorkerSlot +Barrier +BaseBackupCmd +BeginDirectModify_function +BeginForeignInsert_function +BeginForeignModify_function +BeginForeignScan_function +BeginSampleScan_function +BernoulliSamplerData +BgWorkerStartTime +BgwHandleStatus +BinaryArithmFunc +BindParamCbData +BipartiteMatchState +BitmapAnd +BitmapAndPath +BitmapAndState +BitmapHeapPath +BitmapHeapScan +BitmapHeapScanState +BitmapIndexScan +BitmapIndexScanState +BitmapOr +BitmapOrPath +BitmapOrState +Bitmapset +BlobInfo +Block +BlockId +BlockIdData +BlockInfoRecord +BlockNumber +BlockSampler +BlockSamplerData +BlockedProcData +BlockedProcsData +BloomBuildState +BloomFilter +BloomMetaPageData +BloomOpaque +BloomOptions +BloomPageOpaque +BloomPageOpaqueData +BloomScanOpaque +BloomScanOpaqueData +BloomSignatureWord +BloomState +BloomTuple +BlowfishContext +BoolAggState +BoolExpr +BoolExprType +BoolTestType +BooleanTest +BpChar +BrinBuildState +BrinDesc +BrinMemTuple +BrinMetaPageData +BrinOpaque +BrinOpcInfo +BrinOptions +BrinRevmap +BrinSpecialSpace +BrinStatsData +BrinTuple +BrinValues +BtreeCheckState +BtreeLevel +Bucket +BufFile +Buffer +BufferAccessStrategy +BufferAccessStrategyType +BufferCachePagesContext +BufferCachePagesRec +BufferDesc +BufferDescPadded +BufferHeapTupleTableSlot +BufferLookupEnt +BufferStrategyControl +BufferTag +BufferUsage +BuildAccumulator +BuiltinScript +BulkInsertState +BulkInsertStateData +CACHESIGN +CAC_state +CCFastEqualFN +CCHashFN +CEOUC_WAIT_MODE +CFuncHashTabEntry +CHAR +CHECKPOINT +CHKVAL +CIRCLE +CMPDAffix +CONTEXT +COP +CRITICAL_SECTION +CRSSnapshotAction +CState +CTECycleClause +CTEMaterialize +CTESearchClause +CV +CachedExpression +CachedPlan +CachedPlanSource +CallContext +CallStmt +CancelRequestPacket +CaseExpr +CaseTestExpr +CaseWhen +Cash +CastInfo +CatCList +CatCTup +CatCache +CatCacheHeader +CatalogId +CatalogIndexState +ChangeVarNodes_context +CheckPoint +CheckPointStmt +CheckpointStatsData +CheckpointerRequest +CheckpointerShmemStruct +Chromosome +CkptSortItem +CkptTsStatus +ClientAuthentication_hook_type +ClientCertMode +ClientCertName +ClientData +ClonePtrType +ClosePortalStmt +ClosePtrType +Clump +ClusterInfo +ClusterParams +ClusterStmt +CmdType +CoalesceExpr +CoerceParamHook +CoerceToDomain +CoerceToDomainValue +CoerceViaIO +CoercionContext +CoercionForm +CoercionPathType +CollAliasData +CollInfo +CollateClause +CollateExpr +CollateStrength +CollectedATSubcmd +CollectedCommand +CollectedCommandType +ColorTrgm +ColorTrgmInfo +ColumnCompareData +ColumnDef +ColumnIOData +ColumnRef +ColumnsHashData +CombinationGenerator +ComboCidEntry +ComboCidEntryData +ComboCidKey +ComboCidKeyData +Command +CommandDest +CommandId +CommandTag +CommandTagBehavior +CommentItem +CommentStmt +CommitTimestampEntry +CommitTimestampShared +CommonEntry +CommonTableExpr +CompareScalarsContext +CompiledExprState +CompositeIOData +CompositeTypeStmt +CompoundAffixFlag +CompressionAlgorithm +CompressorState +ComputeXidHorizonsResult +ConditionVariable +ConditionVariableMinimallyPadded +ConditionalStack +ConfigData +ConfigVariable +ConnCacheEntry +ConnCacheKey +ConnParams +ConnStatusType +ConnType +ConnectionStateEnum +ConnsAllowedState +ConsiderSplitContext +Const +ConstrCheck +ConstrType +Constraint +ConstraintCategory +ConstraintInfo +ConstraintsSetStmt +ControlData +ControlFileData +ConvInfo +ConvProcInfo +ConversionLocation +ConvertRowtypeExpr +CookedConstraint +CopyDest +CopyFormatOptions +CopyFromState +CopyFromStateData +CopyInsertMethod +CopyMultiInsertBuffer +CopyMultiInsertInfo +CopySource +CopyStmt +CopyToState +CopyToStateData +Cost +CostSelector +Counters +CoverExt +CoverPos +CreateAmStmt +CreateCastStmt +CreateConversionStmt +CreateDomainStmt +CreateEnumStmt +CreateEventTrigStmt +CreateExtensionStmt +CreateFdwStmt +CreateForeignServerStmt +CreateForeignTableStmt +CreateFunctionStmt +CreateOpClassItem +CreateOpClassStmt +CreateOpFamilyStmt +CreatePLangStmt +CreatePolicyStmt +CreatePublicationStmt +CreateRangeStmt +CreateReplicationSlotCmd +CreateRoleStmt +CreateSchemaStmt +CreateSchemaStmtContext +CreateSeqStmt +CreateStatsStmt +CreateStmt +CreateStmtContext +CreateSubscriptionStmt +CreateTableAsStmt +CreateTableSpaceStmt +CreateTransformStmt +CreateTrigStmt +CreateUserMappingStmt +CreatedbStmt +CredHandle +CteItem +CteScan +CteScanState +CteState +CtlCommand +CtxtHandle +CurrentOfExpr +CustomExecMethods +CustomOutPtrType +CustomPath +CustomScan +CustomScanMethods +CustomScanState +CycleCtr +DBState +DCHCacheEntry +DEADLOCK_INFO +DECountItem +DH +DIR +DNSServiceErrorType +DNSServiceRef +DR_copy +DR_intorel +DR_printtup +DR_sqlfunction +DR_transientrel +DSA +DWORD +DataDumperPtr +DataPageDeleteStack +DatabaseInfo +DateADT +Datum +DatumTupleFields +DbInfo +DbInfoArr +DeClonePtrType +DeadLockState +DeallocateStmt +DeclareCursorStmt +DecodedBkpBlock +DecodingOutputState +DefElem +DefElemAction +DefaultACLInfo +DefineStmt +DeleteStmt +DependencyGenerator +DependencyGeneratorData +DependencyType +DestReceiver +DictISpell +DictInt +DictSimple +DictSnowball +DictSubState +DictSyn +DictThesaurus +DimensionInfo +DirectoryMethodData +DirectoryMethodFile +DisableTimeoutParams +DiscardMode +DiscardStmt +DistanceValue +DistinctExpr +DoStmt +DocRepresentation +DomainConstraintCache +DomainConstraintRef +DomainConstraintState +DomainConstraintType +DomainIOData +DropBehavior +DropOwnedStmt +DropReplicationSlotCmd +DropRoleStmt +DropStmt +DropSubscriptionStmt +DropTableSpaceStmt +DropUserMappingStmt +DropdbStmt +DumpComponents +DumpId +DumpOptions +DumpSignalInformation +DumpableObject +DumpableObjectType +DynamicFileList +DynamicZoneAbbrev +EC_KEY +EDGE +ENGINE +EOM_flatten_into_method +EOM_get_flat_size_method +EPQState +EPlan +EState +EVP_CIPHER +EVP_CIPHER_CTX +EVP_MD +EVP_MD_CTX +EVP_PKEY +EachState +Edge +EditableObjectType +ElementsState +EnableTimeoutParams +EndBlobPtrType +EndBlobsPtrType +EndDataPtrType +EndDirectModify_function +EndForeignInsert_function +EndForeignModify_function +EndForeignScan_function +EndSampleScan_function +EnumItem +EolType +EphemeralNameRelationType +EphemeralNamedRelation +EphemeralNamedRelationData +EphemeralNamedRelationMetadata +EphemeralNamedRelationMetadataData +EquivalenceClass +EquivalenceMember +ErrorContextCallback +ErrorData +EstimateDSMForeignScan_function +EstimationInfo +EventTriggerCacheEntry +EventTriggerCacheItem +EventTriggerCacheStateType +EventTriggerData +EventTriggerEvent +EventTriggerInfo +EventTriggerQueryState +ExceptionLabelMap +ExceptionMap +ExclusiveBackupState +ExecAuxRowMark +ExecEvalBoolSubroutine +ExecEvalSubroutine +ExecForeignBatchInsert_function +ExecForeignDelete_function +ExecForeignInsert_function +ExecForeignTruncate_function +ExecForeignUpdate_function +ExecParallelEstimateContext +ExecParallelInitializeDSMContext +ExecPhraseData +ExecProcNodeMtd +ExecRowMark +ExecScanAccessMtd +ExecScanRecheckMtd +ExecStatus +ExecStatusType +ExecuteStmt +ExecutorCheckPerms_hook_type +ExecutorEnd_hook_type +ExecutorFinish_hook_type +ExecutorRun_hook_type +ExecutorStart_hook_type +ExpandedArrayHeader +ExpandedObjectHeader +ExpandedObjectMethods +ExpandedRange +ExpandedRecordFieldInfo +ExpandedRecordHeader +ExplainDirectModify_function +ExplainForeignModify_function +ExplainForeignScan_function +ExplainFormat +ExplainOneQuery_hook_type +ExplainState +ExplainStmt +ExplainWorkersState +ExportedSnapshot +Expr +ExprContext +ExprContextCallbackFunction +ExprContext_CB +ExprDoneCond +ExprEvalOp +ExprEvalOpLookup +ExprEvalRowtypeCache +ExprEvalStep +ExprState +ExprStateEvalFunc +ExtensibleNode +ExtensibleNodeEntry +ExtensibleNodeMethods +ExtensionControlFile +ExtensionInfo +ExtensionMemberId +ExtensionVersionInfo +FDWCollateState +FD_SET +FILE +FILETIME +FILE_INFORMATION_CLASS +FILE_STANDARD_INFORMATION +FSMAddress +FSMPage +FSMPageData +FakeRelCacheEntry +FakeRelCacheEntryData +FastPathStrongRelationLockData +FdwInfo +FdwRoutine +FetchDirection +FetchStmt +FieldSelect +FieldStore +File +FileFdwExecutionState +FileFdwPlanState +FileNameMap +FileTag +FinalPathExtraData +FindColsContext +FindSplitData +FindSplitStrat +FixedParallelExecutorState +FixedParallelState +FixedParamState +FlagMode +FlushPosition +FmgrBuiltin +FmgrHookEventType +FmgrInfo +ForBothCellState +ForBothState +ForEachState +ForFiveState +ForFourState +ForThreeState +ForeignAsyncConfigureWait_function +ForeignAsyncNotify_function +ForeignAsyncRequest_function +ForeignDataWrapper +ForeignKeyCacheInfo +ForeignKeyOptInfo +ForeignPath +ForeignScan +ForeignScanState +ForeignServer +ForeignServerInfo +ForeignTable +ForeignTruncateInfo +ForkNumber +FormData_pg_aggregate +FormData_pg_am +FormData_pg_amop +FormData_pg_amproc +FormData_pg_attrdef +FormData_pg_attribute +FormData_pg_auth_members +FormData_pg_authid +FormData_pg_cast +FormData_pg_class +FormData_pg_collation +FormData_pg_constraint +FormData_pg_conversion +FormData_pg_database +FormData_pg_default_acl +FormData_pg_depend +FormData_pg_enum +FormData_pg_event_trigger +FormData_pg_extension +FormData_pg_foreign_data_wrapper +FormData_pg_foreign_server +FormData_pg_foreign_table +FormData_pg_index +FormData_pg_inherits +FormData_pg_language +FormData_pg_largeobject +FormData_pg_largeobject_metadata +FormData_pg_namespace +FormData_pg_opclass +FormData_pg_operator +FormData_pg_opfamily +FormData_pg_partitioned_table +FormData_pg_policy +FormData_pg_proc +FormData_pg_publication +FormData_pg_publication_rel +FormData_pg_range +FormData_pg_replication_origin +FormData_pg_rewrite +FormData_pg_sequence +FormData_pg_sequence_data +FormData_pg_shdepend +FormData_pg_statistic +FormData_pg_statistic_ext +FormData_pg_subscription +FormData_pg_subscription_rel +FormData_pg_tablespace +FormData_pg_transform +FormData_pg_trigger +FormData_pg_ts_config +FormData_pg_ts_config_map +FormData_pg_ts_dict +FormData_pg_ts_parser +FormData_pg_ts_template +FormData_pg_type +FormData_pg_user_mapping +Form_pg_aggregate +Form_pg_am +Form_pg_amop +Form_pg_amproc +Form_pg_attrdef +Form_pg_attribute +Form_pg_auth_members +Form_pg_authid +Form_pg_cast +Form_pg_class +Form_pg_collation +Form_pg_constraint +Form_pg_conversion +Form_pg_database +Form_pg_default_acl +Form_pg_depend +Form_pg_enum +Form_pg_event_trigger +Form_pg_extension +Form_pg_foreign_data_wrapper +Form_pg_foreign_server +Form_pg_foreign_table +Form_pg_index +Form_pg_inherits +Form_pg_language +Form_pg_largeobject +Form_pg_largeobject_metadata +Form_pg_namespace +Form_pg_opclass +Form_pg_operator +Form_pg_opfamily +Form_pg_partitioned_table +Form_pg_policy +Form_pg_proc +Form_pg_publication +Form_pg_publication_rel +Form_pg_range +Form_pg_replication_origin +Form_pg_rewrite +Form_pg_sequence +Form_pg_sequence_data +Form_pg_shdepend +Form_pg_statistic +Form_pg_statistic_ext +Form_pg_subscription +Form_pg_subscription_rel +Form_pg_tablespace +Form_pg_transform +Form_pg_trigger +Form_pg_ts_config +Form_pg_ts_config_map +Form_pg_ts_dict +Form_pg_ts_parser +Form_pg_ts_template +Form_pg_type +Form_pg_user_mapping +FormatNode +FreeBlockNumberArray +FreeListData +FreePageBtree +FreePageBtreeHeader +FreePageBtreeInternalKey +FreePageBtreeLeafKey +FreePageBtreeSearchResult +FreePageManager +FreePageSpanLeader +FromCharDateMode +FromExpr +FullTransactionId +FuncCall +FuncCallContext +FuncCandidateList +FuncDetailCode +FuncExpr +FuncInfo +FuncLookupError +FunctionCallInfo +FunctionCallInfoBaseData +FunctionParameter +FunctionParameterMode +FunctionScan +FunctionScanPerFuncState +FunctionScanState +FuzzyAttrMatchState +GBT_NUMKEY +GBT_NUMKEY_R +GBT_VARKEY +GBT_VARKEY_R +GENERAL_NAME +GISTBuildBuffers +GISTBuildState +GISTDeletedPageContents +GISTENTRY +GISTInsertStack +GISTInsertState +GISTIntArrayBigOptions +GISTIntArrayOptions +GISTNodeBuffer +GISTNodeBufferPage +GISTPageOpaque +GISTPageOpaqueData +GISTPageSplitInfo +GISTSTATE +GISTScanOpaque +GISTScanOpaqueData +GISTSearchHeapItem +GISTSearchItem +GISTTYPE +GIST_SPLITVEC +GMReaderTupleBuffer +GV +Gather +GatherMerge +GatherMergePath +GatherMergeState +GatherPath +GatherState +Gene +GeneratePruningStepsContext +GenerationBlock +GenerationChunk +GenerationContext +GenerationPointer +GenericCosts +GenericXLogState +GeqoPrivateData +GetForeignJoinPaths_function +GetForeignModifyBatchSize_function +GetForeignPaths_function +GetForeignPlan_function +GetForeignRelSize_function +GetForeignRowMarkType_function +GetForeignUpperPaths_function +GetState +GiSTOptions +GinBtree +GinBtreeData +GinBtreeDataLeafInsertData +GinBtreeEntryInsertData +GinBtreeStack +GinBuildState +GinChkVal +GinEntries +GinEntryAccumulator +GinIndexStat +GinMetaPageData +GinNullCategory +GinOptions +GinPageOpaque +GinPageOpaqueData +GinPlaceToPageRC +GinPostingList +GinQualCounts +GinScanEntry +GinScanKey +GinScanOpaque +GinScanOpaqueData +GinState +GinStatsData +GinTernaryValue +GinTupleCollector +GinVacuumState +GistBuildMode +GistEntryVector +GistHstoreOptions +GistInetKey +GistNSN +GistOptBufferingMode +GistSortedBuildPageState +GistSplitUnion +GistSplitVector +GistTsVectorOptions +GistVacState +GlobalTransaction +GlobalVisState +GrantRoleStmt +GrantStmt +GrantTargetType +Group +GroupClause +GroupPath +GroupPathExtraData +GroupResultPath +GroupState +GroupVarInfo +GroupingFunc +GroupingSet +GroupingSetData +GroupingSetKind +GroupingSetsPath +GucAction +GucBoolAssignHook +GucBoolCheckHook +GucContext +GucEnumAssignHook +GucEnumCheckHook +GucIntAssignHook +GucIntCheckHook +GucRealAssignHook +GucRealCheckHook +GucShowHook +GucSource +GucStack +GucStackState +GucStringAssignHook +GucStringCheckHook +HANDLE +HASHACTION +HASHBUCKET +HASHCTL +HASHELEMENT +HASHHDR +HASHSEGMENT +HASH_SEQ_STATUS +HCRYPTPROV +HE +HEntry +HIST_ENTRY +HKEY +HLOCAL +HMAC_CTX +HMODULE +HOldEntry +HRESULT +HSParser +HSpool +HStore +HTAB +HTSV_Result +HV +Hash +HashAggBatch +HashAggSpill +HashAllocFunc +HashBuildState +HashCompareFunc +HashCopyFunc +HashIndexStat +HashInstrumentation +HashJoin +HashJoinState +HashJoinTable +HashJoinTuple +HashMemoryChunk +HashMetaPage +HashMetaPageData +HashOptions +HashPageOpaque +HashPageOpaqueData +HashPageStat +HashPath +HashScanOpaque +HashScanOpaqueData +HashScanPosData +HashScanPosItem +HashSkewBucket +HashState +HashTapeInfo +HashValueFunc +HbaLine +HbaToken +HeadlineJsonState +HeadlineParsedText +HeadlineWordEntry +HeapCheckContext +HeapScanDesc +HeapTuple +HeapTupleData +HeapTupleFields +HeapTupleForceOption +HeapTupleHeader +HeapTupleHeaderData +HeapTupleTableSlot +HistControl +HotStandbyState +I32 +ICU_Convert_Func +ID +INFIX +INT128 +INTERFACE_INFO +IOFuncSelector +IO_STATUS_BLOCK +IPCompareMethod +ITEM +IV +IdentLine +IdentifierLookup +IdentifySystemCmd +IfStackElem +ImportForeignSchemaStmt +ImportForeignSchemaType +ImportForeignSchema_function +ImportQual +InProgressEnt +IncludeWal +InclusionOpaque +IncrementVarSublevelsUp_context +IncrementalSort +IncrementalSortExecutionStatus +IncrementalSortGroupInfo +IncrementalSortInfo +IncrementalSortPath +IncrementalSortState +Index +IndexAMProperty +IndexAmRoutine +IndexArrayKeyInfo +IndexAttachInfo +IndexAttrBitmapKind +IndexBuildCallback +IndexBuildResult +IndexBulkDeleteCallback +IndexBulkDeleteResult +IndexClause +IndexClauseSet +IndexDeleteCounts +IndexDeletePrefetchState +IndexElem +IndexFetchHeapData +IndexFetchTableData +IndexInfo +IndexList +IndexOnlyScan +IndexOnlyScanState +IndexOptInfo +IndexOrderByDistance +IndexPath +IndexRuntimeKeyInfo +IndexScan +IndexScanDesc +IndexScanState +IndexStateFlagsAction +IndexStmt +IndexTuple +IndexTupleData +IndexUniqueCheck +IndexVacuumInfo +IndxInfo +InferClause +InferenceElem +InfoItem +InhInfo +InheritableSocket +InitSampleScan_function +InitializeDSMForeignScan_function +InitializeWorkerForeignScan_function +InlineCodeBlock +InsertStmt +Instrumentation +Int128AggState +Int8TransTypeData +IntRBTreeNode +IntegerSet +InternalDefaultACL +InternalGrant +Interval +IntoClause +InvalidationChunk +InvalidationListHeader +IpcMemoryId +IpcMemoryKey +IpcMemoryState +IpcSemaphoreId +IpcSemaphoreKey +IsForeignPathAsyncCapable_function +IsForeignRelUpdatable_function +IsForeignScanParallelSafe_function +IsoConnInfo +IspellDict +Item +ItemId +ItemIdData +ItemPointer +ItemPointerData +IterateDirectModify_function +IterateForeignScan_function +IterateJsonStringValuesState +JEntry +JHashState +JOBOBJECTINFOCLASS +JOBOBJECT_BASIC_LIMIT_INFORMATION +JOBOBJECT_BASIC_UI_RESTRICTIONS +JOBOBJECT_SECURITY_LIMIT_INFORMATION +JitContext +JitInstrumentation +JitProviderCallbacks +JitProviderCompileExprCB +JitProviderInit +JitProviderReleaseContextCB +JitProviderResetAfterErrorCB +Join +JoinCostWorkspace +JoinExpr +JoinHashEntry +JoinPath +JoinPathExtraData +JoinState +JoinType +JsObject +JsValue +JsonAggState +JsonBaseObjectInfo +JsonHashEntry +JsonIterateStringValuesAction +JsonLexContext +JsonLikeRegexContext +JsonManifestFileField +JsonManifestParseContext +JsonManifestParseState +JsonManifestSemanticState +JsonManifestWALRangeField +JsonParseContext +JsonParseErrorType +JsonPath +JsonPathBool +JsonPathExecContext +JsonPathExecResult +JsonPathGinAddPathItemFunc +JsonPathGinContext +JsonPathGinExtractNodesFunc +JsonPathGinNode +JsonPathGinNodeType +JsonPathGinPath +JsonPathGinPathItem +JsonPathItem +JsonPathItemType +JsonPathKeyword +JsonPathParseItem +JsonPathParseResult +JsonPathPredicateCallback +JsonPathString +JsonSemAction +JsonTokenType +JsonTransformStringValuesAction +JsonTypeCategory +JsonValueList +JsonValueListIterator +Jsonb +JsonbAggState +JsonbContainer +JsonbInState +JsonbIterState +JsonbIterator +JsonbIteratorToken +JsonbPair +JsonbParseState +JsonbSubWorkspace +JsonbTypeCategory +JsonbValue +JumbleState +JunkFilter +KeyArray +KeySuffix +KeyWord +LARGE_INTEGER +LDAP +LDAPMessage +LDAPURLDesc +LDAP_TIMEVAL +LINE +LLVMAttributeRef +LLVMBasicBlockRef +LLVMBuilderRef +LLVMIntPredicate +LLVMJitContext +LLVMJitHandle +LLVMMemoryBufferRef +LLVMModuleRef +LLVMOrcJITStackRef +LLVMOrcModuleHandle +LLVMOrcTargetAddress +LLVMPassManagerBuilderRef +LLVMPassManagerRef +LLVMSharedModuleRef +LLVMTargetMachineRef +LLVMTargetRef +LLVMTypeRef +LLVMValueRef +LOCALLOCK +LOCALLOCKOWNER +LOCALLOCKTAG +LOCALPREDICATELOCK +LOCK +LOCKMASK +LOCKMETHODID +LOCKMODE +LOCKTAG +LONG +LONG_PTR +LOOP +LPBYTE +LPCTSTR +LPCWSTR +LPDWORD +LPSECURITY_ATTRIBUTES +LPSERVICE_STATUS +LPSTR +LPTHREAD_START_ROUTINE +LPTSTR +LPVOID +LPWSTR +LSEG +LUID +LVDeadTuples +LVPagePruneState +LVParallelState +LVRelState +LVSavedErrInfo +LVShared +LVSharedIndStats +LWLock +LWLockHandle +LWLockMode +LWLockPadded +LabelProvider +LagTracker +LargeObjectDesc +LastAttnumInfo +Latch +LerpFunc +LexDescr +LexemeEntry +LexemeHashKey +LexemeInfo +LexemeKey +LexizeData +LibraryInfo +Limit +LimitOption +LimitPath +LimitState +LimitStateCond +List +ListCell +ListDictionary +ListParsedLex +ListenAction +ListenActionKind +ListenStmt +LoadStmt +LocalBufferLookupEnt +LocalPgBackendStatus +LocalTransactionId +LocationIndex +LocationLen +LockAcquireResult +LockClauseStrength +LockData +LockInfoData +LockInstanceData +LockMethod +LockMethodData +LockRelId +LockRows +LockRowsPath +LockRowsState +LockStmt +LockTagType +LockTupleMode +LockViewRecurse_context +LockWaitPolicy +LockingClause +LogOpts +LogStmtLevel +LogicalDecodeBeginCB +LogicalDecodeBeginPrepareCB +LogicalDecodeChangeCB +LogicalDecodeCommitCB +LogicalDecodeCommitPreparedCB +LogicalDecodeFilterByOriginCB +LogicalDecodeFilterPrepareCB +LogicalDecodeMessageCB +LogicalDecodePrepareCB +LogicalDecodeRollbackPreparedCB +LogicalDecodeShutdownCB +LogicalDecodeStartupCB +LogicalDecodeStreamAbortCB +LogicalDecodeStreamChangeCB +LogicalDecodeStreamCommitCB +LogicalDecodeStreamMessageCB +LogicalDecodeStreamPrepareCB +LogicalDecodeStreamStartCB +LogicalDecodeStreamStopCB +LogicalDecodeStreamTruncateCB +LogicalDecodeTruncateCB +LogicalDecodingContext +LogicalErrorCallbackState +LogicalOutputPluginInit +LogicalOutputPluginWriterPrepareWrite +LogicalOutputPluginWriterUpdateProgress +LogicalOutputPluginWriterWrite +LogicalRepBeginData +LogicalRepCommitData +LogicalRepCtxStruct +LogicalRepMsgType +LogicalRepPartMapEntry +LogicalRepRelId +LogicalRepRelMapEntry +LogicalRepRelation +LogicalRepTupleData +LogicalRepTyp +LogicalRepWorker +LogicalRewriteMappingData +LogicalTape +LogicalTapeSet +LtreeGistOptions +LtreeSignature +MAGIC +MBuf +MCVItem +MCVList +MEMORY_BASIC_INFORMATION +MINIDUMPWRITEDUMP +MINIDUMP_TYPE +MJEvalResult +MTTargetRelLookup +MVDependencies +MVDependency +MVNDistinct +MVNDistinctItem +Material +MaterialPath +MaterialState +MdfdVec +Memoize +MemoizeEntry +MemoizeInstrumentation +MemoizeKey +MemoizePath +MemoizeState +MemoizeTuple +MemoryContext +MemoryContextCallback +MemoryContextCallbackFunction +MemoryContextCounters +MemoryContextData +MemoryContextMethods +MemoryStatsPrintFunc +MergeAppend +MergeAppendPath +MergeAppendState +MergeJoin +MergeJoinClause +MergeJoinState +MergePath +MergeScanSelCache +MetaCommand +MinMaxAggInfo +MinMaxAggPath +MinMaxExpr +MinMaxMultiOptions +MinMaxOp +MinimalTuple +MinimalTupleData +MinimalTupleTableSlot +MinmaxMultiOpaque +MinmaxOpaque +ModifyTable +ModifyTablePath +ModifyTableState +MorphOpaque +MsgType +MultiAssignRef +MultiSortSupport +MultiSortSupportData +MultiXactId +MultiXactMember +MultiXactOffset +MultiXactStateData +MultiXactStatus +MultirangeIOData +MultirangeParseState +MultirangeType +NDBOX +NODE +NTSTATUS +NUMCacheEntry +NUMDesc +NUMProc +NV +Name +NameData +NameHashEntry +NamedArgExpr +NamedLWLockTranche +NamedLWLockTrancheRequest +NamedTuplestoreScan +NamedTuplestoreScanState +NamespaceInfo +NestLoop +NestLoopParam +NestLoopState +NestPath +NewColumnValue +NewConstraint +NextSampleBlock_function +NextSampleTuple_function +NextValueExpr +Node +NodeTag +NonEmptyRange +Notification +NotificationHash +NotificationList +NotifyStmt +Nsrt +NullIfExpr +NullTest +NullTestType +NullableDatum +Numeric +NumericAggState +NumericDigit +NumericSortSupport +NumericSumAccum +NumericVar +OM_uint32 +OP +OSAPerGroupState +OSAPerQueryState +OSInfo +OSSLCipher +OSSLDigest +OVERLAPPED +ObjectAccessDrop +ObjectAccessNamespaceSearch +ObjectAccessPostAlter +ObjectAccessPostCreate +ObjectAccessType +ObjectAddress +ObjectAddressAndFlags +ObjectAddressExtra +ObjectAddressStack +ObjectAddresses +ObjectClass +ObjectPropertyType +ObjectType +ObjectWithArgs +Offset +OffsetNumber +OffsetVarNodes_context +Oid +OidOptions +OkeysState +OldSnapshotControlData +OldSnapshotTimeMapping +OldToNewMapping +OldToNewMappingData +OnCommitAction +OnCommitItem +OnConflictAction +OnConflictClause +OnConflictExpr +OnConflictSetState +OpBtreeInterpretation +OpClassCacheEnt +OpExpr +OpFamilyMember +OpFamilyOpFuncGroup +OpclassInfo +Operator +OperatorElement +OpfamilyInfo +OprCacheEntry +OprCacheKey +OprInfo +OprProofCacheEntry +OprProofCacheKey +OutputContext +OutputPluginCallbacks +OutputPluginOptions +OutputPluginOutputType +OverrideSearchPath +OverrideStackEntry +OverridingKind +PACE_HEADER +PACL +PATH +PBOOL +PCtxtHandle +PFN +PFN_NTQUERYINFORMATIONFILE +PGAlignedBlock +PGAlignedXLogBlock +PGAsyncStatusType +PGCALL2 +PGChecksummablePage +PGContextVisibility +PGEvent +PGEventConnDestroy +PGEventConnReset +PGEventId +PGEventProc +PGEventRegister +PGEventResultCopy +PGEventResultCreate +PGEventResultDestroy +PGFInfoFunction +PGFileType +PGFunction +PGLZ_HistEntry +PGLZ_Strategy +PGMessageField +PGModuleMagicFunction +PGNoticeHooks +PGOutputData +PGPROC +PGP_CFB +PGP_Context +PGP_MPI +PGP_PubKey +PGP_S2K +PGPing +PGQueryClass +PGRUsage +PGSemaphore +PGSemaphoreData +PGShmemHeader +PGTargetServerType +PGTernaryBool +PGTransactionStatusType +PGVerbosity +PG_Locale_Strategy +PG_Lock_Status +PG_init_t +PGcancel +PGcmdQueueEntry +PGconn +PGdataValue +PGlobjfuncs +PGnotify +PGpipelineStatus +PGresAttDesc +PGresAttValue +PGresParamDesc +PGresult +PGresult_data +PHANDLE +PIO_STATUS_BLOCK +PLAINTREE +PLAssignStmt +PLUID_AND_ATTRIBUTES +PLcword +PLpgSQL_case_when +PLpgSQL_condition +PLpgSQL_datum +PLpgSQL_datum_type +PLpgSQL_diag_item +PLpgSQL_exception +PLpgSQL_exception_block +PLpgSQL_execstate +PLpgSQL_expr +PLpgSQL_func_hashkey +PLpgSQL_function +PLpgSQL_getdiag_kind +PLpgSQL_if_elsif +PLpgSQL_label_type +PLpgSQL_nsitem +PLpgSQL_nsitem_type +PLpgSQL_plugin +PLpgSQL_promise_type +PLpgSQL_raise_option +PLpgSQL_raise_option_type +PLpgSQL_rec +PLpgSQL_recfield +PLpgSQL_resolve_option +PLpgSQL_row +PLpgSQL_stmt +PLpgSQL_stmt_assert +PLpgSQL_stmt_assign +PLpgSQL_stmt_block +PLpgSQL_stmt_call +PLpgSQL_stmt_case +PLpgSQL_stmt_close +PLpgSQL_stmt_commit +PLpgSQL_stmt_dynexecute +PLpgSQL_stmt_dynfors +PLpgSQL_stmt_execsql +PLpgSQL_stmt_exit +PLpgSQL_stmt_fetch +PLpgSQL_stmt_forc +PLpgSQL_stmt_foreach_a +PLpgSQL_stmt_fori +PLpgSQL_stmt_forq +PLpgSQL_stmt_fors +PLpgSQL_stmt_getdiag +PLpgSQL_stmt_if +PLpgSQL_stmt_loop +PLpgSQL_stmt_open +PLpgSQL_stmt_perform +PLpgSQL_stmt_raise +PLpgSQL_stmt_return +PLpgSQL_stmt_return_next +PLpgSQL_stmt_return_query +PLpgSQL_stmt_rollback +PLpgSQL_stmt_type +PLpgSQL_stmt_while +PLpgSQL_trigtype +PLpgSQL_type +PLpgSQL_type_type +PLpgSQL_var +PLpgSQL_variable +PLwdatum +PLword +PLyArrayToOb +PLyCursorObject +PLyDatumToOb +PLyDatumToObFunc +PLyExceptionEntry +PLyExecutionContext +PLyObToArray +PLyObToDatum +PLyObToDatumFunc +PLyObToDomain +PLyObToScalar +PLyObToTransform +PLyObToTuple +PLyObject_AsString_t +PLyPlanObject +PLyProcedure +PLyProcedureEntry +PLyProcedureKey +PLyResultObject +PLySRFState +PLySavedArgs +PLyScalarToOb +PLySubtransactionData +PLySubtransactionObject +PLyTransformToOb +PLyTupleToOb +PLyUnicode_FromStringAndSize_t +PLy_elog_impl_t +PMINIDUMP_CALLBACK_INFORMATION +PMINIDUMP_EXCEPTION_INFORMATION +PMINIDUMP_USER_STREAM_INFORMATION +PMSignalData +PMSignalReason +PMState +POLYGON +PQArgBlock +PQEnvironmentOption +PQExpBuffer +PQExpBufferData +PQcommMethods +PQconninfoOption +PQnoticeProcessor +PQnoticeReceiver +PQprintOpt +PQsslKeyPassHook_OpenSSL_type +PREDICATELOCK +PREDICATELOCKTAG +PREDICATELOCKTARGET +PREDICATELOCKTARGETTAG +PROCESS_INFORMATION +PROCLOCK +PROCLOCKTAG +PROC_HDR +PROC_QUEUE +PSID +PSID_AND_ATTRIBUTES +PSQL_COMP_CASE +PSQL_ECHO +PSQL_ECHO_HIDDEN +PSQL_ERROR_ROLLBACK +PTEntryArray +PTIterationArray +PTOKEN_PRIVILEGES +PTOKEN_USER +PUTENVPROC +PVOID +PX_Alias +PX_Cipher +PX_Combo +PX_HMAC +PX_MD +Page +PageData +PageGistNSN +PageHeader +PageHeaderData +PageXLogRecPtr +PagetableEntry +Pairs +ParallelAppendState +ParallelBitmapHeapState +ParallelBlockTableScanDesc +ParallelBlockTableScanWorker +ParallelBlockTableScanWorkerData +ParallelCompletionPtr +ParallelContext +ParallelExecutorInfo +ParallelHashGrowth +ParallelHashJoinBatch +ParallelHashJoinBatchAccessor +ParallelHashJoinState +ParallelIndexScanDesc +ParallelReadyList +ParallelSlot +ParallelSlotArray +ParallelSlotResultHandler +ParallelState +ParallelTableScanDesc +ParallelTableScanDescData +ParallelWorkerContext +ParallelWorkerInfo +Param +ParamCompileHook +ParamExecData +ParamExternData +ParamFetchHook +ParamKind +ParamListInfo +ParamPathInfo +ParamRef +ParamsErrorCbData +ParentMapEntry +ParseCallbackState +ParseExprKind +ParseNamespaceColumn +ParseNamespaceItem +ParseParamRefHook +ParseState +ParsedLex +ParsedScript +ParsedText +ParsedWord +ParserSetupHook +ParserState +PartClauseInfo +PartClauseMatchStatus +PartClauseTarget +PartitionBoundInfo +PartitionBoundInfoData +PartitionBoundSpec +PartitionCmd +PartitionDesc +PartitionDescData +PartitionDirectory +PartitionDirectoryEntry +PartitionDispatch +PartitionElem +PartitionHashBound +PartitionKey +PartitionListValue +PartitionMap +PartitionPruneCombineOp +PartitionPruneContext +PartitionPruneInfo +PartitionPruneState +PartitionPruneStep +PartitionPruneStepCombine +PartitionPruneStepOp +PartitionPruningData +PartitionRangeBound +PartitionRangeDatum +PartitionRangeDatumKind +PartitionScheme +PartitionSpec +PartitionTupleRouting +PartitionedRelPruneInfo +PartitionedRelPruningData +PartitionwiseAggregateType +PasswordType +Path +PathClauseUsage +PathCostComparison +PathHashStack +PathKey +PathKeysComparison +PathTarget +PatternInfo +PatternInfoArray +Pattern_Prefix_Status +Pattern_Type +PendingFsyncEntry +PendingRelDelete +PendingRelSync +PendingUnlinkEntry +PendingWriteback +PerlInterpreter +Perl_check_t +Perl_ppaddr_t +Permutation +PermutationStep +PermutationStepBlocker +PermutationStepBlockerType +PgArchData +PgBackendGSSStatus +PgBackendSSLStatus +PgBackendStatus +PgBenchExpr +PgBenchExprLink +PgBenchExprList +PgBenchExprType +PgBenchFunction +PgBenchValue +PgBenchValueType +PgChecksumMode +PgFdwAnalyzeState +PgFdwConnState +PgFdwDirectModifyState +PgFdwModifyState +PgFdwOption +PgFdwPathExtraData +PgFdwRelationInfo +PgFdwScanState +PgIfAddrCallback +PgStat_ArchiverStats +PgStat_BackendFunctionEntry +PgStat_Counter +PgStat_FunctionCallUsage +PgStat_FunctionCounts +PgStat_FunctionEntry +PgStat_GlobalStats +PgStat_Msg +PgStat_MsgAnalyze +PgStat_MsgAnlAncestors +PgStat_MsgArchiver +PgStat_MsgAutovacStart +PgStat_MsgBgWriter +PgStat_MsgChecksumFailure +PgStat_MsgConnect +PgStat_MsgDeadlock +PgStat_MsgDisconnect +PgStat_MsgDropdb +PgStat_MsgDummy +PgStat_MsgFuncpurge +PgStat_MsgFuncstat +PgStat_MsgHdr +PgStat_MsgInquiry +PgStat_MsgRecoveryConflict +PgStat_MsgReplSlot +PgStat_MsgResetcounter +PgStat_MsgResetreplslotcounter +PgStat_MsgResetsharedcounter +PgStat_MsgResetsinglecounter +PgStat_MsgResetslrucounter +PgStat_MsgSLRU +PgStat_MsgTabpurge +PgStat_MsgTabstat +PgStat_MsgTempFile +PgStat_MsgVacuum +PgStat_MsgWal +PgStat_SLRUStats +PgStat_Shared_Reset_Target +PgStat_Single_Reset_Type +PgStat_StatDBEntry +PgStat_StatFuncEntry +PgStat_StatReplSlotEntry +PgStat_StatTabEntry +PgStat_SubXactStatus +PgStat_TableCounts +PgStat_TableEntry +PgStat_TableStatus +PgStat_TableXactStatus +PgStat_WalStats +PgXmlErrorContext +PgXmlStrictness +Pg_finfo_record +Pg_magic_struct +PipeProtoChunk +PipeProtoHeader +PlaceHolderInfo +PlaceHolderVar +Plan +PlanDirectModify_function +PlanForeignModify_function +PlanInvalItem +PlanRowMark +PlanState +PlannedStmt +PlannerGlobal +PlannerInfo +PlannerParamItem +Point +Pointer +PolicyInfo +PolyNumAggState +Pool +PopulateArrayContext +PopulateArrayState +PopulateRecordCache +PopulateRecordsetState +Port +Portal +PortalHashEnt +PortalStatus +PortalStrategy +PostParseColumnRefHook +PostgresPollingStatusType +PostingItem +PostponedQual +PreParseColumnRefHook +PredClass +PredIterInfo +PredIterInfoData +PredXactList +PredXactListElement +PredicateLockData +PredicateLockTargetType +PrefetchBufferResult +PrepParallelRestorePtrType +PrepareStmt +PreparedStatement +PresortedKeyData +PrewarmType +PrintExtraTocPtrType +PrintTocDataPtrType +PrintfArgType +PrintfArgValue +PrintfTarget +PrinttupAttrInfo +PrivTarget +PrivateRefCountEntry +ProcArrayStruct +ProcLangInfo +ProcSignalBarrierType +ProcSignalHeader +ProcSignalReason +ProcSignalSlot +ProcState +ProcWaitStatus +ProcessUtilityContext +ProcessUtility_hook_type +ProcessingMode +ProgressCommandType +ProjectSet +ProjectSetPath +ProjectSetState +ProjectionInfo +ProjectionPath +ProtocolVersion +PrsStorage +PruneState +PruneStepResult +PsqlScanCallbacks +PsqlScanQuoteType +PsqlScanResult +PsqlScanState +PsqlScanStateData +PsqlSettings +Publication +PublicationActions +PublicationInfo +PublicationPartOpt +PublicationRelInfo +PullFilter +PullFilterOps +PushFilter +PushFilterOps +PushFunction +PyCFunction +PyCodeObject +PyMappingMethods +PyMethodDef +PyModuleDef +PyObject +PySequenceMethods +PyTypeObject +Py_ssize_t +QPRS_STATE +QTN2QTState +QTNode +QUERYTYPE +QUERY_SECURITY_CONTEXT_TOKEN_FN +QualCost +QualItem +Query +QueryCompletion +QueryDesc +QueryEnvironment +QueryInfo +QueryItem +QueryItemType +QueryMode +QueryOperand +QueryOperator +QueryRepresentation +QueryRepresentationOperand +QuerySource +QueueBackendStatus +QueuePosition +QuitSignalReason +RBTNode +RBTOrderControl +RBTree +RBTreeIterator +REPARSE_JUNCTION_DATA_BUFFER +RIX +RI_CompareHashEntry +RI_CompareKey +RI_ConstraintInfo +RI_QueryHashEntry +RI_QueryKey +RTEKind +RWConflict +RWConflictPoolHeader +RandomState +Range +RangeBound +RangeBox +RangeFunction +RangeIOData +RangeQueryClause +RangeSubselect +RangeTableFunc +RangeTableFuncCol +RangeTableSample +RangeTblEntry +RangeTblFunction +RangeTblRef +RangeType +RangeVar +RangeVarGetRelidCallback +Ranges +RawColumnDefault +RawParseMode +RawStmt +ReInitializeDSMForeignScan_function +ReScanForeignScan_function +ReadBufPtrType +ReadBufferMode +ReadBytePtrType +ReadExtraTocPtrType +ReadFunc +ReassignOwnedStmt +RecheckForeignScan_function +RecordCacheEntry +RecordCompareData +RecordIOData +RecoveryLockListsEntry +RecoveryPauseState +RecoveryState +RecoveryTargetTimeLineGoal +RecoveryTargetType +RectBox +RecursionContext +RecursiveUnion +RecursiveUnionPath +RecursiveUnionState +RefetchForeignRow_function +RefreshMatViewStmt +RegProcedure +Regis +RegisNode +RegisteredBgWorker +ReindexErrorInfo +ReindexIndexInfo +ReindexObjectType +ReindexParams +ReindexStmt +ReindexType +RelFileNode +RelFileNodeBackend +RelIdCacheEnt +RelInfo +RelInfoArr +RelMapFile +RelMapping +RelOptInfo +RelOptKind +RelSizeEntry +RelTag +RelToCheck +RelToCluster +RelabelType +Relation +RelationData +RelationInfo +RelationPtr +RelationSyncEntry +RelcacheCallbackFunction +RelfilenodeMapEntry +RelfilenodeMapKey +Relids +RelocationBufferInfo +RelptrFreePageBtree +RelptrFreePageManager +RelptrFreePageSpanLeader +RenameStmt +ReopenPtrType +ReorderBuffer +ReorderBufferApplyChangeCB +ReorderBufferApplyTruncateCB +ReorderBufferBeginCB +ReorderBufferChange +ReorderBufferCommitCB +ReorderBufferCommitPreparedCB +ReorderBufferDiskChange +ReorderBufferIterTXNEntry +ReorderBufferIterTXNState +ReorderBufferMessageCB +ReorderBufferPrepareCB +ReorderBufferRollbackPreparedCB +ReorderBufferStreamAbortCB +ReorderBufferStreamChangeCB +ReorderBufferStreamCommitCB +ReorderBufferStreamMessageCB +ReorderBufferStreamPrepareCB +ReorderBufferStreamStartCB +ReorderBufferStreamStopCB +ReorderBufferStreamTruncateCB +ReorderBufferTXN +ReorderBufferTXNByIdEnt +ReorderBufferToastEnt +ReorderBufferTupleBuf +ReorderBufferTupleCidEnt +ReorderBufferTupleCidKey +ReorderTuple +RepOriginId +ReparameterizeForeignPathByChild_function +ReplaceVarsFromTargetList_context +ReplaceVarsNoMatchOption +ReplicaIdentityStmt +ReplicationKind +ReplicationSlot +ReplicationSlotCtlData +ReplicationSlotOnDisk +ReplicationSlotPersistency +ReplicationSlotPersistentData +ReplicationState +ReplicationStateCtl +ReplicationStateOnDisk +ResTarget +ReservoirState +ReservoirStateData +ResourceArray +ResourceOwner +ResourceReleaseCallback +ResourceReleaseCallbackItem +ResourceReleasePhase +RestoreOptions +RestorePass +RestrictInfo +Result +ResultRelInfo +ResultState +ReturnSetInfo +ReturnStmt +RevmapContents +RewriteMappingDataEntry +RewriteMappingFile +RewriteRule +RewriteState +RmgrData +RmgrDescData +RmgrId +RmgrIds +RoleSpec +RoleSpecType +RoleStmtType +RollupData +RowCompareExpr +RowCompareType +RowExpr +RowIdentityVarInfo +RowMarkClause +RowMarkType +RowSecurityDesc +RowSecurityPolicy +RuleInfo +RuleLock +RuleStmt +RunningTransactions +RunningTransactionsData +SC_HANDLE +SECURITY_ATTRIBUTES +SECURITY_STATUS +SEG +SERIALIZABLEXACT +SERIALIZABLEXID +SERIALIZABLEXIDTAG +SERVICE_STATUS +SERVICE_STATUS_HANDLE +SERVICE_TABLE_ENTRY +SHM_QUEUE +SID_AND_ATTRIBUTES +SID_IDENTIFIER_AUTHORITY +SID_NAME_USE +SISeg +SIZE_T +SMgrRelation +SMgrRelationData +SMgrSortArray +SOCKADDR +SOCKET +SPELL +SPICallbackArg +SPIExecuteOptions +SPIParseOpenOptions +SPIPlanPtr +SPIPrepareOptions +SPITupleTable +SPLITCOST +SPNode +SPNodeData +SPPageDesc +SQLCmd +SQLDropObject +SQLFunctionCache +SQLFunctionCachePtr +SQLFunctionParseInfo +SQLFunctionParseInfoPtr +SQLValueFunction +SQLValueFunctionOp +SSL +SSLExtensionInfoContext +SSL_CTX +STARTUPINFO +STRLEN +SV +SYNCHRONIZATION_BARRIER +SampleScan +SampleScanGetSampleSize_function +SampleScanState +SamplerRandomState +ScalarArrayOpExpr +ScalarArrayOpExprHashEntry +ScalarArrayOpExprHashTable +ScalarIOData +ScalarItem +ScalarMCVItem +Scan +ScanDirection +ScanKey +ScanKeyData +ScanKeywordHashFunc +ScanKeywordList +ScanState +ScanTypeControl +ScannerCallbackState +SchemaQuery +SecBuffer +SecBufferDesc +SecLabelItem +SecLabelStmt +SeenRelsEntry +SelectLimit +SelectStmt +Selectivity +SemTPadded +SemiAntiJoinFactors +SeqScan +SeqScanState +SeqTable +SeqTableData +SerCommitSeqNo +SerialControl +SerializableXactHandle +SerializedActiveRelMaps +SerializedRanges +SerializedReindexState +SerializedSnapshotData +SerializedTransactionState +Session +SessionBackupState +SessionEndType +SetConstraintState +SetConstraintStateData +SetConstraintTriggerData +SetExprState +SetFunctionReturnMode +SetOp +SetOpCmd +SetOpPath +SetOpState +SetOpStatePerGroup +SetOpStrategy +SetOperation +SetOperationStmt +SetQuantifier +SetToDefault +SetupWorkerPtrType +ShDependObjectInfo +SharedAggInfo +SharedBitmapState +SharedDependencyObjectType +SharedDependencyType +SharedExecutorInstrumentation +SharedFileSet +SharedHashInfo +SharedIncrementalSortInfo +SharedInvalCatalogMsg +SharedInvalCatcacheMsg +SharedInvalRelcacheMsg +SharedInvalRelmapMsg +SharedInvalSmgrMsg +SharedInvalSnapshotMsg +SharedInvalidationMessage +SharedJitInstrumentation +SharedMemoizeInfo +SharedRecordTableEntry +SharedRecordTableKey +SharedRecordTypmodRegistry +SharedSortInfo +SharedTuplestore +SharedTuplestoreAccessor +SharedTuplestoreChunk +SharedTuplestoreParticipant +SharedTypmodTableEntry +Sharedsort +ShellTypeInfo +ShippableCacheEntry +ShippableCacheKey +ShmemIndexEnt +ShutdownForeignScan_function +ShutdownInformation +ShutdownMode +SignTSVector +SimpleActionList +SimpleActionListCell +SimpleEcontextStackEntry +SimpleOidList +SimpleOidListCell +SimplePtrList +SimplePtrListCell +SimpleStats +SimpleStringList +SimpleStringListCell +SingleBoundSortItem +Size +SkipPages +SlabBlock +SlabChunk +SlabContext +SlabSlot +SlotErrCallbackArg +SlotNumber +SlruCtl +SlruCtlData +SlruErrorCause +SlruPageStatus +SlruScanCallback +SlruShared +SlruSharedData +SlruWriteAll +SlruWriteAllData +SnapBuild +SnapBuildOnDisk +SnapBuildState +Snapshot +SnapshotData +SnapshotType +SockAddr +Sort +SortBy +SortByDir +SortByNulls +SortCoordinate +SortGroupClause +SortItem +SortPath +SortShimExtra +SortState +SortSupport +SortSupportData +SortTuple +SortTupleComparator +SortedPoint +SpGistBuildState +SpGistCache +SpGistDeadTuple +SpGistDeadTupleData +SpGistInnerTuple +SpGistInnerTupleData +SpGistLUPCache +SpGistLastUsedPage +SpGistLeafTuple +SpGistLeafTupleData +SpGistMetaPageData +SpGistNodeTuple +SpGistNodeTupleData +SpGistOptions +SpGistPageOpaque +SpGistPageOpaqueData +SpGistScanOpaque +SpGistScanOpaqueData +SpGistSearchItem +SpGistState +SpGistTypeDesc +SpecialJoinInfo +SpinDelayStatus +SplitInterval +SplitLR +SplitPoint +SplitTextOutputData +SplitVar +SplitedPageLayout +StackElem +StartBlobPtrType +StartBlobsPtrType +StartDataPtrType +StartReplicationCmd +StartupStatusEnum +StatEntry +StatExtEntry +StatMsgType +StateFileChunk +StatisticExtInfo +Stats +StatsBuildData +StatsData +StatsElem +StatsExtInfo +StdAnalyzeData +StdRdOptIndexCleanup +StdRdOptions +Step +StopList +StrategyNumber +StreamCtl +StreamXidHash +StringInfo +StringInfoData +StripnullState +SubLink +SubLinkType +SubPlan +SubPlanState +SubRemoveRels +SubTransactionId +SubXactCallback +SubXactCallbackItem +SubXactEvent +SubXactInfo +SubqueryScan +SubqueryScanPath +SubqueryScanState +SubscriptExecSetup +SubscriptExecSteps +SubscriptRoutines +SubscriptTransform +SubscriptingRef +SubscriptingRefState +Subscription +SubscriptionInfo +SubscriptionRelState +SupportRequestCost +SupportRequestIndexCondition +SupportRequestRows +SupportRequestSelectivity +SupportRequestSimplify +Syn +SyncOps +SyncRepConfigData +SyncRepStandbyData +SyncRequestHandler +SyncRequestType +SysFKRelationship +SysScanDesc +SyscacheCallbackFunction +SystemRowsSamplerData +SystemSamplerData +SystemTimeSamplerData +TAR_MEMBER +TBMIterateResult +TBMIteratingState +TBMIterator +TBMSharedIterator +TBMSharedIteratorState +TBMStatus +TBlockState +TIDBitmap +TM_FailureData +TM_IndexDelete +TM_IndexDeleteOp +TM_IndexStatus +TM_Result +TOKEN_DEFAULT_DACL +TOKEN_INFORMATION_CLASS +TOKEN_PRIVILEGES +TOKEN_USER +TParser +TParserCharTest +TParserPosition +TParserSpecial +TParserState +TParserStateAction +TParserStateActionItem +TQueueDestReceiver +TRGM +TSAnyCacheEntry +TSConfigCacheEntry +TSConfigInfo +TSDictInfo +TSDictionaryCacheEntry +TSExecuteCallback +TSLexeme +TSParserCacheEntry +TSParserInfo +TSQuery +TSQueryData +TSQueryParserState +TSQuerySign +TSReadPointer +TSTemplateInfo +TSTernaryValue +TSTokenTypeStorage +TSVector +TSVectorBuildState +TSVectorData +TSVectorParseState +TSVectorStat +TState +TStoreState +TXNEntryFile +TYPCATEGORY +T_Action +T_WorkerStatus +TabStatHashEntry +TabStatusArray +TableAmRoutine +TableAttachInfo +TableDataInfo +TableFunc +TableFuncRoutine +TableFuncScan +TableFuncScanState +TableInfo +TableLikeClause +TableSampleClause +TableScanDesc +TableScanDescData +TableSpaceCacheEntry +TableSpaceOpts +TablespaceList +TablespaceListCell +TapeBlockTrailer +TapeShare +TarMethodData +TarMethodFile +TargetEntry +TclExceptionNameMap +Tcl_DString +Tcl_FileProc +Tcl_HashEntry +Tcl_HashTable +Tcl_Interp +Tcl_NotifierProcs +Tcl_Obj +Tcl_Time +TempNamespaceStatus +TestDecodingData +TestDecodingTxnData +TestSpec +TextFreq +TextPositionState +TheLexeme +TheSubstitute +TidExpr +TidExprType +TidHashKey +TidOpExpr +TidPath +TidRangePath +TidRangeScan +TidRangeScanState +TidScan +TidScanState +TimeADT +TimeLineHistoryCmd +TimeLineHistoryEntry +TimeLineID +TimeOffset +TimeStamp +TimeTzADT +TimeZoneAbbrevTable +TimeoutId +TimeoutType +Timestamp +TimestampTz +TmFromChar +TmToChar +ToastAttrInfo +ToastCompressionId +ToastTupleContext +ToastedAttribute +TocEntry +TokenAuxData +TokenizedLine +TrackItem +TransInvalidationInfo +TransState +TransactionId +TransactionState +TransactionStateData +TransactionStmt +TransactionStmtKind +TransformInfo +TransformJsonStringValuesState +TransitionCaptureState +TrgmArc +TrgmArcInfo +TrgmBound +TrgmColor +TrgmColorInfo +TrgmGistOptions +TrgmNFA +TrgmPackArcInfo +TrgmPackedArc +TrgmPackedGraph +TrgmPackedState +TrgmPrefix +TrgmState +TrgmStateKey +TrieChar +Trigger +TriggerData +TriggerDesc +TriggerEvent +TriggerFlags +TriggerInfo +TriggerTransition +TruncateStmt +TsmRoutine +TupOutputState +TupSortStatus +TupStoreStatus +TupleConstr +TupleConversionMap +TupleDesc +TupleHashEntry +TupleHashEntryData +TupleHashIterator +TupleHashTable +TupleQueueReader +TupleTableSlot +TupleTableSlotOps +TuplesortInstrumentation +TuplesortMethod +TuplesortSpaceType +Tuplesortstate +Tuplestorestate +TwoPhaseCallback +TwoPhaseFileHeader +TwoPhaseLockRecord +TwoPhasePgStatRecord +TwoPhasePredicateLockRecord +TwoPhasePredicateRecord +TwoPhasePredicateRecordType +TwoPhasePredicateXactRecord +TwoPhaseRecordOnDisk +TwoPhaseRmgrId +TwoPhaseStateData +Type +TypeCacheEntry +TypeCacheEnumData +TypeCast +TypeCat +TypeFuncClass +TypeInfo +TypeName +U +U32 +U8 +UChar +UCharIterator +UColAttribute +UColAttributeValue +UCollator +UConverter +UErrorCode +UINT +ULARGE_INTEGER +ULONG +ULONG_PTR +UV +UVersionInfo +UnicodeNormalizationForm +UnicodeNormalizationQC +Unique +UniquePath +UniquePathMethod +UniqueState +UnlistenStmt +UnpackTarState +UnresolvedTup +UnresolvedTupData +UpdateStmt +UpperRelationKind +UpperUniquePath +UserAuth +UserMapping +UserOpts +VacAttrStats +VacAttrStatsP +VacErrPhase +VacOptValue +VacuumParams +VacuumRelation +VacuumStmt +ValidateIndexState +Value +ValuesScan +ValuesScanState +Var +VarBit +VarChar +VarParamState +VarString +VarStringSortSupport +Variable +VariableAssignHook +VariableCache +VariableCacheData +VariableSetKind +VariableSetStmt +VariableShowStmt +VariableSpace +VariableStatData +VariableSubstituteHook +VersionedQuery +Vfd +ViewCheckOption +ViewOptCheckOption +ViewOptions +ViewStmt +VirtualTransactionId +VirtualTupleTableSlot +VolatileFunctionStatus +Vsrt +WAIT_ORDER +WALAvailability +WALInsertLock +WALInsertLockPadded +WALOpenSegment +WALReadError +WALSegmentCloseCB +WALSegmentContext +WALSegmentOpenCB +WCHAR +WCOKind +WFW_WaitOption +WIDGET +WORD +WORKSTATE +WSABUF +WSADATA +WSANETWORKEVENTS +WSAPROTOCOL_INFO +WaitEvent +WaitEventActivity +WaitEventClient +WaitEventIO +WaitEventIPC +WaitEventSet +WaitEventTimeout +WaitPMResult +WalCloseMethod +WalLevel +Safekeeper +WalMessage +WalRcvData +WalRcvExecResult +WalRcvExecStatus +WalRcvState +WalRcvStreamOptions +WalReceiverConn +WalReceiverFunctionsType +WalSnd +WalSndCtlData +WalSndSendDataCallback +WalSndState +WalTimeSample +WalUsage +WalWriteMethod +Walfile +WindowAgg +WindowAggPath +WindowAggState +WindowClause +WindowClauseSortData +WindowDef +WindowFunc +WindowFuncExprState +WindowFuncLists +WindowObject +WindowObjectData +WindowStatePerAgg +WindowStatePerAggData +WindowStatePerFunc +WithCheckOption +WithClause +WordEntry +WordEntryIN +WordEntryPos +WordEntryPosVector +WordEntryPosVector1 +WorkTableScan +WorkTableScanState +WorkerInfo +WorkerInfoData +WorkerInstrumentation +WorkerJobDumpPtrType +WorkerJobRestorePtrType +Working_State +WriteBufPtrType +WriteBytePtrType +WriteDataCallback +WriteDataPtrType +WriteExtraTocPtrType +WriteFunc +WriteManifestState +WriteTarState +WritebackContext +X509 +X509_EXTENSION +X509_NAME +X509_NAME_ENTRY +X509_STORE +X509_STORE_CTX +XLTW_Oper +XLogCtlData +XLogCtlInsert +XLogDumpConfig +XLogDumpPrivate +XLogDumpStats +XLogLongPageHeader +XLogLongPageHeaderData +XLogPageHeader +XLogPageHeaderData +XLogPageReadCB +XLogPageReadPrivate +XLogReaderRoutine +XLogReaderState +XLogRecData +XLogRecPtr +XLogRecord +XLogRecordBlockCompressHeader +XLogRecordBlockHeader +XLogRecordBlockImageHeader +XLogRecordBuffer +XLogRedoAction +XLogSegNo +XLogSource +XLogwrtResult +XLogwrtRqst +XPVIV +XPVMG +XactCallback +XactCallbackItem +XactEvent +XactLockTableWaitInfo +XidBoundsViolation +XidCacheStatus +XidCommitStatus +XidStatus +XmlExpr +XmlExprOp +XmlOptionType +XmlSerialize +XmlTableBuilderData +YYLTYPE +YYSTYPE +YY_BUFFER_STATE +ZenithErrorResponse +ZenithExistsRequest +ZenithExistsResponse +ZenithGetPageRequest +ZenithGetPageResponse +ZenithMessage +ZenithMessageTag +ZenithNblocksRequest +ZenithNblocksResponse +ZenithRequest +ZenithResponse +_SPI_connection +_SPI_plan +__AssignProcessToJobObject +__CreateJobObject +__CreateRestrictedToken +__IsProcessInJob +__QueryInformationJobObject +__SetInformationJobObject +__time64_t +_dev_t +_ino_t +_resultmap +_stringlist +acquireLocksOnSubLinks_context +adjust_appendrel_attrs_context +aff_regex_struct +allocfunc +amadjustmembers_function +ambeginscan_function +ambuild_function +ambuildempty_function +ambuildphasename_function +ambulkdelete_function +amcanreturn_function +amcostestimate_function +amendscan_function +amestimateparallelscan_function +amgetbitmap_function +amgettuple_function +aminitparallelscan_function +aminsert_function +ammarkpos_function +amoptions_function +amparallelrescan_function +amproperty_function +amrescan_function +amrestrpos_function +amvacuumcleanup_function +amvalidate_function +array_iter +array_unnest_fctx +assign_collations_context +autovac_table +av_relation +avl_dbase +avl_node +avl_tree +avw_dbase +backslashResult +backup_manifest_info +backup_manifest_option +base_yy_extra_type +basebackup_options +bgworker_main_type +binaryheap +binaryheap_comparator +bitmapword +bits16 +bits32 +bits8 +bloom_filter +brin_column_state +brin_serialize_callback_type +bytea +cached_re_str +cashKEY +cfp +check_agg_arguments_context +check_function_callback +check_network_data +check_object_relabel_type +check_password_hook_type +check_ungrouped_columns_context +chr +clock_t +cmpEntriesArg +cmpfunc +codes_t +coercion +collation_cache_entry +color +colormaprange +compare_context +config_var_value +contain_aggs_of_level_context +convert_testexpr_context +copy_data_source_cb +core_YYSTYPE +core_yy_extra_type +core_yyscan_t +corrupt_items +cost_qual_eval_context +cp_hash_func +create_upper_paths_hook_type +createdb_failure_params +crosstab_HashEnt +crosstab_cat_desc +datapagemap_iterator_t +datapagemap_t +dateKEY +datetkn +dce_uuid_t +decimal +deparse_columns +deparse_context +deparse_expr_cxt +deparse_namespace +destructor +dev_t +digit +disassembledLeaf +dlist_head +dlist_iter +dlist_mutable_iter +dlist_node +ds_state +dsa_area +dsa_area_control +dsa_area_pool +dsa_area_span +dsa_handle +dsa_pointer +dsa_pointer_atomic +dsa_segment_header +dsa_segment_index +dsa_segment_map +dshash_compare_function +dshash_hash +dshash_hash_function +dshash_parameters +dshash_partition +dshash_table +dshash_table_control +dshash_table_handle +dshash_table_item +dsm_control_header +dsm_control_item +dsm_handle +dsm_op +dsm_segment +dsm_segment_detach_callback +eLogType +ean13 +eary +ec_matches_callback_type +ec_member_foreign_arg +ec_member_matches_arg +emit_log_hook_type +eval_const_expressions_context +exec_thread_arg +execution_state +explain_get_index_name_hook_type +f_smgr +fd_set +fe_scram_state +fe_scram_state_enum +fetch_range_request +file_action_t +file_entry_t +file_type_t +filehash_hash +filehash_iterator +filemap_t +fill_string_relopt +finalize_primnode_context +find_dependent_phvs_context +find_expr_references_context +fix_join_expr_context +fix_scan_expr_context +fix_upper_expr_context +flatten_join_alias_vars_context +float4 +float4KEY +float8 +float8KEY +floating_decimal_32 +floating_decimal_64 +fmAggrefPtr +fmExprContextCallbackFunction +fmNodePtr +fmStringInfo +fmgr_hook_type +foreign_glob_cxt +foreign_loc_cxt +freeaddrinfo_ptr_t +freefunc +fsec_t +gbt_vsrt_arg +gbtree_ninfo +gbtree_vinfo +generate_series_fctx +generate_series_numeric_fctx +generate_series_timestamp_fctx +generate_series_timestamptz_fctx +generate_subscripts_fctx +get_attavgwidth_hook_type +get_index_stats_hook_type +get_relation_info_hook_type +get_relation_stats_hook_type +getaddrinfo_ptr_t +getnameinfo_ptr_t +gid_t +gin_leafpage_items_state +ginxlogCreatePostingTree +ginxlogDeleteListPages +ginxlogDeletePage +ginxlogInsert +ginxlogInsertDataInternal +ginxlogInsertEntry +ginxlogInsertListPage +ginxlogRecompressDataLeaf +ginxlogSplit +ginxlogUpdateMeta +ginxlogVacuumDataLeafPage +gistxlogDelete +gistxlogPage +gistxlogPageDelete +gistxlogPageReuse +gistxlogPageSplit +gistxlogPageUpdate +grouping_sets_data +gseg_picksplit_item +gss_buffer_desc +gss_cred_id_t +gss_ctx_id_t +gss_name_t +gtrgm_consistent_cache +gzFile +hashfunc +hbaPort +heap_page_items_state +help_handler +hlCheck +hstoreCheckKeyLen_t +hstoreCheckValLen_t +hstorePairs_t +hstoreUniquePairs_t +hstoreUpgrade_t +hyperLogLogState +ifState +ilist +import_error_callback_arg +indexed_tlist +inet +inetKEY +inet_struct +init_function +inline_cte_walker_context +inline_error_callback_arg +ino_t +inquiry +instr_time +int128 +int16 +int16KEY +int2vector +int32 +int32KEY +int32_t +int64 +int64KEY +int8 +internalPQconninfoOption +intptr_t +intset_internal_node +intset_leaf_node +intset_node +intvKEY +itemIdCompact +itemIdCompactData +iterator +jmp_buf +join_search_hook_type +json_aelem_action +json_manifest_error_callback +json_manifest_perfile_callback +json_manifest_perwalrange_callback +json_ofield_action +json_scalar_action +json_struct_action +keyEntryData +key_t +lclContext +lclTocEntry +leafSegmentInfo +leaf_item +libpq_source +line_t +lineno_t +list_sort_comparator +local_relopt +local_relopts +local_source +locale_t +locate_agg_of_level_context +locate_var_of_level_context +locate_windowfunc_context +logstreamer_param +lquery +lquery_level +lquery_variant +ltree +ltree_gist +ltree_level +ltxtquery +mXactCacheEnt +mac8KEY +macKEY +macaddr +macaddr8 +macaddr_sortsupport_state +manifest_file +manifest_files_hash +manifest_files_iterator +manifest_wal_range +map_variable_attnos_context +max_parallel_hazard_context +mb2wchar_with_len_converter +mbchar_verifier +mbcharacter_incrementer +mbdisplaylen_converter +mblen_converter +mbstr_verifier +memoize_hash +memoize_iterator +metastring +mix_data_t +mixedStruct +mode_t +movedb_failure_params +mp_digit +mp_int +mp_result +mp_sign +mp_size +mp_small +mp_usmall +mp_word +mpz_t +multirange_bsearch_comparison +mxact +mxtruncinfo +needs_fmgr_hook_type +network_sortsupport_state +nodeitem +normal_rand_fctx +ntile_context +numeric +object_access_hook_type +off_t +oidKEY +oidvector +on_dsm_detach_callback +on_exit_nicely_callback +openssl_tls_init_hook_typ +ossl_EVP_cipher_func +other +output_type +pagetable_hash +pagetable_iterator +pairingheap +pairingheap_comparator +pairingheap_node +parallel_worker_main_type +parse_error_callback_arg +parser_context +partition_method_t +pendingPosition +pgParameterStatus +pg_atomic_flag +pg_atomic_uint32 +pg_atomic_uint64 +pg_checksum_context +pg_checksum_raw_context +pg_checksum_type +pg_conn_host +pg_conn_host_type +pg_conv_map +pg_crc32 +pg_crc32c +pg_cryptohash_ctx +pg_cryptohash_type +pg_ctype_cache +pg_enc +pg_enc2gettext +pg_enc2name +pg_encname +pg_funcptr_t +pg_gssinfo +pg_hmac_ctx +pg_int64 +pg_local_to_utf_combined +pg_locale_t +pg_mb_radix_tree +pg_md5_ctx +pg_on_exit_callback +pg_re_flags +pg_saslprep_rc +pg_sha1_ctx +pg_sha224_ctx +pg_sha256_ctx +pg_sha384_ctx +pg_sha512_ctx +pg_snapshot +pg_stack_base_t +pg_time_t +pg_time_usec_t +pg_tz +pg_tz_cache +pg_tzenum +pg_unicode_decompinfo +pg_unicode_decomposition +pg_unicode_norminfo +pg_unicode_normprops +pg_unicode_recompinfo +pg_utf_to_local_combined +pg_uuid_t +pg_wc_probefunc +pg_wchar +pg_wchar_tbl +pgp_armor_headers_state +pgpid_t +pgsocket +pgsql_thing_t +pgssEntry +pgssGlobalStats +pgssHashKey +pgssSharedState +pgssStoreKind +pgssVersion +pgstat_page +pgstattuple_type +pgthreadlock_t +pid_t +pivot_field +planner_hook_type +plperl_array_info +plperl_call_data +plperl_interp_desc +plperl_proc_desc +plperl_proc_key +plperl_proc_ptr +plperl_query_desc +plperl_query_entry +plpgsql_CastHashEntry +plpgsql_CastHashKey +plpgsql_HashEnt +pltcl_call_state +pltcl_interp_desc +pltcl_proc_desc +pltcl_proc_key +pltcl_proc_ptr +pltcl_query_desc +pointer +polymorphic_actuals +pos_trgm +post_parse_analyze_hook_type +postprocess_result_function +pqbool +pqsigfunc +printQueryOpt +printTableContent +printTableFooter +printTableOpt +printTextFormat +printTextLineFormat +printTextLineWrap +printTextRule +printfunc +priv_map +process_file_callback_t +process_sublinks_context +proclist_head +proclist_mutable_iter +proclist_node +promptStatus_t +pthread_barrier_t +pthread_cond_t +pthread_key_t +pthread_mutex_t +pthread_once_t +pthread_t +ptrdiff_t +pull_var_clause_context +pull_varattnos_context +pull_varnos_context +pull_vars_context +pullup_replace_vars_context +pushdown_safety_info +qc_hash_func +qsort_arg_comparator +qsort_comparator +query_pathkeys_callback +radius_attribute +radius_packet +rangeTableEntry_used_context +rank_context +rbt_allocfunc +rbt_combiner +rbt_comparator +rbt_freefunc +reduce_outer_joins_state +reference +regex_arc_t +regex_t +regexp +regexp_matches_ctx +registered_buffer +regmatch_t +regoff_t +regproc +relopt_bool +relopt_enum +relopt_enum_elt_def +relopt_gen +relopt_int +relopt_kind +relopt_parse_elt +relopt_real +relopt_string +relopt_type +relopt_value +relopts_validator +remoteConn +remoteConnHashEnt +remoteDep +rendezvousHashEntry +replace_rte_variables_callback +replace_rte_variables_context +ret_type +rewind_source +rewrite_event +rijndael_ctx +rm_detail_t +role_auth_extra +row_security_policy_hook_type +rsv_callback +saophash_hash +save_buffer +scram_state +scram_state_enum +sem_t +sequence_magic +set_join_pathlist_hook_type +set_rel_pathlist_hook_type +shm_mq +shm_mq_handle +shm_mq_iovec +shm_mq_result +shm_toc +shm_toc_entry +shm_toc_estimator +shmem_startup_hook_type +sig_atomic_t +sigjmp_buf +signedbitmapword +sigset_t +size_t +slist_head +slist_iter +slist_mutable_iter +slist_node +slock_t +socket_set +spgBulkDeleteState +spgChooseIn +spgChooseOut +spgChooseResultType +spgConfigIn +spgConfigOut +spgInnerConsistentIn +spgInnerConsistentOut +spgLeafConsistentIn +spgLeafConsistentOut +spgNodePtr +spgPickSplitIn +spgPickSplitOut +spgVacPendingItem +spgxlogAddLeaf +spgxlogAddNode +spgxlogMoveLeafs +spgxlogPickSplit +spgxlogSplitTuple +spgxlogState +spgxlogVacuumLeaf +spgxlogVacuumRedirect +spgxlogVacuumRoot +split_pathtarget_context +split_pathtarget_item +sql_error_callback_arg +sqlparseInfo +sqlparseState +ss_lru_item_t +ss_scan_location_t +ss_scan_locations_t +ssize_t +standard_qp_extra +stemmer_module +stmtCacheEntry +storeInfo +storeRes_func +stream_stop_callback +string +substitute_actual_parameters_context +substitute_actual_srf_parameters_context +substitute_phv_relids_context +svtype +symbol +tablespaceinfo +teSection +temp_tablespaces_extra +test_re_flags +test_regex_ctx +test_shm_mq_header +test_spec +test_start_function +text +timeKEY +time_t +timeout_handler_proc +timeout_params +timerCA +tlist_vinfo +toast_compress_header +transferMode +transfer_thread_arg +trgm +trgm_mb_char +trivalue +tsKEY +ts_parserstate +ts_tokenizer +ts_tokentype +tsearch_readline_state +tuplehash_hash +tuplehash_iterator +type +tzEntry +u1byte +u4byte +u_char +u_int +uchr +uid_t +uint128 +uint16 +uint16_t +uint32 +uint32_t +uint64 +uint64_t +uint8 +uint8_t +uintptr_t +unicodeStyleBorderFormat +unicodeStyleColumnFormat +unicodeStyleFormat +unicodeStyleRowFormat +unicode_linestyle +unit_conversion +unlogged_relation_entry +utf_local_conversion_func +uuidKEY +uuid_rc_t +uuid_sortsupport_state +uuid_t +va_list +vacuumingOptions +validate_string_relopt +varatt_expanded +varattrib_1b +varattrib_1b_e +varattrib_4b +vbits +verifier_context +walrcv_check_conninfo_fn +walrcv_connect_fn +walrcv_create_slot_fn +walrcv_disconnect_fn +walrcv_endstreaming_fn +walrcv_exec_fn +walrcv_get_backend_pid_fn +walrcv_get_conninfo_fn +walrcv_get_senderinfo_fn +walrcv_identify_system_fn +walrcv_readtimelinehistoryfile_fn +walrcv_receive_fn +walrcv_send_fn +walrcv_server_version_fn +walrcv_startstreaming_fn +wchar2mb_with_len_converter +wchar_t +win32_deadchild_waitinfo +wint_t +worker_state +worktable +wrap +xl_brin_createidx +xl_brin_desummarize +xl_brin_insert +xl_brin_revmap_extend +xl_brin_samepage_update +xl_brin_update +xl_btree_dedup +xl_btree_delete +xl_btree_insert +xl_btree_mark_page_halfdead +xl_btree_metadata +xl_btree_newroot +xl_btree_reuse_page +xl_btree_split +xl_btree_unlink_page +xl_btree_update +xl_btree_vacuum +xl_clog_truncate +xl_commit_ts_truncate +xl_dbase_create_rec +xl_dbase_drop_rec +xl_end_of_recovery +xl_hash_add_ovfl_page +xl_hash_delete +xl_hash_init_bitmap_page +xl_hash_init_meta_page +xl_hash_insert +xl_hash_move_page_contents +xl_hash_split_allocate_page +xl_hash_split_complete +xl_hash_squeeze_page +xl_hash_update_meta_page +xl_hash_vacuum_one_page +xl_heap_confirm +xl_heap_delete +xl_heap_freeze_page +xl_heap_freeze_tuple +xl_heap_header +xl_heap_inplace +xl_heap_insert +xl_heap_lock +xl_heap_lock_updated +xl_heap_multi_insert +xl_heap_new_cid +xl_heap_prune +xl_heap_rewrite_mapping +xl_heap_truncate +xl_heap_update +xl_heap_vacuum +xl_heap_visible +xl_invalid_page +xl_invalid_page_key +xl_invalidations +xl_logical_message +xl_multi_insert_tuple +xl_multixact_create +xl_multixact_truncate +xl_overwrite_contrecord +xl_parameter_change +xl_relmap_update +xl_replorigin_drop +xl_replorigin_set +xl_restore_point +xl_running_xacts +xl_seq_rec +xl_smgr_create +xl_smgr_truncate +xl_standby_lock +xl_standby_locks +xl_tblspc_create_rec +xl_tblspc_drop_rec +xl_xact_abort +xl_xact_assignment +xl_xact_commit +xl_xact_dbinfo +xl_xact_invals +xl_xact_origin +xl_xact_parsed_abort +xl_xact_parsed_commit +xl_xact_parsed_prepare +xl_xact_prepare +xl_xact_relfilenodes +xl_xact_subxacts +xl_xact_twophase +xl_xact_xinfo +xmlBuffer +xmlBufferPtr +xmlChar +xmlDocPtr +xmlErrorPtr +xmlExternalEntityLoader +xmlGenericErrorFunc +xmlNodePtr +xmlNodeSetPtr +xmlParserCtxtPtr +xmlParserInputPtr +xmlStructuredErrorFunc +xmlTextWriter +xmlTextWriterPtr +xmlXPathCompExprPtr +xmlXPathContextPtr +xmlXPathObjectPtr +xmltype +xpath_workspace +xsltSecurityPrefsPtr +xsltStylesheetPtr +xsltTransformContextPtr +yy_parser +yy_size_t +yyscan_t +z_stream +z_streamp +zic_t diff --git a/poetry.lock b/poetry.lock index fe18ad226c..bc1b57bc64 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,6 +1,6 @@ [[package]] name = "aiopg" -version = "1.3.3" +version = "1.3.4" description = "Postgres integration with asyncio." category = "main" optional = false @@ -11,7 +11,33 @@ async-timeout = ">=3.0,<5.0" psycopg2-binary = ">=2.8.4" [package.extras] -sa = ["sqlalchemy[postgresql_psycopg2binary] (>=1.3,<1.5)"] +sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"] + +[[package]] +name = "allure-pytest" +version = "2.10.0" +description = "Allure pytest integration" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +allure-python-commons = "2.10.0" +pytest = ">=4.5.0" +six = ">=1.9.0" + +[[package]] +name = "allure-python-commons" +version = "2.10.0" +description = "Common module for integrate allure with python-based frameworks" +category = "main" +optional = false +python-versions = ">=3.5" + +[package.dependencies] +attrs = ">=16.0.0" +pluggy = ">=0.4.0" +six = ">=1.9.0" [[package]] name = "async-timeout" @@ -21,9 +47,6 @@ category = "main" optional = false python-versions = ">=3.6" -[package.dependencies] -typing-extensions = {version = ">=3.6.5", markers = "python_version < \"3.8\""} - [[package]] name = "asyncpg" version = "0.24.0" @@ -32,17 +55,14 @@ category = "main" optional = false python-versions = ">=3.6.0" -[package.dependencies] -typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""} - [package.extras] -dev = ["Cython (>=0.29.24,<0.30.0)", "pytest (>=6.0)", "Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "pycodestyle (>=2.7.0,<2.8.0)", "flake8 (>=3.9.2,<3.10.0)", "uvloop (>=0.15.3)"] -docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)"] -test = ["pycodestyle (>=2.7.0,<2.8.0)", "flake8 (>=3.9.2,<3.10.0)", "uvloop (>=0.15.3)"] +dev = ["Cython (>=0.29.24,<0.30.0)", "Sphinx (>=4.1.2,<4.2.0)", "flake8 (>=3.9.2,<3.10.0)", "pycodestyle (>=2.7.0,<2.8.0)", "pytest (>=6.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "uvloop (>=0.15.3)"] +docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] +test = ["flake8 (>=3.9.2,<3.10.0)", "pycodestyle (>=2.7.0,<2.8.0)", "uvloop (>=0.15.3)"] [[package]] name = "atomicwrites" -version = "1.4.0" +version = "1.4.1" description = "Atomic file writes." category = "main" optional = false @@ -57,30 +77,29 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [package.extras] -dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"] -docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] -tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"] -tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"] +dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"] +docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] +tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"] +tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] [[package]] name = "aws-sam-translator" -version = "1.42.0" +version = "1.48.0" description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates" category = "main" optional = false -python-versions = "*" +python-versions = ">=3.7, <=4.0, !=4.0" [package.dependencies] -boto3 = ">=1.5,<2.0" +boto3 = ">=1.19.5,<2.0.0" jsonschema = ">=3.2,<4.0" -six = ">=1.15,<2.0" [package.extras] -dev = ["coverage (>=5.3,<6.0)", "flake8 (>=3.8.4,<3.9.0)", "tox (>=3.20.1,<3.21.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pylint (>=1.7.2,<2.0)", "pyyaml (>=5.4,<6.0)", "mock (>=3.0.5,<4.0.0)", "parameterized (>=0.7.4,<0.8.0)", "click (>=7.1,<8.0)", "dateparser (>=0.7,<1.0)", "boto3 (>=1.17,<2.0)", "requests (>=2.24.0,<2.25.0)", "docopt (>=0.6.2,<0.7.0)", "pathlib2 (>=2.3.5)", "pytest (>=4.6.11,<4.7.0)", "pytest (>=6.1.1,<6.2.0)", "black (==20.8b1)"] +dev = ["black (==20.8b1)", "boto3 (>=1.23,<2)", "click (>=7.1,<8.0)", "coverage (>=5.3,<6.0)", "dateparser (>=0.7,<1.0)", "docopt (>=0.6.2,<0.7.0)", "flake8 (>=3.8.4,<3.9.0)", "parameterized (>=0.7.4,<0.8.0)", "pylint (>=2.9.0,<2.10.0)", "pytest (>=6.2.5,<6.3.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-xdist (>=2.5,<3.0)", "pyyaml (>=5.4,<6.0)", "requests (>=2.24.0,<2.25.0)", "tenacity (>=7.0.0,<7.1.0)", "tox (>=3.24,<4.0)"] [[package]] name = "aws-xray-sdk" -version = "2.9.0" +version = "2.10.0" description = "The AWS X-Ray SDK for Python (the SDK) enables Python developers to record and emit information from within their applications to the AWS X-Ray service." category = "main" optional = false @@ -88,7 +107,6 @@ python-versions = "*" [package.dependencies] botocore = ">=1.11.3" -future = "*" wrapt = "*" [[package]] @@ -99,383 +117,414 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "black" +version = "22.6.0" +description = "The uncompromising code formatter." +category = "dev" +optional = false +python-versions = ">=3.6.2" + +[package.dependencies] +click = ">=8.0.0" +mypy-extensions = ">=0.4.3" +pathspec = ">=0.9.0" +platformdirs = ">=2" +tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""} +typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.7.4)"] +jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +uvloop = ["uvloop (>=0.15.2)"] + [[package]] name = "boto3" -version = "1.20.40" +version = "1.24.38" description = "The AWS SDK for Python" category = "main" optional = false -python-versions = ">= 3.6" +python-versions = ">= 3.7" [package.dependencies] -botocore = ">=1.23.40,<1.24.0" -jmespath = ">=0.7.1,<1.0.0" -s3transfer = ">=0.5.0,<0.6.0" +botocore = ">=1.27.38,<1.28.0" +jmespath = ">=0.7.1,<2.0.0" +s3transfer = ">=0.6.0,<0.7.0" [package.extras] crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "boto3-stubs" -version = "1.20.40" -description = "Type annotations for boto3 1.20.40, generated by mypy-boto3-builder 6.3.2" +version = "1.24.58" +description = "Type annotations for boto3 1.24.58 generated with mypy-boto3-builder 7.11.7" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.dependencies] botocore-stubs = "*" -typing-extensions = {version = "*", markers = "python_version < \"3.9\""} +mypy-boto3-s3 = {version = ">=1.24.0,<1.25.0", optional = true, markers = "extra == \"s3\""} +types-s3transfer = "*" +typing-extensions = ">=4.1.0" [package.extras] -accessanalyzer = ["mypy-boto3-accessanalyzer (>=1.20.0)"] -account = ["mypy-boto3-account (>=1.20.0)"] -acm = ["mypy-boto3-acm (>=1.20.0)"] -acm-pca = ["mypy-boto3-acm-pca (>=1.20.0)"] -alexaforbusiness = ["mypy-boto3-alexaforbusiness (>=1.20.0)"] -all = ["mypy-boto3-accessanalyzer (>=1.20.0)", "mypy-boto3-account (>=1.20.0)", "mypy-boto3-acm (>=1.20.0)", "mypy-boto3-acm-pca (>=1.20.0)", "mypy-boto3-alexaforbusiness (>=1.20.0)", "mypy-boto3-amp (>=1.20.0)", "mypy-boto3-amplify (>=1.20.0)", "mypy-boto3-amplifybackend (>=1.20.0)", "mypy-boto3-amplifyuibuilder (>=1.20.0)", "mypy-boto3-apigateway (>=1.20.0)", "mypy-boto3-apigatewaymanagementapi (>=1.20.0)", "mypy-boto3-apigatewayv2 (>=1.20.0)", "mypy-boto3-appconfig (>=1.20.0)", "mypy-boto3-appconfigdata (>=1.20.0)", "mypy-boto3-appflow (>=1.20.0)", "mypy-boto3-appintegrations (>=1.20.0)", "mypy-boto3-application-autoscaling (>=1.20.0)", "mypy-boto3-application-insights (>=1.20.0)", "mypy-boto3-applicationcostprofiler (>=1.20.0)", "mypy-boto3-appmesh (>=1.20.0)", "mypy-boto3-apprunner (>=1.20.0)", "mypy-boto3-appstream (>=1.20.0)", "mypy-boto3-appsync (>=1.20.0)", "mypy-boto3-athena (>=1.20.0)", "mypy-boto3-auditmanager (>=1.20.0)", "mypy-boto3-autoscaling (>=1.20.0)", "mypy-boto3-autoscaling-plans (>=1.20.0)", "mypy-boto3-backup (>=1.20.0)", "mypy-boto3-backup-gateway (>=1.20.0)", "mypy-boto3-batch (>=1.20.0)", "mypy-boto3-braket (>=1.20.0)", "mypy-boto3-budgets (>=1.20.0)", "mypy-boto3-ce (>=1.20.0)", "mypy-boto3-chime (>=1.20.0)", "mypy-boto3-chime-sdk-identity (>=1.20.0)", "mypy-boto3-chime-sdk-meetings (>=1.20.0)", "mypy-boto3-chime-sdk-messaging (>=1.20.0)", "mypy-boto3-cloud9 (>=1.20.0)", "mypy-boto3-cloudcontrol (>=1.20.0)", "mypy-boto3-clouddirectory (>=1.20.0)", "mypy-boto3-cloudformation (>=1.20.0)", "mypy-boto3-cloudfront (>=1.20.0)", "mypy-boto3-cloudhsm (>=1.20.0)", "mypy-boto3-cloudhsmv2 (>=1.20.0)", "mypy-boto3-cloudsearch (>=1.20.0)", "mypy-boto3-cloudsearchdomain (>=1.20.0)", "mypy-boto3-cloudtrail (>=1.20.0)", "mypy-boto3-cloudwatch (>=1.20.0)", "mypy-boto3-codeartifact (>=1.20.0)", "mypy-boto3-codebuild (>=1.20.0)", "mypy-boto3-codecommit (>=1.20.0)", "mypy-boto3-codedeploy (>=1.20.0)", "mypy-boto3-codeguru-reviewer (>=1.20.0)", "mypy-boto3-codeguruprofiler (>=1.20.0)", "mypy-boto3-codepipeline (>=1.20.0)", "mypy-boto3-codestar (>=1.20.0)", "mypy-boto3-codestar-connections (>=1.20.0)", "mypy-boto3-codestar-notifications (>=1.20.0)", "mypy-boto3-cognito-identity (>=1.20.0)", "mypy-boto3-cognito-idp (>=1.20.0)", "mypy-boto3-cognito-sync (>=1.20.0)", "mypy-boto3-comprehend (>=1.20.0)", "mypy-boto3-comprehendmedical (>=1.20.0)", "mypy-boto3-compute-optimizer (>=1.20.0)", "mypy-boto3-config (>=1.20.0)", "mypy-boto3-connect (>=1.20.0)", "mypy-boto3-connect-contact-lens (>=1.20.0)", "mypy-boto3-connectparticipant (>=1.20.0)", "mypy-boto3-cur (>=1.20.0)", "mypy-boto3-customer-profiles (>=1.20.0)", "mypy-boto3-databrew (>=1.20.0)", "mypy-boto3-dataexchange (>=1.20.0)", "mypy-boto3-datapipeline (>=1.20.0)", "mypy-boto3-datasync (>=1.20.0)", "mypy-boto3-dax (>=1.20.0)", "mypy-boto3-detective (>=1.20.0)", "mypy-boto3-devicefarm (>=1.20.0)", "mypy-boto3-devops-guru (>=1.20.0)", "mypy-boto3-directconnect (>=1.20.0)", "mypy-boto3-discovery (>=1.20.0)", "mypy-boto3-dlm (>=1.20.0)", "mypy-boto3-dms (>=1.20.0)", "mypy-boto3-docdb (>=1.20.0)", "mypy-boto3-drs (>=1.20.0)", "mypy-boto3-ds (>=1.20.0)", "mypy-boto3-dynamodb (>=1.20.0)", "mypy-boto3-dynamodbstreams (>=1.20.0)", "mypy-boto3-ebs (>=1.20.0)", "mypy-boto3-ec2 (>=1.20.0)", "mypy-boto3-ec2-instance-connect (>=1.20.0)", "mypy-boto3-ecr (>=1.20.0)", "mypy-boto3-ecr-public (>=1.20.0)", "mypy-boto3-ecs (>=1.20.0)", "mypy-boto3-efs (>=1.20.0)", "mypy-boto3-eks (>=1.20.0)", "mypy-boto3-elastic-inference (>=1.20.0)", "mypy-boto3-elasticache (>=1.20.0)", "mypy-boto3-elasticbeanstalk (>=1.20.0)", "mypy-boto3-elastictranscoder (>=1.20.0)", "mypy-boto3-elb (>=1.20.0)", "mypy-boto3-elbv2 (>=1.20.0)", "mypy-boto3-emr (>=1.20.0)", "mypy-boto3-emr-containers (>=1.20.0)", "mypy-boto3-es (>=1.20.0)", "mypy-boto3-events (>=1.20.0)", "mypy-boto3-evidently (>=1.20.0)", "mypy-boto3-finspace (>=1.20.0)", "mypy-boto3-finspace-data (>=1.20.0)", "mypy-boto3-firehose (>=1.20.0)", "mypy-boto3-fis (>=1.20.0)", "mypy-boto3-fms (>=1.20.0)", "mypy-boto3-forecast (>=1.20.0)", "mypy-boto3-forecastquery (>=1.20.0)", "mypy-boto3-frauddetector (>=1.20.0)", "mypy-boto3-fsx (>=1.20.0)", "mypy-boto3-gamelift (>=1.20.0)", "mypy-boto3-glacier (>=1.20.0)", "mypy-boto3-globalaccelerator (>=1.20.0)", "mypy-boto3-glue (>=1.20.0)", "mypy-boto3-grafana (>=1.20.0)", "mypy-boto3-greengrass (>=1.20.0)", "mypy-boto3-greengrassv2 (>=1.20.0)", "mypy-boto3-groundstation (>=1.20.0)", "mypy-boto3-guardduty (>=1.20.0)", "mypy-boto3-health (>=1.20.0)", "mypy-boto3-healthlake (>=1.20.0)", "mypy-boto3-honeycode (>=1.20.0)", "mypy-boto3-iam (>=1.20.0)", "mypy-boto3-identitystore (>=1.20.0)", "mypy-boto3-imagebuilder (>=1.20.0)", "mypy-boto3-importexport (>=1.20.0)", "mypy-boto3-inspector (>=1.20.0)", "mypy-boto3-inspector2 (>=1.20.0)", "mypy-boto3-iot (>=1.20.0)", "mypy-boto3-iot-data (>=1.20.0)", "mypy-boto3-iot-jobs-data (>=1.20.0)", "mypy-boto3-iot1click-devices (>=1.20.0)", "mypy-boto3-iot1click-projects (>=1.20.0)", "mypy-boto3-iotanalytics (>=1.20.0)", "mypy-boto3-iotdeviceadvisor (>=1.20.0)", "mypy-boto3-iotevents (>=1.20.0)", "mypy-boto3-iotevents-data (>=1.20.0)", "mypy-boto3-iotfleethub (>=1.20.0)", "mypy-boto3-iotsecuretunneling (>=1.20.0)", "mypy-boto3-iotsitewise (>=1.20.0)", "mypy-boto3-iotthingsgraph (>=1.20.0)", "mypy-boto3-iottwinmaker (>=1.20.0)", "mypy-boto3-iotwireless (>=1.20.0)", "mypy-boto3-ivs (>=1.20.0)", "mypy-boto3-kafka (>=1.20.0)", "mypy-boto3-kafkaconnect (>=1.20.0)", "mypy-boto3-kendra (>=1.20.0)", "mypy-boto3-kinesis (>=1.20.0)", "mypy-boto3-kinesis-video-archived-media (>=1.20.0)", "mypy-boto3-kinesis-video-media (>=1.20.0)", "mypy-boto3-kinesis-video-signaling (>=1.20.0)", "mypy-boto3-kinesisanalytics (>=1.20.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.20.0)", "mypy-boto3-kinesisvideo (>=1.20.0)", "mypy-boto3-kms (>=1.20.0)", "mypy-boto3-lakeformation (>=1.20.0)", "mypy-boto3-lambda (>=1.20.0)", "mypy-boto3-lex-models (>=1.20.0)", "mypy-boto3-lex-runtime (>=1.20.0)", "mypy-boto3-lexv2-models (>=1.20.0)", "mypy-boto3-lexv2-runtime (>=1.20.0)", "mypy-boto3-license-manager (>=1.20.0)", "mypy-boto3-lightsail (>=1.20.0)", "mypy-boto3-location (>=1.20.0)", "mypy-boto3-logs (>=1.20.0)", "mypy-boto3-lookoutequipment (>=1.20.0)", "mypy-boto3-lookoutmetrics (>=1.20.0)", "mypy-boto3-lookoutvision (>=1.20.0)", "mypy-boto3-machinelearning (>=1.20.0)", "mypy-boto3-macie (>=1.20.0)", "mypy-boto3-macie2 (>=1.20.0)", "mypy-boto3-managedblockchain (>=1.20.0)", "mypy-boto3-marketplace-catalog (>=1.20.0)", "mypy-boto3-marketplace-entitlement (>=1.20.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.20.0)", "mypy-boto3-mediaconnect (>=1.20.0)", "mypy-boto3-mediaconvert (>=1.20.0)", "mypy-boto3-medialive (>=1.20.0)", "mypy-boto3-mediapackage (>=1.20.0)", "mypy-boto3-mediapackage-vod (>=1.20.0)", "mypy-boto3-mediastore (>=1.20.0)", "mypy-boto3-mediastore-data (>=1.20.0)", "mypy-boto3-mediatailor (>=1.20.0)", "mypy-boto3-memorydb (>=1.20.0)", "mypy-boto3-meteringmarketplace (>=1.20.0)", "mypy-boto3-mgh (>=1.20.0)", "mypy-boto3-mgn (>=1.20.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.20.0)", "mypy-boto3-migrationhub-config (>=1.20.0)", "mypy-boto3-migrationhubstrategy (>=1.20.0)", "mypy-boto3-mobile (>=1.20.0)", "mypy-boto3-mq (>=1.20.0)", "mypy-boto3-mturk (>=1.20.0)", "mypy-boto3-mwaa (>=1.20.0)", "mypy-boto3-neptune (>=1.20.0)", "mypy-boto3-network-firewall (>=1.20.0)", "mypy-boto3-networkmanager (>=1.20.0)", "mypy-boto3-nimble (>=1.20.0)", "mypy-boto3-opensearch (>=1.20.0)", "mypy-boto3-opsworks (>=1.20.0)", "mypy-boto3-opsworkscm (>=1.20.0)", "mypy-boto3-organizations (>=1.20.0)", "mypy-boto3-outposts (>=1.20.0)", "mypy-boto3-panorama (>=1.20.0)", "mypy-boto3-personalize (>=1.20.0)", "mypy-boto3-personalize-events (>=1.20.0)", "mypy-boto3-personalize-runtime (>=1.20.0)", "mypy-boto3-pi (>=1.20.0)", "mypy-boto3-pinpoint (>=1.20.0)", "mypy-boto3-pinpoint-email (>=1.20.0)", "mypy-boto3-pinpoint-sms-voice (>=1.20.0)", "mypy-boto3-polly (>=1.20.0)", "mypy-boto3-pricing (>=1.20.0)", "mypy-boto3-proton (>=1.20.0)", "mypy-boto3-qldb (>=1.20.0)", "mypy-boto3-qldb-session (>=1.20.0)", "mypy-boto3-quicksight (>=1.20.0)", "mypy-boto3-ram (>=1.20.0)", "mypy-boto3-rbin (>=1.20.0)", "mypy-boto3-rds (>=1.20.0)", "mypy-boto3-rds-data (>=1.20.0)", "mypy-boto3-redshift (>=1.20.0)", "mypy-boto3-redshift-data (>=1.20.0)", "mypy-boto3-rekognition (>=1.20.0)", "mypy-boto3-resiliencehub (>=1.20.0)", "mypy-boto3-resource-groups (>=1.20.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.20.0)", "mypy-boto3-robomaker (>=1.20.0)", "mypy-boto3-route53 (>=1.20.0)", "mypy-boto3-route53-recovery-cluster (>=1.20.0)", "mypy-boto3-route53-recovery-control-config (>=1.20.0)", "mypy-boto3-route53-recovery-readiness (>=1.20.0)", "mypy-boto3-route53domains (>=1.20.0)", "mypy-boto3-route53resolver (>=1.20.0)", "mypy-boto3-rum (>=1.20.0)", "mypy-boto3-s3 (>=1.20.0)", "mypy-boto3-s3control (>=1.20.0)", "mypy-boto3-s3outposts (>=1.20.0)", "mypy-boto3-sagemaker (>=1.20.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.20.0)", "mypy-boto3-sagemaker-edge (>=1.20.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.20.0)", "mypy-boto3-sagemaker-runtime (>=1.20.0)", "mypy-boto3-savingsplans (>=1.20.0)", "mypy-boto3-schemas (>=1.20.0)", "mypy-boto3-sdb (>=1.20.0)", "mypy-boto3-secretsmanager (>=1.20.0)", "mypy-boto3-securityhub (>=1.20.0)", "mypy-boto3-serverlessrepo (>=1.20.0)", "mypy-boto3-service-quotas (>=1.20.0)", "mypy-boto3-servicecatalog (>=1.20.0)", "mypy-boto3-servicecatalog-appregistry (>=1.20.0)", "mypy-boto3-servicediscovery (>=1.20.0)", "mypy-boto3-ses (>=1.20.0)", "mypy-boto3-sesv2 (>=1.20.0)", "mypy-boto3-shield (>=1.20.0)", "mypy-boto3-signer (>=1.20.0)", "mypy-boto3-sms (>=1.20.0)", "mypy-boto3-sms-voice (>=1.20.0)", "mypy-boto3-snow-device-management (>=1.20.0)", "mypy-boto3-snowball (>=1.20.0)", "mypy-boto3-sns (>=1.20.0)", "mypy-boto3-sqs (>=1.20.0)", "mypy-boto3-ssm (>=1.20.0)", "mypy-boto3-ssm-contacts (>=1.20.0)", "mypy-boto3-ssm-incidents (>=1.20.0)", "mypy-boto3-sso (>=1.20.0)", "mypy-boto3-sso-admin (>=1.20.0)", "mypy-boto3-sso-oidc (>=1.20.0)", "mypy-boto3-stepfunctions (>=1.20.0)", "mypy-boto3-storagegateway (>=1.20.0)", "mypy-boto3-sts (>=1.20.0)", "mypy-boto3-support (>=1.20.0)", "mypy-boto3-swf (>=1.20.0)", "mypy-boto3-synthetics (>=1.20.0)", "mypy-boto3-textract (>=1.20.0)", "mypy-boto3-timestream-query (>=1.20.0)", "mypy-boto3-timestream-write (>=1.20.0)", "mypy-boto3-transcribe (>=1.20.0)", "mypy-boto3-transfer (>=1.20.0)", "mypy-boto3-translate (>=1.20.0)", "mypy-boto3-voice-id (>=1.20.0)", "mypy-boto3-waf (>=1.20.0)", "mypy-boto3-waf-regional (>=1.20.0)", "mypy-boto3-wafv2 (>=1.20.0)", "mypy-boto3-wellarchitected (>=1.20.0)", "mypy-boto3-wisdom (>=1.20.0)", "mypy-boto3-workdocs (>=1.20.0)", "mypy-boto3-worklink (>=1.20.0)", "mypy-boto3-workmail (>=1.20.0)", "mypy-boto3-workmailmessageflow (>=1.20.0)", "mypy-boto3-workspaces (>=1.20.0)", "mypy-boto3-workspaces-web (>=1.20.0)", "mypy-boto3-xray (>=1.20.0)"] -amp = ["mypy-boto3-amp (>=1.20.0)"] -amplify = ["mypy-boto3-amplify (>=1.20.0)"] -amplifybackend = ["mypy-boto3-amplifybackend (>=1.20.0)"] -amplifyuibuilder = ["mypy-boto3-amplifyuibuilder (>=1.20.0)"] -apigateway = ["mypy-boto3-apigateway (>=1.20.0)"] -apigatewaymanagementapi = ["mypy-boto3-apigatewaymanagementapi (>=1.20.0)"] -apigatewayv2 = ["mypy-boto3-apigatewayv2 (>=1.20.0)"] -appconfig = ["mypy-boto3-appconfig (>=1.20.0)"] -appconfigdata = ["mypy-boto3-appconfigdata (>=1.20.0)"] -appflow = ["mypy-boto3-appflow (>=1.20.0)"] -appintegrations = ["mypy-boto3-appintegrations (>=1.20.0)"] -application-autoscaling = ["mypy-boto3-application-autoscaling (>=1.20.0)"] -application-insights = ["mypy-boto3-application-insights (>=1.20.0)"] -applicationcostprofiler = ["mypy-boto3-applicationcostprofiler (>=1.20.0)"] -appmesh = ["mypy-boto3-appmesh (>=1.20.0)"] -apprunner = ["mypy-boto3-apprunner (>=1.20.0)"] -appstream = ["mypy-boto3-appstream (>=1.20.0)"] -appsync = ["mypy-boto3-appsync (>=1.20.0)"] -athena = ["mypy-boto3-athena (>=1.20.0)"] -auditmanager = ["mypy-boto3-auditmanager (>=1.20.0)"] -autoscaling = ["mypy-boto3-autoscaling (>=1.20.0)"] -autoscaling-plans = ["mypy-boto3-autoscaling-plans (>=1.20.0)"] -backup = ["mypy-boto3-backup (>=1.20.0)"] -backup-gateway = ["mypy-boto3-backup-gateway (>=1.20.0)"] -batch = ["mypy-boto3-batch (>=1.20.0)"] -braket = ["mypy-boto3-braket (>=1.20.0)"] -budgets = ["mypy-boto3-budgets (>=1.20.0)"] -ce = ["mypy-boto3-ce (>=1.20.0)"] -chime = ["mypy-boto3-chime (>=1.20.0)"] -chime-sdk-identity = ["mypy-boto3-chime-sdk-identity (>=1.20.0)"] -chime-sdk-meetings = ["mypy-boto3-chime-sdk-meetings (>=1.20.0)"] -chime-sdk-messaging = ["mypy-boto3-chime-sdk-messaging (>=1.20.0)"] -cloud9 = ["mypy-boto3-cloud9 (>=1.20.0)"] -cloudcontrol = ["mypy-boto3-cloudcontrol (>=1.20.0)"] -clouddirectory = ["mypy-boto3-clouddirectory (>=1.20.0)"] -cloudformation = ["mypy-boto3-cloudformation (>=1.20.0)"] -cloudfront = ["mypy-boto3-cloudfront (>=1.20.0)"] -cloudhsm = ["mypy-boto3-cloudhsm (>=1.20.0)"] -cloudhsmv2 = ["mypy-boto3-cloudhsmv2 (>=1.20.0)"] -cloudsearch = ["mypy-boto3-cloudsearch (>=1.20.0)"] -cloudsearchdomain = ["mypy-boto3-cloudsearchdomain (>=1.20.0)"] -cloudtrail = ["mypy-boto3-cloudtrail (>=1.20.0)"] -cloudwatch = ["mypy-boto3-cloudwatch (>=1.20.0)"] -codeartifact = ["mypy-boto3-codeartifact (>=1.20.0)"] -codebuild = ["mypy-boto3-codebuild (>=1.20.0)"] -codecommit = ["mypy-boto3-codecommit (>=1.20.0)"] -codedeploy = ["mypy-boto3-codedeploy (>=1.20.0)"] -codeguru-reviewer = ["mypy-boto3-codeguru-reviewer (>=1.20.0)"] -codeguruprofiler = ["mypy-boto3-codeguruprofiler (>=1.20.0)"] -codepipeline = ["mypy-boto3-codepipeline (>=1.20.0)"] -codestar = ["mypy-boto3-codestar (>=1.20.0)"] -codestar-connections = ["mypy-boto3-codestar-connections (>=1.20.0)"] -codestar-notifications = ["mypy-boto3-codestar-notifications (>=1.20.0)"] -cognito-identity = ["mypy-boto3-cognito-identity (>=1.20.0)"] -cognito-idp = ["mypy-boto3-cognito-idp (>=1.20.0)"] -cognito-sync = ["mypy-boto3-cognito-sync (>=1.20.0)"] -comprehend = ["mypy-boto3-comprehend (>=1.20.0)"] -comprehendmedical = ["mypy-boto3-comprehendmedical (>=1.20.0)"] -compute-optimizer = ["mypy-boto3-compute-optimizer (>=1.20.0)"] -config = ["mypy-boto3-config (>=1.20.0)"] -connect = ["mypy-boto3-connect (>=1.20.0)"] -connect-contact-lens = ["mypy-boto3-connect-contact-lens (>=1.20.0)"] -connectparticipant = ["mypy-boto3-connectparticipant (>=1.20.0)"] -cur = ["mypy-boto3-cur (>=1.20.0)"] -customer-profiles = ["mypy-boto3-customer-profiles (>=1.20.0)"] -databrew = ["mypy-boto3-databrew (>=1.20.0)"] -dataexchange = ["mypy-boto3-dataexchange (>=1.20.0)"] -datapipeline = ["mypy-boto3-datapipeline (>=1.20.0)"] -datasync = ["mypy-boto3-datasync (>=1.20.0)"] -dax = ["mypy-boto3-dax (>=1.20.0)"] -detective = ["mypy-boto3-detective (>=1.20.0)"] -devicefarm = ["mypy-boto3-devicefarm (>=1.20.0)"] -devops-guru = ["mypy-boto3-devops-guru (>=1.20.0)"] -directconnect = ["mypy-boto3-directconnect (>=1.20.0)"] -discovery = ["mypy-boto3-discovery (>=1.20.0)"] -dlm = ["mypy-boto3-dlm (>=1.20.0)"] -dms = ["mypy-boto3-dms (>=1.20.0)"] -docdb = ["mypy-boto3-docdb (>=1.20.0)"] -drs = ["mypy-boto3-drs (>=1.20.0)"] -ds = ["mypy-boto3-ds (>=1.20.0)"] -dynamodb = ["mypy-boto3-dynamodb (>=1.20.0)"] -dynamodbstreams = ["mypy-boto3-dynamodbstreams (>=1.20.0)"] -ebs = ["mypy-boto3-ebs (>=1.20.0)"] -ec2 = ["mypy-boto3-ec2 (>=1.20.0)"] -ec2-instance-connect = ["mypy-boto3-ec2-instance-connect (>=1.20.0)"] -ecr = ["mypy-boto3-ecr (>=1.20.0)"] -ecr-public = ["mypy-boto3-ecr-public (>=1.20.0)"] -ecs = ["mypy-boto3-ecs (>=1.20.0)"] -efs = ["mypy-boto3-efs (>=1.20.0)"] -eks = ["mypy-boto3-eks (>=1.20.0)"] -elastic-inference = ["mypy-boto3-elastic-inference (>=1.20.0)"] -elasticache = ["mypy-boto3-elasticache (>=1.20.0)"] -elasticbeanstalk = ["mypy-boto3-elasticbeanstalk (>=1.20.0)"] -elastictranscoder = ["mypy-boto3-elastictranscoder (>=1.20.0)"] -elb = ["mypy-boto3-elb (>=1.20.0)"] -elbv2 = ["mypy-boto3-elbv2 (>=1.20.0)"] -emr = ["mypy-boto3-emr (>=1.20.0)"] -emr-containers = ["mypy-boto3-emr-containers (>=1.20.0)"] -es = ["mypy-boto3-es (>=1.20.0)"] -essential = ["mypy-boto3-cloudformation (>=1.20.0)", "mypy-boto3-dynamodb (>=1.20.0)", "mypy-boto3-ec2 (>=1.20.0)", "mypy-boto3-lambda (>=1.20.0)", "mypy-boto3-rds (>=1.20.0)", "mypy-boto3-s3 (>=1.20.0)", "mypy-boto3-sqs (>=1.20.0)"] -events = ["mypy-boto3-events (>=1.20.0)"] -evidently = ["mypy-boto3-evidently (>=1.20.0)"] -finspace = ["mypy-boto3-finspace (>=1.20.0)"] -finspace-data = ["mypy-boto3-finspace-data (>=1.20.0)"] -firehose = ["mypy-boto3-firehose (>=1.20.0)"] -fis = ["mypy-boto3-fis (>=1.20.0)"] -fms = ["mypy-boto3-fms (>=1.20.0)"] -forecast = ["mypy-boto3-forecast (>=1.20.0)"] -forecastquery = ["mypy-boto3-forecastquery (>=1.20.0)"] -frauddetector = ["mypy-boto3-frauddetector (>=1.20.0)"] -fsx = ["mypy-boto3-fsx (>=1.20.0)"] -gamelift = ["mypy-boto3-gamelift (>=1.20.0)"] -glacier = ["mypy-boto3-glacier (>=1.20.0)"] -globalaccelerator = ["mypy-boto3-globalaccelerator (>=1.20.0)"] -glue = ["mypy-boto3-glue (>=1.20.0)"] -grafana = ["mypy-boto3-grafana (>=1.20.0)"] -greengrass = ["mypy-boto3-greengrass (>=1.20.0)"] -greengrassv2 = ["mypy-boto3-greengrassv2 (>=1.20.0)"] -groundstation = ["mypy-boto3-groundstation (>=1.20.0)"] -guardduty = ["mypy-boto3-guardduty (>=1.20.0)"] -health = ["mypy-boto3-health (>=1.20.0)"] -healthlake = ["mypy-boto3-healthlake (>=1.20.0)"] -honeycode = ["mypy-boto3-honeycode (>=1.20.0)"] -iam = ["mypy-boto3-iam (>=1.20.0)"] -identitystore = ["mypy-boto3-identitystore (>=1.20.0)"] -imagebuilder = ["mypy-boto3-imagebuilder (>=1.20.0)"] -importexport = ["mypy-boto3-importexport (>=1.20.0)"] -inspector = ["mypy-boto3-inspector (>=1.20.0)"] -inspector2 = ["mypy-boto3-inspector2 (>=1.20.0)"] -iot = ["mypy-boto3-iot (>=1.20.0)"] -iot-data = ["mypy-boto3-iot-data (>=1.20.0)"] -iot-jobs-data = ["mypy-boto3-iot-jobs-data (>=1.20.0)"] -iot1click-devices = ["mypy-boto3-iot1click-devices (>=1.20.0)"] -iot1click-projects = ["mypy-boto3-iot1click-projects (>=1.20.0)"] -iotanalytics = ["mypy-boto3-iotanalytics (>=1.20.0)"] -iotdeviceadvisor = ["mypy-boto3-iotdeviceadvisor (>=1.20.0)"] -iotevents = ["mypy-boto3-iotevents (>=1.20.0)"] -iotevents-data = ["mypy-boto3-iotevents-data (>=1.20.0)"] -iotfleethub = ["mypy-boto3-iotfleethub (>=1.20.0)"] -iotsecuretunneling = ["mypy-boto3-iotsecuretunneling (>=1.20.0)"] -iotsitewise = ["mypy-boto3-iotsitewise (>=1.20.0)"] -iotthingsgraph = ["mypy-boto3-iotthingsgraph (>=1.20.0)"] -iottwinmaker = ["mypy-boto3-iottwinmaker (>=1.20.0)"] -iotwireless = ["mypy-boto3-iotwireless (>=1.20.0)"] -ivs = ["mypy-boto3-ivs (>=1.20.0)"] -kafka = ["mypy-boto3-kafka (>=1.20.0)"] -kafkaconnect = ["mypy-boto3-kafkaconnect (>=1.20.0)"] -kendra = ["mypy-boto3-kendra (>=1.20.0)"] -kinesis = ["mypy-boto3-kinesis (>=1.20.0)"] -kinesis-video-archived-media = ["mypy-boto3-kinesis-video-archived-media (>=1.20.0)"] -kinesis-video-media = ["mypy-boto3-kinesis-video-media (>=1.20.0)"] -kinesis-video-signaling = ["mypy-boto3-kinesis-video-signaling (>=1.20.0)"] -kinesisanalytics = ["mypy-boto3-kinesisanalytics (>=1.20.0)"] -kinesisanalyticsv2 = ["mypy-boto3-kinesisanalyticsv2 (>=1.20.0)"] -kinesisvideo = ["mypy-boto3-kinesisvideo (>=1.20.0)"] -kms = ["mypy-boto3-kms (>=1.20.0)"] -lakeformation = ["mypy-boto3-lakeformation (>=1.20.0)"] -lambda = ["mypy-boto3-lambda (>=1.20.0)"] -lex-models = ["mypy-boto3-lex-models (>=1.20.0)"] -lex-runtime = ["mypy-boto3-lex-runtime (>=1.20.0)"] -lexv2-models = ["mypy-boto3-lexv2-models (>=1.20.0)"] -lexv2-runtime = ["mypy-boto3-lexv2-runtime (>=1.20.0)"] -license-manager = ["mypy-boto3-license-manager (>=1.20.0)"] -lightsail = ["mypy-boto3-lightsail (>=1.20.0)"] -location = ["mypy-boto3-location (>=1.20.0)"] -logs = ["mypy-boto3-logs (>=1.20.0)"] -lookoutequipment = ["mypy-boto3-lookoutequipment (>=1.20.0)"] -lookoutmetrics = ["mypy-boto3-lookoutmetrics (>=1.20.0)"] -lookoutvision = ["mypy-boto3-lookoutvision (>=1.20.0)"] -machinelearning = ["mypy-boto3-machinelearning (>=1.20.0)"] -macie = ["mypy-boto3-macie (>=1.20.0)"] -macie2 = ["mypy-boto3-macie2 (>=1.20.0)"] -managedblockchain = ["mypy-boto3-managedblockchain (>=1.20.0)"] -marketplace-catalog = ["mypy-boto3-marketplace-catalog (>=1.20.0)"] -marketplace-entitlement = ["mypy-boto3-marketplace-entitlement (>=1.20.0)"] -marketplacecommerceanalytics = ["mypy-boto3-marketplacecommerceanalytics (>=1.20.0)"] -mediaconnect = ["mypy-boto3-mediaconnect (>=1.20.0)"] -mediaconvert = ["mypy-boto3-mediaconvert (>=1.20.0)"] -medialive = ["mypy-boto3-medialive (>=1.20.0)"] -mediapackage = ["mypy-boto3-mediapackage (>=1.20.0)"] -mediapackage-vod = ["mypy-boto3-mediapackage-vod (>=1.20.0)"] -mediastore = ["mypy-boto3-mediastore (>=1.20.0)"] -mediastore-data = ["mypy-boto3-mediastore-data (>=1.20.0)"] -mediatailor = ["mypy-boto3-mediatailor (>=1.20.0)"] -memorydb = ["mypy-boto3-memorydb (>=1.20.0)"] -meteringmarketplace = ["mypy-boto3-meteringmarketplace (>=1.20.0)"] -mgh = ["mypy-boto3-mgh (>=1.20.0)"] -mgn = ["mypy-boto3-mgn (>=1.20.0)"] -migration-hub-refactor-spaces = ["mypy-boto3-migration-hub-refactor-spaces (>=1.20.0)"] -migrationhub-config = ["mypy-boto3-migrationhub-config (>=1.20.0)"] -migrationhubstrategy = ["mypy-boto3-migrationhubstrategy (>=1.20.0)"] -mobile = ["mypy-boto3-mobile (>=1.20.0)"] -mq = ["mypy-boto3-mq (>=1.20.0)"] -mturk = ["mypy-boto3-mturk (>=1.20.0)"] -mwaa = ["mypy-boto3-mwaa (>=1.20.0)"] -neptune = ["mypy-boto3-neptune (>=1.20.0)"] -network-firewall = ["mypy-boto3-network-firewall (>=1.20.0)"] -networkmanager = ["mypy-boto3-networkmanager (>=1.20.0)"] -nimble = ["mypy-boto3-nimble (>=1.20.0)"] -opensearch = ["mypy-boto3-opensearch (>=1.20.0)"] -opsworks = ["mypy-boto3-opsworks (>=1.20.0)"] -opsworkscm = ["mypy-boto3-opsworkscm (>=1.20.0)"] -organizations = ["mypy-boto3-organizations (>=1.20.0)"] -outposts = ["mypy-boto3-outposts (>=1.20.0)"] -panorama = ["mypy-boto3-panorama (>=1.20.0)"] -personalize = ["mypy-boto3-personalize (>=1.20.0)"] -personalize-events = ["mypy-boto3-personalize-events (>=1.20.0)"] -personalize-runtime = ["mypy-boto3-personalize-runtime (>=1.20.0)"] -pi = ["mypy-boto3-pi (>=1.20.0)"] -pinpoint = ["mypy-boto3-pinpoint (>=1.20.0)"] -pinpoint-email = ["mypy-boto3-pinpoint-email (>=1.20.0)"] -pinpoint-sms-voice = ["mypy-boto3-pinpoint-sms-voice (>=1.20.0)"] -polly = ["mypy-boto3-polly (>=1.20.0)"] -pricing = ["mypy-boto3-pricing (>=1.20.0)"] -proton = ["mypy-boto3-proton (>=1.20.0)"] -qldb = ["mypy-boto3-qldb (>=1.20.0)"] -qldb-session = ["mypy-boto3-qldb-session (>=1.20.0)"] -quicksight = ["mypy-boto3-quicksight (>=1.20.0)"] -ram = ["mypy-boto3-ram (>=1.20.0)"] -rbin = ["mypy-boto3-rbin (>=1.20.0)"] -rds = ["mypy-boto3-rds (>=1.20.0)"] -rds-data = ["mypy-boto3-rds-data (>=1.20.0)"] -redshift = ["mypy-boto3-redshift (>=1.20.0)"] -redshift-data = ["mypy-boto3-redshift-data (>=1.20.0)"] -rekognition = ["mypy-boto3-rekognition (>=1.20.0)"] -resiliencehub = ["mypy-boto3-resiliencehub (>=1.20.0)"] -resource-groups = ["mypy-boto3-resource-groups (>=1.20.0)"] -resourcegroupstaggingapi = ["mypy-boto3-resourcegroupstaggingapi (>=1.20.0)"] -robomaker = ["mypy-boto3-robomaker (>=1.20.0)"] -route53 = ["mypy-boto3-route53 (>=1.20.0)"] -route53-recovery-cluster = ["mypy-boto3-route53-recovery-cluster (>=1.20.0)"] -route53-recovery-control-config = ["mypy-boto3-route53-recovery-control-config (>=1.20.0)"] -route53-recovery-readiness = ["mypy-boto3-route53-recovery-readiness (>=1.20.0)"] -route53domains = ["mypy-boto3-route53domains (>=1.20.0)"] -route53resolver = ["mypy-boto3-route53resolver (>=1.20.0)"] -rum = ["mypy-boto3-rum (>=1.20.0)"] -s3 = ["mypy-boto3-s3 (>=1.20.0)"] -s3control = ["mypy-boto3-s3control (>=1.20.0)"] -s3outposts = ["mypy-boto3-s3outposts (>=1.20.0)"] -sagemaker = ["mypy-boto3-sagemaker (>=1.20.0)"] -sagemaker-a2i-runtime = ["mypy-boto3-sagemaker-a2i-runtime (>=1.20.0)"] -sagemaker-edge = ["mypy-boto3-sagemaker-edge (>=1.20.0)"] -sagemaker-featurestore-runtime = ["mypy-boto3-sagemaker-featurestore-runtime (>=1.20.0)"] -sagemaker-runtime = ["mypy-boto3-sagemaker-runtime (>=1.20.0)"] -savingsplans = ["mypy-boto3-savingsplans (>=1.20.0)"] -schemas = ["mypy-boto3-schemas (>=1.20.0)"] -sdb = ["mypy-boto3-sdb (>=1.20.0)"] -secretsmanager = ["mypy-boto3-secretsmanager (>=1.20.0)"] -securityhub = ["mypy-boto3-securityhub (>=1.20.0)"] -serverlessrepo = ["mypy-boto3-serverlessrepo (>=1.20.0)"] -service-quotas = ["mypy-boto3-service-quotas (>=1.20.0)"] -servicecatalog = ["mypy-boto3-servicecatalog (>=1.20.0)"] -servicecatalog-appregistry = ["mypy-boto3-servicecatalog-appregistry (>=1.20.0)"] -servicediscovery = ["mypy-boto3-servicediscovery (>=1.20.0)"] -ses = ["mypy-boto3-ses (>=1.20.0)"] -sesv2 = ["mypy-boto3-sesv2 (>=1.20.0)"] -shield = ["mypy-boto3-shield (>=1.20.0)"] -signer = ["mypy-boto3-signer (>=1.20.0)"] -sms = ["mypy-boto3-sms (>=1.20.0)"] -sms-voice = ["mypy-boto3-sms-voice (>=1.20.0)"] -snow-device-management = ["mypy-boto3-snow-device-management (>=1.20.0)"] -snowball = ["mypy-boto3-snowball (>=1.20.0)"] -sns = ["mypy-boto3-sns (>=1.20.0)"] -sqs = ["mypy-boto3-sqs (>=1.20.0)"] -ssm = ["mypy-boto3-ssm (>=1.20.0)"] -ssm-contacts = ["mypy-boto3-ssm-contacts (>=1.20.0)"] -ssm-incidents = ["mypy-boto3-ssm-incidents (>=1.20.0)"] -sso = ["mypy-boto3-sso (>=1.20.0)"] -sso-admin = ["mypy-boto3-sso-admin (>=1.20.0)"] -sso-oidc = ["mypy-boto3-sso-oidc (>=1.20.0)"] -stepfunctions = ["mypy-boto3-stepfunctions (>=1.20.0)"] -storagegateway = ["mypy-boto3-storagegateway (>=1.20.0)"] -sts = ["mypy-boto3-sts (>=1.20.0)"] -support = ["mypy-boto3-support (>=1.20.0)"] -swf = ["mypy-boto3-swf (>=1.20.0)"] -synthetics = ["mypy-boto3-synthetics (>=1.20.0)"] -textract = ["mypy-boto3-textract (>=1.20.0)"] -timestream-query = ["mypy-boto3-timestream-query (>=1.20.0)"] -timestream-write = ["mypy-boto3-timestream-write (>=1.20.0)"] -transcribe = ["mypy-boto3-transcribe (>=1.20.0)"] -transfer = ["mypy-boto3-transfer (>=1.20.0)"] -translate = ["mypy-boto3-translate (>=1.20.0)"] -voice-id = ["mypy-boto3-voice-id (>=1.20.0)"] -waf = ["mypy-boto3-waf (>=1.20.0)"] -waf-regional = ["mypy-boto3-waf-regional (>=1.20.0)"] -wafv2 = ["mypy-boto3-wafv2 (>=1.20.0)"] -wellarchitected = ["mypy-boto3-wellarchitected (>=1.20.0)"] -wisdom = ["mypy-boto3-wisdom (>=1.20.0)"] -workdocs = ["mypy-boto3-workdocs (>=1.20.0)"] -worklink = ["mypy-boto3-worklink (>=1.20.0)"] -workmail = ["mypy-boto3-workmail (>=1.20.0)"] -workmailmessageflow = ["mypy-boto3-workmailmessageflow (>=1.20.0)"] -workspaces = ["mypy-boto3-workspaces (>=1.20.0)"] -workspaces-web = ["mypy-boto3-workspaces-web (>=1.20.0)"] -xray = ["mypy-boto3-xray (>=1.20.0)"] +accessanalyzer = ["mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)"] +account = ["mypy-boto3-account (>=1.24.0,<1.25.0)"] +acm = ["mypy-boto3-acm (>=1.24.0,<1.25.0)"] +acm-pca = ["mypy-boto3-acm-pca (>=1.24.0,<1.25.0)"] +alexaforbusiness = ["mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)"] +all = ["mypy-boto3-accessanalyzer (>=1.24.0,<1.25.0)", "mypy-boto3-account (>=1.24.0,<1.25.0)", "mypy-boto3-acm (>=1.24.0,<1.25.0)", "mypy-boto3-acm-pca (>=1.24.0,<1.25.0)", "mypy-boto3-alexaforbusiness (>=1.24.0,<1.25.0)", "mypy-boto3-amp (>=1.24.0,<1.25.0)", "mypy-boto3-amplify (>=1.24.0,<1.25.0)", "mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)", "mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)", "mypy-boto3-apigateway (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)", "mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)", "mypy-boto3-appconfig (>=1.24.0,<1.25.0)", "mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)", "mypy-boto3-appflow (>=1.24.0,<1.25.0)", "mypy-boto3-appintegrations (>=1.24.0,<1.25.0)", "mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-application-insights (>=1.24.0,<1.25.0)", "mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-appmesh (>=1.24.0,<1.25.0)", "mypy-boto3-apprunner (>=1.24.0,<1.25.0)", "mypy-boto3-appstream (>=1.24.0,<1.25.0)", "mypy-boto3-appsync (>=1.24.0,<1.25.0)", "mypy-boto3-athena (>=1.24.0,<1.25.0)", "mypy-boto3-auditmanager (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling (>=1.24.0,<1.25.0)", "mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)", "mypy-boto3-backup (>=1.24.0,<1.25.0)", "mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)", "mypy-boto3-backupstorage (>=1.24.0,<1.25.0)", "mypy-boto3-batch (>=1.24.0,<1.25.0)", "mypy-boto3-billingconductor (>=1.24.0,<1.25.0)", "mypy-boto3-braket (>=1.24.0,<1.25.0)", "mypy-boto3-budgets (>=1.24.0,<1.25.0)", "mypy-boto3-ce (>=1.24.0,<1.25.0)", "mypy-boto3-chime (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)", "mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)", "mypy-boto3-cloud9 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)", "mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)", "mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-cloudfront (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)", "mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)", "mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)", "mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)", "mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)", "mypy-boto3-codeartifact (>=1.24.0,<1.25.0)", "mypy-boto3-codebuild (>=1.24.0,<1.25.0)", "mypy-boto3-codecommit (>=1.24.0,<1.25.0)", "mypy-boto3-codedeploy (>=1.24.0,<1.25.0)", "mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)", "mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)", "mypy-boto3-codepipeline (>=1.24.0,<1.25.0)", "mypy-boto3-codestar (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)", "mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)", "mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)", "mypy-boto3-comprehend (>=1.24.0,<1.25.0)", "mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)", "mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)", "mypy-boto3-config (>=1.24.0,<1.25.0)", "mypy-boto3-connect (>=1.24.0,<1.25.0)", "mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)", "mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)", "mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)", "mypy-boto3-cur (>=1.24.0,<1.25.0)", "mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)", "mypy-boto3-databrew (>=1.24.0,<1.25.0)", "mypy-boto3-dataexchange (>=1.24.0,<1.25.0)", "mypy-boto3-datapipeline (>=1.24.0,<1.25.0)", "mypy-boto3-datasync (>=1.24.0,<1.25.0)", "mypy-boto3-dax (>=1.24.0,<1.25.0)", "mypy-boto3-detective (>=1.24.0,<1.25.0)", "mypy-boto3-devicefarm (>=1.24.0,<1.25.0)", "mypy-boto3-devops-guru (>=1.24.0,<1.25.0)", "mypy-boto3-directconnect (>=1.24.0,<1.25.0)", "mypy-boto3-discovery (>=1.24.0,<1.25.0)", "mypy-boto3-dlm (>=1.24.0,<1.25.0)", "mypy-boto3-dms (>=1.24.0,<1.25.0)", "mypy-boto3-docdb (>=1.24.0,<1.25.0)", "mypy-boto3-drs (>=1.24.0,<1.25.0)", "mypy-boto3-ds (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)", "mypy-boto3-ebs (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)", "mypy-boto3-ecr (>=1.24.0,<1.25.0)", "mypy-boto3-ecr-public (>=1.24.0,<1.25.0)", "mypy-boto3-ecs (>=1.24.0,<1.25.0)", "mypy-boto3-efs (>=1.24.0,<1.25.0)", "mypy-boto3-eks (>=1.24.0,<1.25.0)", "mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)", "mypy-boto3-elasticache (>=1.24.0,<1.25.0)", "mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)", "mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)", "mypy-boto3-elb (>=1.24.0,<1.25.0)", "mypy-boto3-elbv2 (>=1.24.0,<1.25.0)", "mypy-boto3-emr (>=1.24.0,<1.25.0)", "mypy-boto3-emr-containers (>=1.24.0,<1.25.0)", "mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-es (>=1.24.0,<1.25.0)", "mypy-boto3-events (>=1.24.0,<1.25.0)", "mypy-boto3-evidently (>=1.24.0,<1.25.0)", "mypy-boto3-finspace (>=1.24.0,<1.25.0)", "mypy-boto3-finspace-data (>=1.24.0,<1.25.0)", "mypy-boto3-firehose (>=1.24.0,<1.25.0)", "mypy-boto3-fis (>=1.24.0,<1.25.0)", "mypy-boto3-fms (>=1.24.0,<1.25.0)", "mypy-boto3-forecast (>=1.24.0,<1.25.0)", "mypy-boto3-forecastquery (>=1.24.0,<1.25.0)", "mypy-boto3-frauddetector (>=1.24.0,<1.25.0)", "mypy-boto3-fsx (>=1.24.0,<1.25.0)", "mypy-boto3-gamelift (>=1.24.0,<1.25.0)", "mypy-boto3-gamesparks (>=1.24.0,<1.25.0)", "mypy-boto3-glacier (>=1.24.0,<1.25.0)", "mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)", "mypy-boto3-glue (>=1.24.0,<1.25.0)", "mypy-boto3-grafana (>=1.24.0,<1.25.0)", "mypy-boto3-greengrass (>=1.24.0,<1.25.0)", "mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)", "mypy-boto3-groundstation (>=1.24.0,<1.25.0)", "mypy-boto3-guardduty (>=1.24.0,<1.25.0)", "mypy-boto3-health (>=1.24.0,<1.25.0)", "mypy-boto3-healthlake (>=1.24.0,<1.25.0)", "mypy-boto3-honeycode (>=1.24.0,<1.25.0)", "mypy-boto3-iam (>=1.24.0,<1.25.0)", "mypy-boto3-identitystore (>=1.24.0,<1.25.0)", "mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)", "mypy-boto3-importexport (>=1.24.0,<1.25.0)", "mypy-boto3-inspector (>=1.24.0,<1.25.0)", "mypy-boto3-inspector2 (>=1.24.0,<1.25.0)", "mypy-boto3-iot (>=1.24.0,<1.25.0)", "mypy-boto3-iot-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)", "mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)", "mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents (>=1.24.0,<1.25.0)", "mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)", "mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)", "mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)", "mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)", "mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)", "mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)", "mypy-boto3-iotwireless (>=1.24.0,<1.25.0)", "mypy-boto3-ivs (>=1.24.0,<1.25.0)", "mypy-boto3-ivschat (>=1.24.0,<1.25.0)", "mypy-boto3-kafka (>=1.24.0,<1.25.0)", "mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-kendra (>=1.24.0,<1.25.0)", "mypy-boto3-keyspaces (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)", "mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)", "mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)", "mypy-boto3-kms (>=1.24.0,<1.25.0)", "mypy-boto3-lakeformation (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-lex-models (>=1.24.0,<1.25.0)", "mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)", "mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager (>=1.24.0,<1.25.0)", "mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)", "mypy-boto3-lightsail (>=1.24.0,<1.25.0)", "mypy-boto3-location (>=1.24.0,<1.25.0)", "mypy-boto3-logs (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)", "mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)", "mypy-boto3-m2 (>=1.24.0,<1.25.0)", "mypy-boto3-machinelearning (>=1.24.0,<1.25.0)", "mypy-boto3-macie (>=1.24.0,<1.25.0)", "mypy-boto3-macie2 (>=1.24.0,<1.25.0)", "mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)", "mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)", "mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)", "mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)", "mypy-boto3-medialive (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage (>=1.24.0,<1.25.0)", "mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore (>=1.24.0,<1.25.0)", "mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)", "mypy-boto3-mediatailor (>=1.24.0,<1.25.0)", "mypy-boto3-memorydb (>=1.24.0,<1.25.0)", "mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)", "mypy-boto3-mgh (>=1.24.0,<1.25.0)", "mypy-boto3-mgn (>=1.24.0,<1.25.0)", "mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)", "mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)", "mypy-boto3-mobile (>=1.24.0,<1.25.0)", "mypy-boto3-mq (>=1.24.0,<1.25.0)", "mypy-boto3-mturk (>=1.24.0,<1.25.0)", "mypy-boto3-mwaa (>=1.24.0,<1.25.0)", "mypy-boto3-neptune (>=1.24.0,<1.25.0)", "mypy-boto3-network-firewall (>=1.24.0,<1.25.0)", "mypy-boto3-networkmanager (>=1.24.0,<1.25.0)", "mypy-boto3-nimble (>=1.24.0,<1.25.0)", "mypy-boto3-opensearch (>=1.24.0,<1.25.0)", "mypy-boto3-opsworks (>=1.24.0,<1.25.0)", "mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)", "mypy-boto3-organizations (>=1.24.0,<1.25.0)", "mypy-boto3-outposts (>=1.24.0,<1.25.0)", "mypy-boto3-panorama (>=1.24.0,<1.25.0)", "mypy-boto3-personalize (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-events (>=1.24.0,<1.25.0)", "mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-pi (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)", "mypy-boto3-polly (>=1.24.0,<1.25.0)", "mypy-boto3-pricing (>=1.24.0,<1.25.0)", "mypy-boto3-privatenetworks (>=1.24.0,<1.25.0)", "mypy-boto3-proton (>=1.24.0,<1.25.0)", "mypy-boto3-qldb (>=1.24.0,<1.25.0)", "mypy-boto3-qldb-session (>=1.24.0,<1.25.0)", "mypy-boto3-quicksight (>=1.24.0,<1.25.0)", "mypy-boto3-ram (>=1.24.0,<1.25.0)", "mypy-boto3-rbin (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-rds-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-data (>=1.24.0,<1.25.0)", "mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)", "mypy-boto3-rekognition (>=1.24.0,<1.25.0)", "mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)", "mypy-boto3-resource-groups (>=1.24.0,<1.25.0)", "mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)", "mypy-boto3-robomaker (>=1.24.0,<1.25.0)", "mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)", "mypy-boto3-route53 (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)", "mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)", "mypy-boto3-route53domains (>=1.24.0,<1.25.0)", "mypy-boto3-route53resolver (>=1.24.0,<1.25.0)", "mypy-boto3-rum (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-s3control (>=1.24.0,<1.25.0)", "mypy-boto3-s3outposts (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)", "mypy-boto3-savingsplans (>=1.24.0,<1.25.0)", "mypy-boto3-schemas (>=1.24.0,<1.25.0)", "mypy-boto3-sdb (>=1.24.0,<1.25.0)", "mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)", "mypy-boto3-securityhub (>=1.24.0,<1.25.0)", "mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)", "mypy-boto3-service-quotas (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)", "mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)", "mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)", "mypy-boto3-ses (>=1.24.0,<1.25.0)", "mypy-boto3-sesv2 (>=1.24.0,<1.25.0)", "mypy-boto3-shield (>=1.24.0,<1.25.0)", "mypy-boto3-signer (>=1.24.0,<1.25.0)", "mypy-boto3-sms (>=1.24.0,<1.25.0)", "mypy-boto3-sms-voice (>=1.24.0,<1.25.0)", "mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)", "mypy-boto3-snowball (>=1.24.0,<1.25.0)", "mypy-boto3-sns (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)", "mypy-boto3-ssm (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)", "mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)", "mypy-boto3-sso (>=1.24.0,<1.25.0)", "mypy-boto3-sso-admin (>=1.24.0,<1.25.0)", "mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)", "mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)", "mypy-boto3-storagegateway (>=1.24.0,<1.25.0)", "mypy-boto3-sts (>=1.24.0,<1.25.0)", "mypy-boto3-support (>=1.24.0,<1.25.0)", "mypy-boto3-support-app (>=1.24.0,<1.25.0)", "mypy-boto3-swf (>=1.24.0,<1.25.0)", "mypy-boto3-synthetics (>=1.24.0,<1.25.0)", "mypy-boto3-textract (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-query (>=1.24.0,<1.25.0)", "mypy-boto3-timestream-write (>=1.24.0,<1.25.0)", "mypy-boto3-transcribe (>=1.24.0,<1.25.0)", "mypy-boto3-transfer (>=1.24.0,<1.25.0)", "mypy-boto3-translate (>=1.24.0,<1.25.0)", "mypy-boto3-voice-id (>=1.24.0,<1.25.0)", "mypy-boto3-waf (>=1.24.0,<1.25.0)", "mypy-boto3-waf-regional (>=1.24.0,<1.25.0)", "mypy-boto3-wafv2 (>=1.24.0,<1.25.0)", "mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)", "mypy-boto3-wisdom (>=1.24.0,<1.25.0)", "mypy-boto3-workdocs (>=1.24.0,<1.25.0)", "mypy-boto3-worklink (>=1.24.0,<1.25.0)", "mypy-boto3-workmail (>=1.24.0,<1.25.0)", "mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces (>=1.24.0,<1.25.0)", "mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)", "mypy-boto3-xray (>=1.24.0,<1.25.0)"] +amp = ["mypy-boto3-amp (>=1.24.0,<1.25.0)"] +amplify = ["mypy-boto3-amplify (>=1.24.0,<1.25.0)"] +amplifybackend = ["mypy-boto3-amplifybackend (>=1.24.0,<1.25.0)"] +amplifyuibuilder = ["mypy-boto3-amplifyuibuilder (>=1.24.0,<1.25.0)"] +apigateway = ["mypy-boto3-apigateway (>=1.24.0,<1.25.0)"] +apigatewaymanagementapi = ["mypy-boto3-apigatewaymanagementapi (>=1.24.0,<1.25.0)"] +apigatewayv2 = ["mypy-boto3-apigatewayv2 (>=1.24.0,<1.25.0)"] +appconfig = ["mypy-boto3-appconfig (>=1.24.0,<1.25.0)"] +appconfigdata = ["mypy-boto3-appconfigdata (>=1.24.0,<1.25.0)"] +appflow = ["mypy-boto3-appflow (>=1.24.0,<1.25.0)"] +appintegrations = ["mypy-boto3-appintegrations (>=1.24.0,<1.25.0)"] +application-autoscaling = ["mypy-boto3-application-autoscaling (>=1.24.0,<1.25.0)"] +application-insights = ["mypy-boto3-application-insights (>=1.24.0,<1.25.0)"] +applicationcostprofiler = ["mypy-boto3-applicationcostprofiler (>=1.24.0,<1.25.0)"] +appmesh = ["mypy-boto3-appmesh (>=1.24.0,<1.25.0)"] +apprunner = ["mypy-boto3-apprunner (>=1.24.0,<1.25.0)"] +appstream = ["mypy-boto3-appstream (>=1.24.0,<1.25.0)"] +appsync = ["mypy-boto3-appsync (>=1.24.0,<1.25.0)"] +athena = ["mypy-boto3-athena (>=1.24.0,<1.25.0)"] +auditmanager = ["mypy-boto3-auditmanager (>=1.24.0,<1.25.0)"] +autoscaling = ["mypy-boto3-autoscaling (>=1.24.0,<1.25.0)"] +autoscaling-plans = ["mypy-boto3-autoscaling-plans (>=1.24.0,<1.25.0)"] +backup = ["mypy-boto3-backup (>=1.24.0,<1.25.0)"] +backup-gateway = ["mypy-boto3-backup-gateway (>=1.24.0,<1.25.0)"] +backupstorage = ["mypy-boto3-backupstorage (>=1.24.0,<1.25.0)"] +batch = ["mypy-boto3-batch (>=1.24.0,<1.25.0)"] +billingconductor = ["mypy-boto3-billingconductor (>=1.24.0,<1.25.0)"] +braket = ["mypy-boto3-braket (>=1.24.0,<1.25.0)"] +budgets = ["mypy-boto3-budgets (>=1.24.0,<1.25.0)"] +ce = ["mypy-boto3-ce (>=1.24.0,<1.25.0)"] +chime = ["mypy-boto3-chime (>=1.24.0,<1.25.0)"] +chime-sdk-identity = ["mypy-boto3-chime-sdk-identity (>=1.24.0,<1.25.0)"] +chime-sdk-media-pipelines = ["mypy-boto3-chime-sdk-media-pipelines (>=1.24.0,<1.25.0)"] +chime-sdk-meetings = ["mypy-boto3-chime-sdk-meetings (>=1.24.0,<1.25.0)"] +chime-sdk-messaging = ["mypy-boto3-chime-sdk-messaging (>=1.24.0,<1.25.0)"] +cloud9 = ["mypy-boto3-cloud9 (>=1.24.0,<1.25.0)"] +cloudcontrol = ["mypy-boto3-cloudcontrol (>=1.24.0,<1.25.0)"] +clouddirectory = ["mypy-boto3-clouddirectory (>=1.24.0,<1.25.0)"] +cloudformation = ["mypy-boto3-cloudformation (>=1.24.0,<1.25.0)"] +cloudfront = ["mypy-boto3-cloudfront (>=1.24.0,<1.25.0)"] +cloudhsm = ["mypy-boto3-cloudhsm (>=1.24.0,<1.25.0)"] +cloudhsmv2 = ["mypy-boto3-cloudhsmv2 (>=1.24.0,<1.25.0)"] +cloudsearch = ["mypy-boto3-cloudsearch (>=1.24.0,<1.25.0)"] +cloudsearchdomain = ["mypy-boto3-cloudsearchdomain (>=1.24.0,<1.25.0)"] +cloudtrail = ["mypy-boto3-cloudtrail (>=1.24.0,<1.25.0)"] +cloudwatch = ["mypy-boto3-cloudwatch (>=1.24.0,<1.25.0)"] +codeartifact = ["mypy-boto3-codeartifact (>=1.24.0,<1.25.0)"] +codebuild = ["mypy-boto3-codebuild (>=1.24.0,<1.25.0)"] +codecommit = ["mypy-boto3-codecommit (>=1.24.0,<1.25.0)"] +codedeploy = ["mypy-boto3-codedeploy (>=1.24.0,<1.25.0)"] +codeguru-reviewer = ["mypy-boto3-codeguru-reviewer (>=1.24.0,<1.25.0)"] +codeguruprofiler = ["mypy-boto3-codeguruprofiler (>=1.24.0,<1.25.0)"] +codepipeline = ["mypy-boto3-codepipeline (>=1.24.0,<1.25.0)"] +codestar = ["mypy-boto3-codestar (>=1.24.0,<1.25.0)"] +codestar-connections = ["mypy-boto3-codestar-connections (>=1.24.0,<1.25.0)"] +codestar-notifications = ["mypy-boto3-codestar-notifications (>=1.24.0,<1.25.0)"] +cognito-identity = ["mypy-boto3-cognito-identity (>=1.24.0,<1.25.0)"] +cognito-idp = ["mypy-boto3-cognito-idp (>=1.24.0,<1.25.0)"] +cognito-sync = ["mypy-boto3-cognito-sync (>=1.24.0,<1.25.0)"] +comprehend = ["mypy-boto3-comprehend (>=1.24.0,<1.25.0)"] +comprehendmedical = ["mypy-boto3-comprehendmedical (>=1.24.0,<1.25.0)"] +compute-optimizer = ["mypy-boto3-compute-optimizer (>=1.24.0,<1.25.0)"] +config = ["mypy-boto3-config (>=1.24.0,<1.25.0)"] +connect = ["mypy-boto3-connect (>=1.24.0,<1.25.0)"] +connect-contact-lens = ["mypy-boto3-connect-contact-lens (>=1.24.0,<1.25.0)"] +connectcampaigns = ["mypy-boto3-connectcampaigns (>=1.24.0,<1.25.0)"] +connectparticipant = ["mypy-boto3-connectparticipant (>=1.24.0,<1.25.0)"] +cur = ["mypy-boto3-cur (>=1.24.0,<1.25.0)"] +customer-profiles = ["mypy-boto3-customer-profiles (>=1.24.0,<1.25.0)"] +databrew = ["mypy-boto3-databrew (>=1.24.0,<1.25.0)"] +dataexchange = ["mypy-boto3-dataexchange (>=1.24.0,<1.25.0)"] +datapipeline = ["mypy-boto3-datapipeline (>=1.24.0,<1.25.0)"] +datasync = ["mypy-boto3-datasync (>=1.24.0,<1.25.0)"] +dax = ["mypy-boto3-dax (>=1.24.0,<1.25.0)"] +detective = ["mypy-boto3-detective (>=1.24.0,<1.25.0)"] +devicefarm = ["mypy-boto3-devicefarm (>=1.24.0,<1.25.0)"] +devops-guru = ["mypy-boto3-devops-guru (>=1.24.0,<1.25.0)"] +directconnect = ["mypy-boto3-directconnect (>=1.24.0,<1.25.0)"] +discovery = ["mypy-boto3-discovery (>=1.24.0,<1.25.0)"] +dlm = ["mypy-boto3-dlm (>=1.24.0,<1.25.0)"] +dms = ["mypy-boto3-dms (>=1.24.0,<1.25.0)"] +docdb = ["mypy-boto3-docdb (>=1.24.0,<1.25.0)"] +drs = ["mypy-boto3-drs (>=1.24.0,<1.25.0)"] +ds = ["mypy-boto3-ds (>=1.24.0,<1.25.0)"] +dynamodb = ["mypy-boto3-dynamodb (>=1.24.0,<1.25.0)"] +dynamodbstreams = ["mypy-boto3-dynamodbstreams (>=1.24.0,<1.25.0)"] +ebs = ["mypy-boto3-ebs (>=1.24.0,<1.25.0)"] +ec2 = ["mypy-boto3-ec2 (>=1.24.0,<1.25.0)"] +ec2-instance-connect = ["mypy-boto3-ec2-instance-connect (>=1.24.0,<1.25.0)"] +ecr = ["mypy-boto3-ecr (>=1.24.0,<1.25.0)"] +ecr-public = ["mypy-boto3-ecr-public (>=1.24.0,<1.25.0)"] +ecs = ["mypy-boto3-ecs (>=1.24.0,<1.25.0)"] +efs = ["mypy-boto3-efs (>=1.24.0,<1.25.0)"] +eks = ["mypy-boto3-eks (>=1.24.0,<1.25.0)"] +elastic-inference = ["mypy-boto3-elastic-inference (>=1.24.0,<1.25.0)"] +elasticache = ["mypy-boto3-elasticache (>=1.24.0,<1.25.0)"] +elasticbeanstalk = ["mypy-boto3-elasticbeanstalk (>=1.24.0,<1.25.0)"] +elastictranscoder = ["mypy-boto3-elastictranscoder (>=1.24.0,<1.25.0)"] +elb = ["mypy-boto3-elb (>=1.24.0,<1.25.0)"] +elbv2 = ["mypy-boto3-elbv2 (>=1.24.0,<1.25.0)"] +emr = ["mypy-boto3-emr (>=1.24.0,<1.25.0)"] +emr-containers = ["mypy-boto3-emr-containers (>=1.24.0,<1.25.0)"] +emr-serverless = ["mypy-boto3-emr-serverless (>=1.24.0,<1.25.0)"] +es = ["mypy-boto3-es (>=1.24.0,<1.25.0)"] +essential = ["mypy-boto3-cloudformation (>=1.24.0,<1.25.0)", "mypy-boto3-dynamodb (>=1.24.0,<1.25.0)", "mypy-boto3-ec2 (>=1.24.0,<1.25.0)", "mypy-boto3-lambda (>=1.24.0,<1.25.0)", "mypy-boto3-rds (>=1.24.0,<1.25.0)", "mypy-boto3-s3 (>=1.24.0,<1.25.0)", "mypy-boto3-sqs (>=1.24.0,<1.25.0)"] +events = ["mypy-boto3-events (>=1.24.0,<1.25.0)"] +evidently = ["mypy-boto3-evidently (>=1.24.0,<1.25.0)"] +finspace = ["mypy-boto3-finspace (>=1.24.0,<1.25.0)"] +finspace-data = ["mypy-boto3-finspace-data (>=1.24.0,<1.25.0)"] +firehose = ["mypy-boto3-firehose (>=1.24.0,<1.25.0)"] +fis = ["mypy-boto3-fis (>=1.24.0,<1.25.0)"] +fms = ["mypy-boto3-fms (>=1.24.0,<1.25.0)"] +forecast = ["mypy-boto3-forecast (>=1.24.0,<1.25.0)"] +forecastquery = ["mypy-boto3-forecastquery (>=1.24.0,<1.25.0)"] +frauddetector = ["mypy-boto3-frauddetector (>=1.24.0,<1.25.0)"] +fsx = ["mypy-boto3-fsx (>=1.24.0,<1.25.0)"] +gamelift = ["mypy-boto3-gamelift (>=1.24.0,<1.25.0)"] +gamesparks = ["mypy-boto3-gamesparks (>=1.24.0,<1.25.0)"] +glacier = ["mypy-boto3-glacier (>=1.24.0,<1.25.0)"] +globalaccelerator = ["mypy-boto3-globalaccelerator (>=1.24.0,<1.25.0)"] +glue = ["mypy-boto3-glue (>=1.24.0,<1.25.0)"] +grafana = ["mypy-boto3-grafana (>=1.24.0,<1.25.0)"] +greengrass = ["mypy-boto3-greengrass (>=1.24.0,<1.25.0)"] +greengrassv2 = ["mypy-boto3-greengrassv2 (>=1.24.0,<1.25.0)"] +groundstation = ["mypy-boto3-groundstation (>=1.24.0,<1.25.0)"] +guardduty = ["mypy-boto3-guardduty (>=1.24.0,<1.25.0)"] +health = ["mypy-boto3-health (>=1.24.0,<1.25.0)"] +healthlake = ["mypy-boto3-healthlake (>=1.24.0,<1.25.0)"] +honeycode = ["mypy-boto3-honeycode (>=1.24.0,<1.25.0)"] +iam = ["mypy-boto3-iam (>=1.24.0,<1.25.0)"] +identitystore = ["mypy-boto3-identitystore (>=1.24.0,<1.25.0)"] +imagebuilder = ["mypy-boto3-imagebuilder (>=1.24.0,<1.25.0)"] +importexport = ["mypy-boto3-importexport (>=1.24.0,<1.25.0)"] +inspector = ["mypy-boto3-inspector (>=1.24.0,<1.25.0)"] +inspector2 = ["mypy-boto3-inspector2 (>=1.24.0,<1.25.0)"] +iot = ["mypy-boto3-iot (>=1.24.0,<1.25.0)"] +iot-data = ["mypy-boto3-iot-data (>=1.24.0,<1.25.0)"] +iot-jobs-data = ["mypy-boto3-iot-jobs-data (>=1.24.0,<1.25.0)"] +iot1click-devices = ["mypy-boto3-iot1click-devices (>=1.24.0,<1.25.0)"] +iot1click-projects = ["mypy-boto3-iot1click-projects (>=1.24.0,<1.25.0)"] +iotanalytics = ["mypy-boto3-iotanalytics (>=1.24.0,<1.25.0)"] +iotdeviceadvisor = ["mypy-boto3-iotdeviceadvisor (>=1.24.0,<1.25.0)"] +iotevents = ["mypy-boto3-iotevents (>=1.24.0,<1.25.0)"] +iotevents-data = ["mypy-boto3-iotevents-data (>=1.24.0,<1.25.0)"] +iotfleethub = ["mypy-boto3-iotfleethub (>=1.24.0,<1.25.0)"] +iotsecuretunneling = ["mypy-boto3-iotsecuretunneling (>=1.24.0,<1.25.0)"] +iotsitewise = ["mypy-boto3-iotsitewise (>=1.24.0,<1.25.0)"] +iotthingsgraph = ["mypy-boto3-iotthingsgraph (>=1.24.0,<1.25.0)"] +iottwinmaker = ["mypy-boto3-iottwinmaker (>=1.24.0,<1.25.0)"] +iotwireless = ["mypy-boto3-iotwireless (>=1.24.0,<1.25.0)"] +ivs = ["mypy-boto3-ivs (>=1.24.0,<1.25.0)"] +ivschat = ["mypy-boto3-ivschat (>=1.24.0,<1.25.0)"] +kafka = ["mypy-boto3-kafka (>=1.24.0,<1.25.0)"] +kafkaconnect = ["mypy-boto3-kafkaconnect (>=1.24.0,<1.25.0)"] +kendra = ["mypy-boto3-kendra (>=1.24.0,<1.25.0)"] +keyspaces = ["mypy-boto3-keyspaces (>=1.24.0,<1.25.0)"] +kinesis = ["mypy-boto3-kinesis (>=1.24.0,<1.25.0)"] +kinesis-video-archived-media = ["mypy-boto3-kinesis-video-archived-media (>=1.24.0,<1.25.0)"] +kinesis-video-media = ["mypy-boto3-kinesis-video-media (>=1.24.0,<1.25.0)"] +kinesis-video-signaling = ["mypy-boto3-kinesis-video-signaling (>=1.24.0,<1.25.0)"] +kinesisanalytics = ["mypy-boto3-kinesisanalytics (>=1.24.0,<1.25.0)"] +kinesisanalyticsv2 = ["mypy-boto3-kinesisanalyticsv2 (>=1.24.0,<1.25.0)"] +kinesisvideo = ["mypy-boto3-kinesisvideo (>=1.24.0,<1.25.0)"] +kms = ["mypy-boto3-kms (>=1.24.0,<1.25.0)"] +lakeformation = ["mypy-boto3-lakeformation (>=1.24.0,<1.25.0)"] +lambda = ["mypy-boto3-lambda (>=1.24.0,<1.25.0)"] +lex-models = ["mypy-boto3-lex-models (>=1.24.0,<1.25.0)"] +lex-runtime = ["mypy-boto3-lex-runtime (>=1.24.0,<1.25.0)"] +lexv2-models = ["mypy-boto3-lexv2-models (>=1.24.0,<1.25.0)"] +lexv2-runtime = ["mypy-boto3-lexv2-runtime (>=1.24.0,<1.25.0)"] +license-manager = ["mypy-boto3-license-manager (>=1.24.0,<1.25.0)"] +license-manager-user-subscriptions = ["mypy-boto3-license-manager-user-subscriptions (>=1.24.0,<1.25.0)"] +lightsail = ["mypy-boto3-lightsail (>=1.24.0,<1.25.0)"] +location = ["mypy-boto3-location (>=1.24.0,<1.25.0)"] +logs = ["mypy-boto3-logs (>=1.24.0,<1.25.0)"] +lookoutequipment = ["mypy-boto3-lookoutequipment (>=1.24.0,<1.25.0)"] +lookoutmetrics = ["mypy-boto3-lookoutmetrics (>=1.24.0,<1.25.0)"] +lookoutvision = ["mypy-boto3-lookoutvision (>=1.24.0,<1.25.0)"] +m2 = ["mypy-boto3-m2 (>=1.24.0,<1.25.0)"] +machinelearning = ["mypy-boto3-machinelearning (>=1.24.0,<1.25.0)"] +macie = ["mypy-boto3-macie (>=1.24.0,<1.25.0)"] +macie2 = ["mypy-boto3-macie2 (>=1.24.0,<1.25.0)"] +managedblockchain = ["mypy-boto3-managedblockchain (>=1.24.0,<1.25.0)"] +marketplace-catalog = ["mypy-boto3-marketplace-catalog (>=1.24.0,<1.25.0)"] +marketplace-entitlement = ["mypy-boto3-marketplace-entitlement (>=1.24.0,<1.25.0)"] +marketplacecommerceanalytics = ["mypy-boto3-marketplacecommerceanalytics (>=1.24.0,<1.25.0)"] +mediaconnect = ["mypy-boto3-mediaconnect (>=1.24.0,<1.25.0)"] +mediaconvert = ["mypy-boto3-mediaconvert (>=1.24.0,<1.25.0)"] +medialive = ["mypy-boto3-medialive (>=1.24.0,<1.25.0)"] +mediapackage = ["mypy-boto3-mediapackage (>=1.24.0,<1.25.0)"] +mediapackage-vod = ["mypy-boto3-mediapackage-vod (>=1.24.0,<1.25.0)"] +mediastore = ["mypy-boto3-mediastore (>=1.24.0,<1.25.0)"] +mediastore-data = ["mypy-boto3-mediastore-data (>=1.24.0,<1.25.0)"] +mediatailor = ["mypy-boto3-mediatailor (>=1.24.0,<1.25.0)"] +memorydb = ["mypy-boto3-memorydb (>=1.24.0,<1.25.0)"] +meteringmarketplace = ["mypy-boto3-meteringmarketplace (>=1.24.0,<1.25.0)"] +mgh = ["mypy-boto3-mgh (>=1.24.0,<1.25.0)"] +mgn = ["mypy-boto3-mgn (>=1.24.0,<1.25.0)"] +migration-hub-refactor-spaces = ["mypy-boto3-migration-hub-refactor-spaces (>=1.24.0,<1.25.0)"] +migrationhub-config = ["mypy-boto3-migrationhub-config (>=1.24.0,<1.25.0)"] +migrationhubstrategy = ["mypy-boto3-migrationhubstrategy (>=1.24.0,<1.25.0)"] +mobile = ["mypy-boto3-mobile (>=1.24.0,<1.25.0)"] +mq = ["mypy-boto3-mq (>=1.24.0,<1.25.0)"] +mturk = ["mypy-boto3-mturk (>=1.24.0,<1.25.0)"] +mwaa = ["mypy-boto3-mwaa (>=1.24.0,<1.25.0)"] +neptune = ["mypy-boto3-neptune (>=1.24.0,<1.25.0)"] +network-firewall = ["mypy-boto3-network-firewall (>=1.24.0,<1.25.0)"] +networkmanager = ["mypy-boto3-networkmanager (>=1.24.0,<1.25.0)"] +nimble = ["mypy-boto3-nimble (>=1.24.0,<1.25.0)"] +opensearch = ["mypy-boto3-opensearch (>=1.24.0,<1.25.0)"] +opsworks = ["mypy-boto3-opsworks (>=1.24.0,<1.25.0)"] +opsworkscm = ["mypy-boto3-opsworkscm (>=1.24.0,<1.25.0)"] +organizations = ["mypy-boto3-organizations (>=1.24.0,<1.25.0)"] +outposts = ["mypy-boto3-outposts (>=1.24.0,<1.25.0)"] +panorama = ["mypy-boto3-panorama (>=1.24.0,<1.25.0)"] +personalize = ["mypy-boto3-personalize (>=1.24.0,<1.25.0)"] +personalize-events = ["mypy-boto3-personalize-events (>=1.24.0,<1.25.0)"] +personalize-runtime = ["mypy-boto3-personalize-runtime (>=1.24.0,<1.25.0)"] +pi = ["mypy-boto3-pi (>=1.24.0,<1.25.0)"] +pinpoint = ["mypy-boto3-pinpoint (>=1.24.0,<1.25.0)"] +pinpoint-email = ["mypy-boto3-pinpoint-email (>=1.24.0,<1.25.0)"] +pinpoint-sms-voice = ["mypy-boto3-pinpoint-sms-voice (>=1.24.0,<1.25.0)"] +pinpoint-sms-voice-v2 = ["mypy-boto3-pinpoint-sms-voice-v2 (>=1.24.0,<1.25.0)"] +polly = ["mypy-boto3-polly (>=1.24.0,<1.25.0)"] +pricing = ["mypy-boto3-pricing (>=1.24.0,<1.25.0)"] +privatenetworks = ["mypy-boto3-privatenetworks (>=1.24.0,<1.25.0)"] +proton = ["mypy-boto3-proton (>=1.24.0,<1.25.0)"] +qldb = ["mypy-boto3-qldb (>=1.24.0,<1.25.0)"] +qldb-session = ["mypy-boto3-qldb-session (>=1.24.0,<1.25.0)"] +quicksight = ["mypy-boto3-quicksight (>=1.24.0,<1.25.0)"] +ram = ["mypy-boto3-ram (>=1.24.0,<1.25.0)"] +rbin = ["mypy-boto3-rbin (>=1.24.0,<1.25.0)"] +rds = ["mypy-boto3-rds (>=1.24.0,<1.25.0)"] +rds-data = ["mypy-boto3-rds-data (>=1.24.0,<1.25.0)"] +redshift = ["mypy-boto3-redshift (>=1.24.0,<1.25.0)"] +redshift-data = ["mypy-boto3-redshift-data (>=1.24.0,<1.25.0)"] +redshift-serverless = ["mypy-boto3-redshift-serverless (>=1.24.0,<1.25.0)"] +rekognition = ["mypy-boto3-rekognition (>=1.24.0,<1.25.0)"] +resiliencehub = ["mypy-boto3-resiliencehub (>=1.24.0,<1.25.0)"] +resource-groups = ["mypy-boto3-resource-groups (>=1.24.0,<1.25.0)"] +resourcegroupstaggingapi = ["mypy-boto3-resourcegroupstaggingapi (>=1.24.0,<1.25.0)"] +robomaker = ["mypy-boto3-robomaker (>=1.24.0,<1.25.0)"] +rolesanywhere = ["mypy-boto3-rolesanywhere (>=1.24.0,<1.25.0)"] +route53 = ["mypy-boto3-route53 (>=1.24.0,<1.25.0)"] +route53-recovery-cluster = ["mypy-boto3-route53-recovery-cluster (>=1.24.0,<1.25.0)"] +route53-recovery-control-config = ["mypy-boto3-route53-recovery-control-config (>=1.24.0,<1.25.0)"] +route53-recovery-readiness = ["mypy-boto3-route53-recovery-readiness (>=1.24.0,<1.25.0)"] +route53domains = ["mypy-boto3-route53domains (>=1.24.0,<1.25.0)"] +route53resolver = ["mypy-boto3-route53resolver (>=1.24.0,<1.25.0)"] +rum = ["mypy-boto3-rum (>=1.24.0,<1.25.0)"] +s3 = ["mypy-boto3-s3 (>=1.24.0,<1.25.0)"] +s3control = ["mypy-boto3-s3control (>=1.24.0,<1.25.0)"] +s3outposts = ["mypy-boto3-s3outposts (>=1.24.0,<1.25.0)"] +sagemaker = ["mypy-boto3-sagemaker (>=1.24.0,<1.25.0)"] +sagemaker-a2i-runtime = ["mypy-boto3-sagemaker-a2i-runtime (>=1.24.0,<1.25.0)"] +sagemaker-edge = ["mypy-boto3-sagemaker-edge (>=1.24.0,<1.25.0)"] +sagemaker-featurestore-runtime = ["mypy-boto3-sagemaker-featurestore-runtime (>=1.24.0,<1.25.0)"] +sagemaker-runtime = ["mypy-boto3-sagemaker-runtime (>=1.24.0,<1.25.0)"] +savingsplans = ["mypy-boto3-savingsplans (>=1.24.0,<1.25.0)"] +schemas = ["mypy-boto3-schemas (>=1.24.0,<1.25.0)"] +sdb = ["mypy-boto3-sdb (>=1.24.0,<1.25.0)"] +secretsmanager = ["mypy-boto3-secretsmanager (>=1.24.0,<1.25.0)"] +securityhub = ["mypy-boto3-securityhub (>=1.24.0,<1.25.0)"] +serverlessrepo = ["mypy-boto3-serverlessrepo (>=1.24.0,<1.25.0)"] +service-quotas = ["mypy-boto3-service-quotas (>=1.24.0,<1.25.0)"] +servicecatalog = ["mypy-boto3-servicecatalog (>=1.24.0,<1.25.0)"] +servicecatalog-appregistry = ["mypy-boto3-servicecatalog-appregistry (>=1.24.0,<1.25.0)"] +servicediscovery = ["mypy-boto3-servicediscovery (>=1.24.0,<1.25.0)"] +ses = ["mypy-boto3-ses (>=1.24.0,<1.25.0)"] +sesv2 = ["mypy-boto3-sesv2 (>=1.24.0,<1.25.0)"] +shield = ["mypy-boto3-shield (>=1.24.0,<1.25.0)"] +signer = ["mypy-boto3-signer (>=1.24.0,<1.25.0)"] +sms = ["mypy-boto3-sms (>=1.24.0,<1.25.0)"] +sms-voice = ["mypy-boto3-sms-voice (>=1.24.0,<1.25.0)"] +snow-device-management = ["mypy-boto3-snow-device-management (>=1.24.0,<1.25.0)"] +snowball = ["mypy-boto3-snowball (>=1.24.0,<1.25.0)"] +sns = ["mypy-boto3-sns (>=1.24.0,<1.25.0)"] +sqs = ["mypy-boto3-sqs (>=1.24.0,<1.25.0)"] +ssm = ["mypy-boto3-ssm (>=1.24.0,<1.25.0)"] +ssm-contacts = ["mypy-boto3-ssm-contacts (>=1.24.0,<1.25.0)"] +ssm-incidents = ["mypy-boto3-ssm-incidents (>=1.24.0,<1.25.0)"] +sso = ["mypy-boto3-sso (>=1.24.0,<1.25.0)"] +sso-admin = ["mypy-boto3-sso-admin (>=1.24.0,<1.25.0)"] +sso-oidc = ["mypy-boto3-sso-oidc (>=1.24.0,<1.25.0)"] +stepfunctions = ["mypy-boto3-stepfunctions (>=1.24.0,<1.25.0)"] +storagegateway = ["mypy-boto3-storagegateway (>=1.24.0,<1.25.0)"] +sts = ["mypy-boto3-sts (>=1.24.0,<1.25.0)"] +support = ["mypy-boto3-support (>=1.24.0,<1.25.0)"] +support-app = ["mypy-boto3-support-app (>=1.24.0,<1.25.0)"] +swf = ["mypy-boto3-swf (>=1.24.0,<1.25.0)"] +synthetics = ["mypy-boto3-synthetics (>=1.24.0,<1.25.0)"] +textract = ["mypy-boto3-textract (>=1.24.0,<1.25.0)"] +timestream-query = ["mypy-boto3-timestream-query (>=1.24.0,<1.25.0)"] +timestream-write = ["mypy-boto3-timestream-write (>=1.24.0,<1.25.0)"] +transcribe = ["mypy-boto3-transcribe (>=1.24.0,<1.25.0)"] +transfer = ["mypy-boto3-transfer (>=1.24.0,<1.25.0)"] +translate = ["mypy-boto3-translate (>=1.24.0,<1.25.0)"] +voice-id = ["mypy-boto3-voice-id (>=1.24.0,<1.25.0)"] +waf = ["mypy-boto3-waf (>=1.24.0,<1.25.0)"] +waf-regional = ["mypy-boto3-waf-regional (>=1.24.0,<1.25.0)"] +wafv2 = ["mypy-boto3-wafv2 (>=1.24.0,<1.25.0)"] +wellarchitected = ["mypy-boto3-wellarchitected (>=1.24.0,<1.25.0)"] +wisdom = ["mypy-boto3-wisdom (>=1.24.0,<1.25.0)"] +workdocs = ["mypy-boto3-workdocs (>=1.24.0,<1.25.0)"] +worklink = ["mypy-boto3-worklink (>=1.24.0,<1.25.0)"] +workmail = ["mypy-boto3-workmail (>=1.24.0,<1.25.0)"] +workmailmessageflow = ["mypy-boto3-workmailmessageflow (>=1.24.0,<1.25.0)"] +workspaces = ["mypy-boto3-workspaces (>=1.24.0,<1.25.0)"] +workspaces-web = ["mypy-boto3-workspaces-web (>=1.24.0,<1.25.0)"] +xray = ["mypy-boto3-xray (>=1.24.0,<1.25.0)"] [[package]] name = "botocore" -version = "1.23.40" +version = "1.27.38" description = "Low-level, data-driven core of boto 3." category = "main" optional = false -python-versions = ">= 3.6" +python-versions = ">= 3.7" [package.dependencies] -jmespath = ">=0.7.1,<1.0.0" +jmespath = ">=0.7.1,<2.0.0" python-dateutil = ">=2.1,<3.0.0" urllib3 = ">=1.25.4,<1.27" [package.extras] -crt = ["awscrt (==0.12.5)"] +crt = ["awscrt (==0.13.8)"] [[package]] name = "botocore-stubs" -version = "1.23.40" -description = "Type annotations for botocore 1.23.40, generated by mypy-boto3-builder 6.3.2" +version = "1.27.38" +description = "Type annotations for botocore 1.27.38 generated with mypy-boto3-builder 7.10.1" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +typing-extensions = ">=4.1.0" + +[[package]] +name = "certifi" +version = "2022.6.15" +description = "Python package for providing Mozilla's CA Bundle." category = "main" optional = false python-versions = ">=3.6" -[package.dependencies] -typing-extensions = {version = "*", markers = "python_version < \"3.9\""} - -[[package]] -name = "cached-property" -version = "1.5.2" -description = "A decorator for caching properties in classes." -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "certifi" -version = "2021.10.8" -description = "Python package for providing Mozilla's CA Bundle." -category = "main" -optional = false -python-versions = "*" - [[package]] name = "cffi" -version = "1.15.0" +version = "1.15.1" description = "Foreign Function Interface for Python calling C code." category = "main" optional = false @@ -486,14 +535,14 @@ pycparser = "*" [[package]] name = "cfn-lint" -version = "0.57.0" +version = "0.61.3" description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved" category = "main" optional = false python-versions = ">=3.6, <=4.0, !=4.0" [package.dependencies] -aws-sam-translator = ">=1.42.0" +aws-sam-translator = ">=1.47.0" jschema-to-python = ">=1.2.3,<1.3.0" jsonpatch = "*" jsonschema = ">=3.0,<4.0" @@ -501,34 +550,32 @@ junit-xml = ">=1.9,<2.0" networkx = ">=2.4,<3.0" pyyaml = ">5.4" sarif-om = ">=1.0.4,<1.1.0" -six = ">=1.11" [[package]] name = "charset-normalizer" -version = "2.0.10" +version = "2.1.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." category = "main" optional = false -python-versions = ">=3.5.0" +python-versions = ">=3.6.0" [package.extras] -unicode_backport = ["unicodedata2"] +unicode-backport = ["unicodedata2"] [[package]] name = "click" -version = "8.0.3" +version = "8.1.3" description = "Composable command line interface toolkit" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} [[package]] name = "colorama" -version = "0.4.4" +version = "0.4.5" description = "Cross-platform colored terminal text." category = "main" optional = false @@ -536,7 +583,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "cryptography" -version = "36.0.1" +version = "38.0.3" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." category = "main" optional = false @@ -547,32 +594,33 @@ cffi = ">=1.12" [package.extras] docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"] -docstest = ["pyenchant (>=1.6.11)", "twine (>=1.12.0)", "sphinxcontrib-spelling (>=4.0.1)"] +docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"] -sdist = ["setuptools_rust (>=0.11.4)"] +sdist = ["setuptools-rust (>=0.11.4)"] ssh = ["bcrypt (>=3.1.5)"] -test = ["pytest (>=6.2.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pretend", "iso8601", "pytz", "hypothesis (>=1.11.4,!=3.79.2)"] +test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", "pytest-xdist", "pytz"] [[package]] name = "docker" -version = "5.0.3" +version = "4.2.2" description = "A Python library for the Docker Engine API." category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [package.dependencies] -pywin32 = {version = "227", markers = "sys_platform == \"win32\""} +pypiwin32 = {version = "223", markers = "sys_platform == \"win32\" and python_version >= \"3.6\""} requests = ">=2.14.2,<2.18.0 || >2.18.0" +six = ">=1.4.0" websocket-client = ">=0.32.0" [package.extras] ssh = ["paramiko (>=2.4.2)"] -tls = ["pyOpenSSL (>=17.5.0)", "cryptography (>=3.4.7)", "idna (>=2.0.0)"] +tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"] [[package]] name = "ecdsa" -version = "0.17.0" +version = "0.18.0" description = "ECDSA cryptographic signature library (pure python)" category = "main" optional = false @@ -598,28 +646,28 @@ testing = ["pre-commit"] [[package]] name = "flake8" -version = "3.9.2" +version = "5.0.4" description = "the modular source code checker: pep8 pyflakes and co" category = "dev" optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" +python-versions = ">=3.6.1" [package.dependencies] -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} -mccabe = ">=0.6.0,<0.7.0" -pycodestyle = ">=2.7.0,<2.8.0" -pyflakes = ">=2.3.0,<2.4.0" +mccabe = ">=0.7.0,<0.8.0" +pycodestyle = ">=2.9.0,<2.10.0" +pyflakes = ">=2.5.0,<2.6.0" [[package]] name = "flask" -version = "2.0.2" +version = "2.1.3" description = "A simple framework for building complex web applications." category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.dependencies] -click = ">=7.1.2" +click = ">=8.0" +importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""} itsdangerous = ">=2.0" Jinja2 = ">=3.0" Werkzeug = ">=2.0" @@ -640,17 +688,9 @@ python-versions = "*" Flask = ">=0.9" Six = "*" -[[package]] -name = "future" -version = "0.18.2" -description = "Clean single-source support for Python 3 and 2" -category = "main" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" - [[package]] name = "graphql-core" -version = "3.2.0" +version = "3.2.1" description = "GraphQL implementation for Python, a port of GraphQL.js, the JavaScript reference implementation for GraphQL." category = "main" optional = false @@ -666,20 +706,19 @@ python-versions = ">=3.5" [[package]] name = "importlib-metadata" -version = "4.10.1" +version = "4.12.0" description = "Read metadata from Python packages" category = "main" optional = false python-versions = ">=3.7" [package.dependencies] -typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} zipp = ">=0.5" [package.extras] -docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] +docs = ["jaraco.packaging (>=9)", "rst.linker (>=1.9)", "sphinx"] perf = ["ipython"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy", "importlib-resources (>=1.3)"] +testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"] [[package]] name = "iniconfig" @@ -689,21 +728,35 @@ category = "main" optional = false python-versions = "*" +[[package]] +name = "isort" +version = "5.10.1" +description = "A Python utility / library to sort Python imports." +category = "dev" +optional = false +python-versions = ">=3.6.1,<4.0" + +[package.extras] +colors = ["colorama (>=0.4.3,<0.5.0)"] +pipfile-deprecated-finder = ["pipreqs", "requirementslib"] +plugins = ["setuptools"] +requirements-deprecated-finder = ["pip-api", "pipreqs"] + [[package]] name = "itsdangerous" -version = "2.0.1" +version = "2.1.2" description = "Safely pass data to untrusted environments and back." category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [[package]] name = "jinja2" -version = "3.0.3" +version = "3.1.2" description = "A very fast and expressive template engine." category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.dependencies] MarkupSafe = ">=2.0" @@ -713,11 +766,11 @@ i18n = ["Babel (>=2.7)"] [[package]] name = "jmespath" -version = "0.10.0" +version = "1.0.1" description = "JSON Matching Expressions" category = "main" optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +python-versions = ">=3.7" [[package]] name = "jschema-to-python" @@ -734,7 +787,7 @@ pbr = "*" [[package]] name = "jsondiff" -version = "1.3.0" +version = "2.0.0" description = "Diff JSON and JSON-like structures in Python" category = "main" optional = false @@ -753,23 +806,20 @@ jsonpointer = ">=1.9" [[package]] name = "jsonpickle" -version = "2.1.0" +version = "2.2.0" description = "Python library for serializing any arbitrary object graph into JSON" category = "main" optional = false python-versions = ">=2.7" -[package.dependencies] -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} - [package.extras] -docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] -testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-black-multipy", "pytest-cov", "ecdsa", "feedparser", "numpy", "pandas", "pymongo", "scikit-learn", "sqlalchemy", "enum34", "jsonlib"] -"testing.libs" = ["demjson", "simplejson", "ujson", "yajl"] +docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"] +testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"] +testing-libs = ["simplejson", "ujson", "yajl"] [[package]] name = "jsonpointer" -version = "2.2" +version = "2.3" description = "Identify specific nodes in a JSON document (RFC 6901)" category = "main" optional = false @@ -785,13 +835,13 @@ python-versions = "*" [package.dependencies] attrs = ">=17.4.0" -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} pyrsistent = ">=0.14.0" +setuptools = "*" six = ">=1.11.0" [package.extras] format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"] -format_nongpl = ["idna", "jsonpointer (>1.13)", "webcolors", "rfc3986-validator (>0.1.0)", "rfc3339-validator"] +format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"] [[package]] name = "junit-xml" @@ -806,23 +856,23 @@ six = "*" [[package]] name = "markupsafe" -version = "2.0.1" +version = "2.1.1" description = "Safely add untrusted strings to HTML/XML markup." category = "main" optional = false +python-versions = ">=3.7" + +[[package]] +name = "mccabe" +version = "0.7.0" +description = "McCabe checker, plugin for flake8" +category = "dev" +optional = false python-versions = ">=3.6" -[[package]] -name = "mccabe" -version = "0.6.1" -description = "McCabe checker, plugin for flake8" -category = "dev" -optional = false -python-versions = "*" - [[package]] name = "moto" -version = "3.0.4" +version = "3.1.18" description = "A library that allows your python tests to easily mock out the boto library" category = "main" optional = false @@ -836,62 +886,78 @@ cfn-lint = {version = ">=0.4.0", optional = true, markers = "extra == \"server\" cryptography = ">=3.3.1" docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""} ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""} -flask = {version = "*", optional = true, markers = "extra == \"server\""} +flask = {version = "<2.2.0", optional = true, markers = "extra == \"server\""} flask-cors = {version = "*", optional = true, markers = "extra == \"server\""} graphql-core = {version = "*", optional = true, markers = "extra == \"server\""} idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""} -importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} Jinja2 = ">=2.10.1" jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} MarkupSafe = "!=2.0.0a1" +openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""} +pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""} python-dateutil = ">=2.1,<3.0.0" python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""} pytz = "*" PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""} requests = ">=2.5" responses = ">=0.9.0" +setuptools = {version = "*", optional = true, markers = "extra == \"server\""} sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""} -werkzeug = "*" +werkzeug = ">=0.5,<2.2.0" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "setuptools"] -apigateway = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)"] +all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] apigatewayv2 = ["PyYAML (>=5.1)"] appsync = ["graphql-core"] awslambda = ["docker (>=2.5.1)"] batch = ["docker (>=2.5.1)"] -cloudformation = ["docker (>=2.5.1)", "PyYAML (>=5.1)", "cfn-lint (>=0.4.0)"] -cognitoidp = ["python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)"] +cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] ds = ["sshpubkeys (>=3.1.0)"] +dynamodb = ["docker (>=2.5.1)"] dynamodb2 = ["docker (>=2.5.1)"] dynamodbstreams = ["docker (>=2.5.1)"] +ebs = ["sshpubkeys (>=3.1.0)"] ec2 = ["sshpubkeys (>=3.1.0)"] efs = ["sshpubkeys (>=3.1.0)"] +glue = ["pyparsing (>=3.0.7)"] iotdata = ["jsondiff (>=1.1.2)"] route53resolver = ["sshpubkeys (>=3.1.0)"] s3 = ["PyYAML (>=5.1)"] -server = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "setuptools", "flask", "flask-cors"] +server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (<2.2.0)", "flask-cors", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] ssm = ["PyYAML (>=5.1)", "dataclasses"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] [[package]] name = "mypy" -version = "0.910" +version = "0.971" description = "Optional static typing for Python" category = "dev" optional = false -python-versions = ">=3.5" +python-versions = ">=3.6" [package.dependencies] -mypy-extensions = ">=0.4.3,<0.5.0" -toml = "*" -typed-ast = {version = ">=1.4.0,<1.5.0", markers = "python_version < \"3.8\""} -typing-extensions = ">=3.7.4" +mypy-extensions = ">=0.4.3" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = ">=3.10" [package.extras] dmypy = ["psutil (>=4.0)"] -python2 = ["typed-ast (>=1.4.0,<1.5.0)"] +python2 = ["typed-ast (>=1.4.0,<2)"] +reports = ["lxml"] + +[[package]] +name = "mypy-boto3-s3" +version = "1.24.36.post1" +description = "Type annotations for boto3.S3 1.24.36 service generated with mypy-boto3-builder 7.10.0" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +typing-extensions = ">=4.1.0" [[package]] name = "mypy-extensions" @@ -903,18 +969,51 @@ python-versions = "*" [[package]] name = "networkx" -version = "2.6.3" +version = "2.8.5" description = "Python package for creating and manipulating graphs and networks" category = "main" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" [package.extras] -default = ["numpy (>=1.19)", "scipy (>=1.5,!=1.6.1)", "matplotlib (>=3.3)", "pandas (>=1.1)"] -developer = ["black (==21.5b1)", "pre-commit (>=2.12)"] -doc = ["sphinx (>=4.0,<5.0)", "pydata-sphinx-theme (>=0.6,<1.0)", "sphinx-gallery (>=0.9,<1.0)", "numpydoc (>=1.1)", "pillow (>=8.2)", "nb2plots (>=0.6)", "texext (>=0.6.6)"] -extra = ["lxml (>=4.5)", "pygraphviz (>=1.7)", "pydot (>=1.4.1)"] -test = ["pytest (>=6.2)", "pytest-cov (>=2.12)", "codecov (>=2.1)"] +default = ["matplotlib (>=3.4)", "numpy (>=1.19)", "pandas (>=1.3)", "scipy (>=1.8)"] +developer = ["mypy (>=0.960)", "pre-commit (>=2.19)"] +doc = ["nb2plots (>=0.6)", "numpydoc (>=1.4)", "pillow (>=9.1)", "pydata-sphinx-theme (>=0.9)", "sphinx (>=5)", "sphinx-gallery (>=0.10)", "texext (>=0.6.6)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.9)", "sympy (>=1.10)"] +test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"] + +[[package]] +name = "openapi-schema-validator" +version = "0.2.3" +description = "OpenAPI schema validation for Python" +category = "main" +optional = false +python-versions = ">=3.7.0,<4.0.0" + +[package.dependencies] +jsonschema = ">=3.0.0,<5.0.0" + +[package.extras] +isodate = ["isodate"] +rfc3339-validator = ["rfc3339-validator"] +strict-rfc3339 = ["strict-rfc3339"] + +[[package]] +name = "openapi-spec-validator" +version = "0.4.0" +description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3.0 spec validator" +category = "main" +optional = false +python-versions = ">=3.7.0,<4.0.0" + +[package.dependencies] +jsonschema = ">=3.2.0,<5.0.0" +openapi-schema-validator = ">=0.2.0,<0.3.0" +PyYAML = ">=5.1" +setuptools = "*" + +[package.extras] +requests = ["requests"] [[package]] name = "packaging" @@ -927,14 +1026,34 @@ python-versions = ">=3.6" [package.dependencies] pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" +[[package]] +name = "pathspec" +version = "0.9.0" +description = "Utility library for gitignore style pattern matching of file paths." +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" + [[package]] name = "pbr" -version = "5.8.0" +version = "5.9.0" description = "Python Build Reasonableness" category = "main" optional = false python-versions = ">=2.6" +[[package]] +name = "platformdirs" +version = "2.5.2" +description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)"] +test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"] + [[package]] name = "pluggy" version = "1.0.0" @@ -943,13 +1062,32 @@ category = "main" optional = false python-versions = ">=3.6" -[package.dependencies] -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} - [package.extras] dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "prometheus-client" +version = "0.14.1" +description = "Python client for the Prometheus monitoring system." +category = "main" +optional = false +python-versions = ">=3.6" + +[package.extras] +twisted = ["twisted"] + +[[package]] +name = "psutil" +version = "5.9.4" +description = "Cross-platform lib for process and system monitoring in Python." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.extras] +test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] + [[package]] name = "psycopg2-binary" version = "2.9.3" @@ -976,11 +1114,11 @@ python-versions = "*" [[package]] name = "pycodestyle" -version = "2.7.0" +version = "2.9.1" description = "Python style guide checker" category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +python-versions = ">=3.6" [[package]] name = "pycparser" @@ -992,15 +1130,15 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [[package]] name = "pyflakes" -version = "2.3.1" +version = "2.5.0" description = "passive checker of Python programs" category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +python-versions = ">=3.6" [[package]] name = "pyjwt" -version = "2.3.0" +version = "2.4.0" description = "JSON Web Token implementation in Python" category = "main" optional = false @@ -1011,21 +1149,32 @@ cryptography = {version = ">=3.3.1", optional = true, markers = "extra == \"cryp [package.extras] crypto = ["cryptography (>=3.3.1)"] -dev = ["sphinx", "sphinx-rtd-theme", "zope.interface", "cryptography (>=3.3.1)", "pytest (>=6.0.0,<7.0.0)", "coverage[toml] (==5.0.4)", "mypy", "pre-commit"] +dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.3.1)", "mypy", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx", "sphinx-rtd-theme", "zope.interface"] docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"] -tests = ["pytest (>=6.0.0,<7.0.0)", "coverage[toml] (==5.0.4)"] +tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] [[package]] name = "pyparsing" -version = "3.0.6" -description = "Python parsing module" +version = "3.0.9" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.6.8" [package.extras] diagrams = ["jinja2", "railroad-diagrams"] +[[package]] +name = "pypiwin32" +version = "223" +description = "" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +pywin32 = ">=223" + [[package]] name = "pyrsistent" version = "0.18.1" @@ -1046,7 +1195,6 @@ python-versions = ">=3.6" atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} attrs = ">=19.2.0" colorama = {version = "*", markers = "sys_platform == \"win32\""} -importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} iniconfig = "*" packaging = "*" pluggy = ">=0.12,<2.0" @@ -1057,20 +1205,58 @@ toml = "*" testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] [[package]] -name = "pytest-forked" -version = "1.4.0" -description = "run tests in isolated forked subprocesses" +name = "pytest-asyncio" +version = "0.19.0" +description = "Pytest support for asyncio" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +pytest = ">=6.1.0" + +[package.extras] +testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"] + +[[package]] +name = "pytest-lazy-fixture" +version = "0.6.3" +description = "It helps to use fixtures in pytest.mark.parametrize" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +pytest = ">=3.2.5" + +[[package]] +name = "pytest-order" +version = "1.0.1" +description = "pytest plugin to run your tests in a specific order" category = "main" optional = false python-versions = ">=3.6" [package.dependencies] -py = "*" -pytest = ">=3.10" +pytest = [ + {version = ">=5.0", markers = "python_version < \"3.10\""}, + {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, +] + +[[package]] +name = "pytest-timeout" +version = "2.1.0" +description = "pytest plugin to abort hanging tests" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +pytest = ">=5.0.0" [[package]] name = "pytest-xdist" -version = "2.5.0" +version = "3.0.2" description = "pytest xdist plugin for distributed testing and loop-on-failing modes" category = "main" optional = false @@ -1079,7 +1265,6 @@ python-versions = ">=3.6" [package.dependencies] execnet = ">=1.1" pytest = ">=6.2.0" -pytest-forked = "*" [package.extras] psutil = ["psutil (>=3.0)"] @@ -1113,12 +1298,12 @@ rsa = "*" [package.extras] cryptography = ["cryptography (>=3.4.0)"] -pycrypto = ["pycrypto (>=2.6.0,<2.7.0)", "pyasn1"] -pycryptodome = ["pycryptodome (>=3.3.1,<4.0.0)", "pyasn1"] +pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"] +pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"] [[package]] name = "pytz" -version = "2021.3" +version = "2022.1" description = "World timezone definitions, modern and historical" category = "main" optional = false @@ -1126,7 +1311,7 @@ python-versions = "*" [[package]] name = "pywin32" -version = "227" +version = "301" description = "Python for Window Extensions" category = "main" optional = false @@ -1142,41 +1327,40 @@ python-versions = ">=3.6" [[package]] name = "requests" -version = "2.27.1" +version = "2.28.1" description = "Python HTTP for Humans." category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +python-versions = ">=3.7, <4" [package.dependencies] certifi = ">=2017.4.17" -charset-normalizer = {version = ">=2.0.0,<2.1.0", markers = "python_version >= \"3\""} -idna = {version = ">=2.5,<4", markers = "python_version >= \"3\""} +charset-normalizer = ">=2,<3" +idna = ">=2.5,<4" urllib3 = ">=1.21.1,<1.27" [package.extras] -socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] -use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "responses" -version = "0.17.0" +version = "0.21.0" description = "A utility library for mocking out the `requests` Python library." category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = ">=3.7" [package.dependencies] -requests = ">=2.0" -six = "*" +requests = ">=2.0,<3.0" urllib3 = ">=1.25.10" [package.extras] -tests = ["coverage (>=3.7.1,<6.0.0)", "pytest-cov", "pytest-localserver", "flake8", "types-mock", "types-requests", "types-six", "pytest (>=4.6,<5.0)", "pytest (>=4.6)", "mypy"] +tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-localserver", "types-mock", "types-requests"] [[package]] name = "rsa" -version = "4.8" +version = "4.9" description = "Pure-Python RSA implementation" category = "main" optional = false @@ -1187,11 +1371,11 @@ pyasn1 = ">=0.1.3" [[package]] name = "s3transfer" -version = "0.5.0" +version = "0.6.0" description = "An Amazon S3 Transfer Manager" category = "main" optional = false -python-versions = ">= 3.6" +python-versions = ">= 3.7" [package.dependencies] botocore = ">=1.12.36,<2.0a.0" @@ -1211,6 +1395,19 @@ python-versions = ">= 2.7" attrs = "*" pbr = "*" +[[package]] +name = "setuptools" +version = "65.5.0" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] + [[package]] name = "six" version = "1.16.0" @@ -1243,16 +1440,24 @@ optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" [[package]] -name = "typed-ast" -version = "1.4.3" -description = "a fork of Python 2 and 3 ast modules with type comment support" +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" category = "dev" optional = false +python-versions = ">=3.7" + +[[package]] +name = "types-psutil" +version = "5.9.5.4" +description = "Typing stubs for psutil" +category = "main" +optional = false python-versions = "*" [[package]] name = "types-psycopg2" -version = "2.9.6" +version = "2.9.18" description = "Typing stubs for psycopg2" category = "main" optional = false @@ -1260,7 +1465,7 @@ python-versions = "*" [[package]] name = "types-requests" -version = "2.27.7" +version = "2.28.5" description = "Typing stubs for requests" category = "main" optional = false @@ -1269,9 +1474,25 @@ python-versions = "*" [package.dependencies] types-urllib3 = "<1.27" +[[package]] +name = "types-s3transfer" +version = "0.6.0.post3" +description = "Type annotations and code completion for s3transfer" +category = "main" +optional = false +python-versions = ">=3.7,<4.0" + +[[package]] +name = "types-toml" +version = "0.10.8" +description = "Typing stubs for toml" +category = "dev" +optional = false +python-versions = "*" + [[package]] name = "types-urllib3" -version = "1.26.7" +version = "1.26.17" description = "Typing stubs for urllib3" category = "main" optional = false @@ -1279,32 +1500,32 @@ python-versions = "*" [[package]] name = "typing-extensions" -version = "3.10.0.2" -description = "Backported and Experimental Type Hints for Python 3.5+" +version = "4.3.0" +description = "Backported and Experimental Type Hints for Python 3.7+" category = "main" optional = false -python-versions = "*" +python-versions = ">=3.7" [[package]] name = "urllib3" -version = "1.26.8" +version = "1.26.11" description = "HTTP library with thread-safe connection pooling, file post, and more." category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" [package.extras] -brotli = ["brotlipy (>=0.6.0)"] -secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] name = "websocket-client" -version = "1.2.3" +version = "1.3.3" description = "WebSocket client for Python with low level API options" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.extras] docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"] @@ -1313,18 +1534,18 @@ test = ["websockets"] [[package]] name = "werkzeug" -version = "2.0.2" +version = "2.1.2" description = "The comprehensive WSGI web application library." category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.extras] watchdog = ["watchdog"] [[package]] name = "wrapt" -version = "1.13.3" +version = "1.14.1" description = "Module for decorators, wrappers and monkey patching." category = "main" optional = false @@ -1332,41 +1553,41 @@ python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" [[package]] name = "xmltodict" -version = "0.12.0" +version = "0.13.0" description = "Makes working with XML feel like you are working with JSON" category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - -[[package]] -name = "yapf" -version = "0.31.0" -description = "A formatter for Python code." -category = "dev" -optional = false -python-versions = "*" +python-versions = ">=3.4" [[package]] name = "zipp" -version = "3.7.0" +version = "3.8.1" description = "Backport of pathlib-compatible object wrapper for zip files" category = "main" optional = false python-versions = ">=3.7" [package.extras] -docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy"] +docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"] +testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] [metadata] lock-version = "1.1" -python-versions = "^3.7" -content-hash = "58762accad4122026c650fa43421a900546e89f9908e2268410e7b11cc8c6c4e" +python-versions = "^3.9" +content-hash = "c95c184fccaf40815405ad616ec1c55869c7f87b72777cc3a9cbaff41de98977" [metadata.files] aiopg = [ - {file = "aiopg-1.3.3-py3-none-any.whl", hash = "sha256:2842dd8741460eeef940032dcb577bfba4d4115205dd82a73ce13b3271f5bf0a"}, - {file = "aiopg-1.3.3.tar.gz", hash = "sha256:547c6ba4ea0d73c2a11a2f44387d7133cc01d3c6f3b8ed976c0ac1eff4f595d7"}, + {file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"}, + {file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"}, +] +allure-pytest = [ + {file = "allure-pytest-2.10.0.tar.gz", hash = "sha256:3b2ab67629f4cbd8617abd817d2b22292c6eb7efd5584f992d1af8143aea6ee7"}, + {file = "allure_pytest-2.10.0-py3-none-any.whl", hash = "sha256:08274096594758447db54c3b2c382526ee04f1fe12119cdaee92d2d93c84b530"}, +] +allure-python-commons = [ + {file = "allure-python-commons-2.10.0.tar.gz", hash = "sha256:d4d31344b0f0037a4a11e16b91b28cf0eeb23ffa0e50c27fcfc6aabe72212d3c"}, + {file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"}, ] async-timeout = [ {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"}, @@ -1388,213 +1609,255 @@ asyncpg = [ {file = "asyncpg-0.24.0.tar.gz", hash = "sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6"}, ] atomicwrites = [ - {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, - {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, + {file = "atomicwrites-1.4.1.tar.gz", hash = "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"}, ] attrs = [ {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"}, {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"}, ] aws-sam-translator = [ - {file = "aws-sam-translator-1.42.0.tar.gz", hash = "sha256:8a7976c0ee2fca004a590e17d3551a49c8d8ba14ed0cb3674ea270d41d0dcd5b"}, - {file = "aws_sam_translator-1.42.0-py2-none-any.whl", hash = "sha256:4f5d3d5d0567fe728e75c5c8dff599f7c88313b3b8e85b9b17a2c00cb046b2e4"}, - {file = "aws_sam_translator-1.42.0-py3-none-any.whl", hash = "sha256:31875e4f639511f506d0c757a2a50756bd846440724079e867aafb12c534ac23"}, + {file = "aws-sam-translator-1.48.0.tar.gz", hash = "sha256:7171037323dfa30f8f73e9bccb9210e4c384a585e087219a9518a5204f0a2c44"}, + {file = "aws_sam_translator-1.48.0-py2-none-any.whl", hash = "sha256:be18dfa3dfe7ab291d281667c5f73ac62dbe6bfe86df7d122e4258b906b736f0"}, + {file = "aws_sam_translator-1.48.0-py3-none-any.whl", hash = "sha256:ca4f8f9910d7713aeaba59346775bfb3198f6acb47c6704572f9bd3fc0fb5bf0"}, ] aws-xray-sdk = [ - {file = "aws-xray-sdk-2.9.0.tar.gz", hash = "sha256:b0cd972db218d4d8f7b53ad806fc6184626b924c4997ae58fc9f2a8cd1281568"}, - {file = "aws_xray_sdk-2.9.0-py2.py3-none-any.whl", hash = "sha256:98216b3ac8281b51b59a8703f8ec561c460807d9d0679838f5c0179d381d7e58"}, + {file = "aws-xray-sdk-2.10.0.tar.gz", hash = "sha256:9b14924fd0628cf92936055864655354003f0b1acc3e1c3ffde6403d0799dd7a"}, + {file = "aws_xray_sdk-2.10.0-py2.py3-none-any.whl", hash = "sha256:7551e81a796e1a5471ebe84844c40e8edf7c218db33506d046fec61f7495eda4"}, ] backoff = [ {file = "backoff-1.11.1-py2.py3-none-any.whl", hash = "sha256:61928f8fa48d52e4faa81875eecf308eccfb1016b018bb6bd21e05b5d90a96c5"}, {file = "backoff-1.11.1.tar.gz", hash = "sha256:ccb962a2378418c667b3c979b504fdeb7d9e0d29c0579e3b13b86467177728cb"}, ] +black = [ + {file = "black-22.6.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f586c26118bc6e714ec58c09df0157fe2d9ee195c764f630eb0d8e7ccce72e69"}, + {file = "black-22.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b270a168d69edb8b7ed32c193ef10fd27844e5c60852039599f9184460ce0807"}, + {file = "black-22.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6797f58943fceb1c461fb572edbe828d811e719c24e03375fd25170ada53825e"}, + {file = "black-22.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c85928b9d5f83b23cee7d0efcb310172412fbf7cb9d9ce963bd67fd141781def"}, + {file = "black-22.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:f6fe02afde060bbeef044af7996f335fbe90b039ccf3f5eb8f16df8b20f77666"}, + {file = "black-22.6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cfaf3895a9634e882bf9d2363fed5af8888802d670f58b279b0bece00e9a872d"}, + {file = "black-22.6.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94783f636bca89f11eb5d50437e8e17fbc6a929a628d82304c80fa9cd945f256"}, + {file = "black-22.6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:2ea29072e954a4d55a2ff58971b83365eba5d3d357352a07a7a4df0d95f51c78"}, + {file = "black-22.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e439798f819d49ba1c0bd9664427a05aab79bfba777a6db94fd4e56fae0cb849"}, + {file = "black-22.6.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:187d96c5e713f441a5829e77120c269b6514418f4513a390b0499b0987f2ff1c"}, + {file = "black-22.6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:074458dc2f6e0d3dab7928d4417bb6957bb834434516f21514138437accdbe90"}, + {file = "black-22.6.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a218d7e5856f91d20f04e931b6f16d15356db1c846ee55f01bac297a705ca24f"}, + {file = "black-22.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:568ac3c465b1c8b34b61cd7a4e349e93f91abf0f9371eda1cf87194663ab684e"}, + {file = "black-22.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6c1734ab264b8f7929cef8ae5f900b85d579e6cbfde09d7387da8f04771b51c6"}, + {file = "black-22.6.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c9a3ac16efe9ec7d7381ddebcc022119794872abce99475345c5a61aa18c45ad"}, + {file = "black-22.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:b9fd45787ba8aa3f5e0a0a98920c1012c884622c6c920dbe98dbd05bc7c70fbf"}, + {file = "black-22.6.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7ba9be198ecca5031cd78745780d65a3f75a34b2ff9be5837045dce55db83d1c"}, + {file = "black-22.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a3db5b6409b96d9bd543323b23ef32a1a2b06416d525d27e0f67e74f1446c8f2"}, + {file = "black-22.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:560558527e52ce8afba936fcce93a7411ab40c7d5fe8c2463e279e843c0328ee"}, + {file = "black-22.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b154e6bbde1e79ea3260c4b40c0b7b3109ffcdf7bc4ebf8859169a6af72cd70b"}, + {file = "black-22.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:4af5bc0e1f96be5ae9bd7aaec219c901a94d6caa2484c21983d043371c733fc4"}, + {file = "black-22.6.0-py3-none-any.whl", hash = "sha256:ac609cf8ef5e7115ddd07d85d988d074ed00e10fbc3445aee393e70164a2219c"}, + {file = "black-22.6.0.tar.gz", hash = "sha256:6c6d39e28aed379aec40da1c65434c77d75e65bb59a1e1c283de545fb4e7c6c9"}, +] boto3 = [ - {file = "boto3-1.20.40-py3-none-any.whl", hash = "sha256:cfe85589e4a0a997c7b9ae7432400b03fa6fa5fea29fdc48db3099a903b76998"}, - {file = "boto3-1.20.40.tar.gz", hash = "sha256:66aef9a6d8cad393f69166112ba49e14e2c6766f9278c96134101314a9af2992"}, + {file = "boto3-1.24.38-py3-none-any.whl", hash = "sha256:bcf97fd7c494f4e2bbbe2511625500654179c0a6b3bea977d46f97af764e85a4"}, + {file = "boto3-1.24.38.tar.gz", hash = "sha256:f4c6b025f392c934338c7f01badfddbd0d3cf2397ff5df35c31409798dce33f5"}, ] boto3-stubs = [ - {file = "boto3-stubs-1.20.40.tar.gz", hash = "sha256:24f23e14de15d29a85e301b5beb144d2c778ed350e0c08a2136a978c8105e3c9"}, - {file = "boto3_stubs-1.20.40-py3-none-any.whl", hash = "sha256:2e940afd4a47688bb536155b10bdc65cc99390217bfcb392f4fc8c188646a65f"}, + {file = "boto3-stubs-1.24.58.tar.gz", hash = "sha256:95ab521a9a931cc21d48c97c5bd7de0e37370d9b6a298e3905ec621db9243897"}, + {file = "boto3_stubs-1.24.58-py3-none-any.whl", hash = "sha256:a16940df2a347f7890075af8c0b202b06057bc18ff4c640ef94e09ce4176adb9"}, ] botocore = [ - {file = "botocore-1.23.40-py3-none-any.whl", hash = "sha256:88a314fe27cd97a0c731094c5b34db01ebe930801700e5d1b68485ebde746c3c"}, - {file = "botocore-1.23.40.tar.gz", hash = "sha256:49baa1fca4483b24769f0743fbf72afe4db391f41f1fc12ea34e06036db642a4"}, + {file = "botocore-1.27.38-py3-none-any.whl", hash = "sha256:46a0264ff3335496bd9cb404f83ec0d8eb7bfdef8f74a830c13e6a6b9612adea"}, + {file = "botocore-1.27.38.tar.gz", hash = "sha256:56a7682564ea57ceecfef5648f77b77e0543b9c904212fc9ef4416517d24fa45"}, ] botocore-stubs = [ - {file = "botocore-stubs-1.23.40.tar.gz", hash = "sha256:48529a2b7e14c6e3dd4544c21d4cf342ad512e2a526f5262c565357683d78787"}, - {file = "botocore_stubs-1.23.40-py3-none-any.whl", hash = "sha256:b5762895175cbacfa989b7ff313ca20f30f82137fcfd8a389cfe4a920cb57e73"}, -] -cached-property = [ - {file = "cached-property-1.5.2.tar.gz", hash = "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130"}, - {file = "cached_property-1.5.2-py2.py3-none-any.whl", hash = "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"}, + {file = "botocore-stubs-1.27.38.tar.gz", hash = "sha256:408e8b86b5d171b58f81c74ca9d3b5317a5a8e2d3bc2073aa841ac13b8939e56"}, + {file = "botocore_stubs-1.27.38-py3-none-any.whl", hash = "sha256:7add7641e9a479a9c8366893bb522fd9ca3d58714201e43662a200a148a1bc38"}, ] certifi = [ - {file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"}, - {file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"}, + {file = "certifi-2022.6.15-py3-none-any.whl", hash = "sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412"}, + {file = "certifi-2022.6.15.tar.gz", hash = "sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d"}, ] cffi = [ - {file = "cffi-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962"}, - {file = "cffi-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0"}, - {file = "cffi-1.15.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14"}, - {file = "cffi-1.15.0-cp27-cp27m-win32.whl", hash = "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474"}, - {file = "cffi-1.15.0-cp27-cp27m-win_amd64.whl", hash = "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6"}, - {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27"}, - {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023"}, - {file = "cffi-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2"}, - {file = "cffi-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e"}, - {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7"}, - {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3"}, - {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c"}, - {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962"}, - {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382"}, - {file = "cffi-1.15.0-cp310-cp310-win32.whl", hash = "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55"}, - {file = "cffi-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0"}, - {file = "cffi-1.15.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e"}, - {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39"}, - {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc"}, - {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032"}, - {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8"}, - {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605"}, - {file = "cffi-1.15.0-cp36-cp36m-win32.whl", hash = "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e"}, - {file = "cffi-1.15.0-cp36-cp36m-win_amd64.whl", hash = "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc"}, - {file = "cffi-1.15.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636"}, - {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4"}, - {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997"}, - {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b"}, - {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2"}, - {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7"}, - {file = "cffi-1.15.0-cp37-cp37m-win32.whl", hash = "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66"}, - {file = "cffi-1.15.0-cp37-cp37m-win_amd64.whl", hash = "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029"}, - {file = "cffi-1.15.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880"}, - {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20"}, - {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024"}, - {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e"}, - {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728"}, - {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6"}, - {file = "cffi-1.15.0-cp38-cp38-win32.whl", hash = "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c"}, - {file = "cffi-1.15.0-cp38-cp38-win_amd64.whl", hash = "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443"}, - {file = "cffi-1.15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a"}, - {file = "cffi-1.15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37"}, - {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a"}, - {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e"}, - {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"}, - {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df"}, - {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8"}, - {file = "cffi-1.15.0-cp39-cp39-win32.whl", hash = "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a"}, - {file = "cffi-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139"}, - {file = "cffi-1.15.0.tar.gz", hash = "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954"}, + {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"}, + {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"}, + {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"}, + {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"}, + {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"}, + {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"}, + {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"}, + {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"}, + {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"}, + {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"}, + {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"}, + {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"}, + {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"}, + {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"}, + {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"}, + {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"}, + {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"}, + {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"}, + {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"}, + {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"}, + {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"}, + {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"}, + {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"}, + {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"}, + {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"}, + {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"}, + {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"}, + {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"}, + {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"}, + {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"}, + {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"}, + {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"}, + {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"}, + {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"}, + {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"}, ] cfn-lint = [ - {file = "cfn-lint-0.57.0.tar.gz", hash = "sha256:17c2e3ba693ae259c868e221d159dc4aa9c7e60a970cdc1d1309150c9250faf4"}, - {file = "cfn_lint-0.57.0-py3-none-any.whl", hash = "sha256:71b5e23b6a5101416c13275baa0f172c935f679fac6956ae768c467a117913c2"}, + {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"}, + {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"}, ] charset-normalizer = [ - {file = "charset-normalizer-2.0.10.tar.gz", hash = "sha256:876d180e9d7432c5d1dfd4c5d26b72f099d503e8fcc0feb7532c9289be60fcbd"}, - {file = "charset_normalizer-2.0.10-py3-none-any.whl", hash = "sha256:cb957888737fc0bbcd78e3df769addb41fd1ff8cf950dc9e7ad7793f1bf44455"}, + {file = "charset-normalizer-2.1.0.tar.gz", hash = "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413"}, + {file = "charset_normalizer-2.1.0-py3-none-any.whl", hash = "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5"}, ] click = [ - {file = "click-8.0.3-py3-none-any.whl", hash = "sha256:353f466495adaeb40b6b5f592f9f91cb22372351c84caeb068132442a4518ef3"}, - {file = "click-8.0.3.tar.gz", hash = "sha256:410e932b050f5eed773c4cda94de75971c89cdb3155a72a0831139a79e5ecb5b"}, + {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, + {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, ] colorama = [ - {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, - {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, + {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"}, + {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"}, ] cryptography = [ - {file = "cryptography-36.0.1-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:73bc2d3f2444bcfeac67dd130ff2ea598ea5f20b40e36d19821b4df8c9c5037b"}, - {file = "cryptography-36.0.1-cp36-abi3-macosx_10_10_x86_64.whl", hash = "sha256:2d87cdcb378d3cfed944dac30596da1968f88fb96d7fc34fdae30a99054b2e31"}, - {file = "cryptography-36.0.1-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:74d6c7e80609c0f4c2434b97b80c7f8fdfaa072ca4baab7e239a15d6d70ed73a"}, - {file = "cryptography-36.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:6c0c021f35b421ebf5976abf2daacc47e235f8b6082d3396a2fe3ccd537ab173"}, - {file = "cryptography-36.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d59a9d55027a8b88fd9fd2826c4392bd487d74bf628bb9d39beecc62a644c12"}, - {file = "cryptography-36.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a817b961b46894c5ca8a66b599c745b9a3d9f822725221f0e0fe49dc043a3a3"}, - {file = "cryptography-36.0.1-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:94ae132f0e40fe48f310bba63f477f14a43116f05ddb69d6fa31e93f05848ae2"}, - {file = "cryptography-36.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:7be0eec337359c155df191d6ae00a5e8bbb63933883f4f5dffc439dac5348c3f"}, - {file = "cryptography-36.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:e0344c14c9cb89e76eb6a060e67980c9e35b3f36691e15e1b7a9e58a0a6c6dc3"}, - {file = "cryptography-36.0.1-cp36-abi3-win32.whl", hash = "sha256:4caa4b893d8fad33cf1964d3e51842cd78ba87401ab1d2e44556826df849a8ca"}, - {file = "cryptography-36.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:391432971a66cfaf94b21c24ab465a4cc3e8bf4a939c1ca5c3e3a6e0abebdbcf"}, - {file = "cryptography-36.0.1-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:bb5829d027ff82aa872d76158919045a7c1e91fbf241aec32cb07956e9ebd3c9"}, - {file = "cryptography-36.0.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebc15b1c22e55c4d5566e3ca4db8689470a0ca2babef8e3a9ee057a8b82ce4b1"}, - {file = "cryptography-36.0.1-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:596f3cd67e1b950bc372c33f1a28a0692080625592ea6392987dba7f09f17a94"}, - {file = "cryptography-36.0.1-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:30ee1eb3ebe1644d1c3f183d115a8c04e4e603ed6ce8e394ed39eea4a98469ac"}, - {file = "cryptography-36.0.1-pp38-pypy38_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ec63da4e7e4a5f924b90af42eddf20b698a70e58d86a72d943857c4c6045b3ee"}, - {file = "cryptography-36.0.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca238ceb7ba0bdf6ce88c1b74a87bffcee5afbfa1e41e173b1ceb095b39add46"}, - {file = "cryptography-36.0.1-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:ca28641954f767f9822c24e927ad894d45d5a1e501767599647259cbf030b903"}, - {file = "cryptography-36.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:39bdf8e70eee6b1c7b289ec6e5d84d49a6bfa11f8b8646b5b3dfe41219153316"}, - {file = "cryptography-36.0.1.tar.gz", hash = "sha256:53e5c1dc3d7a953de055d77bef2ff607ceef7a2aac0353b5d630ab67f7423638"}, + {file = "cryptography-38.0.3-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:984fe150f350a3c91e84de405fe49e688aa6092b3525f407a18b9646f6612320"}, + {file = "cryptography-38.0.3-cp36-abi3-macosx_10_10_x86_64.whl", hash = "sha256:ed7b00096790213e09eb11c97cc6e2b757f15f3d2f85833cd2d3ec3fe37c1722"}, + {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:bbf203f1a814007ce24bd4d51362991d5cb90ba0c177a9c08825f2cc304d871f"}, + {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:554bec92ee7d1e9d10ded2f7e92a5d70c1f74ba9524947c0ba0c850c7b011828"}, + {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1b52c9e5f8aa2b802d48bd693190341fae201ea51c7a167d69fc48b60e8a959"}, + {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:728f2694fa743a996d7784a6194da430f197d5c58e2f4e278612b359f455e4a2"}, + {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:dfb4f4dd568de1b6af9f4cda334adf7d72cf5bc052516e1b2608b683375dd95c"}, + {file = "cryptography-38.0.3-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5419a127426084933076132d317911e3c6eb77568a1ce23c3ac1e12d111e61e0"}, + {file = "cryptography-38.0.3-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:9b24bcff7853ed18a63cfb0c2b008936a9554af24af2fb146e16d8e1aed75748"}, + {file = "cryptography-38.0.3-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:25c1d1f19729fb09d42e06b4bf9895212292cb27bb50229f5aa64d039ab29146"}, + {file = "cryptography-38.0.3-cp36-abi3-win32.whl", hash = "sha256:7f836217000342d448e1c9a342e9163149e45d5b5eca76a30e84503a5a96cab0"}, + {file = "cryptography-38.0.3-cp36-abi3-win_amd64.whl", hash = "sha256:c46837ea467ed1efea562bbeb543994c2d1f6e800785bd5a2c98bc096f5cb220"}, + {file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06fc3cc7b6f6cca87bd56ec80a580c88f1da5306f505876a71c8cfa7050257dd"}, + {file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:65535bc550b70bd6271984d9863a37741352b4aad6fb1b3344a54e6950249b55"}, + {file = "cryptography-38.0.3-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:5e89468fbd2fcd733b5899333bc54d0d06c80e04cd23d8c6f3e0542358c6060b"}, + {file = "cryptography-38.0.3-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6ab9516b85bebe7aa83f309bacc5f44a61eeb90d0b4ec125d2d003ce41932d36"}, + {file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:068147f32fa662c81aebab95c74679b401b12b57494872886eb5c1139250ec5d"}, + {file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:402852a0aea73833d982cabb6d0c3bb582c15483d29fb7085ef2c42bfa7e38d7"}, + {file = "cryptography-38.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:b1b35d9d3a65542ed2e9d90115dfd16bbc027b3f07ee3304fc83580f26e43249"}, + {file = "cryptography-38.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:6addc3b6d593cd980989261dc1cce38263c76954d758c3c94de51f1e010c9a50"}, + {file = "cryptography-38.0.3-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:be243c7e2bfcf6cc4cb350c0d5cdf15ca6383bbcb2a8ef51d3c9411a9d4386f0"}, + {file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78cf5eefac2b52c10398a42765bfa981ce2372cbc0457e6bf9658f41ec3c41d8"}, + {file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:4e269dcd9b102c5a3d72be3c45d8ce20377b8076a43cbed6f660a1afe365e436"}, + {file = "cryptography-38.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8d41a46251bf0634e21fac50ffd643216ccecfaf3701a063257fe0b2be1b6548"}, + {file = "cryptography-38.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:785e4056b5a8b28f05a533fab69febf5004458e20dad7e2e13a3120d8ecec75a"}, + {file = "cryptography-38.0.3.tar.gz", hash = "sha256:bfbe6ee19615b07a98b1d2287d6a6073f734735b49ee45b11324d85efc4d5cbd"}, ] docker = [ - {file = "docker-5.0.3-py2.py3-none-any.whl", hash = "sha256:7a79bb439e3df59d0a72621775d600bc8bc8b422d285824cb37103eab91d1ce0"}, - {file = "docker-5.0.3.tar.gz", hash = "sha256:d916a26b62970e7c2f554110ed6af04c7ccff8e9f81ad17d0d40c75637e227fb"}, + {file = "docker-4.2.2-py2.py3-none-any.whl", hash = "sha256:03a46400c4080cb6f7aa997f881ddd84fef855499ece219d75fbdb53289c17ab"}, + {file = "docker-4.2.2.tar.gz", hash = "sha256:26eebadce7e298f55b76a88c4f8802476c5eaddbdbe38dbc6cce8781c47c9b54"}, ] ecdsa = [ - {file = "ecdsa-0.17.0-py2.py3-none-any.whl", hash = "sha256:5cf31d5b33743abe0dfc28999036c849a69d548f994b535e527ee3cb7f3ef676"}, - {file = "ecdsa-0.17.0.tar.gz", hash = "sha256:b9f500bb439e4153d0330610f5d26baaf18d17b8ced1bc54410d189385ea68aa"}, + {file = "ecdsa-0.18.0-py2.py3-none-any.whl", hash = "sha256:80600258e7ed2f16b9aa1d7c295bd70194109ad5a30fdee0eaeefef1d4c559dd"}, + {file = "ecdsa-0.18.0.tar.gz", hash = "sha256:190348041559e21b22a1d65cee485282ca11a6f81d503fddb84d5017e9ed1e49"}, ] execnet = [ {file = "execnet-1.9.0-py2.py3-none-any.whl", hash = "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"}, {file = "execnet-1.9.0.tar.gz", hash = "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5"}, ] flake8 = [ - {file = "flake8-3.9.2-py2.py3-none-any.whl", hash = "sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"}, - {file = "flake8-3.9.2.tar.gz", hash = "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b"}, + {file = "flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"}, + {file = "flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"}, ] flask = [ - {file = "Flask-2.0.2-py3-none-any.whl", hash = "sha256:cb90f62f1d8e4dc4621f52106613488b5ba826b2e1e10a33eac92f723093ab6a"}, - {file = "Flask-2.0.2.tar.gz", hash = "sha256:7b2fb8e934ddd50731893bdcdb00fc8c0315916f9fcd50d22c7cc1a95ab634e2"}, + {file = "Flask-2.1.3-py3-none-any.whl", hash = "sha256:9013281a7402ad527f8fd56375164f3aa021ecfaff89bfe3825346c24f87e04c"}, + {file = "Flask-2.1.3.tar.gz", hash = "sha256:15972e5017df0575c3d6c090ba168b6db90259e620ac8d7ea813a396bad5b6cb"}, ] flask-cors = [ {file = "Flask-Cors-3.0.10.tar.gz", hash = "sha256:b60839393f3b84a0f3746f6cdca56c1ad7426aa738b70d6c61375857823181de"}, {file = "Flask_Cors-3.0.10-py2.py3-none-any.whl", hash = "sha256:74efc975af1194fc7891ff5cd85b0f7478be4f7f59fe158102e91abb72bb4438"}, ] -future = [ - {file = "future-0.18.2.tar.gz", hash = "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"}, -] graphql-core = [ - {file = "graphql-core-3.2.0.tar.gz", hash = "sha256:86e2a0be008bfde19ef78388de8a725a1d942a9190ca431c24a60837973803ce"}, - {file = "graphql_core-3.2.0-py3-none-any.whl", hash = "sha256:0dda7e63676f119bb3d814621190fedad72fda07a8e9ab780bedd9f1957c6dc6"}, + {file = "graphql-core-3.2.1.tar.gz", hash = "sha256:9d1bf141427b7d54be944587c8349df791ce60ade2e3cccaf9c56368c133c201"}, + {file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"}, ] idna = [ {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, ] importlib-metadata = [ - {file = "importlib_metadata-4.10.1-py3-none-any.whl", hash = "sha256:899e2a40a8c4a1aec681feef45733de8a6c58f3f6a0dbed2eb6574b4387a77b6"}, - {file = "importlib_metadata-4.10.1.tar.gz", hash = "sha256:951f0d8a5b7260e9db5e41d429285b5f451e928479f19d80818878527d36e95e"}, + {file = "importlib_metadata-4.12.0-py3-none-any.whl", hash = "sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23"}, + {file = "importlib_metadata-4.12.0.tar.gz", hash = "sha256:637245b8bab2b6502fcbc752cc4b7a6f6243bb02b31c5c26156ad103d3d45670"}, ] iniconfig = [ {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, ] +isort = [ + {file = "isort-5.10.1-py3-none-any.whl", hash = "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7"}, + {file = "isort-5.10.1.tar.gz", hash = "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951"}, +] itsdangerous = [ - {file = "itsdangerous-2.0.1-py3-none-any.whl", hash = "sha256:5174094b9637652bdb841a3029700391451bd092ba3db90600dea710ba28e97c"}, - {file = "itsdangerous-2.0.1.tar.gz", hash = "sha256:9e724d68fc22902a1435351f84c3fb8623f303fffcc566a4cb952df8c572cff0"}, + {file = "itsdangerous-2.1.2-py3-none-any.whl", hash = "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44"}, + {file = "itsdangerous-2.1.2.tar.gz", hash = "sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a"}, ] jinja2 = [ - {file = "Jinja2-3.0.3-py3-none-any.whl", hash = "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8"}, - {file = "Jinja2-3.0.3.tar.gz", hash = "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7"}, + {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, + {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, ] jmespath = [ - {file = "jmespath-0.10.0-py2.py3-none-any.whl", hash = "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"}, - {file = "jmespath-0.10.0.tar.gz", hash = "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9"}, + {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, + {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] jschema-to-python = [ {file = "jschema_to_python-1.2.3-py3-none-any.whl", hash = "sha256:8a703ca7604d42d74b2815eecf99a33359a8dccbb80806cce386d5e2dd992b05"}, {file = "jschema_to_python-1.2.3.tar.gz", hash = "sha256:76ff14fe5d304708ccad1284e4b11f96a658949a31ee7faed9e0995279549b91"}, ] jsondiff = [ - {file = "jsondiff-1.3.0.tar.gz", hash = "sha256:5122bf4708a031b02db029366184a87c5d0ddd5a327a5884ee6cf0193e599d71"}, + {file = "jsondiff-2.0.0-py3-none-any.whl", hash = "sha256:689841d66273fc88fc79f7d33f4c074774f4f214b6466e3aff0e5adaf889d1e0"}, + {file = "jsondiff-2.0.0.tar.gz", hash = "sha256:2795844ef075ec8a2b8d385c4d59f5ea48b08e7180fce3cb2787be0db00b1fb4"}, ] jsonpatch = [ {file = "jsonpatch-1.32-py2.py3-none-any.whl", hash = "sha256:26ac385719ac9f54df8a2f0827bb8253aa3ea8ab7b3368457bcdb8c14595a397"}, {file = "jsonpatch-1.32.tar.gz", hash = "sha256:b6ddfe6c3db30d81a96aaeceb6baf916094ffa23d7dd5fa2c13e13f8b6e600c2"}, ] jsonpickle = [ - {file = "jsonpickle-2.1.0-py2.py3-none-any.whl", hash = "sha256:1dee77ddc5d652dfdabc33d33cff9d7e131d428007007da4fd6f7071ae774b0f"}, - {file = "jsonpickle-2.1.0.tar.gz", hash = "sha256:84684cfc5338a534173c8dd69809e40f2865d0be1f8a2b7af8465e5b968dcfa9"}, + {file = "jsonpickle-2.2.0-py2.py3-none-any.whl", hash = "sha256:de7f2613818aa4f234138ca11243d6359ff83ae528b2185efdd474f62bcf9ae1"}, + {file = "jsonpickle-2.2.0.tar.gz", hash = "sha256:7b272918b0554182e53dc340ddd62d9b7f902fec7e7b05620c04f3ccef479a0e"}, ] jsonpointer = [ - {file = "jsonpointer-2.2-py2.py3-none-any.whl", hash = "sha256:26d9a47a72d4dc3e3ae72c4c6cd432afd73c680164cd2540772eab53cb3823b6"}, - {file = "jsonpointer-2.2.tar.gz", hash = "sha256:f09f8deecaaa5aea65b5eb4f67ca4e54e1a61f7a11c75085e360fe6feb6a48bf"}, + {file = "jsonpointer-2.3-py2.py3-none-any.whl", hash = "sha256:51801e558539b4e9cd268638c078c6c5746c9ac96bc38152d443400e4f3793e9"}, + {file = "jsonpointer-2.3.tar.gz", hash = "sha256:97cba51526c829282218feb99dab1b1e6bdf8efd1c43dc9d57be093c0d69c99a"}, ] jsonschema = [ {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"}, @@ -1604,132 +1867,144 @@ junit-xml = [ {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"}, ] markupsafe = [ - {file = "MarkupSafe-2.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-win32.whl", hash = "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-win32.whl", hash = "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-win32.whl", hash = "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-win32.whl", hash = "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-win32.whl", hash = "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8"}, - {file = "MarkupSafe-2.0.1.tar.gz", hash = "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-win32.whl", hash = "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-win32.whl", hash = "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-win32.whl", hash = "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-win32.whl", hash = "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247"}, + {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"}, ] mccabe = [ - {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, - {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, + {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, + {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, ] moto = [ - {file = "moto-3.0.4-py2.py3-none-any.whl", hash = "sha256:79646213d8438385182f4eea79e28725f94b3d0d3dc9a3eda81db47e0ebef6cc"}, - {file = "moto-3.0.4.tar.gz", hash = "sha256:168b8a3cb4dd8a6df8e51d582761cefa9657b9f45ac7e1eb24dae394ebc9e000"}, + {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"}, + {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"}, ] mypy = [ - {file = "mypy-0.910-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457"}, - {file = "mypy-0.910-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb"}, - {file = "mypy-0.910-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9"}, - {file = "mypy-0.910-cp35-cp35m-win_amd64.whl", hash = "sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e"}, - {file = "mypy-0.910-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921"}, - {file = "mypy-0.910-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6"}, - {file = "mypy-0.910-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212"}, - {file = "mypy-0.910-cp36-cp36m-win_amd64.whl", hash = "sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885"}, - {file = "mypy-0.910-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0"}, - {file = "mypy-0.910-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de"}, - {file = "mypy-0.910-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703"}, - {file = "mypy-0.910-cp37-cp37m-win_amd64.whl", hash = "sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a"}, - {file = "mypy-0.910-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504"}, - {file = "mypy-0.910-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9"}, - {file = "mypy-0.910-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072"}, - {file = "mypy-0.910-cp38-cp38-win_amd64.whl", hash = "sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811"}, - {file = "mypy-0.910-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e"}, - {file = "mypy-0.910-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b"}, - {file = "mypy-0.910-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2"}, - {file = "mypy-0.910-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97"}, - {file = "mypy-0.910-cp39-cp39-win_amd64.whl", hash = "sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8"}, - {file = "mypy-0.910-py3-none-any.whl", hash = "sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"}, - {file = "mypy-0.910.tar.gz", hash = "sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150"}, + {file = "mypy-0.971-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f2899a3cbd394da157194f913a931edfd4be5f274a88041c9dc2d9cdcb1c315c"}, + {file = "mypy-0.971-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:98e02d56ebe93981c41211c05adb630d1d26c14195d04d95e49cd97dbc046dc5"}, + {file = "mypy-0.971-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:19830b7dba7d5356d3e26e2427a2ec91c994cd92d983142cbd025ebe81d69cf3"}, + {file = "mypy-0.971-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:02ef476f6dcb86e6f502ae39a16b93285fef97e7f1ff22932b657d1ef1f28655"}, + {file = "mypy-0.971-cp310-cp310-win_amd64.whl", hash = "sha256:25c5750ba5609a0c7550b73a33deb314ecfb559c350bb050b655505e8aed4103"}, + {file = "mypy-0.971-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d3348e7eb2eea2472db611486846742d5d52d1290576de99d59edeb7cd4a42ca"}, + {file = "mypy-0.971-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3fa7a477b9900be9b7dd4bab30a12759e5abe9586574ceb944bc29cddf8f0417"}, + {file = "mypy-0.971-cp36-cp36m-win_amd64.whl", hash = "sha256:2ad53cf9c3adc43cf3bea0a7d01a2f2e86db9fe7596dfecb4496a5dda63cbb09"}, + {file = "mypy-0.971-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:855048b6feb6dfe09d3353466004490b1872887150c5bb5caad7838b57328cc8"}, + {file = "mypy-0.971-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:23488a14a83bca6e54402c2e6435467a4138785df93ec85aeff64c6170077fb0"}, + {file = "mypy-0.971-cp37-cp37m-win_amd64.whl", hash = "sha256:4b21e5b1a70dfb972490035128f305c39bc4bc253f34e96a4adf9127cf943eb2"}, + {file = "mypy-0.971-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:9796a2ba7b4b538649caa5cecd398d873f4022ed2333ffde58eaf604c4d2cb27"}, + {file = "mypy-0.971-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5a361d92635ad4ada1b1b2d3630fc2f53f2127d51cf2def9db83cba32e47c856"}, + {file = "mypy-0.971-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b793b899f7cf563b1e7044a5c97361196b938e92f0a4343a5d27966a53d2ec71"}, + {file = "mypy-0.971-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d1ea5d12c8e2d266b5fb8c7a5d2e9c0219fedfeb493b7ed60cd350322384ac27"}, + {file = "mypy-0.971-cp38-cp38-win_amd64.whl", hash = "sha256:23c7ff43fff4b0df93a186581885c8512bc50fc4d4910e0f838e35d6bb6b5e58"}, + {file = "mypy-0.971-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1f7656b69974a6933e987ee8ffb951d836272d6c0f81d727f1d0e2696074d9e6"}, + {file = "mypy-0.971-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d2022bfadb7a5c2ef410d6a7c9763188afdb7f3533f22a0a32be10d571ee4bbe"}, + {file = "mypy-0.971-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef943c72a786b0f8d90fd76e9b39ce81fb7171172daf84bf43eaf937e9f220a9"}, + {file = "mypy-0.971-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d744f72eb39f69312bc6c2abf8ff6656973120e2eb3f3ec4f758ed47e414a4bf"}, + {file = "mypy-0.971-cp39-cp39-win_amd64.whl", hash = "sha256:77a514ea15d3007d33a9e2157b0ba9c267496acf12a7f2b9b9f8446337aac5b0"}, + {file = "mypy-0.971-py3-none-any.whl", hash = "sha256:0d054ef16b071149917085f51f89555a576e2618d5d9dd70bd6eea6410af3ac9"}, + {file = "mypy-0.971.tar.gz", hash = "sha256:40b0f21484238269ae6a57200c807d80debc6459d444c0489a102d7c6a75fa56"}, +] +mypy-boto3-s3 = [ + {file = "mypy-boto3-s3-1.24.36.post1.tar.gz", hash = "sha256:3bd7e06f9ade5059eae2181d7a9f1a41e7fa807ad3e94c01c9901838e87e0abe"}, + {file = "mypy_boto3_s3-1.24.36.post1-py3-none-any.whl", hash = "sha256:30ae59b33c55f8b7b693170f9519ea5b91a2fbf31a73de79cdef57a27d784e5a"}, ] mypy-extensions = [ {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, ] networkx = [ - {file = "networkx-2.6.3-py3-none-any.whl", hash = "sha256:80b6b89c77d1dfb64a4c7854981b60aeea6360ac02c6d4e4913319e0a313abef"}, - {file = "networkx-2.6.3.tar.gz", hash = "sha256:c0946ed31d71f1b732b5aaa6da5a0388a345019af232ce2f49c766e2d6795c51"}, + {file = "networkx-2.8.5-py3-none-any.whl", hash = "sha256:a762f4b385692d9c3a6f2912d058d76d29a827deaedf9e63ed14d397b8030687"}, + {file = "networkx-2.8.5.tar.gz", hash = "sha256:15a7b81a360791c458c55a417418ea136c13378cfdc06a2dcdc12bd2f9cf09c1"}, +] +openapi-schema-validator = [ + {file = "openapi-schema-validator-0.2.3.tar.gz", hash = "sha256:2c64907728c3ef78e23711c8840a423f0b241588c9ed929855e4b2d1bb0cf5f2"}, + {file = "openapi_schema_validator-0.2.3-py3-none-any.whl", hash = "sha256:9bae709212a19222892cabcc60cafd903cbf4b220223f48583afa3c0e3cc6fc4"}, +] +openapi-spec-validator = [ + {file = "openapi-spec-validator-0.4.0.tar.gz", hash = "sha256:97f258850afc97b048f7c2653855e0f88fa66ac103c2be5077c7960aca2ad49a"}, + {file = "openapi_spec_validator-0.4.0-py3-none-any.whl", hash = "sha256:06900ac4d546a1df3642a779da0055be58869c598e3042a2fef067cfd99d04d0"}, ] packaging = [ {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, ] +pathspec = [ + {file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"}, + {file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"}, +] pbr = [ - {file = "pbr-5.8.0-py2.py3-none-any.whl", hash = "sha256:176e8560eaf61e127817ef93d8a844803abb27a4d4637f0ff3bb783129be2e0a"}, - {file = "pbr-5.8.0.tar.gz", hash = "sha256:672d8ebee84921862110f23fcec2acea191ef58543d34dfe9ef3d9f13c31cddf"}, + {file = "pbr-5.9.0-py2.py3-none-any.whl", hash = "sha256:e547125940bcc052856ded43be8e101f63828c2d94239ffbe2b327ba3d5ccf0a"}, + {file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"}, +] +platformdirs = [ + {file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"}, + {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"}, ] pluggy = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, ] +prometheus-client = [ + {file = "prometheus_client-0.14.1-py3-none-any.whl", hash = "sha256:522fded625282822a89e2773452f42df14b5a8e84a86433e3f8a189c1d54dc01"}, + {file = "prometheus_client-0.14.1.tar.gz", hash = "sha256:5459c427624961076277fdc6dc50540e2bacb98eebde99886e59ec55ed92093a"}, +] +psutil = [ + {file = "psutil-5.9.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c1ca331af862803a42677c120aff8a814a804e09832f166f226bfd22b56feee8"}, + {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:68908971daf802203f3d37e78d3f8831b6d1014864d7a85937941bb35f09aefe"}, + {file = "psutil-5.9.4-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:3ff89f9b835100a825b14c2808a106b6fdcc4b15483141482a12c725e7f78549"}, + {file = "psutil-5.9.4-cp27-cp27m-win32.whl", hash = "sha256:852dd5d9f8a47169fe62fd4a971aa07859476c2ba22c2254d4a1baa4e10b95ad"}, + {file = "psutil-5.9.4-cp27-cp27m-win_amd64.whl", hash = "sha256:9120cd39dca5c5e1c54b59a41d205023d436799b1c8c4d3ff71af18535728e94"}, + {file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6b92c532979bafc2df23ddc785ed116fced1f492ad90a6830cf24f4d1ea27d24"}, + {file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:efeae04f9516907be44904cc7ce08defb6b665128992a56957abc9b61dca94b7"}, + {file = "psutil-5.9.4-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:54d5b184728298f2ca8567bf83c422b706200bcbbfafdc06718264f9393cfeb7"}, + {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:16653106f3b59386ffe10e0bad3bb6299e169d5327d3f187614b1cb8f24cf2e1"}, + {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54c0d3d8e0078b7666984e11b12b88af2db11d11249a8ac8920dd5ef68a66e08"}, + {file = "psutil-5.9.4-cp36-abi3-win32.whl", hash = "sha256:149555f59a69b33f056ba1c4eb22bb7bf24332ce631c44a319cec09f876aaeff"}, + {file = "psutil-5.9.4-cp36-abi3-win_amd64.whl", hash = "sha256:fd8522436a6ada7b4aad6638662966de0d61d241cb821239b2ae7013d41a43d4"}, + {file = "psutil-5.9.4-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6001c809253a29599bc0dfd5179d9f8a5779f9dffea1da0f13c53ee568115e1e"}, + {file = "psutil-5.9.4.tar.gz", hash = "sha256:3d7f9739eb435d4b1338944abe23f49584bde5395f27487d2ee25ad9a8774a62"}, +] psycopg2-binary = [ {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"}, @@ -1763,6 +2038,7 @@ psycopg2-binary = [ {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"}, {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"}, @@ -1774,6 +2050,7 @@ psycopg2-binary = [ {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"}, @@ -1790,39 +2067,32 @@ py = [ {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] pyasn1 = [ - {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"}, - {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"}, - {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"}, - {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"}, {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, - {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"}, - {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"}, - {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"}, - {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"}, - {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"}, - {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"}, - {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"}, {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, ] pycodestyle = [ - {file = "pycodestyle-2.7.0-py2.py3-none-any.whl", hash = "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068"}, - {file = "pycodestyle-2.7.0.tar.gz", hash = "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"}, + {file = "pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"}, + {file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"}, ] pycparser = [ {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] pyflakes = [ - {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"}, - {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"}, + {file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"}, + {file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"}, ] pyjwt = [ - {file = "PyJWT-2.3.0-py3-none-any.whl", hash = "sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"}, - {file = "PyJWT-2.3.0.tar.gz", hash = "sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41"}, + {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"}, + {file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"}, ] pyparsing = [ - {file = "pyparsing-3.0.6-py3-none-any.whl", hash = "sha256:04ff808a5b90911829c55c4e26f75fa5ca8a2f5f36aa3a51f68e27033341d3e4"}, - {file = "pyparsing-3.0.6.tar.gz", hash = "sha256:d9bdec0013ef1eb5a84ab39a3b3868911598afa494f5faa038647101504e2b81"}, + {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, + {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, +] +pypiwin32 = [ + {file = "pypiwin32-223-py3-none-any.whl", hash = "sha256:67adf399debc1d5d14dffc1ab5acacb800da569754fafdc576b2a039485aa775"}, + {file = "pypiwin32-223.tar.gz", hash = "sha256:71be40c1fbd28594214ecaecb58e7aa8b708eabfa0125c8a109ebd51edbd776a"}, ] pyrsistent = [ {file = "pyrsistent-0.18.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:df46c854f490f81210870e509818b729db4488e1f30f2a1ce1698b2295a878d1"}, @@ -1851,13 +2121,25 @@ pytest = [ {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"}, {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"}, ] -pytest-forked = [ - {file = "pytest-forked-1.4.0.tar.gz", hash = "sha256:8b67587c8f98cbbadfdd804539ed5455b6ed03802203485dd2f53c1422d7440e"}, - {file = "pytest_forked-1.4.0-py3-none-any.whl", hash = "sha256:bbbb6717efc886b9d64537b41fb1497cfaf3c9601276be8da2cccfea5a3c8ad8"}, +pytest-asyncio = [ + {file = "pytest-asyncio-0.19.0.tar.gz", hash = "sha256:ac4ebf3b6207259750bc32f4c1d8fcd7e79739edbc67ad0c58dd150b1d072fed"}, + {file = "pytest_asyncio-0.19.0-py3-none-any.whl", hash = "sha256:7a97e37cfe1ed296e2e84941384bdd37c376453912d397ed39293e0916f521fa"}, +] +pytest-lazy-fixture = [ + {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"}, + {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"}, +] +pytest-order = [ + {file = "pytest-order-1.0.1.tar.gz", hash = "sha256:5dd6b929fbd7eaa6d0ee07586f65c623babb0afe72b4843c5f15055d6b3b1b1f"}, + {file = "pytest_order-1.0.1-py3-none-any.whl", hash = "sha256:bbe6e63a8e23741ab3e810d458d1ea7317e797b70f9550512d77d6e9e8fd1bbb"}, +] +pytest-timeout = [ + {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"}, + {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"}, ] pytest-xdist = [ - {file = "pytest-xdist-2.5.0.tar.gz", hash = "sha256:4580deca3ff04ddb2ac53eba39d76cb5dd5edeac050cb6fbc768b0dd712b4edf"}, - {file = "pytest_xdist-2.5.0-py3-none-any.whl", hash = "sha256:6fe5c74fec98906deb8f2d2b616b5c782022744978e7bd4695d39c8f42d0ce65"}, + {file = "pytest-xdist-3.0.2.tar.gz", hash = "sha256:688da9b814370e891ba5de650c9327d1a9d861721a524eb917e620eec3e90291"}, + {file = "pytest_xdist-3.0.2-py3-none-any.whl", hash = "sha256:9feb9a18e1790696ea23e1434fa73b325ed4998b0e9fcb221f16fd1945e6df1b"}, ] python-dateutil = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, @@ -1868,22 +2150,20 @@ python-jose = [ {file = "python_jose-3.3.0-py2.py3-none-any.whl", hash = "sha256:9b1376b023f8b298536eedd47ae1089bcdb848f1535ab30555cd92002d78923a"}, ] pytz = [ - {file = "pytz-2021.3-py2.py3-none-any.whl", hash = "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c"}, - {file = "pytz-2021.3.tar.gz", hash = "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"}, + {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"}, + {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"}, ] pywin32 = [ - {file = "pywin32-227-cp27-cp27m-win32.whl", hash = "sha256:371fcc39416d736401f0274dd64c2302728c9e034808e37381b5e1b22be4a6b0"}, - {file = "pywin32-227-cp27-cp27m-win_amd64.whl", hash = "sha256:4cdad3e84191194ea6d0dd1b1b9bdda574ff563177d2adf2b4efec2a244fa116"}, - {file = "pywin32-227-cp35-cp35m-win32.whl", hash = "sha256:f4c5be1a293bae0076d93c88f37ee8da68136744588bc5e2be2f299a34ceb7aa"}, - {file = "pywin32-227-cp35-cp35m-win_amd64.whl", hash = "sha256:a929a4af626e530383a579431b70e512e736e9588106715215bf685a3ea508d4"}, - {file = "pywin32-227-cp36-cp36m-win32.whl", hash = "sha256:300a2db938e98c3e7e2093e4491439e62287d0d493fe07cce110db070b54c0be"}, - {file = "pywin32-227-cp36-cp36m-win_amd64.whl", hash = "sha256:9b31e009564fb95db160f154e2aa195ed66bcc4c058ed72850d047141b36f3a2"}, - {file = "pywin32-227-cp37-cp37m-win32.whl", hash = "sha256:47a3c7551376a865dd8d095a98deba954a98f326c6fe3c72d8726ca6e6b15507"}, - {file = "pywin32-227-cp37-cp37m-win_amd64.whl", hash = "sha256:31f88a89139cb2adc40f8f0e65ee56a8c585f629974f9e07622ba80199057511"}, - {file = "pywin32-227-cp38-cp38-win32.whl", hash = "sha256:7f18199fbf29ca99dff10e1f09451582ae9e372a892ff03a28528a24d55875bc"}, - {file = "pywin32-227-cp38-cp38-win_amd64.whl", hash = "sha256:7c1ae32c489dc012930787f06244426f8356e129184a02c25aef163917ce158e"}, - {file = "pywin32-227-cp39-cp39-win32.whl", hash = "sha256:c054c52ba46e7eb6b7d7dfae4dbd987a1bb48ee86debe3f245a2884ece46e295"}, - {file = "pywin32-227-cp39-cp39-win_amd64.whl", hash = "sha256:f27cec5e7f588c3d1051651830ecc00294f90728d19c3bf6916e6dba93ea357c"}, + {file = "pywin32-301-cp35-cp35m-win32.whl", hash = "sha256:93367c96e3a76dfe5003d8291ae16454ca7d84bb24d721e0b74a07610b7be4a7"}, + {file = "pywin32-301-cp35-cp35m-win_amd64.whl", hash = "sha256:9635df6998a70282bd36e7ac2a5cef9ead1627b0a63b17c731312c7a0daebb72"}, + {file = "pywin32-301-cp36-cp36m-win32.whl", hash = "sha256:c866f04a182a8cb9b7855de065113bbd2e40524f570db73ef1ee99ff0a5cc2f0"}, + {file = "pywin32-301-cp36-cp36m-win_amd64.whl", hash = "sha256:dafa18e95bf2a92f298fe9c582b0e205aca45c55f989937c52c454ce65b93c78"}, + {file = "pywin32-301-cp37-cp37m-win32.whl", hash = "sha256:98f62a3f60aa64894a290fb7494bfa0bfa0a199e9e052e1ac293b2ad3cd2818b"}, + {file = "pywin32-301-cp37-cp37m-win_amd64.whl", hash = "sha256:fb3b4933e0382ba49305cc6cd3fb18525df7fd96aa434de19ce0878133bf8e4a"}, + {file = "pywin32-301-cp38-cp38-win32.whl", hash = "sha256:88981dd3cfb07432625b180f49bf4e179fb8cbb5704cd512e38dd63636af7a17"}, + {file = "pywin32-301-cp38-cp38-win_amd64.whl", hash = "sha256:8c9d33968aa7fcddf44e47750e18f3d034c3e443a707688a008a2e52bbef7e96"}, + {file = "pywin32-301-cp39-cp39-win32.whl", hash = "sha256:595d397df65f1b2e0beaca63a883ae6d8b6df1cdea85c16ae85f6d2e648133fe"}, + {file = "pywin32-301-cp39-cp39-win_amd64.whl", hash = "sha256:87604a4087434cd814ad8973bd47d6524bd1fa9e971ce428e76b62a5e0860fdf"}, ] pyyaml = [ {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, @@ -1893,6 +2173,13 @@ pyyaml = [ {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, + {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, + {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, @@ -1921,25 +2208,29 @@ pyyaml = [ {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"}, ] requests = [ - {file = "requests-2.27.1-py2.py3-none-any.whl", hash = "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"}, - {file = "requests-2.27.1.tar.gz", hash = "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61"}, + {file = "requests-2.28.1-py3-none-any.whl", hash = "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"}, + {file = "requests-2.28.1.tar.gz", hash = "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983"}, ] responses = [ - {file = "responses-0.17.0-py2.py3-none-any.whl", hash = "sha256:e4fc472fb7374fb8f84fcefa51c515ca4351f198852b4eb7fc88223780b472ea"}, - {file = "responses-0.17.0.tar.gz", hash = "sha256:ec675e080d06bf8d1fb5e5a68a1e5cd0df46b09c78230315f650af5e4036bec7"}, + {file = "responses-0.21.0-py3-none-any.whl", hash = "sha256:2dcc863ba63963c0c3d9ee3fa9507cbe36b7d7b0fccb4f0bdfd9e96c539b1487"}, + {file = "responses-0.21.0.tar.gz", hash = "sha256:b82502eb5f09a0289d8e209e7bad71ef3978334f56d09b444253d5ad67bf5253"}, ] rsa = [ - {file = "rsa-4.8-py3-none-any.whl", hash = "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"}, - {file = "rsa-4.8.tar.gz", hash = "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17"}, + {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"}, + {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"}, ] s3transfer = [ - {file = "s3transfer-0.5.0-py3-none-any.whl", hash = "sha256:9c1dc369814391a6bda20ebbf4b70a0f34630592c9aa520856bf384916af2803"}, - {file = "s3transfer-0.5.0.tar.gz", hash = "sha256:50ed823e1dc5868ad40c8dc92072f757aa0e653a192845c94a3b676f4a62da4c"}, + {file = "s3transfer-0.6.0-py3-none-any.whl", hash = "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd"}, + {file = "s3transfer-0.6.0.tar.gz", hash = "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"}, ] sarif-om = [ {file = "sarif_om-1.0.4-py3-none-any.whl", hash = "sha256:539ef47a662329b1c8502388ad92457425e95dc0aaaf995fe46f4984c4771911"}, {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"}, ] +setuptools = [ + {file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"}, + {file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"}, +] six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, @@ -1952,129 +2243,121 @@ toml = [ {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, ] -typed-ast = [ - {file = "typed_ast-1.4.3-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6"}, - {file = "typed_ast-1.4.3-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075"}, - {file = "typed_ast-1.4.3-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528"}, - {file = "typed_ast-1.4.3-cp35-cp35m-win32.whl", hash = "sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428"}, - {file = "typed_ast-1.4.3-cp35-cp35m-win_amd64.whl", hash = "sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3"}, - {file = "typed_ast-1.4.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f"}, - {file = "typed_ast-1.4.3-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341"}, - {file = "typed_ast-1.4.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace"}, - {file = "typed_ast-1.4.3-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f"}, - {file = "typed_ast-1.4.3-cp36-cp36m-win32.whl", hash = "sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363"}, - {file = "typed_ast-1.4.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7"}, - {file = "typed_ast-1.4.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266"}, - {file = "typed_ast-1.4.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e"}, - {file = "typed_ast-1.4.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04"}, - {file = "typed_ast-1.4.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899"}, - {file = "typed_ast-1.4.3-cp37-cp37m-win32.whl", hash = "sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c"}, - {file = "typed_ast-1.4.3-cp37-cp37m-win_amd64.whl", hash = "sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805"}, - {file = "typed_ast-1.4.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a"}, - {file = "typed_ast-1.4.3-cp38-cp38-manylinux1_i686.whl", hash = "sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff"}, - {file = "typed_ast-1.4.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41"}, - {file = "typed_ast-1.4.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39"}, - {file = "typed_ast-1.4.3-cp38-cp38-win32.whl", hash = "sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927"}, - {file = "typed_ast-1.4.3-cp38-cp38-win_amd64.whl", hash = "sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40"}, - {file = "typed_ast-1.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3"}, - {file = "typed_ast-1.4.3-cp39-cp39-manylinux1_i686.whl", hash = "sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4"}, - {file = "typed_ast-1.4.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0"}, - {file = "typed_ast-1.4.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3"}, - {file = "typed_ast-1.4.3-cp39-cp39-win32.whl", hash = "sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808"}, - {file = "typed_ast-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c"}, - {file = "typed_ast-1.4.3.tar.gz", hash = "sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"}, +tomli = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] +types-psutil = [ + {file = "types-psutil-5.9.5.4.tar.gz", hash = "sha256:aa09102b80c65a3b4573216614372398dab78972d650488eaff1ff05482cc18f"}, + {file = "types_psutil-5.9.5.4-py3-none-any.whl", hash = "sha256:28e59764630187e462d43788efa16d59d5e77b510115f9e25901b2d4007fca62"}, ] types-psycopg2 = [ - {file = "types-psycopg2-2.9.6.tar.gz", hash = "sha256:753b50b38da0e61bc8f89d149f2c4420c7e18535a87963d17b72343eb98f7c32"}, - {file = "types_psycopg2-2.9.6-py3-none-any.whl", hash = "sha256:2cfd855e1562ebb5da595ee9401da93a308d69121ccd359cb8341f94ba4b6d1c"}, + {file = "types-psycopg2-2.9.18.tar.gz", hash = "sha256:9b0e9e1f097b15cd9fa8aad2596a9e3082fd72f8d9cfe52b190cfa709105b6c0"}, + {file = "types_psycopg2-2.9.18-py3-none-any.whl", hash = "sha256:14c779dcab18c31453fa1cad3cf4b1601d33540a344adead3c47a6b8091cd2fa"}, ] types-requests = [ - {file = "types-requests-2.27.7.tar.gz", hash = "sha256:f38bd488528cdcbce5b01dc953972f3cead0d060cfd9ee35b363066c25bab13c"}, - {file = "types_requests-2.27.7-py3-none-any.whl", hash = "sha256:2e0e100dd489f83870d4f61949d3a7eae4821e7bfbf46c57e463c38f92d473d4"}, + {file = "types-requests-2.28.5.tar.gz", hash = "sha256:ac618bfefcb3742eaf97c961e13e9e5a226e545eda4a3dbe293b898d40933ad1"}, + {file = "types_requests-2.28.5-py3-none-any.whl", hash = "sha256:98ab647ae88b5e2c41d6d20cfcb5117da1bea561110000b6fdeeea07b3e89877"}, +] +types-s3transfer = [ + {file = "types-s3transfer-0.6.0.post3.tar.gz", hash = "sha256:92c3704e5d041202bfb5ddb79d083fd1a02de2c5dfec6a91576823e6b5c93993"}, + {file = "types_s3transfer-0.6.0.post3-py3-none-any.whl", hash = "sha256:eedc5117275565b3c83662c0ccc81662a34da5dda8bd502b89d296b6d5cb091d"}, +] +types-toml = [ + {file = "types-toml-0.10.8.tar.gz", hash = "sha256:b7e7ea572308b1030dc86c3ba825c5210814c2825612ec679eb7814f8dd9295a"}, + {file = "types_toml-0.10.8-py3-none-any.whl", hash = "sha256:8300fd093e5829eb9c1fba69cee38130347d4b74ddf32d0a7df650ae55c2b599"}, ] types-urllib3 = [ - {file = "types-urllib3-1.26.7.tar.gz", hash = "sha256:cfd1fbbe4ba9a605ed148294008aac8a7b8b7472651d1cc357d507ae5962e3d2"}, - {file = "types_urllib3-1.26.7-py3-none-any.whl", hash = "sha256:3adcf2cb5981809091dbff456e6999fe55f201652d8c360f99997de5ac2f556e"}, + {file = "types-urllib3-1.26.17.tar.gz", hash = "sha256:73fd274524c3fc7cd8cd9ceb0cb67ed99b45f9cb2831013e46d50c1451044800"}, + {file = "types_urllib3-1.26.17-py3-none-any.whl", hash = "sha256:0d027fcd27dbb3cb532453b4d977e05bc1e13aefd70519866af211b3003d895d"}, ] typing-extensions = [ - {file = "typing_extensions-3.10.0.2-py2-none-any.whl", hash = "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7"}, - {file = "typing_extensions-3.10.0.2-py3-none-any.whl", hash = "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"}, - {file = "typing_extensions-3.10.0.2.tar.gz", hash = "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e"}, + {file = "typing_extensions-4.3.0-py3-none-any.whl", hash = "sha256:25642c956049920a5aa49edcdd6ab1e06d7e5d467fc00e0506c44ac86fbfca02"}, + {file = "typing_extensions-4.3.0.tar.gz", hash = "sha256:e6d2677a32f47fc7eb2795db1dd15c1f34eff616bcaf2cfb5e997f854fa1c4a6"}, ] urllib3 = [ - {file = "urllib3-1.26.8-py2.py3-none-any.whl", hash = "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed"}, - {file = "urllib3-1.26.8.tar.gz", hash = "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"}, + {file = "urllib3-1.26.11-py2.py3-none-any.whl", hash = "sha256:c33ccba33c819596124764c23a97d25f32b28433ba0dedeb77d873a38722c9bc"}, + {file = "urllib3-1.26.11.tar.gz", hash = "sha256:ea6e8fb210b19d950fab93b60c9009226c63a28808bc8386e05301e25883ac0a"}, ] websocket-client = [ - {file = "websocket-client-1.2.3.tar.gz", hash = "sha256:1315816c0acc508997eb3ae03b9d3ff619c9d12d544c9a9b553704b1cc4f6af5"}, - {file = "websocket_client-1.2.3-py3-none-any.whl", hash = "sha256:2eed4cc58e4d65613ed6114af2f380f7910ff416fc8c46947f6e76b6815f56c0"}, + {file = "websocket-client-1.3.3.tar.gz", hash = "sha256:d58c5f284d6a9bf8379dab423259fe8f85b70d5fa5d2916d5791a84594b122b1"}, + {file = "websocket_client-1.3.3-py3-none-any.whl", hash = "sha256:5d55652dc1d0b3c734f044337d929aaf83f4f9138816ec680c1aefefb4dc4877"}, ] werkzeug = [ - {file = "Werkzeug-2.0.2-py3-none-any.whl", hash = "sha256:63d3dc1cf60e7b7e35e97fa9861f7397283b75d765afcaefd993d6046899de8f"}, - {file = "Werkzeug-2.0.2.tar.gz", hash = "sha256:aa2bb6fc8dee8d6c504c0ac1e7f5f7dc5810a9903e793b6f715a9f015bdadb9a"}, + {file = "Werkzeug-2.1.2-py3-none-any.whl", hash = "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255"}, + {file = "Werkzeug-2.1.2.tar.gz", hash = "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6"}, ] wrapt = [ - {file = "wrapt-1.13.3-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:e05e60ff3b2b0342153be4d1b597bbcfd8330890056b9619f4ad6b8d5c96a81a"}, - {file = "wrapt-1.13.3-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:85148f4225287b6a0665eef08a178c15097366d46b210574a658c1ff5b377489"}, - {file = "wrapt-1.13.3-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:2dded5496e8f1592ec27079b28b6ad2a1ef0b9296d270f77b8e4a3a796cf6909"}, - {file = "wrapt-1.13.3-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:e94b7d9deaa4cc7bac9198a58a7240aaf87fe56c6277ee25fa5b3aa1edebd229"}, - {file = "wrapt-1.13.3-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:498e6217523111d07cd67e87a791f5e9ee769f9241fcf8a379696e25806965af"}, - {file = "wrapt-1.13.3-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:ec7e20258ecc5174029a0f391e1b948bf2906cd64c198a9b8b281b811cbc04de"}, - {file = "wrapt-1.13.3-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:87883690cae293541e08ba2da22cacaae0a092e0ed56bbba8d018cc486fbafbb"}, - {file = "wrapt-1.13.3-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:f99c0489258086308aad4ae57da9e8ecf9e1f3f30fa35d5e170b4d4896554d80"}, - {file = "wrapt-1.13.3-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:6a03d9917aee887690aa3f1747ce634e610f6db6f6b332b35c2dd89412912bca"}, - {file = "wrapt-1.13.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:936503cb0a6ed28dbfa87e8fcd0a56458822144e9d11a49ccee6d9a8adb2ac44"}, - {file = "wrapt-1.13.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f9c51d9af9abb899bd34ace878fbec8bf357b3194a10c4e8e0a25512826ef056"}, - {file = "wrapt-1.13.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:220a869982ea9023e163ba915077816ca439489de6d2c09089b219f4e11b6785"}, - {file = "wrapt-1.13.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:0877fe981fd76b183711d767500e6b3111378ed2043c145e21816ee589d91096"}, - {file = "wrapt-1.13.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:43e69ffe47e3609a6aec0fe723001c60c65305784d964f5007d5b4fb1bc6bf33"}, - {file = "wrapt-1.13.3-cp310-cp310-win32.whl", hash = "sha256:78dea98c81915bbf510eb6a3c9c24915e4660302937b9ae05a0947164248020f"}, - {file = "wrapt-1.13.3-cp310-cp310-win_amd64.whl", hash = "sha256:ea3e746e29d4000cd98d572f3ee2a6050a4f784bb536f4ac1f035987fc1ed83e"}, - {file = "wrapt-1.13.3-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:8c73c1a2ec7c98d7eaded149f6d225a692caa1bd7b2401a14125446e9e90410d"}, - {file = "wrapt-1.13.3-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:086218a72ec7d986a3eddb7707c8c4526d677c7b35e355875a0fe2918b059179"}, - {file = "wrapt-1.13.3-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:e92d0d4fa68ea0c02d39f1e2f9cb5bc4b4a71e8c442207433d8db47ee79d7aa3"}, - {file = "wrapt-1.13.3-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:d4a5f6146cfa5c7ba0134249665acd322a70d1ea61732723c7d3e8cc0fa80755"}, - {file = "wrapt-1.13.3-cp35-cp35m-win32.whl", hash = "sha256:8aab36778fa9bba1a8f06a4919556f9f8c7b33102bd71b3ab307bb3fecb21851"}, - {file = "wrapt-1.13.3-cp35-cp35m-win_amd64.whl", hash = "sha256:944b180f61f5e36c0634d3202ba8509b986b5fbaf57db3e94df11abee244ba13"}, - {file = "wrapt-1.13.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:2ebdde19cd3c8cdf8df3fc165bc7827334bc4e353465048b36f7deeae8ee0918"}, - {file = "wrapt-1.13.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:610f5f83dd1e0ad40254c306f4764fcdc846641f120c3cf424ff57a19d5f7ade"}, - {file = "wrapt-1.13.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5601f44a0f38fed36cc07db004f0eedeaadbdcec90e4e90509480e7e6060a5bc"}, - {file = "wrapt-1.13.3-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:e6906d6f48437dfd80464f7d7af1740eadc572b9f7a4301e7dd3d65db285cacf"}, - {file = "wrapt-1.13.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:766b32c762e07e26f50d8a3468e3b4228b3736c805018e4b0ec8cc01ecd88125"}, - {file = "wrapt-1.13.3-cp36-cp36m-win32.whl", hash = "sha256:5f223101f21cfd41deec8ce3889dc59f88a59b409db028c469c9b20cfeefbe36"}, - {file = "wrapt-1.13.3-cp36-cp36m-win_amd64.whl", hash = "sha256:f122ccd12fdc69628786d0c947bdd9cb2733be8f800d88b5a37c57f1f1d73c10"}, - {file = "wrapt-1.13.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:46f7f3af321a573fc0c3586612db4decb7eb37172af1bc6173d81f5b66c2e068"}, - {file = "wrapt-1.13.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:778fd096ee96890c10ce96187c76b3e99b2da44e08c9e24d5652f356873f6709"}, - {file = "wrapt-1.13.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0cb23d36ed03bf46b894cfec777eec754146d68429c30431c99ef28482b5c1df"}, - {file = "wrapt-1.13.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:96b81ae75591a795d8c90edc0bfaab44d3d41ffc1aae4d994c5aa21d9b8e19a2"}, - {file = "wrapt-1.13.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:7dd215e4e8514004c8d810a73e342c536547038fb130205ec4bba9f5de35d45b"}, - {file = "wrapt-1.13.3-cp37-cp37m-win32.whl", hash = "sha256:47f0a183743e7f71f29e4e21574ad3fa95676136f45b91afcf83f6a050914829"}, - {file = "wrapt-1.13.3-cp37-cp37m-win_amd64.whl", hash = "sha256:fd76c47f20984b43d93de9a82011bb6e5f8325df6c9ed4d8310029a55fa361ea"}, - {file = "wrapt-1.13.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b73d4b78807bd299b38e4598b8e7bd34ed55d480160d2e7fdaabd9931afa65f9"}, - {file = "wrapt-1.13.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:ec9465dd69d5657b5d2fa6133b3e1e989ae27d29471a672416fd729b429eb554"}, - {file = "wrapt-1.13.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:dd91006848eb55af2159375134d724032a2d1d13bcc6f81cd8d3ed9f2b8e846c"}, - {file = "wrapt-1.13.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ae9de71eb60940e58207f8e71fe113c639da42adb02fb2bcbcaccc1ccecd092b"}, - {file = "wrapt-1.13.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:51799ca950cfee9396a87f4a1240622ac38973b6df5ef7a41e7f0b98797099ce"}, - {file = "wrapt-1.13.3-cp38-cp38-win32.whl", hash = "sha256:4b9c458732450ec42578b5642ac53e312092acf8c0bfce140ada5ca1ac556f79"}, - {file = "wrapt-1.13.3-cp38-cp38-win_amd64.whl", hash = "sha256:7dde79d007cd6dfa65afe404766057c2409316135cb892be4b1c768e3f3a11cb"}, - {file = "wrapt-1.13.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:981da26722bebb9247a0601e2922cedf8bb7a600e89c852d063313102de6f2cb"}, - {file = "wrapt-1.13.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:705e2af1f7be4707e49ced9153f8d72131090e52be9278b5dbb1498c749a1e32"}, - {file = "wrapt-1.13.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:25b1b1d5df495d82be1c9d2fad408f7ce5ca8a38085e2da41bb63c914baadff7"}, - {file = "wrapt-1.13.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:77416e6b17926d953b5c666a3cb718d5945df63ecf922af0ee576206d7033b5e"}, - {file = "wrapt-1.13.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:865c0b50003616f05858b22174c40ffc27a38e67359fa1495605f96125f76640"}, - {file = "wrapt-1.13.3-cp39-cp39-win32.whl", hash = "sha256:0a017a667d1f7411816e4bf214646d0ad5b1da2c1ea13dec6c162736ff25a374"}, - {file = "wrapt-1.13.3-cp39-cp39-win_amd64.whl", hash = "sha256:81bd7c90d28a4b2e1df135bfbd7c23aee3050078ca6441bead44c42483f9ebfb"}, - {file = "wrapt-1.13.3.tar.gz", hash = "sha256:1fea9cd438686e6682271d36f3481a9f3636195578bab9ca3382e2f5f01fc185"}, + {file = "wrapt-1.14.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3"}, + {file = "wrapt-1.14.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef"}, + {file = "wrapt-1.14.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28"}, + {file = "wrapt-1.14.1-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59"}, + {file = "wrapt-1.14.1-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87"}, + {file = "wrapt-1.14.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1"}, + {file = "wrapt-1.14.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b"}, + {file = "wrapt-1.14.1-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462"}, + {file = "wrapt-1.14.1-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1"}, + {file = "wrapt-1.14.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320"}, + {file = "wrapt-1.14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2"}, + {file = "wrapt-1.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4"}, + {file = "wrapt-1.14.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069"}, + {file = "wrapt-1.14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310"}, + {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f"}, + {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656"}, + {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, + {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, + {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, + {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, + {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, + {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, + {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d"}, + {file = "wrapt-1.14.1-cp35-cp35m-win32.whl", hash = "sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7"}, + {file = "wrapt-1.14.1-cp35-cp35m-win_amd64.whl", hash = "sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00"}, + {file = "wrapt-1.14.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4"}, + {file = "wrapt-1.14.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1"}, + {file = "wrapt-1.14.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1"}, + {file = "wrapt-1.14.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff"}, + {file = "wrapt-1.14.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d"}, + {file = "wrapt-1.14.1-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1"}, + {file = "wrapt-1.14.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569"}, + {file = "wrapt-1.14.1-cp36-cp36m-win32.whl", hash = "sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed"}, + {file = "wrapt-1.14.1-cp36-cp36m-win_amd64.whl", hash = "sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471"}, + {file = "wrapt-1.14.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248"}, + {file = "wrapt-1.14.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68"}, + {file = "wrapt-1.14.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d"}, + {file = "wrapt-1.14.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77"}, + {file = "wrapt-1.14.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7"}, + {file = "wrapt-1.14.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015"}, + {file = "wrapt-1.14.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a"}, + {file = "wrapt-1.14.1-cp37-cp37m-win32.whl", hash = "sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853"}, + {file = "wrapt-1.14.1-cp37-cp37m-win_amd64.whl", hash = "sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c"}, + {file = "wrapt-1.14.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456"}, + {file = "wrapt-1.14.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f"}, + {file = "wrapt-1.14.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc"}, + {file = "wrapt-1.14.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1"}, + {file = "wrapt-1.14.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af"}, + {file = "wrapt-1.14.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b"}, + {file = "wrapt-1.14.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0"}, + {file = "wrapt-1.14.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57"}, + {file = "wrapt-1.14.1-cp38-cp38-win32.whl", hash = "sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5"}, + {file = "wrapt-1.14.1-cp38-cp38-win_amd64.whl", hash = "sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d"}, + {file = "wrapt-1.14.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383"}, + {file = "wrapt-1.14.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7"}, + {file = "wrapt-1.14.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86"}, + {file = "wrapt-1.14.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735"}, + {file = "wrapt-1.14.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b"}, + {file = "wrapt-1.14.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3"}, + {file = "wrapt-1.14.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3"}, + {file = "wrapt-1.14.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe"}, + {file = "wrapt-1.14.1-cp39-cp39-win32.whl", hash = "sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5"}, + {file = "wrapt-1.14.1-cp39-cp39-win_amd64.whl", hash = "sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb"}, + {file = "wrapt-1.14.1.tar.gz", hash = "sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d"}, ] xmltodict = [ - {file = "xmltodict-0.12.0-py2.py3-none-any.whl", hash = "sha256:8bbcb45cc982f48b2ca8fe7e7827c5d792f217ecf1792626f808bf41c3b86051"}, - {file = "xmltodict-0.12.0.tar.gz", hash = "sha256:50d8c638ed7ecb88d90561beedbf720c9b4e851a9fa6c47ebd64e99d166d8a21"}, -] -yapf = [ - {file = "yapf-0.31.0-py2.py3-none-any.whl", hash = "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"}, - {file = "yapf-0.31.0.tar.gz", hash = "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d"}, + {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"}, + {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"}, ] zipp = [ - {file = "zipp-3.7.0-py3-none-any.whl", hash = "sha256:b47250dd24f92b7dd6a0a8fc5244da14608f3ca90a5efcd37a3b1642fac9a375"}, - {file = "zipp-3.7.0.tar.gz", hash = "sha256:9f50f446828eb9d45b267433fd3e9da8d801f614129124863f9c51ebceafb87d"}, + {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"}, + {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"}, ] diff --git a/postgres_ffi/Cargo.toml b/postgres_ffi/Cargo.toml deleted file mode 100644 index 17f1ecd666..0000000000 --- a/postgres_ffi/Cargo.toml +++ /dev/null @@ -1,24 +0,0 @@ -[package] -name = "postgres_ffi" -version = "0.1.0" -edition = "2021" - -[dependencies] -chrono = "0.4.19" -rand = "0.8.3" -regex = "1.4.5" -bytes = "1.0.1" -byteorder = "1.4.3" -anyhow = "1.0" -crc32c = "0.6.0" -hex = "0.4.3" -lazy_static = "1.4" -log = "0.4.14" -memoffset = "0.6.2" -thiserror = "1.0" -serde = { version = "1.0", features = ["derive"] } -workspace_hack = { path = "../workspace_hack" } -zenith_utils = { path = "../zenith_utils" } - -[build-dependencies] -bindgen = "0.59.1" diff --git a/postgres_ffi/build.rs b/postgres_ffi/build.rs deleted file mode 100644 index 3b4b37f9ee..0000000000 --- a/postgres_ffi/build.rs +++ /dev/null @@ -1,104 +0,0 @@ -extern crate bindgen; - -use std::env; -use std::path::PathBuf; - -use bindgen::callbacks::ParseCallbacks; - -#[derive(Debug)] -struct PostgresFfiCallbacks; - -impl ParseCallbacks for PostgresFfiCallbacks { - fn include_file(&self, filename: &str) { - // This does the equivalent of passing bindgen::CargoCallbacks - // to the builder .parse_callbacks() method. - let cargo_callbacks = bindgen::CargoCallbacks; - cargo_callbacks.include_file(filename) - } - - // Add any custom #[derive] attributes to the data structures that bindgen - // creates. - fn add_derives(&self, name: &str) -> Vec { - // This is the list of data structures that we want to serialize/deserialize. - let serde_list = [ - "XLogRecord", - "XLogPageHeaderData", - "XLogLongPageHeaderData", - "CheckPoint", - "FullTransactionId", - "ControlFileData", - ]; - - if serde_list.contains(&name) { - vec![ - "Default".into(), // Default allows us to easily fill the padding fields with 0. - "Serialize".into(), - "Deserialize".into(), - ] - } else { - vec![] - } - } -} - -fn main() { - // Tell cargo to invalidate the built crate whenever the wrapper changes - println!("cargo:rerun-if-changed=pg_control_ffi.h"); - - // The bindgen::Builder is the main entry point - // to bindgen, and lets you build up options for - // the resulting bindings. - let bindings = bindgen::Builder::default() - // - // All the needed PostgreSQL headers are included from 'pg_control_ffi.h' - // - .header("pg_control_ffi.h") - // - // Tell cargo to invalidate the built crate whenever any of the - // included header files changed. - // - .parse_callbacks(Box::new(PostgresFfiCallbacks)) - // - // These are the types and constants that we want to generate bindings for - // - .allowlist_type("BlockNumber") - .allowlist_type("OffsetNumber") - .allowlist_type("MultiXactId") - .allowlist_type("MultiXactOffset") - .allowlist_type("MultiXactStatus") - .allowlist_type("ControlFileData") - .allowlist_type("CheckPoint") - .allowlist_type("FullTransactionId") - .allowlist_type("XLogRecord") - .allowlist_type("XLogPageHeaderData") - .allowlist_type("XLogLongPageHeaderData") - .allowlist_var("XLOG_PAGE_MAGIC") - .allowlist_var("PG_CONTROL_FILE_SIZE") - .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC") - .allowlist_type("PageHeaderData") - .allowlist_type("DBState") - // Because structs are used for serialization, tell bindgen to emit - // explicit padding fields. - .explicit_padding(true) - // - // Path the server include dir. It is in tmp_install/include/server, if you did - // "configure --prefix=". But if you used "configure --prefix=/", - // and used DESTDIR to move it into tmp_install, then it's in - // tmp_install/include/postgres/server - // 'pg_config --includedir-server' would perhaps be the more proper way to find it, - // but this will do for now. - // - .clang_arg("-I../tmp_install/include/server") - .clang_arg("-I../tmp_install/include/postgresql/server") - // - // Finish the builder and generate the bindings. - // - .generate() - .expect("Unable to generate bindings"); - - // Write the bindings to the $OUT_DIR/bindings.rs file. - let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); - bindings - .write_to_file(out_path.join("bindings.rs")) - .expect("Couldn't write bindings!"); -} diff --git a/postgres_ffi/src/lib.rs b/postgres_ffi/src/lib.rs deleted file mode 100644 index 923fbe4d5a..0000000000 --- a/postgres_ffi/src/lib.rs +++ /dev/null @@ -1,39 +0,0 @@ -#![allow(non_upper_case_globals)] -#![allow(non_camel_case_types)] -#![allow(non_snake_case)] -// bindgen creates some unsafe code with no doc comments. -#![allow(clippy::missing_safety_doc)] -// suppress warnings on rust 1.53 due to bindgen unit tests. -// https://github.com/rust-lang/rust-bindgen/issues/1651 -#![allow(deref_nullptr)] - -use serde::{Deserialize, Serialize}; - -include!(concat!(env!("OUT_DIR"), "/bindings.rs")); - -pub mod controlfile_utils; -pub mod nonrelfile_utils; -pub mod pg_constants; -pub mod relfile_utils; -pub mod waldecoder; -pub mod xlog_utils; - -// See TransactionIdIsNormal in transam.h -pub const fn transaction_id_is_normal(id: TransactionId) -> bool { - id > pg_constants::FIRST_NORMAL_TRANSACTION_ID -} - -// See TransactionIdPrecedes in transam.c -pub const fn transaction_id_precedes(id1: TransactionId, id2: TransactionId) -> bool { - /* - * If either ID is a permanent XID then we can just do unsigned - * comparison. If both are normal, do a modulo-2^32 comparison. - */ - - if !(transaction_id_is_normal(id1)) || !transaction_id_is_normal(id2) { - return id1 < id2; - } - - let diff = id1.wrapping_sub(id2) as i32; - diff < 0 -} diff --git a/postgres_ffi/src/waldecoder.rs b/postgres_ffi/src/waldecoder.rs deleted file mode 100644 index ac48b1b0f3..0000000000 --- a/postgres_ffi/src/waldecoder.rs +++ /dev/null @@ -1,219 +0,0 @@ -//! -//! Basic WAL stream decoding. -//! -//! This understands the WAL page and record format, enough to figure out where the WAL record -//! boundaries are, and to reassemble WAL records that cross page boundaries. -//! -//! This functionality is needed by both the pageserver and the walkeepers. The pageserver needs -//! to look deeper into the WAL records to also understand which blocks they modify, the code -//! for that is in pageserver/src/walrecord.rs -//! -use super::pg_constants; -use super::xlog_utils::*; -use super::XLogLongPageHeaderData; -use super::XLogPageHeaderData; -use super::XLogRecord; -use bytes::{Buf, BufMut, Bytes, BytesMut}; -use crc32c::*; -use log::*; -use std::cmp::min; -use thiserror::Error; -use zenith_utils::lsn::Lsn; - -pub struct WalStreamDecoder { - lsn: Lsn, - - startlsn: Lsn, // LSN where this record starts - contlen: u32, - padlen: u32, - - inputbuf: BytesMut, - - /// buffer used to reassemble records that cross page boundaries. - recordbuf: BytesMut, -} - -#[derive(Error, Debug, Clone)] -#[error("{msg} at {lsn}")] -pub struct WalDecodeError { - msg: String, - lsn: Lsn, -} - -// -// WalRecordStream is a Stream that returns a stream of WAL records -// FIXME: This isn't a proper rust stream -// -impl WalStreamDecoder { - pub fn new(lsn: Lsn) -> WalStreamDecoder { - WalStreamDecoder { - lsn, - - startlsn: Lsn(0), - contlen: 0, - padlen: 0, - - inputbuf: BytesMut::new(), - recordbuf: BytesMut::new(), - } - } - - // The latest LSN position fed to the decoder. - pub fn available(&self) -> Lsn { - self.lsn + self.inputbuf.remaining() as u64 - } - - pub fn feed_bytes(&mut self, buf: &[u8]) { - self.inputbuf.extend_from_slice(buf); - } - - /// Attempt to decode another WAL record from the input that has been fed to the - /// decoder so far. - /// - /// Returns one of the following: - /// Ok((Lsn, Bytes)): a tuple containing the LSN of next record, and the record itself - /// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function - /// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid. - /// - pub fn poll_decode(&mut self) -> Result, WalDecodeError> { - let recordbuf; - - // Run state machine that validates page headers, and reassembles records - // that cross page boundaries. - loop { - // parse and verify page boundaries as we go - if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 { - // parse long header - - if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD { - return Ok(None); - } - - let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf); - - if hdr.std.xlp_pageaddr != self.lsn.0 { - return Err(WalDecodeError { - msg: "invalid xlog segment header".into(), - lsn: self.lsn, - }); - } - // TODO: verify the remaining fields in the header - - self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64; - continue; - } else if self.lsn.block_offset() == 0 { - if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD { - return Ok(None); - } - - let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf); - - if hdr.xlp_pageaddr != self.lsn.0 { - return Err(WalDecodeError { - msg: "invalid xlog page header".into(), - lsn: self.lsn, - }); - } - // TODO: verify the remaining fields in the header - - self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64; - continue; - } else if self.padlen > 0 { - if self.inputbuf.remaining() < self.padlen as usize { - return Ok(None); - } - - // skip padding - self.inputbuf.advance(self.padlen as usize); - self.lsn += self.padlen as u64; - self.padlen = 0; - } else if self.contlen == 0 { - assert!(self.recordbuf.is_empty()); - - // need to have at least the xl_tot_len field - if self.inputbuf.remaining() < 4 { - return Ok(None); - } - - // peek xl_tot_len at the beginning of the record. - // FIXME: assumes little-endian - self.startlsn = self.lsn; - let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le(); - if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD { - return Err(WalDecodeError { - msg: format!("invalid xl_tot_len {}", xl_tot_len), - lsn: self.lsn, - }); - } - - // Fast path for the common case that the whole record fits on the page. - let pageleft = self.lsn.remaining_in_block() as u32; - if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft { - // Take the record from the 'inputbuf', and validate it. - recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize); - self.lsn += xl_tot_len as u64; - break; - } else { - // Need to assemble the record from pieces. Remember the size of the - // record, and loop back. On next iteration, we will reach the 'else' - // branch below, and copy the part of the record that was on this page - // to 'recordbuf'. Subsequent iterations will skip page headers, and - // append the continuations from the next pages to 'recordbuf'. - self.recordbuf.reserve(xl_tot_len as usize); - self.contlen = xl_tot_len; - continue; - } - } else { - // we're continuing a record, possibly from previous page. - let pageleft = self.lsn.remaining_in_block() as u32; - - // read the rest of the record, or as much as fits on this page. - let n = min(self.contlen, pageleft) as usize; - - if self.inputbuf.remaining() < n { - return Ok(None); - } - - self.recordbuf.put(self.inputbuf.split_to(n)); - self.lsn += n as u64; - self.contlen -= n as u32; - - if self.contlen == 0 { - // The record is now complete. - recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze(); - break; - } - continue; - } - } - - // We now have a record in the 'recordbuf' local variable. - let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]); - - let mut crc = 0; - crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]); - crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]); - if crc != xlogrec.xl_crc { - return Err(WalDecodeError { - msg: "WAL record crc mismatch".into(), - lsn: self.lsn, - }); - } - - // XLOG_SWITCH records are special. If we see one, we need to skip - // to the next WAL segment. - if xlogrec.is_xlog_switch_record() { - trace!("saw xlog switch record at {}", self.lsn); - self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32; - } else { - // Pad to an 8-byte boundary - self.padlen = self.lsn.calc_padding(8u32) as u32; - } - - // Always align resulting LSN on 0x8 boundary -- that is important for getPage() - // and WalReceiver integration. Since this code is used both for WalReceiver and - // initial WAL import let's force alignment right here. - let result = (self.lsn.align(), recordbuf); - Ok(Some(result)) - } -} diff --git a/postgres_ffi/src/xlog_utils.rs b/postgres_ffi/src/xlog_utils.rs deleted file mode 100644 index caf1940a9c..0000000000 --- a/postgres_ffi/src/xlog_utils.rs +++ /dev/null @@ -1,564 +0,0 @@ -// -// This file contains common utilities for dealing with PostgreSQL WAL files and -// LSNs. -// -// Many of these functions have been copied from PostgreSQL, and rewritten in -// Rust. That's why they don't follow the usual Rust naming conventions, they -// have been named the same as the corresponding PostgreSQL functions instead. -// - -use crate::pg_constants; -use crate::CheckPoint; -use crate::FullTransactionId; -use crate::XLogLongPageHeaderData; -use crate::XLogPageHeaderData; -use crate::XLogRecord; -use crate::XLOG_PAGE_MAGIC; - -use anyhow::{bail, Result}; -use byteorder::{ByteOrder, LittleEndian}; -use bytes::BytesMut; -use bytes::{Buf, Bytes}; -use crc32c::*; -use log::*; -use std::cmp::max; -use std::cmp::min; -use std::fs::{self, File}; -use std::io::prelude::*; -use std::io::SeekFrom; -use std::path::{Path, PathBuf}; -use std::time::SystemTime; -use zenith_utils::lsn::Lsn; - -pub const XLOG_FNAME_LEN: usize = 24; -pub const XLOG_BLCKSZ: usize = 8192; -pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; -pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8; -pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2; -pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; - -pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::(); -pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::(); -pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::(); -#[allow(clippy::identity_op)] -pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; - -// PG timeline is always 1, changing it doesn't have useful meaning in Zenith. -pub const PG_TLI: u32 = 1; - -pub type XLogRecPtr = u64; -pub type TimeLineID = u32; -pub type TimestampTz = i64; -pub type XLogSegNo = u64; - -/// Interval of checkpointing metadata file. We should store metadata file to enforce -/// predicate that checkpoint.nextXid is larger than any XID in WAL. -/// But flushing checkpoint file for each transaction seems to be too expensive, -/// so XID_CHECKPOINT_INTERVAL is used to forward align nextXid and so perform -/// metadata checkpoint only once per XID_CHECKPOINT_INTERVAL transactions. -/// XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE -/// in order to let CLOG_TRUNCATE mechanism correctly extend CLOG. -const XID_CHECKPOINT_INTERVAL: u32 = 1024; - -#[allow(non_snake_case)] -pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo { - (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo -} - -#[allow(non_snake_case)] -pub fn XLogSegNoOffsetToRecPtr( - segno: XLogSegNo, - offset: u32, - wal_segsz_bytes: usize, -) -> XLogRecPtr { - segno * (wal_segsz_bytes as u64) + (offset as u64) -} - -#[allow(non_snake_case)] -pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { - return format!( - "{:>08X}{:>08X}{:>08X}", - tli, - logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes), - logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes) - ); -} - -#[allow(non_snake_case)] -pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) { - let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); - let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; - let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo; - (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli) -} - -#[allow(non_snake_case)] -pub fn IsXLogFileName(fname: &str) -> bool { - return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit()); -} - -#[allow(non_snake_case)] -pub fn IsPartialXLogFileName(fname: &str) -> bool { - fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8]) -} - -/// If LSN points to the beginning of the page, then shift it to first record, -/// otherwise align on 8-bytes boundary (required for WAL records) -pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn { - if lsn.0 % XLOG_BLCKSZ as u64 == 0 { - let hdr_size = if lsn.0 % seg_sz as u64 == 0 { - XLOG_SIZE_OF_XLOG_LONG_PHD - } else { - XLOG_SIZE_OF_XLOG_SHORT_PHD - }; - lsn + hdr_size as u64 - } else { - lsn.align() - } -} - -pub fn get_current_timestamp() -> TimestampTz { - const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */ - const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */ - const SECS_PER_DAY: u64 = 86400; - const USECS_PER_SEC: u64 = 1000000; - match SystemTime::now().duration_since(SystemTime::UNIX_EPOCH) { - Ok(n) => { - ((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY)) - * USECS_PER_SEC - + n.subsec_micros() as u64) as i64 - } - Err(_) => panic!("SystemTime before UNIX EPOCH!"), - } -} - -fn find_end_of_wal_segment( - data_dir: &Path, - segno: XLogSegNo, - tli: TimeLineID, - wal_seg_size: usize, - start_offset: usize, // start reading at this point -) -> Result { - // step back to the beginning of the page to read it in... - let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ; - let mut contlen: usize = 0; - let mut wal_crc: u32 = 0; - let mut crc: u32 = 0; - let mut rec_offs: usize = 0; - let mut buf = [0u8; XLOG_BLCKSZ]; - let file_name = XLogFileName(tli, segno, wal_seg_size); - let mut last_valid_rec_pos: usize = 0; - let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap(); - file.seek(SeekFrom::Start(offs as u64))?; - let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS]; - - while offs < wal_seg_size { - // we are at the beginning of the page; read it in - if offs % XLOG_BLCKSZ == 0 { - let bytes_read = file.read(&mut buf)?; - if bytes_read != buf.len() { - bail!( - "failed to read {} bytes from {} at {}", - XLOG_BLCKSZ, - file_name, - offs - ); - } - - let xlp_magic = LittleEndian::read_u16(&buf[0..2]); - let xlp_info = LittleEndian::read_u16(&buf[2..4]); - let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS + 4]); - // this is expected in current usage when valid WAL starts after page header - if xlp_magic != XLOG_PAGE_MAGIC as u16 { - trace!( - "invalid WAL file {}.partial magic {} at {:?}", - file_name, - xlp_magic, - Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)), - ); - } - if offs == 0 { - offs = XLOG_SIZE_OF_XLOG_LONG_PHD; - if (xlp_info & XLP_FIRST_IS_CONTRECORD) != 0 { - offs += ((xlp_rem_len + 7) & !7) as usize; - } - } else { - offs += XLOG_SIZE_OF_XLOG_SHORT_PHD; - } - // ... and step forward again if asked - offs = max(offs, start_offset); - - // beginning of the next record - } else if contlen == 0 { - let page_offs = offs % XLOG_BLCKSZ; - let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize; - if xl_tot_len == 0 { - info!( - "find_end_of_wal_segment reached zeros at {:?}, last records ends at {:?}", - Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)), - Lsn(XLogSegNoOffsetToRecPtr( - segno, - last_valid_rec_pos as u32, - wal_seg_size - )) - ); - break; // zeros, reached the end - } - last_valid_rec_pos = offs; - offs += 4; - rec_offs = 4; - contlen = xl_tot_len - 4; - rec_hdr[0..4].copy_from_slice(&buf[page_offs..page_offs + 4]); - } else { - // we're continuing a record, possibly from previous page. - let page_offs = offs % XLOG_BLCKSZ; - let pageleft = XLOG_BLCKSZ - page_offs; - - // read the rest of the record, or as much as fits on this page. - let n = min(contlen, pageleft); - // fill rec_hdr (header up to (but not including) xl_crc field) - if rec_offs < XLOG_RECORD_CRC_OFFS { - let len = min(XLOG_RECORD_CRC_OFFS - rec_offs, n); - rec_hdr[rec_offs..rec_offs + len].copy_from_slice(&buf[page_offs..page_offs + len]); - } - if rec_offs <= XLOG_RECORD_CRC_OFFS && rec_offs + n >= XLOG_SIZE_OF_XLOG_RECORD { - let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS; - wal_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]); - crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]); - } else { - crc ^= 0xFFFFFFFFu32; - crc = crc32c_append(crc, &buf[page_offs..page_offs + n]); - } - crc = !crc; - rec_offs += n; - offs += n; - contlen -= n; - - if contlen == 0 { - crc = !crc; - crc = crc32c_append(crc, &rec_hdr); - offs = (offs + 7) & !7; // pad on 8 bytes boundary */ - if crc == wal_crc { - // record is valid, advance the result to its end (with - // alignment to the next record taken into account) - last_valid_rec_pos = offs; - } else { - info!( - "CRC mismatch {} vs {} at {}", - crc, wal_crc, last_valid_rec_pos - ); - break; - } - } - } - } - Ok(last_valid_rec_pos as u32) -} - -/// -/// Scan a directory that contains PostgreSQL WAL files, for the end of WAL. -/// If precise, returns end LSN (next insertion point, basically); -/// otherwise, start of the last segment. -/// Returns (0, 0) if there is no WAL. -/// -pub fn find_end_of_wal( - data_dir: &Path, - wal_seg_size: usize, - precise: bool, - start_lsn: Lsn, // start reading WAL at this point or later -) -> Result<(XLogRecPtr, TimeLineID)> { - let mut high_segno: XLogSegNo = 0; - let mut high_tli: TimeLineID = 0; - let mut high_ispartial = false; - - for entry in fs::read_dir(data_dir).unwrap().flatten() { - let ispartial: bool; - let entry_name = entry.file_name(); - let fname = entry_name.to_str().unwrap(); - /* - * Check if the filename looks like an xlog file, or a .partial file. - */ - if IsXLogFileName(fname) { - ispartial = false; - } else if IsPartialXLogFileName(fname) { - ispartial = true; - } else { - continue; - } - let (segno, tli) = XLogFromFileName(fname, wal_seg_size); - if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 { - continue; - } - if segno > high_segno - || (segno == high_segno && tli > high_tli) - || (segno == high_segno && tli == high_tli && high_ispartial && !ispartial) - { - high_segno = segno; - high_tli = tli; - high_ispartial = ispartial; - } - } - if high_segno > 0 { - let mut high_offs = 0; - /* - * Move the starting pointer to the start of the next segment, if the - * highest one we saw was completed. - */ - if !high_ispartial { - high_segno += 1; - } else if precise { - /* otherwise locate last record in last partial segment */ - if start_lsn.segment_number(wal_seg_size) > high_segno { - bail!( - "provided start_lsn {:?} is beyond highest segno {:?} available", - start_lsn, - high_segno, - ); - } - let start_offset = if start_lsn.segment_number(wal_seg_size) == high_segno { - start_lsn.segment_offset(wal_seg_size) - } else { - 0 - }; - high_offs = find_end_of_wal_segment( - data_dir, - high_segno, - high_tli, - wal_seg_size, - start_offset, - )?; - } - let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size); - return Ok((high_ptr, high_tli)); - } - Ok((0, 0)) -} - -pub fn main() { - let mut data_dir = PathBuf::new(); - data_dir.push("."); - let wal_seg_size = 16 * 1024 * 1024; - let (wal_end, tli) = find_end_of_wal(&data_dir, wal_seg_size, true, Lsn(0)).unwrap(); - println!( - "wal_end={:>08X}{:>08X}, tli={}", - (wal_end >> 32) as u32, - wal_end as u32, - tli - ); -} - -impl XLogRecord { - pub fn from_slice(buf: &[u8]) -> XLogRecord { - use zenith_utils::bin_ser::LeSer; - XLogRecord::des(buf).unwrap() - } - - pub fn from_bytes(buf: &mut B) -> XLogRecord { - use zenith_utils::bin_ser::LeSer; - XLogRecord::des_from(&mut buf.reader()).unwrap() - } - - pub fn encode(&self) -> Bytes { - use zenith_utils::bin_ser::LeSer; - self.ser().unwrap().into() - } - - // Is this record an XLOG_SWITCH record? They need some special processing, - pub fn is_xlog_switch_record(&self) -> bool { - self.xl_info == pg_constants::XLOG_SWITCH && self.xl_rmid == pg_constants::RM_XLOG_ID - } -} - -impl XLogPageHeaderData { - pub fn from_bytes(buf: &mut B) -> XLogPageHeaderData { - use zenith_utils::bin_ser::LeSer; - XLogPageHeaderData::des_from(&mut buf.reader()).unwrap() - } -} - -impl XLogLongPageHeaderData { - pub fn from_bytes(buf: &mut B) -> XLogLongPageHeaderData { - use zenith_utils::bin_ser::LeSer; - XLogLongPageHeaderData::des_from(&mut buf.reader()).unwrap() - } - - pub fn encode(&self) -> Bytes { - use zenith_utils::bin_ser::LeSer; - self.ser().unwrap().into() - } -} - -pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::(); - -impl CheckPoint { - pub fn encode(&self) -> Bytes { - use zenith_utils::bin_ser::LeSer; - self.ser().unwrap().into() - } - - pub fn decode(buf: &[u8]) -> Result { - use zenith_utils::bin_ser::LeSer; - Ok(CheckPoint::des(buf)?) - } - - /// Update next XID based on provided new_xid and stored epoch. - /// Next XID should be greater than new_xid. This handles 32-bit - /// XID wraparound correctly. - /// - /// Returns 'true' if the XID was updated. - pub fn update_next_xid(&mut self, xid: u32) -> bool { - // nextXid should nw greate than any XID in WAL, so increment provided XID and check for wraparround. - let mut new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID); - // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL. - // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE - new_xid = - new_xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1); - let full_xid = self.nextXid.value; - let old_xid = full_xid as u32; - if new_xid.wrapping_sub(old_xid) as i32 > 0 { - let mut epoch = full_xid >> 32; - if new_xid < old_xid { - // wrap-around - epoch += 1; - } - let nextXid = (epoch << 32) | new_xid as u64; - - if nextXid != self.nextXid.value { - self.nextXid = FullTransactionId { value: nextXid }; - return true; - } - } - false - } -} - -// -// Generate new, empty WAL segment. -// We need this segment to start compute node. -// -pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes { - let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize); - - let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE); - let hdr = XLogLongPageHeaderData { - std: { - XLogPageHeaderData { - xlp_magic: XLOG_PAGE_MAGIC as u16, - xlp_info: pg_constants::XLP_LONG_HEADER, - xlp_tli: PG_TLI, - xlp_pageaddr: pageaddr, - xlp_rem_len: 0, - ..Default::default() // Put 0 in padding fields. - } - }, - xlp_sysid: system_id, - xlp_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32, - xlp_xlog_blcksz: XLOG_BLCKSZ as u32, - }; - - let hdr_bytes = hdr.encode(); - seg_buf.extend_from_slice(&hdr_bytes); - - //zero out the rest of the file - seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0); - seg_buf.freeze() -} - -#[cfg(test)] -mod tests { - use super::*; - use regex::Regex; - use std::{env, process::Command, str::FromStr}; - - // Run find_end_of_wal against file in test_wal dir - // Ensure that it finds last record correctly - #[test] - pub fn test_find_end_of_wal() { - // 1. Run initdb to generate some WAL - let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(".."); - let data_dir = top_path.join("test_output/test_find_end_of_wal"); - let initdb_path = top_path.join("tmp_install/bin/initdb"); - let lib_path = top_path.join("tmp_install/lib"); - if data_dir.exists() { - fs::remove_dir_all(&data_dir).unwrap(); - } - println!("Using initdb from '{}'", initdb_path.display()); - println!("Data directory '{}'", data_dir.display()); - let initdb_output = Command::new(initdb_path) - .args(&["-D", data_dir.to_str().unwrap()]) - .arg("--no-instructions") - .arg("--no-sync") - .env_clear() - .env("LD_LIBRARY_PATH", &lib_path) - .env("DYLD_LIBRARY_PATH", &lib_path) - .output() - .unwrap(); - assert!(initdb_output.status.success()); - - // 2. Pick WAL generated by initdb - let wal_dir = data_dir.join("pg_wal"); - let wal_seg_size = 16 * 1024 * 1024; - - // 3. Check end_of_wal on non-partial WAL segment (we treat it as fully populated) - let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap(); - let wal_end = Lsn(wal_end); - println!("wal_end={}, tli={}", wal_end, tli); - assert_eq!(wal_end, "0/2000000".parse::().unwrap()); - - // 4. Get the actual end of WAL by pg_waldump - let waldump_path = top_path.join("tmp_install/bin/pg_waldump"); - let waldump_output = Command::new(waldump_path) - .arg(wal_dir.join("000000010000000000000001")) - .env_clear() - .env("LD_LIBRARY_PATH", &lib_path) - .env("DYLD_LIBRARY_PATH", &lib_path) - .output() - .unwrap(); - let waldump_output = std::str::from_utf8(&waldump_output.stderr).unwrap(); - println!("waldump_output = '{}'", &waldump_output); - let re = Regex::new(r"invalid record length at (.+):").unwrap(); - let caps = re.captures(waldump_output).unwrap(); - let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap(); - - // 5. Rename file to partial to actually find last valid lsn - fs::rename( - wal_dir.join("000000010000000000000001"), - wal_dir.join("000000010000000000000001.partial"), - ) - .unwrap(); - let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap(); - let wal_end = Lsn(wal_end); - println!("wal_end={}, tli={}", wal_end, tli); - assert_eq!(wal_end, waldump_wal_end); - } - - /// Check the math in update_next_xid - /// - /// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL, - /// currently 1024. - #[test] - pub fn test_update_next_xid() { - let checkpoint_buf = [0u8; std::mem::size_of::()]; - let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap(); - - checkpoint.nextXid = FullTransactionId { value: 10 }; - assert_eq!(checkpoint.nextXid.value, 10); - - // The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL - // boundary - checkpoint.update_next_xid(100); - assert_eq!(checkpoint.nextXid.value, 1024); - - // No change - checkpoint.update_next_xid(500); - assert_eq!(checkpoint.nextXid.value, 1024); - checkpoint.update_next_xid(1023); - assert_eq!(checkpoint.nextXid.value, 1024); - - // The function returns the *next* XID, given the highest XID seen so - // far. So when we pass 1024, the nextXid gets bumped up to the next - // XID_CHECKPOINT_INTERVAL boundary. - checkpoint.update_next_xid(1024); - assert_eq!(checkpoint.nextXid.value, 2048); - } -} diff --git a/pre-commit.py b/pre-commit.py index 1e886e403b..560df6cd0c 100755 --- a/pre-commit.py +++ b/pre-commit.py @@ -1,11 +1,10 @@ #!/usr/bin/env python3 -from typing import List +import argparse +import enum import subprocess import sys -import enum -import argparse -import os +from typing import List @enum.unique @@ -29,7 +28,7 @@ def colorify( def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str: - cmd = "rustfmt --edition=2018" + cmd = "rustfmt --edition=2021" if not fix_inplace: cmd += " --check" if no_color: @@ -37,15 +36,24 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str: return cmd -def yapf(fix_inplace: bool) -> str: - cmd = "poetry run yapf --recursive" - if fix_inplace: - cmd += " --in-place" - else: - cmd += " --diff" +def black(fix_inplace: bool) -> str: + cmd = "poetry run black" + if not fix_inplace: + cmd += " --diff --check" return cmd +def isort(fix_inplace: bool) -> str: + cmd = "poetry run isort" + if not fix_inplace: + cmd += " --diff --check" + return cmd + + +def flake8() -> str: + return "poetry run flake8" + + def mypy() -> str: return "poetry run mypy" @@ -71,11 +79,13 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color: else: print("Please inspect the output below and run make fmt to fix automatically.") if suffix == ".py": - print("If the output is empty, ensure that you've installed Python tooling by\n" - "running './scripts/pysync' in the current directory (no root needed)") + print( + "If the output is empty, ensure that you've installed Python tooling by\n" + "running './scripts/pysync' in the current directory (no root needed)" + ) print() print(res.stdout.decode()) - exit(1) + sys.exit(1) print(colorify("[OK]", Color.GREEN, no_color)) @@ -83,10 +93,12 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color: if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--fix-inplace", action="store_true", help="apply fixes inplace") - parser.add_argument("--no-color", - action="store_true", - help="disable colored output", - default=not sys.stdout.isatty()) + parser.add_argument( + "--no-color", + action="store_true", + help="disable colored output", + default=not sys.stdout.isatty(), + ) args = parser.parse_args() files = get_commit_files() @@ -101,9 +113,23 @@ if __name__ == "__main__": no_color=args.no_color, ) check( - name="yapf", + name="isort", suffix=".py", - cmd=yapf(fix_inplace=args.fix_inplace), + cmd=isort(fix_inplace=args.fix_inplace), + changed_files=files, + no_color=args.no_color, + ) + check( + name="black", + suffix=".py", + cmd=black(fix_inplace=args.fix_inplace), + changed_files=files, + no_color=args.no_color, + ) + check( + name="flake8", + suffix=".py", + cmd=flake8(), changed_files=files, no_color=args.no_color, ) diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index d8d5cbe5bf..14a5450d5e 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -5,29 +5,49 @@ edition = "2021" [dependencies] anyhow = "1.0" +atty = "0.2.14" +base64 = "0.13.0" +bstr = "1.0" bytes = { version = "1.0.1", features = ['serde'] } -clap = "3.0" +clap = "4.0" futures = "0.3.13" -hashbrown = "0.11.2" +git-version = "0.3.5" +hashbrown = "0.12" hex = "0.4.3" +hmac = "0.12.1" hyper = "0.14" -lazy_static = "1.4.0" +itertools = "0.10.3" md5 = "0.7.0" -parking_lot = "0.11.2" +once_cell = "1.13.0" +parking_lot = "0.12" pin-project-lite = "0.2.7" rand = "0.8.3" -reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } -rustls = "0.19.1" +reqwest = { version = "0.11", default-features = false, features = [ "json", "rustls-tls" ] } +routerify = "3" +rustls = "0.20.0" +rustls-pemfile = "1" scopeguard = "1.1.0" serde = "1" serde_json = "1" -tokio = { version = "1.11", features = ["macros"] } -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -tokio-rustls = "0.22.0" +sha2 = "0.10.2" +socket2 = "0.4.4" +thiserror = "1.0.30" +tokio = { version = "1.17", features = ["macros"] } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-rustls = "0.23.0" +tracing = "0.1.36" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +url = "2.2.2" +uuid = { version = "1.2", features = ["v4", "serde"] } +x509-parser = "0.14" -zenith_utils = { path = "../zenith_utils" } -zenith_metrics = { path = "../zenith_metrics" } +metrics = { path = "../libs/metrics" } +pq_proto = { path = "../libs/pq_proto" } +utils = { path = "../libs/utils" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } [dev-dependencies] -tokio-postgres-rustls = "0.8.0" -rcgen = "0.8.14" +async-trait = "0.1" +rcgen = "0.10" +rstest = "0.15" +tokio-postgres-rustls = "0.9.0" diff --git a/proxy/README.md b/proxy/README.md new file mode 100644 index 0000000000..4ead098b73 --- /dev/null +++ b/proxy/README.md @@ -0,0 +1,30 @@ +# Proxy + +Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following backends are currently implemented: + +* console + new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon) +* postgres + uses postgres to select auth secrets of existing roles. Useful for local testing +* link + sends login link for all usernames + +## Using SNI-based routing on localhost + +Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy: + +```sh +openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me" +``` + +start proxy + +```sh +./target/debug/proxy -c server.crt -k server.key +``` + +and connect to it + +```sh +PGSSLROOTCERT=./server.crt psql 'postgres://my-cluster-42.localtest.me:1234?sslmode=verify-full' +``` diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index a5bdaeaeca..2df4f9d920 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,169 +1,87 @@ -use crate::compute::DatabaseInfo; -use crate::config::ProxyConfig; -use crate::cplane_api::{self, CPlaneApi}; -use crate::stream::PqStream; -use anyhow::{anyhow, bail, Context}; -use std::collections::HashMap; -use tokio::io::{AsyncRead, AsyncWrite}; -use zenith_utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage, FeMessage as Fe}; +//! Client authentication mechanisms. -/// Various client credentials which we use for authentication. -#[derive(Debug, PartialEq, Eq)] -pub struct ClientCredentials { - pub user: String, - pub dbname: String, +pub mod backend; +pub use backend::{BackendType, ConsoleReqExtra, DatabaseInfo}; + +mod credentials; +pub use credentials::ClientCredentials; + +mod password_hack; +use password_hack::PasswordHackPayload; + +mod flow; +pub use flow::*; + +use crate::error::UserFacingError; +use std::io; +use thiserror::Error; + +/// Convenience wrapper for the authentication error. +pub type Result = std::result::Result; + +/// Common authentication error. +#[derive(Debug, Error)] +pub enum AuthErrorImpl { + #[error(transparent)] + Link(#[from] backend::LinkAuthError), + + #[error(transparent)] + GetAuthInfo(#[from] backend::GetAuthInfoError), + + #[error(transparent)] + WakeCompute(#[from] backend::WakeComputeError), + + /// SASL protocol errors (includes [SCRAM](crate::scram)). + #[error(transparent)] + Sasl(#[from] crate::sasl::Error), + + #[error("Unsupported authentication method: {0}")] + BadAuthMethod(Box), + + #[error("Malformed password message: {0}")] + MalformedPassword(&'static str), + + #[error( + "Project ID is not specified. \ + Either please upgrade the postgres client library (libpq) for SNI support \ + or pass the project ID (first part of the domain name) as a parameter: '?options=project%3D'. \ + See more at https://neon.tech/sni" + )] + MissingProjectName, + + /// Errors produced by e.g. [`crate::stream::PqStream`]. + #[error(transparent)] + Io(#[from] io::Error), } -impl TryFrom> for ClientCredentials { - type Error = anyhow::Error; +#[derive(Debug, Error)] +#[error(transparent)] +pub struct AuthError(Box); - fn try_from(mut value: HashMap) -> Result { - let mut get_param = |key| { - value - .remove(key) - .with_context(|| format!("{} is missing in startup packet", key)) - }; - - let user = get_param("user")?; - let db = get_param("database")?; - - Ok(Self { user, dbname: db }) +impl AuthError { + pub fn bad_auth_method(name: impl Into>) -> Self { + AuthErrorImpl::BadAuthMethod(name.into()).into() } } -impl ClientCredentials { - /// Use credentials to authenticate the user. - pub async fn authenticate( - self, - config: &ProxyConfig, - client: &mut PqStream, - ) -> anyhow::Result { - use crate::config::ClientAuthMethod::*; - use crate::config::RouterConfig::*; - let db_info = match &config.router_config { - Static { host, port } => handle_static(host.clone(), *port, client, self).await, - Dynamic(Mixed) => { - if self.user.ends_with("@zenith") { - handle_existing_user(config, client, self).await - } else { - handle_new_user(config, client).await - } - } - Dynamic(Password) => handle_existing_user(config, client, self).await, - Dynamic(Link) => handle_new_user(config, client).await, - }; - - db_info.context("failed to authenticate client") +impl> From for AuthError { + fn from(e: E) -> Self { + Self(Box::new(e.into())) } } -fn new_psql_session_id() -> String { - hex::encode(rand::random::<[u8; 8]>()) -} - -async fn handle_static( - host: String, - port: u16, - client: &mut PqStream, - creds: ClientCredentials, -) -> anyhow::Result { - client - .write_message(&Be::AuthenticationCleartextPassword) - .await?; - - // Read client's password bytes - let msg = match client.read_message().await? { - Fe::PasswordMessage(msg) => msg, - bad => bail!("unexpected message type: {:?}", bad), - }; - - let cleartext_password = std::str::from_utf8(&msg)?.split('\0').next().unwrap(); - - let db_info = DatabaseInfo { - host, - port, - dbname: creds.dbname.clone(), - user: creds.user.clone(), - password: Some(cleartext_password.into()), - }; - - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - - Ok(db_info) -} - -async fn handle_existing_user( - config: &ProxyConfig, - client: &mut PqStream, - creds: ClientCredentials, -) -> anyhow::Result { - let psql_session_id = new_psql_session_id(); - let md5_salt = rand::random(); - - client - .write_message(&Be::AuthenticationMD5Password(&md5_salt)) - .await?; - - // Read client's password hash - let msg = match client.read_message().await? { - Fe::PasswordMessage(msg) => msg, - bad => bail!("unexpected message type: {:?}", bad), - }; - - let (_trailing_null, md5_response) = msg - .split_last() - .ok_or_else(|| anyhow!("unexpected password message"))?; - - let cplane = CPlaneApi::new(&config.auth_endpoint); - let db_info = cplane - .authenticate_proxy_request(creds, md5_response, &md5_salt, &psql_session_id) - .await?; - - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - - Ok(db_info) -} - -async fn handle_new_user( - config: &ProxyConfig, - client: &mut PqStream, -) -> anyhow::Result { - let psql_session_id = new_psql_session_id(); - let greeting = hello_message(&config.redirect_uri, &psql_session_id); - - let db_info = cplane_api::with_waiter(psql_session_id, |waiter| async { - // Give user a URL to spawn a new database - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())? - .write_message(&Be::NoticeResponse(greeting)) - .await?; - - // Wait for web console response - waiter.await?.map_err(|e| anyhow!(e)) - }) - .await?; - - client.write_message_noflush(&Be::NoticeResponse("Connecting to database.".into()))?; - - Ok(db_info) -} - -fn hello_message(redirect_uri: &str, session_id: &str) -> String { - format!( - concat![ - "☀️ Welcome to Zenith!\n", - "To proceed with database creation, open the following link:\n\n", - " {redirect_uri}{session_id}\n\n", - "It needs to be done once and we will send you '.pgpass' file,\n", - "which will allow you to access or create ", - "databases without opening your web browser." - ], - redirect_uri = redirect_uri, - session_id = session_id, - ) +impl UserFacingError for AuthError { + fn to_string_client(&self) -> String { + use AuthErrorImpl::*; + match self.0.as_ref() { + Link(e) => e.to_string_client(), + GetAuthInfo(e) => e.to_string_client(), + WakeCompute(e) => e.to_string_client(), + Sasl(e) => e.to_string_client(), + BadAuthMethod(_) => self.to_string(), + MalformedPassword(_) => self.to_string(), + MissingProjectName => self.to_string(), + _ => "Internal error".to_string(), + } + } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs new file mode 100644 index 0000000000..bb919770c1 --- /dev/null +++ b/proxy/src/auth/backend.rs @@ -0,0 +1,238 @@ +mod postgres; + +mod link; +pub use link::LinkAuthError; + +mod console; +pub use console::{GetAuthInfoError, WakeComputeError}; + +use crate::{ + auth::{self, AuthFlow, ClientCredentials}, + compute, http, mgmt, stream, url, + waiters::{self, Waiter, Waiters}, +}; +use once_cell::sync::Lazy; +use serde::{Deserialize, Serialize}; +use std::borrow::Cow; +use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::{info, warn}; + +static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); + +/// Give caller an opportunity to wait for the cloud's reply. +pub async fn with_waiter( + psql_session_id: impl Into, + action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R, +) -> Result +where + R: std::future::Future>, + E: From, +{ + let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; + action(waiter).await +} + +pub fn notify(psql_session_id: &str, msg: mgmt::ComputeReady) -> Result<(), waiters::NotifyError> { + CPLANE_WAITERS.notify(psql_session_id, msg) +} + +/// Compute node connection params provided by the cloud. +/// Note how it implements serde traits, since we receive it over the wire. +#[derive(Serialize, Deserialize, Default)] +pub struct DatabaseInfo { + pub host: String, + pub port: u16, + pub dbname: String, + pub user: String, + pub password: Option, +} + +// Manually implement debug to omit personal and sensitive info. +impl std::fmt::Debug for DatabaseInfo { + fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + fmt.debug_struct("DatabaseInfo") + .field("host", &self.host) + .field("port", &self.port) + .finish_non_exhaustive() + } +} + +impl From for tokio_postgres::Config { + fn from(db_info: DatabaseInfo) -> Self { + let mut config = tokio_postgres::Config::new(); + + config + .host(&db_info.host) + .port(db_info.port) + .dbname(&db_info.dbname) + .user(&db_info.user); + + if let Some(password) = db_info.password { + config.password(password); + } + + config + } +} + +/// Extra query params we'd like to pass to the console. +pub struct ConsoleReqExtra<'a> { + /// A unique identifier for a connection. + pub session_id: uuid::Uuid, + /// Name of client application, if set. + pub application_name: Option<&'a str>, +} + +/// This type serves two purposes: +/// +/// * When `T` is `()`, it's just a regular auth backend selector +/// which we use in [`crate::config::ProxyConfig`]. +/// +/// * However, when we substitute `T` with [`ClientCredentials`], +/// this helps us provide the credentials only to those auth +/// backends which require them for the authentication process. +#[derive(Debug)] +pub enum BackendType<'a, T> { + /// Current Cloud API (V2). + Console(Cow<'a, http::Endpoint>, T), + /// Local mock of Cloud API (V2). + Postgres(Cow<'a, url::ApiUrl>, T), + /// Authentication via a web browser. + Link(Cow<'a, url::ApiUrl>), +} + +impl std::fmt::Display for BackendType<'_, ()> { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use BackendType::*; + match self { + Console(endpoint, _) => fmt + .debug_tuple("Console") + .field(&endpoint.url().as_str()) + .finish(), + Postgres(endpoint, _) => fmt + .debug_tuple("Postgres") + .field(&endpoint.as_str()) + .finish(), + Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(), + } + } +} + +impl BackendType<'_, T> { + /// Very similar to [`std::option::Option::as_ref`]. + /// This helps us pass structured config to async tasks. + pub fn as_ref(&self) -> BackendType<'_, &T> { + use BackendType::*; + match self { + Console(c, x) => Console(Cow::Borrowed(c), x), + Postgres(c, x) => Postgres(Cow::Borrowed(c), x), + Link(c) => Link(Cow::Borrowed(c)), + } + } +} + +impl<'a, T> BackendType<'a, T> { + /// Very similar to [`std::option::Option::map`]. + /// Maps [`BackendType`] to [`BackendType`] by applying + /// a function to a contained value. + pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R> { + use BackendType::*; + match self { + Console(c, x) => Console(c, f(x)), + Postgres(c, x) => Postgres(c, f(x)), + Link(c) => Link(c), + } + } +} + +impl<'a, T, E> BackendType<'a, Result> { + /// Very similar to [`std::option::Option::transpose`]. + /// This is most useful for error handling. + pub fn transpose(self) -> Result, E> { + use BackendType::*; + match self { + Console(c, x) => x.map(|x| Console(c, x)), + Postgres(c, x) => x.map(|x| Postgres(c, x)), + Link(c) => Ok(Link(c)), + } + } +} + +impl BackendType<'_, ClientCredentials<'_>> { + /// Authenticate the client via the requested backend, possibly using credentials. + pub async fn authenticate( + mut self, + extra: &ConsoleReqExtra<'_>, + client: &mut stream::PqStream, + ) -> super::Result { + use BackendType::*; + + if let Console(_, creds) | Postgres(_, creds) = &mut self { + // If there's no project so far, that entails that client doesn't + // support SNI or other means of passing the project name. + // We now expect to see a very specific payload in the place of password. + if creds.project().is_none() { + warn!("project name not specified, resorting to the password hack auth flow"); + + let payload = AuthFlow::new(client) + .begin(auth::PasswordHack) + .await? + .authenticate() + .await?; + + // Finally we may finish the initialization of `creds`. + // TODO: add missing type safety to ClientCredentials. + info!(project = &payload.project, "received missing parameter"); + creds.project = Some(payload.project.into()); + + let mut config = match &self { + Console(endpoint, creds) => { + console::Api::new(endpoint, extra, creds) + .wake_compute() + .await? + } + Postgres(endpoint, creds) => { + postgres::Api::new(endpoint, creds).wake_compute().await? + } + _ => unreachable!("see the patterns above"), + }; + + // We should use a password from payload as well. + config.password(payload.password); + + info!("user successfully authenticated (using the password hack)"); + return Ok(compute::NodeInfo { + reported_auth_ok: false, + config, + }); + } + } + + let res = match self { + Console(endpoint, creds) => { + info!( + user = creds.user, + project = creds.project(), + "performing authentication using the console" + ); + console::Api::new(&endpoint, extra, &creds) + .handle_user(client) + .await + } + Postgres(endpoint, creds) => { + info!("performing mock authentication using a local postgres instance"); + postgres::Api::new(&endpoint, &creds) + .handle_user(client) + .await + } + // NOTE: this auth backend doesn't use client credentials. + Link(url) => { + info!("performing link authentication"); + link::handle_user(&url, client).await + } + }?; + + info!("user successfully authenticated"); + Ok(res) + } +} diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs new file mode 100644 index 0000000000..cf99aa08ef --- /dev/null +++ b/proxy/src/auth/backend/console.rs @@ -0,0 +1,267 @@ +//! Cloud API V2. + +use super::ConsoleReqExtra; +use crate::{ + auth::{self, AuthFlow, ClientCredentials}, + compute::{self, ComputeConnCfg}, + error::{io_error, UserFacingError}, + http, scram, + stream::PqStream, +}; +use futures::TryFutureExt; +use serde::{Deserialize, Serialize}; +use std::future::Future; +use thiserror::Error; +use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::{error, info, info_span}; + +const REQUEST_FAILED: &str = "Console request failed"; + +#[derive(Debug, Error)] +#[error("{}", REQUEST_FAILED)] +pub struct TransportError(#[from] std::io::Error); + +impl UserFacingError for TransportError {} + +// Helps eliminate graceless `.map_err` calls without introducing another ctor. +impl From for TransportError { + fn from(e: reqwest::Error) -> Self { + io_error(e).into() + } +} + +#[derive(Debug, Error)] +pub enum GetAuthInfoError { + // We shouldn't include the actual secret here. + #[error("Console responded with a malformed auth secret")] + BadSecret, + + #[error(transparent)] + Transport(TransportError), +} + +impl UserFacingError for GetAuthInfoError { + fn to_string_client(&self) -> String { + use GetAuthInfoError::*; + match self { + BadSecret => REQUEST_FAILED.to_owned(), + Transport(e) => e.to_string_client(), + } + } +} + +impl> From for GetAuthInfoError { + fn from(e: E) -> Self { + Self::Transport(e.into()) + } +} + +#[derive(Debug, Error)] +pub enum WakeComputeError { + // We shouldn't show users the address even if it's broken. + #[error("Console responded with a malformed compute address: {0}")] + BadComputeAddress(String), + + #[error(transparent)] + Transport(TransportError), +} + +impl UserFacingError for WakeComputeError { + fn to_string_client(&self) -> String { + use WakeComputeError::*; + match self { + BadComputeAddress(_) => REQUEST_FAILED.to_owned(), + Transport(e) => e.to_string_client(), + } + } +} + +impl> From for WakeComputeError { + fn from(e: E) -> Self { + Self::Transport(e.into()) + } +} + +// TODO: convert into an enum with "error" +#[derive(Serialize, Deserialize, Debug)] +struct GetRoleSecretResponse { + role_secret: String, +} + +// TODO: convert into an enum with "error" +#[derive(Serialize, Deserialize, Debug)] +struct GetWakeComputeResponse { + address: String, +} + +/// Auth secret which is managed by the cloud. +pub enum AuthInfo { + /// Md5 hash of user's password. + Md5([u8; 16]), + + /// [SCRAM](crate::scram) authentication info. + Scram(scram::ServerSecret), +} + +#[must_use] +pub(super) struct Api<'a> { + endpoint: &'a http::Endpoint, + extra: &'a ConsoleReqExtra<'a>, + creds: &'a ClientCredentials<'a>, +} + +impl<'a> Api<'a> { + /// Construct an API object containing the auth parameters. + pub(super) fn new( + endpoint: &'a http::Endpoint, + extra: &'a ConsoleReqExtra<'a>, + creds: &'a ClientCredentials, + ) -> Self { + Self { + endpoint, + extra, + creds, + } + } + + /// Authenticate the existing user or throw an error. + pub(super) async fn handle_user( + self, + client: &mut PqStream, + ) -> auth::Result { + handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await + } + + async fn get_auth_info(&self) -> Result { + let request_id = uuid::Uuid::new_v4().to_string(); + let req = self + .endpoint + .get("proxy_get_role_secret") + .header("X-Request-ID", &request_id) + .query(&[("session_id", self.extra.session_id)]) + .query(&[ + ("application_name", self.extra.application_name), + ("project", Some(self.creds.project().expect("impossible"))), + ("role", Some(self.creds.user)), + ]) + .build()?; + + let span = info_span!("http", id = request_id, url = req.url().as_str()); + info!(parent: &span, "request auth info"); + let msg = self + .endpoint + .checked_execute(req) + .and_then(|r| r.json::()) + .await + .map_err(|e| { + error!(parent: &span, "{e}"); + e + })?; + + scram::ServerSecret::parse(&msg.role_secret) + .map(AuthInfo::Scram) + .ok_or(GetAuthInfoError::BadSecret) + } + + /// Wake up the compute node and return the corresponding connection info. + pub(super) async fn wake_compute(&self) -> Result { + let request_id = uuid::Uuid::new_v4().to_string(); + let req = self + .endpoint + .get("proxy_wake_compute") + .header("X-Request-ID", &request_id) + .query(&[("session_id", self.extra.session_id)]) + .query(&[ + ("application_name", self.extra.application_name), + ("project", Some(self.creds.project().expect("impossible"))), + ]) + .build()?; + + let span = info_span!("http", id = request_id, url = req.url().as_str()); + info!(parent: &span, "request wake-up"); + let msg = self + .endpoint + .checked_execute(req) + .and_then(|r| r.json::()) + .await + .map_err(|e| { + error!(parent: &span, "{e}"); + e + })?; + + // Unfortunately, ownership won't let us use `Option::ok_or` here. + let (host, port) = match parse_host_port(&msg.address) { + None => return Err(WakeComputeError::BadComputeAddress(msg.address)), + Some(x) => x, + }; + + let mut config = ComputeConnCfg::new(); + config + .host(host) + .port(port) + .dbname(self.creds.dbname) + .user(self.creds.user); + + Ok(config) + } +} + +/// Common logic for user handling in API V2. +/// We reuse this for a mock API implementation in [`super::postgres`]. +pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>( + client: &mut PqStream, + endpoint: &'a Endpoint, + get_auth_info: impl FnOnce(&'a Endpoint) -> GetAuthInfo, + wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute, +) -> auth::Result +where + GetAuthInfo: Future>, + WakeCompute: Future>, +{ + info!("fetching user's authentication info"); + let auth_info = get_auth_info(endpoint).await?; + + let flow = AuthFlow::new(client); + let scram_keys = match auth_info { + AuthInfo::Md5(_) => { + // TODO: decide if we should support MD5 in api v2 + info!("auth endpoint chooses MD5"); + return Err(auth::AuthError::bad_auth_method("MD5")); + } + AuthInfo::Scram(secret) => { + info!("auth endpoint chooses SCRAM"); + let scram = auth::Scram(&secret); + Some(compute::ScramKeys { + client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), + server_key: secret.server_key.as_bytes(), + }) + } + }; + + let mut config = wake_compute(endpoint).await?; + if let Some(keys) = scram_keys { + config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(keys)); + } + + Ok(compute::NodeInfo { + reported_auth_ok: false, + config, + }) +} + +fn parse_host_port(input: &str) -> Option<(&str, u16)> { + let (host, port) = input.split_once(':')?; + Some((host, port.parse().ok()?)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_host_port() { + let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse"); + assert_eq!(host, "127.0.0.1"); + assert_eq!(port, 5432); + } +} diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs new file mode 100644 index 0000000000..96c6f0ba18 --- /dev/null +++ b/proxy/src/auth/backend/link.rs @@ -0,0 +1,78 @@ +use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters}; +use pq_proto::{BeMessage as Be, BeParameterStatusMessage}; +use thiserror::Error; +use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::{info, info_span}; + +#[derive(Debug, Error)] +pub enum LinkAuthError { + /// Authentication error reported by the console. + #[error("Authentication failed: {0}")] + AuthFailed(String), + + #[error(transparent)] + WaiterRegister(#[from] waiters::RegisterError), + + #[error(transparent)] + WaiterWait(#[from] waiters::WaitError), + + #[error(transparent)] + Io(#[from] std::io::Error), +} + +impl UserFacingError for LinkAuthError { + fn to_string_client(&self) -> String { + use LinkAuthError::*; + match self { + AuthFailed(_) => self.to_string(), + _ => "Internal error".to_string(), + } + } +} + +fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String { + format!( + concat![ + "Welcome to Neon!\n", + "Authenticate by visiting:\n", + " {redirect_uri}{session_id}\n\n", + ], + redirect_uri = redirect_uri, + session_id = session_id, + ) +} + +pub fn new_psql_session_id() -> String { + hex::encode(rand::random::<[u8; 8]>()) +} + +pub async fn handle_user( + link_uri: &reqwest::Url, + client: &mut PqStream, +) -> auth::Result { + let psql_session_id = new_psql_session_id(); + let span = info_span!("link", psql_session_id = &psql_session_id); + let greeting = hello_message(link_uri, &psql_session_id); + + let db_info = super::with_waiter(psql_session_id, |waiter| async { + // Give user a URL to spawn a new database. + info!(parent: &span, "sending the auth URL to the user"); + client + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())? + .write_message(&Be::NoticeResponse(&greeting)) + .await?; + + // Wait for web console response (see `mgmt`). + info!(parent: &span, "waiting for console's reply..."); + waiter.await?.map_err(LinkAuthError::AuthFailed) + }) + .await?; + + client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; + + Ok(compute::NodeInfo { + reported_auth_ok: true, + config: db_info.into(), + }) +} diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs new file mode 100644 index 0000000000..2055ee14c8 --- /dev/null +++ b/proxy/src/auth/backend/postgres.rs @@ -0,0 +1,95 @@ +//! Local mock of Cloud API V2. + +use crate::{ + auth::{ + self, + backend::console::{self, AuthInfo, GetAuthInfoError, TransportError, WakeComputeError}, + ClientCredentials, + }, + compute::{self, ComputeConnCfg}, + error::io_error, + scram, + stream::PqStream, + url::ApiUrl, +}; +use tokio::io::{AsyncRead, AsyncWrite}; + +#[must_use] +pub(super) struct Api<'a> { + endpoint: &'a ApiUrl, + creds: &'a ClientCredentials<'a>, +} + +// Helps eliminate graceless `.map_err` calls without introducing another ctor. +impl From for TransportError { + fn from(e: tokio_postgres::Error) -> Self { + io_error(e).into() + } +} + +impl<'a> Api<'a> { + /// Construct an API object containing the auth parameters. + pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self { + Self { endpoint, creds } + } + + /// Authenticate the existing user or throw an error. + pub(super) async fn handle_user( + self, + client: &mut PqStream, + ) -> auth::Result { + // We reuse user handling logic from a production module. + console::handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await + } + + /// This implementation fetches the auth info from a local postgres instance. + async fn get_auth_info(&self) -> Result { + // Perhaps we could persist this connection, but then we'd have to + // write more code for reopening it if it got closed, which doesn't + // seem worth it. + let (client, connection) = + tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; + + tokio::spawn(connection); + let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1"; + let rows = client.query(query, &[&self.creds.user]).await?; + + match &rows[..] { + // We can't get a secret if there's no such user. + [] => Err(io_error(format!("unknown user '{}'", self.creds.user)).into()), + + // We shouldn't get more than one row anyway. + [row, ..] => { + let entry = row + .try_get("rolpassword") + .map_err(|e| io_error(format!("failed to read user's password: {e}")))?; + + scram::ServerSecret::parse(entry) + .map(AuthInfo::Scram) + .or_else(|| { + // It could be an md5 hash if it's not a SCRAM secret. + let text = entry.strip_prefix("md5")?; + Some(AuthInfo::Md5({ + let mut bytes = [0u8; 16]; + hex::decode_to_slice(text, &mut bytes).ok()?; + bytes + })) + }) + // Putting the secret into this message is a security hazard! + .ok_or(GetAuthInfoError::BadSecret) + } + } + } + + /// We don't need to wake anything locally, so we just return the connection info. + pub(super) async fn wake_compute(&self) -> Result { + let mut config = ComputeConnCfg::new(); + config + .host(self.endpoint.host_str().unwrap_or("localhost")) + .port(self.endpoint.port().unwrap_or(5432)) + .dbname(self.creds.dbname) + .user(self.creds.user); + + Ok(config) + } +} diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs new file mode 100644 index 0000000000..907f99b8e0 --- /dev/null +++ b/proxy/src/auth/credentials.rs @@ -0,0 +1,206 @@ +//! User credentials used in authentication. + +use crate::error::UserFacingError; +use pq_proto::StartupMessageParams; +use std::borrow::Cow; +use thiserror::Error; +use tracing::info; + +#[derive(Debug, Error, PartialEq, Eq, Clone)] +pub enum ClientCredsParseError { + #[error("Parameter '{0}' is missing in startup packet.")] + MissingKey(&'static str), + + #[error("Inconsistent project name inferred from SNI ('{0}') and project option ('{1}').")] + InconsistentProjectNames(String, String), + + #[error( + "SNI ('{1}') inconsistently formatted with respect to common name ('{0}'). \ + SNI should be formatted as '.{0}'." + )] + InconsistentSni(String, String), + + #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")] + MalformedProjectName(String), +} + +impl UserFacingError for ClientCredsParseError {} + +/// Various client credentials which we use for authentication. +/// Note that we don't store any kind of client key or password here. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ClientCredentials<'a> { + pub user: &'a str, + pub dbname: &'a str, + pub project: Option>, +} + +impl ClientCredentials<'_> { + pub fn project(&self) -> Option<&str> { + self.project.as_deref() + } +} + +impl<'a> ClientCredentials<'a> { + pub fn parse( + params: &'a StartupMessageParams, + sni: Option<&str>, + common_name: Option<&str>, + ) -> Result { + use ClientCredsParseError::*; + + // Some parameters are stored in the startup message. + let get_param = |key| params.get(key).ok_or(MissingKey(key)); + let user = get_param("user")?; + let dbname = get_param("database")?; + + // Project name might be passed via PG's command-line options. + let project_a = params.options_raw().and_then(|mut options| { + options + .find_map(|opt| opt.strip_prefix("project=")) + .map(Cow::Borrowed) + }); + + // Alternative project name is in fact a subdomain from SNI. + // NOTE: we do not consider SNI if `common_name` is missing. + let project_b = sni + .zip(common_name) + .map(|(sni, cn)| { + subdomain_from_sni(sni, cn) + .ok_or_else(|| InconsistentSni(sni.into(), cn.into())) + .map(Cow::<'static, str>::Owned) + }) + .transpose()?; + + let project = match (project_a, project_b) { + // Invariant: if we have both project name variants, they should match. + (Some(a), Some(b)) if a != b => Some(Err(InconsistentProjectNames(a.into(), b.into()))), + // Invariant: project name may not contain certain characters. + (a, b) => a.or(b).map(|name| match project_name_valid(&name) { + false => Err(MalformedProjectName(name.into())), + true => Ok(name), + }), + } + .transpose()?; + + info!( + user = user, + dbname = dbname, + project = project.as_deref(), + "credentials" + ); + + Ok(Self { + user, + dbname, + project, + }) + } +} + +fn project_name_valid(name: &str) -> bool { + name.chars().all(|c| c.is_alphanumeric() || c == '-') +} + +fn subdomain_from_sni(sni: &str, common_name: &str) -> Option { + sni.strip_suffix(common_name)? + .strip_suffix('.') + .map(str::to_owned) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + #[ignore = "TODO: fix how database is handled"] + fn parse_bare_minimum() -> anyhow::Result<()> { + // According to postgresql, only `user` should be required. + let options = StartupMessageParams::new([("user", "john_doe")]); + + // TODO: check that `creds.dbname` is None. + let creds = ClientCredentials::parse(&options, None, None)?; + assert_eq!(creds.user, "john_doe"); + + Ok(()) + } + + #[test] + fn parse_missing_project() -> anyhow::Result<()> { + let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]); + + let creds = ClientCredentials::parse(&options, None, None)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project, None); + + Ok(()) + } + + #[test] + fn parse_project_from_sni() -> anyhow::Result<()> { + let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]); + + let sni = Some("foo.localhost"); + let common_name = Some("localhost"); + + let creds = ClientCredentials::parse(&options, sni, common_name)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project.as_deref(), Some("foo")); + + Ok(()) + } + + #[test] + fn parse_project_from_options() -> anyhow::Result<()> { + let options = StartupMessageParams::new([ + ("user", "john_doe"), + ("database", "world"), + ("options", "-ckey=1 project=bar -c geqo=off"), + ]); + + let creds = ClientCredentials::parse(&options, None, None)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project.as_deref(), Some("bar")); + + Ok(()) + } + + #[test] + fn parse_projects_identical() -> anyhow::Result<()> { + let options = StartupMessageParams::new([ + ("user", "john_doe"), + ("database", "world"), + ("options", "project=baz"), + ]); + + let sni = Some("baz.localhost"); + let common_name = Some("localhost"); + + let creds = ClientCredentials::parse(&options, sni, common_name)?; + assert_eq!(creds.user, "john_doe"); + assert_eq!(creds.dbname, "world"); + assert_eq!(creds.project.as_deref(), Some("baz")); + + Ok(()) + } + + #[test] + fn parse_projects_different() { + let options = StartupMessageParams::new([ + ("user", "john_doe"), + ("database", "world"), + ("options", "project=first"), + ]); + + let sni = Some("second.localhost"); + let common_name = Some("localhost"); + + assert!(matches!( + ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"), + ClientCredsParseError::InconsistentProjectNames(_, _) + )); + } +} diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs new file mode 100644 index 0000000000..865af4d2e5 --- /dev/null +++ b/proxy/src/auth/flow.rs @@ -0,0 +1,110 @@ +//! Main authentication flow. + +use super::{AuthErrorImpl, PasswordHackPayload}; +use crate::{sasl, scram, stream::PqStream}; +use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; +use std::io; +use tokio::io::{AsyncRead, AsyncWrite}; + +/// Every authentication selector is supposed to implement this trait. +pub trait AuthMethod { + /// Any authentication selector should provide initial backend message + /// containing auth method name and parameters, e.g. md5 salt. + fn first_message(&self) -> BeMessage<'_>; +} + +/// Initial state of [`AuthFlow`]. +pub struct Begin; + +/// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`]. +pub struct Scram<'a>(pub &'a scram::ServerSecret); + +impl AuthMethod for Scram<'_> { + #[inline(always)] + fn first_message(&self) -> BeMessage<'_> { + Be::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(scram::METHODS)) + } +} + +/// Use an ad hoc auth flow (for clients which don't support SNI) proposed in +/// . +pub struct PasswordHack; + +impl AuthMethod for PasswordHack { + #[inline(always)] + fn first_message(&self) -> BeMessage<'_> { + Be::AuthenticationCleartextPassword + } +} + +/// This wrapper for [`PqStream`] performs client authentication. +#[must_use] +pub struct AuthFlow<'a, Stream, State> { + /// The underlying stream which implements libpq's protocol. + stream: &'a mut PqStream, + /// State might contain ancillary data (see [`Self::begin`]). + state: State, +} + +/// Initial state of the stream wrapper. +impl<'a, S: AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { + /// Create a new wrapper for client authentication. + pub fn new(stream: &'a mut PqStream) -> Self { + Self { + stream, + state: Begin, + } + } + + /// Move to the next step by sending auth method's name & params to client. + pub async fn begin(self, method: M) -> io::Result> { + self.stream.write_message(&method.first_message()).await?; + + Ok(AuthFlow { + stream: self.stream, + state: method, + }) + } +} + +impl AuthFlow<'_, S, PasswordHack> { + /// Perform user authentication. Raise an error in case authentication failed. + pub async fn authenticate(self) -> super::Result { + let msg = self.stream.read_password_message().await?; + let password = msg + .strip_suffix(&[0]) + .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; + + let payload = PasswordHackPayload::parse(password) + // If we ended up here and the payload is malformed, it means that + // the user neither enabled SNI nor resorted to any other method + // for passing the project name we rely on. We should show them + // the most helpful error message and point to the documentation. + .ok_or(AuthErrorImpl::MissingProjectName)?; + + Ok(payload) + } +} + +/// Stream wrapper for handling [SCRAM](crate::scram) auth. +impl AuthFlow<'_, S, Scram<'_>> { + /// Perform user authentication. Raise an error in case authentication failed. + pub async fn authenticate(self) -> super::Result { + // Initial client message contains the chosen auth method's name. + let msg = self.stream.read_password_message().await?; + let sasl = sasl::FirstMessage::parse(&msg) + .ok_or(AuthErrorImpl::MalformedPassword("bad sasl message"))?; + + // Currently, the only supported SASL method is SCRAM. + if !scram::METHODS.contains(&sasl.method) { + return Err(super::AuthError::bad_auth_method(sasl.method)); + } + + let secret = self.state.0; + let key = sasl::SaslStream::new(self.stream, sasl.message) + .authenticate(scram::Exchange::new(secret, rand::random, None)) + .await?; + + Ok(key) + } +} diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs new file mode 100644 index 0000000000..639809e18a --- /dev/null +++ b/proxy/src/auth/password_hack.rs @@ -0,0 +1,46 @@ +//! Payload for ad hoc authentication method for clients that don't support SNI. +//! See the `impl` for [`super::backend::BackendType`]. +//! Read more: . +//! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified. + +use bstr::ByteSlice; + +pub struct PasswordHackPayload { + pub project: String, + pub password: Vec, +} + +impl PasswordHackPayload { + pub fn parse(bytes: &[u8]) -> Option { + // The format is `project=;`. + let mut iter = bytes.strip_prefix(b"project=")?.splitn_str(2, ";"); + let project = iter.next()?.to_str().ok()?.to_owned(); + let password = iter.next()?.to_owned(); + + Some(Self { project, password }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_password_hack_payload() { + let bytes = b""; + assert!(PasswordHackPayload::parse(bytes).is_none()); + + let bytes = b"project="; + assert!(PasswordHackPayload::parse(bytes).is_none()); + + let bytes = b"project=;"; + let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); + assert_eq!(payload.project, ""); + assert_eq!(payload.password, b""); + + let bytes = b"project=foobar;pass;word"; + let payload = PasswordHackPayload::parse(bytes).expect("parsing failed"); + assert_eq!(payload.project, "foobar"); + assert_eq!(payload.password, b"pass;word"); + } +} diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index c1a7e81be9..b219cd0fa2 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,12 +1,13 @@ use anyhow::{anyhow, Context}; use hashbrown::HashMap; use parking_lot::Mutex; +use pq_proto::CancelKeyData; use std::net::SocketAddr; use tokio::net::TcpStream; use tokio_postgres::{CancelToken, NoTls}; -use zenith_utils::pq_proto::CancelKeyData; +use tracing::info; -/// Enables serving CancelRequests. +/// Enables serving `CancelRequest`s. #[derive(Default)] pub struct CancelMap(Mutex>>); @@ -18,8 +19,9 @@ impl CancelMap { .lock() .get(&key) .and_then(|x| x.clone()) - .with_context(|| format!("unknown session: {:?}", key))?; + .with_context(|| format!("query cancellation key not found: {key}"))?; + info!("cancelling query per user's request using key {key}"); cancel_closure.try_cancel_query().await } @@ -41,17 +43,29 @@ impl CancelMap { self.0 .lock() .try_insert(key, None) - .map_err(|_| anyhow!("session already exists: {:?}", key))?; + .map_err(|_| anyhow!("query cancellation key already exists: {key}"))?; // This will guarantee that the session gets dropped // as soon as the future is finished. scopeguard::defer! { self.0.lock().remove(&key); + info!("dropped query cancellation key {key}"); } + info!("registered new query cancellation key {key}"); let session = Session::new(key, self); f(session).await } + + #[cfg(test)] + fn contains(&self, session: &Session) -> bool { + self.0.lock().contains_key(&session.key) + } + + #[cfg(test)] + fn is_empty(&self) -> bool { + self.0.lock().is_empty() + } } /// This should've been a [`std::future::Future`], but @@ -92,10 +106,13 @@ impl<'a> Session<'a> { fn new(key: CancelKeyData, cancel_map: &'a CancelMap) -> Self { Self { key, cancel_map } } +} +impl Session<'_> { /// Store the cancel token for the given session. /// This enables query cancellation in [`crate::proxy::handshake`]. - pub fn enable_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData { + pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData { + info!("enabling query cancellation for this session"); self.cancel_map .0 .lock() @@ -104,3 +121,39 @@ impl<'a> Session<'a> { self.key } } + +#[cfg(test)] +mod tests { + use super::*; + use once_cell::sync::Lazy; + + #[tokio::test] + async fn check_session_drop() -> anyhow::Result<()> { + static CANCEL_MAP: Lazy = Lazy::new(Default::default); + + let (tx, rx) = tokio::sync::oneshot::channel(); + let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move { + assert!(CANCEL_MAP.contains(&session)); + + tx.send(()).expect("failed to send"); + futures::future::pending::<()>().await; // sleep forever + + Ok(()) + })); + + // Wait until the task has been spawned. + rx.await.context("failed to hear from the task")?; + + // Drop the session's entry by cancelling the task. + task.abort(); + let error = task.await.expect_err("task should have failed"); + if !error.is_cancelled() { + anyhow::bail!(error); + } + + // Check that the session has been dropped. + assert!(CANCEL_MAP.is_empty()); + + Ok(()) + } +} diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 7c294bd488..4771c774a1 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,42 +1,175 @@ -use anyhow::Context; -use serde::{Deserialize, Serialize}; -use std::net::{SocketAddr, ToSocketAddrs}; +use crate::{cancellation::CancelClosure, error::UserFacingError}; +use futures::TryFutureExt; +use itertools::Itertools; +use pq_proto::StartupMessageParams; +use std::{io, net::SocketAddr}; +use thiserror::Error; +use tokio::net::TcpStream; +use tokio_postgres::NoTls; +use tracing::{error, info}; -/// Compute node connection params. -#[derive(Serialize, Deserialize, Debug, Default)] -pub struct DatabaseInfo { - pub host: String, - pub port: u16, - pub dbname: String, - pub user: String, - pub password: Option, +#[derive(Debug, Error)] +pub enum ConnectionError { + /// This error doesn't seem to reveal any secrets; for instance, + /// [`tokio_postgres::error::Kind`] doesn't contain ip addresses and such. + #[error("Failed to connect to the compute node: {0}")] + Postgres(#[from] tokio_postgres::Error), + + #[error("Failed to connect to the compute node")] + FailedToConnectToCompute, + + #[error("Failed to fetch compute node version")] + FailedToFetchPgVersion, } -impl DatabaseInfo { - pub fn socket_addr(&self) -> anyhow::Result { - let host_port = format!("{}:{}", self.host, self.port); - host_port - .to_socket_addrs() - .with_context(|| format!("cannot resolve {} to SocketAddr", host_port))? - .next() - .context("cannot resolve at least one SocketAddr") +impl UserFacingError for ConnectionError { + fn to_string_client(&self) -> String { + use ConnectionError::*; + match self { + // This helps us drop irrelevant library-specific prefixes. + // TODO: propagate severity level and other parameters. + Postgres(err) => match err.as_db_error() { + Some(err) => err.message().to_string(), + None => err.to_string(), + }, + other => other.to_string(), + } } } -impl From for tokio_postgres::Config { - fn from(db_info: DatabaseInfo) -> Self { - let mut config = tokio_postgres::Config::new(); +/// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`. +pub type ScramKeys = tokio_postgres::config::ScramKeys<32>; - config - .host(&db_info.host) - .port(db_info.port) - .dbname(&db_info.dbname) - .user(&db_info.user); +pub type ComputeConnCfg = tokio_postgres::Config; - if let Some(password) = db_info.password { - config.password(password); +/// Various compute node info for establishing connection etc. +pub struct NodeInfo { + /// Did we send [`pq_proto::BeMessage::AuthenticationOk`]? + pub reported_auth_ok: bool, + /// Compute node connection params. + pub config: tokio_postgres::Config, +} + +impl NodeInfo { + async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { + use tokio_postgres::config::Host; + + let connect_once = |host, port| { + info!("trying to connect to a compute node at {host}:{port}"); + TcpStream::connect((host, port)).and_then(|socket| async { + let socket_addr = socket.peer_addr()?; + // This prevents load balancer from severing the connection. + socket2::SockRef::from(&socket).set_keepalive(true)?; + Ok((socket_addr, socket)) + }) + }; + + // We can't reuse connection establishing logic from `tokio_postgres` here, + // because it has no means for extracting the underlying socket which we + // require for our business. + let mut connection_error = None; + let ports = self.config.get_ports(); + let hosts = self.config.get_hosts(); + // the ports array is supposed to have 0 entries, 1 entry, or as many entries as in the hosts array + if ports.len() > 1 && ports.len() != hosts.len() { + return Err(io::Error::new( + io::ErrorKind::Other, + format!( + "couldn't connect: bad compute config, \ + ports and hosts entries' count does not match: {:?}", + self.config + ), + )); } - config + for (i, host) in hosts.iter().enumerate() { + let port = ports.get(i).or_else(|| ports.first()).unwrap_or(&5432); + let host = match host { + Host::Tcp(host) => host.as_str(), + Host::Unix(_) => continue, // unix sockets are not welcome here + }; + + // TODO: maybe we should add a timeout. + match connect_once(host, *port).await { + Ok(socket) => return Ok(socket), + Err(err) => { + // We can't throw an error here, as there might be more hosts to try. + error!("failed to connect to a compute node at {host}:{port}: {err}"); + connection_error = Some(err); + } + } + } + + Err(connection_error.unwrap_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + format!("couldn't connect: bad compute config: {:?}", self.config), + ) + })) + } +} + +pub struct PostgresConnection { + /// Socket connected to a compute node. + pub stream: TcpStream, + /// PostgreSQL version of this instance. + pub version: String, +} + +impl NodeInfo { + /// Connect to a corresponding compute node. + pub async fn connect( + mut self, + params: &StartupMessageParams, + ) -> Result<(PostgresConnection, CancelClosure), ConnectionError> { + if let Some(options) = params.options_raw() { + // We must drop all proxy-specific parameters. + #[allow(unstable_name_collisions)] + let options: String = options + .filter(|opt| !opt.starts_with("project=")) + .intersperse(" ") // TODO: use impl from std once it's stabilized + .collect(); + + self.config.options(&options); + } + + if let Some(app_name) = params.get("application_name") { + self.config.application_name(app_name); + } + + if let Some(replication) = params.get("replication") { + use tokio_postgres::config::ReplicationMode; + match replication { + "true" | "on" | "yes" | "1" => { + self.config.replication_mode(ReplicationMode::Physical); + } + "database" => { + self.config.replication_mode(ReplicationMode::Logical); + } + _other => {} + } + } + + // TODO: extend the list of the forwarded startup parameters. + // Currently, tokio-postgres doesn't allow us to pass + // arbitrary parameters, but the ones above are a good start. + + let (socket_addr, mut stream) = self + .connect_raw() + .await + .map_err(|_| ConnectionError::FailedToConnectToCompute)?; + + // TODO: establish a secure connection to the DB + let (client, conn) = self.config.connect_raw(&mut stream, NoTls).await?; + let version = conn + .parameter("server_version") + .ok_or(ConnectionError::FailedToFetchPgVersion)? + .into(); + + info!("connected to user's compute node at {socket_addr}"); + let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); + let db = PostgresConnection { stream, version }; + + Ok((db, cancel_closure)) } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 9ab64db795..031fa84509 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,81 +1,69 @@ -use anyhow::{anyhow, ensure, Context}; -use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig}; -use std::net::SocketAddr; -use std::str::FromStr; +use crate::auth; +use anyhow::{ensure, Context}; use std::sync::Arc; -pub type TlsConfig = Arc; - -#[non_exhaustive] -pub enum ClientAuthMethod { - Password, - Link, - - /// Use password auth only if username ends with "@zenith" - Mixed, +pub struct ProxyConfig { + pub tls_config: Option, + pub auth_backend: auth::BackendType<'static, ()>, } -pub enum RouterConfig { - Static { host: String, port: u16 }, - Dynamic(ClientAuthMethod), +pub struct TlsConfig { + pub config: Arc, + pub common_name: Option, } -impl FromStr for ClientAuthMethod { - type Err = anyhow::Error; - - fn from_str(s: &str) -> anyhow::Result { - use ClientAuthMethod::*; - match s { - "password" => Ok(Password), - "link" => Ok(Link), - "mixed" => Ok(Mixed), - _ => Err(anyhow::anyhow!("Invlid option for router")), - } +impl TlsConfig { + pub fn to_server_config(&self) -> Arc { + self.config.clone() } } -pub struct ProxyConfig { - /// main entrypoint for users to connect to - pub proxy_address: SocketAddr, - - /// method of assigning compute nodes - pub router_config: RouterConfig, - - /// internally used for status and prometheus metrics - pub http_address: SocketAddr, - - /// management endpoint. Upon user account creation control plane - /// will notify us here, so that we can 'unfreeze' user session. - /// TODO It uses postgres protocol over TCP but should be migrated to http. - pub mgmt_address: SocketAddr, - - /// send unauthenticated users to this URI - pub redirect_uri: String, - - /// control plane address where we would check auth. - pub auth_endpoint: String, - - pub tls_config: Option, -} - -pub fn configure_ssl(key_path: &str, cert_path: &str) -> anyhow::Result { +/// Configure TLS for the main endpoint. +pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result { let key = { - let key_bytes = std::fs::read(key_path).context("SSL key file")?; - let mut keys = pemfile::pkcs8_private_keys(&mut &key_bytes[..]) - .map_err(|_| anyhow!("couldn't read TLS keys"))?; + let key_bytes = std::fs::read(key_path).context("TLS key file")?; + let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) + .context(format!("Failed to read TLS keys at '{key_path}'"))?; + ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - keys.pop().unwrap() + keys.pop().map(rustls::PrivateKey).unwrap() }; + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; let cert_chain = { - let cert_chain_bytes = std::fs::read(cert_path).context("SSL cert file")?; - pemfile::certs(&mut &cert_chain_bytes[..]) - .map_err(|_| anyhow!("couldn't read TLS certificates"))? + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .context(format!( + "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." + ))? + .into_iter() + .map(rustls::Certificate) + .collect() }; - let mut config = ServerConfig::new(NoClientAuth::new()); - config.set_single_cert(cert_chain, key)?; - config.versions = vec![ProtocolVersion::TLSv1_3]; + let config = rustls::ServerConfig::builder() + .with_safe_default_cipher_suites() + .with_safe_default_kx_groups() + // allow TLS 1.2 to be compatible with older client libraries + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])? + .with_no_client_auth() + .with_single_cert(cert_chain, key)? + .into(); - Ok(config.into()) + // determine common name from tls-cert (-c server.crt param). + // used in asserting project name formatting invariant. + let common_name = { + let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes) + .context(format!( + "Failed to parse PEM object from bytes from file at '{cert_path}'." + ))? + .1; + let common_name = pem.parse_x509()?.subject().to_string(); + common_name.strip_prefix("CN=*.").map(|s| s.to_string()) + }; + + Ok(TlsConfig { + config, + common_name, + }) } diff --git a/proxy/src/cplane_api.rs b/proxy/src/cplane_api.rs deleted file mode 100644 index 187809717f..0000000000 --- a/proxy/src/cplane_api.rs +++ /dev/null @@ -1,120 +0,0 @@ -use crate::auth::ClientCredentials; -use crate::compute::DatabaseInfo; -use crate::waiters::{Waiter, Waiters}; -use anyhow::{anyhow, bail}; -use lazy_static::lazy_static; -use serde::{Deserialize, Serialize}; - -lazy_static! { - static ref CPLANE_WAITERS: Waiters> = Default::default(); -} - -/// Give caller an opportunity to wait for cplane's reply. -pub async fn with_waiter(psql_session_id: impl Into, f: F) -> anyhow::Result -where - F: FnOnce(Waiter<'static, Result>) -> R, - R: std::future::Future>, -{ - let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; - f(waiter).await -} - -pub fn notify(psql_session_id: &str, msg: Result) -> anyhow::Result<()> { - CPLANE_WAITERS.notify(psql_session_id, msg) -} - -/// Zenith console API wrapper. -pub struct CPlaneApi<'a> { - auth_endpoint: &'a str, -} - -impl<'a> CPlaneApi<'a> { - pub fn new(auth_endpoint: &'a str) -> Self { - Self { auth_endpoint } - } -} - -impl CPlaneApi<'_> { - pub async fn authenticate_proxy_request( - &self, - creds: ClientCredentials, - md5_response: &[u8], - salt: &[u8; 4], - psql_session_id: &str, - ) -> anyhow::Result { - let mut url = reqwest::Url::parse(self.auth_endpoint)?; - url.query_pairs_mut() - .append_pair("login", &creds.user) - .append_pair("database", &creds.dbname) - .append_pair("md5response", std::str::from_utf8(md5_response)?) - .append_pair("salt", &hex::encode(salt)) - .append_pair("psql_session_id", psql_session_id); - - with_waiter(psql_session_id, |waiter| async { - println!("cplane request: {}", url); - // TODO: leverage `reqwest::Client` to reuse connections - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - bail!("Auth failed: {}", resp.status()) - } - - let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?; - println!("got auth info: #{:?}", auth_info); - - use ProxyAuthResponse::*; - match auth_info { - Ready { conn_info } => Ok(conn_info), - Error { error } => bail!(error), - NotReady { .. } => waiter.await?.map_err(|e| anyhow!(e)), - } - }) - .await - } -} - -// NOTE: the order of constructors is important. -// https://serde.rs/enum-representations.html#untagged -#[derive(Serialize, Deserialize, Debug)] -#[serde(untagged)] -enum ProxyAuthResponse { - Ready { conn_info: DatabaseInfo }, - Error { error: String }, - NotReady { ready: bool }, // TODO: get rid of `ready` -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn test_proxy_auth_response() { - // Ready - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": true, - "conn_info": DatabaseInfo::default(), - })) - .unwrap(); - assert!(matches!( - auth, - ProxyAuthResponse::Ready { - conn_info: DatabaseInfo { .. } - } - )); - - // Error - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": false, - "error": "too bad, so sad", - })) - .unwrap(); - assert!(matches!(auth, ProxyAuthResponse::Error { .. })); - - // NotReady - let auth: ProxyAuthResponse = serde_json::from_value(json!({ - "ready": false, - })) - .unwrap(); - assert!(matches!(auth, ProxyAuthResponse::NotReady { .. })); - } -} diff --git a/proxy/src/error.rs b/proxy/src/error.rs new file mode 100644 index 0000000000..0e376a37cd --- /dev/null +++ b/proxy/src/error.rs @@ -0,0 +1,24 @@ +use std::io; + +/// Marks errors that may be safely shown to a client. +/// This trait can be seen as a specialized version of [`ToString`]. +/// +/// NOTE: This trait should not be implemented for [`anyhow::Error`], since it +/// is way too convenient and tends to proliferate all across the codebase, +/// ultimately leading to accidental leaks of sensitive data. +pub trait UserFacingError: ToString { + /// Format the error for client, stripping all sensitive info. + /// + /// Although this might be a no-op for many types, it's highly + /// recommended to override the default impl in case error type + /// contains anything sensitive: various IDs, IP addresses etc. + #[inline(always)] + fn to_string_client(&self) -> String { + self.to_string() + } +} + +/// Upcast (almost) any error into an opaque [`io::Error`]. +pub fn io_error(e: impl Into>) -> io::Error { + io::Error::new(io::ErrorKind::Other, e) +} diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 0b693d88dd..6f9145678b 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -1,30 +1,92 @@ -use anyhow::anyhow; -use hyper::{Body, Request, Response, StatusCode}; -use std::net::TcpListener; -use zenith_utils::http::endpoint; -use zenith_utils::http::error::ApiError; -use zenith_utils::http::json::json_response; -use zenith_utils::http::{RouterBuilder, RouterService}; +pub mod server; -async fn status_handler(_: Request) -> Result, ApiError> { - Ok(json_response(StatusCode::OK, "")?) +use crate::url::ApiUrl; + +/// Thin convenience wrapper for an API provided by an http endpoint. +#[derive(Debug, Clone)] +pub struct Endpoint { + /// API's base URL. + endpoint: ApiUrl, + /// Connection manager with built-in pooling. + client: reqwest::Client, } -fn make_router() -> RouterBuilder { - let router = endpoint::make_router(); - router.get("/v1/status", status_handler) -} - -pub async fn thread_main(http_listener: TcpListener) -> anyhow::Result<()> { - scopeguard::defer! { - println!("http has shut down"); +impl Endpoint { + /// Construct a new HTTP endpoint wrapper. + pub fn new(endpoint: ApiUrl, client: reqwest::Client) -> Self { + Self { endpoint, client } } - let service = || RouterService::new(make_router().build()?); + #[inline(always)] + pub fn url(&self) -> &ApiUrl { + &self.endpoint + } - hyper::Server::from_tcp(http_listener)? - .serve(service().map_err(|e| anyhow!(e))?) - .await?; + /// Return a [builder](reqwest::RequestBuilder) for a `GET` request, + /// appending a single `path` segment to the base endpoint URL. + pub fn get(&self, path: &str) -> reqwest::RequestBuilder { + let mut url = self.endpoint.clone(); + url.path_segments_mut().push(path); + self.client.get(url.into_inner()) + } - Ok(()) + /// Execute a [request](reqwest::Request). + pub async fn execute( + &self, + request: reqwest::Request, + ) -> Result { + self.client.execute(request).await + } + + /// Execute a [request](reqwest::Request) and raise an error if status != 200. + pub async fn checked_execute( + &self, + request: reqwest::Request, + ) -> Result { + self.execute(request) + .await + .and_then(|r| r.error_for_status()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn optional_query_params() -> anyhow::Result<()> { + let url = "http://example.com".parse()?; + let endpoint = Endpoint::new(url, reqwest::Client::new()); + + // Validate that this pattern makes sense. + let req = endpoint + .get("frobnicate") + .query(&[ + ("foo", Some("10")), // should be just `foo=10` + ("bar", None), // shouldn't be passed at all + ]) + .build()?; + + assert_eq!(req.url().as_str(), "http://example.com/frobnicate?foo=10"); + + Ok(()) + } + + #[test] + fn uuid_params() -> anyhow::Result<()> { + let url = "http://example.com".parse()?; + let endpoint = Endpoint::new(url, reqwest::Client::new()); + + let req = endpoint + .get("frobnicate") + .query(&[("session_id", uuid::Uuid::nil())]) + .build()?; + + assert_eq!( + req.url().as_str(), + "http://example.com/frobnicate?session_id=00000000-0000-0000-0000-000000000000" + ); + + Ok(()) + } } diff --git a/proxy/src/http/server.rs b/proxy/src/http/server.rs new file mode 100644 index 0000000000..05f6feb307 --- /dev/null +++ b/proxy/src/http/server.rs @@ -0,0 +1,28 @@ +use anyhow::anyhow; +use hyper::{Body, Request, Response, StatusCode}; +use std::net::TcpListener; +use tracing::info; +use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; + +async fn status_handler(_: Request) -> Result, ApiError> { + json_response(StatusCode::OK, "") +} + +fn make_router() -> RouterBuilder { + let router = endpoint::make_router(); + router.get("/v1/status", status_handler) +} + +pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<()> { + scopeguard::defer! { + info!("http has shut down"); + } + + let service = || RouterService::new(make_router().build()?); + + hyper::Server::from_tcp(http_listener)? + .serve(service().map_err(|e| anyhow!(e))?) + .await?; + + Ok(()) +} diff --git a/proxy/src/main.rs b/proxy/src/main.rs index fb3bf725b8..2055616a6e 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -1,32 +1,37 @@ -/// -/// Postgres protocol proxy/router. -/// -/// This service listens psql port and can check auth via external service -/// (control plane API in our case) and can create new databases and accounts -/// in somewhat transparent manner (again via communication with control plane API). -/// -use anyhow::{bail, Context}; -use clap::{App, Arg}; -use config::ProxyConfig; -use futures::FutureExt; -use std::future::Future; -use tokio::{net::TcpListener, task::JoinError}; -use zenith_utils::GIT_VERSION; - -use crate::config::{ClientAuthMethod, RouterConfig}; +//! Postgres protocol proxy/router. +//! +//! This service listens psql port and can check auth via external service +//! (control plane API in our case) and can create new databases and accounts +//! in somewhat transparent manner (again via communication with control plane API). mod auth; mod cancellation; mod compute; mod config; -mod cplane_api; +mod error; mod http; mod mgmt; +mod parse; mod proxy; +mod sasl; +mod scram; mod stream; +mod url; mod waiters; -/// Flattens Result> into Result. +use anyhow::{bail, Context}; +use clap::{self, Arg}; +use config::ProxyConfig; +use futures::FutureExt; +use metrics::set_build_info_metric; +use std::{borrow::Cow, future::Future, net::SocketAddr}; +use tokio::{net::TcpListener, task::JoinError}; +use tracing::info; +use utils::project_git_version; + +project_git_version!(GIT_VERSION); + +/// Flattens `Result>` into `Result`. async fn flatten_err( f: impl Future, JoinError>>, ) -> anyhow::Result<()> { @@ -35,44 +40,113 @@ async fn flatten_err( #[tokio::main] async fn main() -> anyhow::Result<()> { - zenith_metrics::set_common_metrics_prefix("zenith_proxy"); - let arg_matches = App::new("Zenith proxy/router") + tracing_subscriber::fmt() + .with_ansi(atty::is(atty::Stream::Stdout)) + .with_target(false) + .init(); + + let arg_matches = cli().get_matches(); + + let tls_config = match ( + arg_matches.get_one::("tls-key"), + arg_matches.get_one::("tls-cert"), + ) { + (Some(key_path), Some(cert_path)) => Some(config::configure_tls(key_path, cert_path)?), + (None, None) => None, + _ => bail!("either both or neither tls-key and tls-cert must be specified"), + }; + + let proxy_address: SocketAddr = arg_matches.get_one::("proxy").unwrap().parse()?; + let mgmt_address: SocketAddr = arg_matches.get_one::("mgmt").unwrap().parse()?; + let http_address: SocketAddr = arg_matches.get_one::("http").unwrap().parse()?; + + let auth_backend = match arg_matches + .get_one::("auth-backend") + .unwrap() + .as_str() + { + "console" => { + let url = arg_matches + .get_one::("auth-endpoint") + .unwrap() + .parse()?; + let endpoint = http::Endpoint::new(url, reqwest::Client::new()); + auth::BackendType::Console(Cow::Owned(endpoint), ()) + } + "postgres" => { + let url = arg_matches + .get_one::("auth-endpoint") + .unwrap() + .parse()?; + auth::BackendType::Postgres(Cow::Owned(url), ()) + } + "link" => { + let url = arg_matches.get_one::("uri").unwrap().parse()?; + auth::BackendType::Link(Cow::Owned(url)) + } + other => bail!("unsupported auth backend: {other}"), + }; + + let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig { + tls_config, + auth_backend, + })); + + info!("Version: {GIT_VERSION}"); + info!("Authentication backend: {}", config.auth_backend); + + // Check that we can bind to address before further initialization + info!("Starting http on {http_address}"); + let http_listener = TcpListener::bind(http_address).await?.into_std()?; + + info!("Starting mgmt on {mgmt_address}"); + let mgmt_listener = TcpListener::bind(mgmt_address).await?.into_std()?; + + info!("Starting proxy on {proxy_address}"); + let proxy_listener = TcpListener::bind(proxy_address).await?; + + let tasks = [ + tokio::spawn(http::server::task_main(http_listener)), + tokio::spawn(proxy::task_main(config, proxy_listener)), + tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)), + ] + .map(flatten_err); + + set_build_info_metric(GIT_VERSION); + // This will block until all tasks have completed. + // Furthermore, the first one to fail will cancel the rest. + let _: Vec<()> = futures::future::try_join_all(tasks).await?; + + Ok(()) +} + +fn cli() -> clap::Command { + clap::Command::new("Neon proxy/router") + .disable_help_flag(true) .version(GIT_VERSION) .arg( Arg::new("proxy") .short('p') .long("proxy") - .takes_value(true) .help("listen for incoming client connections on ip:port") .default_value("127.0.0.1:4432"), ) .arg( - Arg::new("auth-method") - .long("auth-method") - .takes_value(true) - .help("Possible values: password | link | mixed") - .default_value("mixed"), - ) - .arg( - Arg::new("static-router") - .short('s') - .long("static-router") - .takes_value(true) - .help("Route all clients to host:port"), + Arg::new("auth-backend") + .long("auth-backend") + .value_parser(["console", "postgres", "link"]) + .default_value("link"), ) .arg( Arg::new("mgmt") .short('m') .long("mgmt") - .takes_value(true) .help("listen for management callback connection on ip:port") .default_value("127.0.0.1:7000"), ) .arg( Arg::new("http") - .short('h') .long("http") - .takes_value(true) .help("listen for incoming http connections (metrics, etc) on ip:port") .default_value("127.0.0.1:7001"), ) @@ -80,87 +154,33 @@ async fn main() -> anyhow::Result<()> { Arg::new("uri") .short('u') .long("uri") - .takes_value(true) - .help("redirect unauthenticated users to given uri") + .help("redirect unauthenticated users to the given uri in case of link auth") .default_value("http://localhost:3000/psql_session/"), ) .arg( Arg::new("auth-endpoint") .short('a') .long("auth-endpoint") - .takes_value(true) - .help("API endpoint for authenticating users") + .help("cloud API endpoint for authenticating users") .default_value("http://localhost:3000/authenticate_proxy_request/"), ) .arg( - Arg::new("ssl-key") + Arg::new("tls-key") .short('k') - .long("ssl-key") - .takes_value(true) - .help("path to SSL key for client postgres connections"), + .long("tls-key") + .alias("ssl-key") // backwards compatibility + .help("path to TLS key for client postgres connections"), ) .arg( - Arg::new("ssl-cert") + Arg::new("tls-cert") .short('c') - .long("ssl-cert") - .takes_value(true) - .help("path to SSL cert for client postgres connections"), + .long("tls-cert") + .alias("ssl-cert") // backwards compatibility + .help("path to TLS cert for client postgres connections"), ) - .get_matches(); - - let tls_config = match ( - arg_matches.value_of("ssl-key"), - arg_matches.value_of("ssl-cert"), - ) { - (Some(key_path), Some(cert_path)) => Some(config::configure_ssl(key_path, cert_path)?), - (None, None) => None, - _ => bail!("either both or neither ssl-key and ssl-cert must be specified"), - }; - - let auth_method = arg_matches.value_of("auth-method").unwrap().parse()?; - let router_config = match arg_matches.value_of("static-router") { - None => RouterConfig::Dynamic(auth_method), - Some(addr) => { - if let ClientAuthMethod::Password = auth_method { - let (host, port) = addr.split_once(":").unwrap(); - RouterConfig::Static { - host: host.to_string(), - port: port.parse().unwrap(), - } - } else { - bail!("static-router requires --auth-method password") - } - } - }; - - let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig { - router_config, - proxy_address: arg_matches.value_of("proxy").unwrap().parse()?, - mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?, - http_address: arg_matches.value_of("http").unwrap().parse()?, - redirect_uri: arg_matches.value_of("uri").unwrap().parse()?, - auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?, - tls_config, - })); - - println!("Version: {}", GIT_VERSION); - - // Check that we can bind to address before further initialization - println!("Starting http on {}", config.http_address); - let http_listener = TcpListener::bind(config.http_address).await?.into_std()?; - - println!("Starting mgmt on {}", config.mgmt_address); - let mgmt_listener = TcpListener::bind(config.mgmt_address).await?.into_std()?; - - println!("Starting proxy on {}", config.proxy_address); - let proxy_listener = TcpListener::bind(config.proxy_address).await?; - - let http = tokio::spawn(http::thread_main(http_listener)); - let proxy = tokio::spawn(proxy::thread_main(config, proxy_listener)); - let mgmt = tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)); - - let tasks = [flatten_err(http), flatten_err(proxy), flatten_err(mgmt)]; - let _: Vec<()> = futures::future::try_join_all(tasks).await?; - - Ok(()) +} + +#[test] +fn verify_cli() { + cli().debug_assert(); } diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 55b49b441f..06d1a4f106 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -1,14 +1,15 @@ -use crate::{compute::DatabaseInfo, cplane_api}; +use crate::auth; use anyhow::Context; +use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; use serde::Deserialize; use std::{ net::{TcpListener, TcpStream}, thread, }; -use zenith_utils::{ - postgres_backend::{self, AuthType, PostgresBackend}, - pq_proto::{BeMessage, SINGLE_COL_ROWDESC}, -}; +use tracing::{error, info}; +use utils::postgres_backend::{self, AuthType, PostgresBackend}; + +/// TODO: move all of that to auth-backend/link.rs when we ditch legacy-console backend /// /// Main proxy listener loop. @@ -17,7 +18,7 @@ use zenith_utils::{ /// pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> { scopeguard::defer! { - println!("mgmt has shut down"); + info!("mgmt has shut down"); } listener @@ -25,14 +26,14 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> { .context("failed to set listener to blocking")?; loop { let (socket, peer_addr) = listener.accept().context("failed to accept a new client")?; - println!("accepted connection from {}", peer_addr); + info!("accepted connection from {peer_addr}"); socket .set_nodelay(true) .context("failed to set client socket option")?; thread::spawn(move || { if let Err(err) = handle_connection(socket) { - println!("error: {}", err); + error!("{err}"); } }); } @@ -75,10 +76,22 @@ struct PsqlSessionResponse { #[derive(Deserialize)] enum PsqlSessionResult { - Success(DatabaseInfo), + Success(auth::DatabaseInfo), Failure(String), } +/// A message received by `mgmt` when a compute node is ready. +pub type ComputeReady = Result; + +impl PsqlSessionResult { + fn into_compute_ready(self) -> ComputeReady { + match self { + Self::Success(db_info) => Ok(db_info), + Self::Failure(message) => Err(message), + } + } +} + impl postgres_backend::Handler for MgmtHandler { fn process_query( &mut self, @@ -88,24 +101,18 @@ impl postgres_backend::Handler for MgmtHandler { let res = try_process_query(pgb, query_string); // intercept and log error message if res.is_err() { - println!("Mgmt query failed: #{:?}", res); + error!("mgmt query failed: {res:?}"); } res } } fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::Result<()> { - println!("Got mgmt query: '{}'", query_string); + info!("got mgmt query [redacted]"); // Content contains password, don't print it let resp: PsqlSessionResponse = serde_json::from_str(query_string)?; - use PsqlSessionResult::*; - let msg = match resp.result { - Success(db_info) => Ok(db_info), - Failure(message) => Err(message), - }; - - match cplane_api::notify(&resp.session_id, msg) { + match auth::backend::notify(&resp.session_id, resp.result.into_compute_ready()) { Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? diff --git a/proxy/src/parse.rs b/proxy/src/parse.rs new file mode 100644 index 0000000000..cbd48d91e9 --- /dev/null +++ b/proxy/src/parse.rs @@ -0,0 +1,44 @@ +//! Small parsing helpers. + +use std::ffi::CStr; + +pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { + let pos = bytes.iter().position(|&x| x == 0)?; + let (cstr, other) = bytes.split_at(pos + 1); + // SAFETY: we've already checked that there's a terminator + Some((unsafe { CStr::from_bytes_with_nul_unchecked(cstr) }, other)) +} + +/// See . +pub fn split_at_const(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> { + (bytes.len() >= N).then(|| { + let (head, tail) = bytes.split_at(N); + (head.try_into().unwrap(), tail) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_split_cstr() { + assert!(split_cstr(b"").is_none()); + assert!(split_cstr(b"foo").is_none()); + + let (cstr, rest) = split_cstr(b"\0").expect("uh-oh"); + assert_eq!(cstr.to_bytes(), b""); + assert_eq!(rest, b""); + + let (cstr, rest) = split_cstr(b"foo\0bar").expect("uh-oh"); + assert_eq!(cstr.to_bytes(), b"foo"); + assert_eq!(rest, b"bar"); + } + + #[test] + fn test_split_at_const() { + assert!(split_at_const::<0>(b"").is_some()); + assert!(split_at_const::<1>(b"").is_none()); + assert!(matches!(split_at_const::<1>(b"ok"), Some((b"o", b"k")))); + } +} diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 1dc301b792..9257fcd650 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,73 +1,93 @@ use crate::auth; -use crate::cancellation::{self, CancelClosure, CancelMap}; -use crate::compute::DatabaseInfo; +use crate::cancellation::{self, CancelMap}; use crate::config::{ProxyConfig, TlsConfig}; -use crate::stream::{MetricsStream, PqStream, Stream}; +use crate::stream::{MeasuredStream, PqStream, Stream}; use anyhow::{bail, Context}; -use lazy_static::lazy_static; +use futures::TryFutureExt; +use metrics::{register_int_counter, IntCounter}; +use once_cell::sync::Lazy; +use pq_proto::{BeMessage as Be, *}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::net::TcpStream; -use tokio_postgres::NoTls; -use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter}; -use zenith_utils::pq_proto::{BeMessage as Be, *}; +use tracing::{error, info, info_span, Instrument}; -lazy_static! { - static ref NUM_CONNECTIONS_ACCEPTED_COUNTER: IntCounter = register_int_counter!( - new_common_metric_name("num_connections_accepted"), +const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; +const ERR_PROTO_VIOLATION: &str = "protocol violation"; + +static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter!( + "proxy_accepted_connections_total", "Number of TCP client connections accepted." ) - .unwrap(); - static ref NUM_CONNECTIONS_CLOSED_COUNTER: IntCounter = register_int_counter!( - new_common_metric_name("num_connections_closed"), + .unwrap() +}); + +static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter!( + "proxy_closed_connections_total", "Number of TCP client connections closed." ) - .unwrap(); - static ref NUM_BYTES_PROXIED_COUNTER: IntCounter = register_int_counter!( - new_common_metric_name("num_bytes_proxied"), + .unwrap() +}); + +static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter!( + "proxy_io_bytes_total", "Number of bytes sent/received between any client and backend." ) - .unwrap(); -} + .unwrap() +}); +/// A small combinator for pluggable error logging. async fn log_error(future: F) -> F::Output where F: std::future::Future>, { future.await.map_err(|err| { - println!("error: {}", err); + error!("{err}"); err }) } -pub async fn thread_main( +pub async fn task_main( config: &'static ProxyConfig, listener: tokio::net::TcpListener, ) -> anyhow::Result<()> { scopeguard::defer! { - println!("proxy has shut down"); + info!("proxy has shut down"); } + // When set for the server socket, the keepalive setting + // will be inherited by all accepted client sockets. + socket2::SockRef::from(&listener).set_keepalive(true)?; + let cancel_map = Arc::new(CancelMap::default()); loop { let (socket, peer_addr) = listener.accept().await?; - println!("accepted connection from {}", peer_addr); + info!("accepted postgres client connection from {peer_addr}"); + let session_id = uuid::Uuid::new_v4(); let cancel_map = Arc::clone(&cancel_map); - tokio::spawn(log_error(async move { - socket - .set_nodelay(true) - .context("failed to set socket option")?; + tokio::spawn( + log_error(async move { + info!("spawned a task for {peer_addr}"); - handle_client(config, &cancel_map, socket).await - })); + socket + .set_nodelay(true) + .context("failed to set socket option")?; + + handle_client(config, &cancel_map, session_id, socket).await + }) + .instrument(info_span!("client", session = format_args!("{session_id}"))), + ); } } async fn handle_client( config: &ProxyConfig, cancel_map: &CancelMap, - stream: impl AsyncRead + AsyncWrite + Unpin, + session_id: uuid::Uuid, + stream: impl AsyncRead + AsyncWrite + Unpin + Send, ) -> anyhow::Result<()> { // The `closed` counter will increase when this future is destroyed. NUM_CONNECTIONS_ACCEPTED_COUNTER.inc(); @@ -75,33 +95,48 @@ async fn handle_client( NUM_CONNECTIONS_CLOSED_COUNTER.inc(); } - let tls = config.tls_config.clone(); - if let Some((client, creds)) = handshake(stream, tls, cancel_map).await? { - cancel_map - .with_session(|session| async { - connect_client_to_db(config, session, client, creds).await - }) - .await?; - } + let tls = config.tls_config.as_ref(); + let do_handshake = handshake(stream, tls, cancel_map).instrument(info_span!("handshake")); + let (mut stream, params) = match do_handshake.await? { + Some(x) => x, + None => return Ok(()), // it's a cancellation request + }; - Ok(()) + // Extract credentials which we're going to use for auth. + let creds = { + let sni = stream.get_ref().sni_hostname(); + let common_name = tls.and_then(|tls| tls.common_name.as_deref()); + let result = config + .auth_backend + .as_ref() + .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name)) + .transpose(); + + async { result }.or_else(|e| stream.throw_error(e)).await? + }; + + let client = Client::new(stream, creds, ¶ms, session_id); + cancel_map + .with_session(|session| client.connect_to_db(session)) + .await } -/// Handle a connection from one client. -/// For better testing experience, `stream` can be -/// any object satisfying the traits. +/// Establish a (most probably, secure) connection with the client. +/// For better testing experience, `stream` can be any object satisfying the traits. +/// It's easier to work with owned `stream` here as we need to upgrade it to TLS; +/// we also take an extra care of propagating only the select handshake errors to client. async fn handshake( stream: S, - mut tls: Option, + mut tls: Option<&TlsConfig>, cancel_map: &CancelMap, -) -> anyhow::Result>, auth::ClientCredentials)>> { +) -> anyhow::Result>, StartupMessageParams)>> { // Client may try upgrading to each protocol only once let (mut tried_ssl, mut tried_gss) = (false, false); let mut stream = PqStream::new(Stream::from_raw(stream)); loop { let msg = stream.read_startup_packet().await?; - println!("got message: {:?}", msg); + info!("received {msg:?}"); use FeStartupPacket::*; match msg { @@ -112,14 +147,15 @@ async fn handshake( // We can't perform TLS handshake without a config let enc = tls.is_some(); stream.write_message(&Be::EncryptionResponse(enc)).await?; - if let Some(tls) = tls.take() { // Upgrade raw stream into a secure TLS-backed stream. // NOTE: We've consumed `tls`; this fact will be used later. - stream = PqStream::new(stream.into_inner().upgrade(tls).await?); + stream = PqStream::new( + stream.into_inner().upgrade(tls.to_server_config()).await?, + ); } } - _ => bail!("protocol violation"), + _ => bail!(ERR_PROTO_VIOLATION), }, GssEncRequest => match stream.get_ref() { Stream::Raw { .. } if !tried_gss => { @@ -128,111 +164,131 @@ async fn handshake( // Currently, we don't support GSSAPI stream.write_message(&Be::EncryptionResponse(false)).await?; } - _ => bail!("protocol violation"), + _ => bail!(ERR_PROTO_VIOLATION), }, StartupMessage { params, .. } => { // Check that the config has been consumed during upgrade // OR we didn't provide it at all (for dev purposes). if tls.is_some() { - let msg = "connection is insecure (try using `sslmode=require`)"; - stream.write_message(&Be::ErrorResponse(msg)).await?; - bail!(msg); + stream.throw_error_str(ERR_INSECURE_CONNECTION).await?; } - break Ok(Some((stream, params.try_into()?))); + info!(session_type = "normal", "successful handshake"); + break Ok(Some((stream, params))); } CancelRequest(cancel_key_data) => { cancel_map.cancel_session(cancel_key_data).await?; + info!(session_type = "cancellation", "successful handshake"); break Ok(None); } } } } -async fn connect_client_to_db( - config: &ProxyConfig, - session: cancellation::Session<'_>, - mut client: PqStream, - creds: auth::ClientCredentials, -) -> anyhow::Result<()> { - let db_info = creds.authenticate(config, &mut client).await?; - let (db, version, cancel_closure) = connect_to_db(db_info).await?; - let cancel_key_data = session.enable_cancellation(cancel_closure); - - client - .write_message_noflush(&BeMessage::ParameterStatus( - BeParameterStatusMessage::ServerVersion(&version), - ))? - .write_message_noflush(&Be::BackendKeyData(cancel_key_data))? - .write_message(&BeMessage::ReadyForQuery) - .await?; - - // This function will be called for writes to either direction. - fn inc_proxied(cnt: usize) { - // Consider inventing something more sophisticated - // if this ever becomes a bottleneck (cacheline bouncing). - NUM_BYTES_PROXIED_COUNTER.inc_by(cnt as u64); - } - - let mut db = MetricsStream::new(db, inc_proxied); - let mut client = MetricsStream::new(client.into_inner(), inc_proxied); - let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?; - - Ok(()) +/// Thin connection context. +struct Client<'a, S> { + /// The underlying libpq protocol stream. + stream: PqStream, + /// Client credentials that we care about. + creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, + /// KV-dictionary with PostgreSQL connection params. + params: &'a StartupMessageParams, + /// Unique connection ID. + session_id: uuid::Uuid, } -/// Connect to a corresponding compute node. -async fn connect_to_db( - db_info: DatabaseInfo, -) -> anyhow::Result<(TcpStream, String, CancelClosure)> { - // TODO: establish a secure connection to the DB - let socket_addr = db_info.socket_addr()?; - let mut socket = TcpStream::connect(socket_addr).await?; +impl<'a, S> Client<'a, S> { + /// Construct a new connection context. + fn new( + stream: PqStream, + creds: auth::BackendType<'a, auth::ClientCredentials<'a>>, + params: &'a StartupMessageParams, + session_id: uuid::Uuid, + ) -> Self { + Self { + stream, + creds, + params, + session_id, + } + } +} - let (client, conn) = tokio_postgres::Config::from(db_info) - .connect_raw(&mut socket, NoTls) - .await?; +impl Client<'_, S> { + /// Let the client authenticate and connect to the designated compute node. + async fn connect_to_db(self, session: cancellation::Session<'_>) -> anyhow::Result<()> { + let Self { + mut stream, + creds, + params, + session_id, + } = self; - let version = conn - .parameter("server_version") - .context("failed to fetch postgres server version")? - .into(); + let extra = auth::ConsoleReqExtra { + session_id, // aka this connection's id + application_name: params.get("application_name"), + }; - let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); + // Authenticate and connect to a compute node. + let auth = creds + .authenticate(&extra, &mut stream) + .instrument(info_span!("auth")) + .await; - Ok((socket, version, cancel_closure)) + let node = async { auth }.or_else(|e| stream.throw_error(e)).await?; + let reported_auth_ok = node.reported_auth_ok; + + let (db, cancel_closure) = node + .connect(params) + .or_else(|e| stream.throw_error(e)) + .await?; + + let cancel_key_data = session.enable_query_cancellation(cancel_closure); + + // Report authentication success if we haven't done this already. + if !reported_auth_ok { + stream + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())?; + } + + stream + .write_message_noflush(&BeMessage::ParameterStatus( + BeParameterStatusMessage::ServerVersion(&db.version), + ))? + .write_message_noflush(&Be::BackendKeyData(cancel_key_data))? + .write_message(&BeMessage::ReadyForQuery) + .await?; + + /// This function will be called for writes to either direction. + fn inc_proxied(cnt: usize) { + // Consider inventing something more sophisticated + // if this ever becomes a bottleneck (cacheline bouncing). + NUM_BYTES_PROXIED_COUNTER.inc_by(cnt as u64); + } + + // Starting from here we only proxy the client's traffic. + info!("performing the proxy pass..."); + let mut db = MeasuredStream::new(db.stream, inc_proxied); + let mut client = MeasuredStream::new(stream.into_inner(), inc_proxied); + let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?; + + Ok(()) + } } #[cfg(test)] mod tests { use super::*; - - use tokio::io::DuplexStream; + use crate::{auth, scram}; + use async_trait::async_trait; + use rstest::rstest; use tokio_postgres::config::SslMode; - use tokio_postgres::tls::MakeTlsConnect; + use tokio_postgres::tls::{MakeTlsConnect, NoTls}; use tokio_postgres_rustls::MakeRustlsConnect; - async fn dummy_proxy( - client: impl AsyncRead + AsyncWrite + Unpin, - tls: Option, - ) -> anyhow::Result<()> { - let cancel_map = CancelMap::default(); - - // TODO: add some infra + tests for credentials - let (mut stream, _creds) = handshake(client, tls, &cancel_map) - .await? - .context("no stream")?; - - stream - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())? - .write_message(&BeMessage::ReadyForQuery) - .await?; - - Ok(()) - } - + /// Generate a set of TLS certificates: CA + server. fn generate_certs( hostname: &str, ) -> anyhow::Result<(rustls::Certificate, rustls::Certificate, rustls::PrivateKey)> { @@ -250,21 +306,133 @@ mod tests { )) } + struct ClientConfig<'a> { + config: rustls::ClientConfig, + hostname: &'a str, + } + + impl ClientConfig<'_> { + fn make_tls_connect( + self, + ) -> anyhow::Result> { + let mut mk = MakeRustlsConnect::new(self.config); + let tls = MakeTlsConnect::::make_tls_connect(&mut mk, self.hostname)?; + Ok(tls) + } + } + + /// Generate TLS certificates and build rustls configs for client and server. + fn generate_tls_config<'a>( + hostname: &'a str, + common_name: &'a str, + ) -> anyhow::Result<(ClientConfig<'a>, TlsConfig)> { + let (ca, cert, key) = generate_certs(hostname)?; + + let tls_config = { + let config = rustls::ServerConfig::builder() + .with_safe_defaults() + .with_no_client_auth() + .with_single_cert(vec![cert], key)? + .into(); + + TlsConfig { + config, + common_name: Some(common_name.to_string()), + } + }; + + let client_config = { + let config = rustls::ClientConfig::builder() + .with_safe_defaults() + .with_root_certificates({ + let mut store = rustls::RootCertStore::empty(); + store.add(&ca)?; + store + }) + .with_no_client_auth(); + + ClientConfig { config, hostname } + }; + + Ok((client_config, tls_config)) + } + + #[async_trait] + trait TestAuth: Sized { + async fn authenticate( + self, + _stream: &mut PqStream>, + ) -> anyhow::Result<()> { + Ok(()) + } + } + + struct NoAuth; + impl TestAuth for NoAuth {} + + struct Scram(scram::ServerSecret); + + impl Scram { + fn new(password: &str) -> anyhow::Result { + let salt = rand::random::<[u8; 16]>(); + let secret = scram::ServerSecret::build(password, &salt, 256) + .context("failed to generate scram secret")?; + Ok(Scram(secret)) + } + + fn mock(user: &str) -> Self { + let salt = rand::random::<[u8; 32]>(); + Scram(scram::ServerSecret::mock(user, &salt)) + } + } + + #[async_trait] + impl TestAuth for Scram { + async fn authenticate( + self, + stream: &mut PqStream>, + ) -> anyhow::Result<()> { + auth::AuthFlow::new(stream) + .begin(auth::Scram(&self.0)) + .await? + .authenticate() + .await?; + + Ok(()) + } + } + + /// A dummy proxy impl which performs a handshake and reports auth success. + async fn dummy_proxy( + client: impl AsyncRead + AsyncWrite + Unpin + Send, + tls: Option, + auth: impl TestAuth + Send, + ) -> anyhow::Result<()> { + let cancel_map = CancelMap::default(); + let (mut stream, _params) = handshake(client, tls.as_ref(), &cancel_map) + .await? + .context("handshake failed")?; + + auth.authenticate(&mut stream).await?; + + stream + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())? + .write_message(&BeMessage::ReadyForQuery) + .await?; + + Ok(()) + } + #[tokio::test] async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); - let server_config = { - let (_ca, cert, key) = generate_certs("localhost")?; + let (_, server_config) = + generate_tls_config("generic-project-name.localhost", "localhost")?; + let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); - let mut config = rustls::ServerConfig::new(rustls::NoClientAuth::new()); - config.set_single_cert(vec![cert], key)?; - config - }; - - let proxy = tokio::spawn(dummy_proxy(client, Some(server_config.into()))); - - tokio_postgres::Config::new() + let client_err = tokio_postgres::Config::new() .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Disable) @@ -273,11 +441,15 @@ mod tests { .err() // -> Option .context("client shouldn't be able to connect")?; - proxy + assert!(client_err.to_string().contains(ERR_INSECURE_CONNECTION)); + + let server_err = proxy .await? .err() // -> Option .context("server shouldn't accept client")?; + assert!(client_err.to_string().contains(&server_err.to_string())); + Ok(()) } @@ -285,30 +457,15 @@ mod tests { async fn handshake_tls() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); - let (ca, cert, key) = generate_certs("localhost")?; - - let server_config = { - let mut config = rustls::ServerConfig::new(rustls::NoClientAuth::new()); - config.set_single_cert(vec![cert], key)?; - config - }; - - let proxy = tokio::spawn(dummy_proxy(client, Some(server_config.into()))); - - let client_config = { - let mut config = rustls::ClientConfig::new(); - config.root_store.add(&ca)?; - config - }; - - let mut mk = MakeRustlsConnect::new(client_config); - let tls = MakeTlsConnect::::make_tls_connect(&mut mk, "localhost")?; + let (client_config, server_config) = + generate_tls_config("generic-project-name.localhost", "localhost")?; + let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); let (_client, _conn) = tokio_postgres::Config::new() .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Require) - .connect_raw(server, tls) + .connect_raw(server, client_config.make_tls_connect()?) .await?; proxy.await? @@ -318,15 +475,100 @@ mod tests { async fn handshake_raw() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); - let proxy = tokio::spawn(dummy_proxy(client, None)); + let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth)); let (_client, _conn) = tokio_postgres::Config::new() .user("john_doe") .dbname("earth") + .options("project=generic-project-name") .ssl_mode(SslMode::Prefer) .connect_raw(server, NoTls) .await?; proxy.await? } + + #[tokio::test] + async fn keepalive_is_inherited() -> anyhow::Result<()> { + use tokio::net::{TcpListener, TcpStream}; + + let listener = TcpListener::bind("127.0.0.1:0").await?; + let port = listener.local_addr()?.port(); + socket2::SockRef::from(&listener).set_keepalive(true)?; + + let t = tokio::spawn(async move { + let (client, _) = listener.accept().await?; + let keepalive = socket2::SockRef::from(&client).keepalive()?; + anyhow::Ok(keepalive) + }); + + let _ = TcpStream::connect(("127.0.0.1", port)).await?; + assert!(t.await??, "keepalive should be inherited"); + + Ok(()) + } + + #[rstest] + #[case("password_foo")] + #[case("pwd-bar")] + #[case("")] + #[tokio::test] + async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { + let (client, server) = tokio::io::duplex(1024); + + let (client_config, server_config) = + generate_tls_config("generic-project-name.localhost", "localhost")?; + let proxy = tokio::spawn(dummy_proxy( + client, + Some(server_config), + Scram::new(password)?, + )); + + let (_client, _conn) = tokio_postgres::Config::new() + .user("user") + .dbname("db") + .password(password) + .ssl_mode(SslMode::Require) + .connect_raw(server, client_config.make_tls_connect()?) + .await?; + + proxy.await? + } + + #[tokio::test] + async fn scram_auth_mock() -> anyhow::Result<()> { + let (client, server) = tokio::io::duplex(1024); + + let (client_config, server_config) = + generate_tls_config("generic-project-name.localhost", "localhost")?; + let proxy = tokio::spawn(dummy_proxy( + client, + Some(server_config), + Scram::mock("user"), + )); + + use rand::{distributions::Alphanumeric, Rng}; + let password: String = rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(rand::random::() as usize) + .map(char::from) + .collect(); + + let _client_err = tokio_postgres::Config::new() + .user("user") + .dbname("db") + .password(&password) // no password will match the mocked secret + .ssl_mode(SslMode::Require) + .connect_raw(server, client_config.make_tls_connect()?) + .await + .err() // -> Option + .context("client shouldn't be able to connect")?; + + let _server_err = proxy + .await? + .err() // -> Option + .context("server shouldn't accept client")?; + + Ok(()) + } } diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs new file mode 100644 index 0000000000..689fca6049 --- /dev/null +++ b/proxy/src/sasl.rs @@ -0,0 +1,73 @@ +//! Simple Authentication and Security Layer. +//! +//! RFC: . +//! +//! Reference implementation: +//! * +//! * + +mod channel_binding; +mod messages; +mod stream; + +use crate::error::UserFacingError; +use std::io; +use thiserror::Error; + +pub use channel_binding::ChannelBinding; +pub use messages::FirstMessage; +pub use stream::SaslStream; + +/// Fine-grained auth errors help in writing tests. +#[derive(Error, Debug)] +pub enum Error { + #[error("Failed to authenticate client: {0}")] + AuthenticationFailed(&'static str), + + #[error("Channel binding failed: {0}")] + ChannelBindingFailed(&'static str), + + #[error("Unsupported channel binding method: {0}")] + ChannelBindingBadMethod(Box), + + #[error("Bad client message")] + BadClientMessage, + + #[error(transparent)] + Io(#[from] io::Error), +} + +impl UserFacingError for Error { + fn to_string_client(&self) -> String { + use Error::*; + match self { + // This constructor contains the reason why auth has failed. + AuthenticationFailed(s) => s.to_string(), + // TODO: add support for channel binding + ChannelBindingFailed(_) => "channel binding is not supported yet".to_string(), + ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"), + _ => "authentication protocol violation".to_string(), + } + } +} + +/// A convenient result type for SASL exchange. +pub type Result = std::result::Result; + +/// A result of one SASL exchange. +pub enum Step { + /// We should continue exchanging messages. + Continue(T), + /// The client has been authenticated successfully. + Authenticated(R), +} + +/// Every SASL mechanism (e.g. [SCRAM](crate::scram)) is expected to implement this trait. +pub trait Mechanism: Sized { + /// What's produced as a result of successful authentication. + type Output; + + /// Produce a server challenge to be sent to the client. + /// This is how this method is called in PostgreSQL (`libpq/sasl.h`). + fn exchange(self, input: &str) -> Result<(Step, String)>; +} diff --git a/proxy/src/sasl/channel_binding.rs b/proxy/src/sasl/channel_binding.rs new file mode 100644 index 0000000000..776adabe55 --- /dev/null +++ b/proxy/src/sasl/channel_binding.rs @@ -0,0 +1,85 @@ +//! Definition and parser for channel binding flag (a part of the `GS2` header). + +/// Channel binding flag (possibly with params). +#[derive(Debug, PartialEq, Eq)] +pub enum ChannelBinding { + /// Client doesn't support channel binding. + NotSupportedClient, + /// Client thinks server doesn't support channel binding. + NotSupportedServer, + /// Client wants to use this type of channel binding. + Required(T), +} + +impl ChannelBinding { + pub fn and_then(self, f: impl FnOnce(T) -> Result) -> Result, E> { + use ChannelBinding::*; + Ok(match self { + NotSupportedClient => NotSupportedClient, + NotSupportedServer => NotSupportedServer, + Required(x) => Required(f(x)?), + }) + } +} + +impl<'a> ChannelBinding<&'a str> { + // NB: FromStr doesn't work with lifetimes + pub fn parse(input: &'a str) -> Option { + use ChannelBinding::*; + Some(match input { + "n" => NotSupportedClient, + "y" => NotSupportedServer, + other => Required(other.strip_prefix("p=")?), + }) + } +} + +impl ChannelBinding { + /// Encode channel binding data as base64 for subsequent checks. + pub fn encode( + &self, + get_cbind_data: impl FnOnce(&T) -> Result, + ) -> Result, E> { + use ChannelBinding::*; + Ok(match self { + NotSupportedClient => { + // base64::encode("n,,") + "biws".into() + } + NotSupportedServer => { + // base64::encode("y,,") + "eSws".into() + } + Required(mode) => { + let msg = format!( + "p={mode},,{data}", + mode = mode, + data = get_cbind_data(mode)? + ); + base64::encode(msg).into() + } + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn channel_binding_encode() -> anyhow::Result<()> { + use ChannelBinding::*; + + let cases = [ + (NotSupportedClient, base64::encode("n,,")), + (NotSupportedServer, base64::encode("y,,")), + (Required("foo"), base64::encode("p=foo,,bar")), + ]; + + for (cb, input) in cases { + assert_eq!(cb.encode(|_| anyhow::Ok("bar".to_owned()))?, input); + } + + Ok(()) + } +} diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs new file mode 100644 index 0000000000..fb3833c8b6 --- /dev/null +++ b/proxy/src/sasl/messages.rs @@ -0,0 +1,68 @@ +//! Definitions for SASL messages. + +use crate::parse::{split_at_const, split_cstr}; +use pq_proto::{BeAuthenticationSaslMessage, BeMessage}; + +/// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage). +#[derive(Debug)] +pub struct FirstMessage<'a> { + /// Authentication method, e.g. `"SCRAM-SHA-256"`. + pub method: &'a str, + /// Initial client message. + pub message: &'a str, +} + +impl<'a> FirstMessage<'a> { + // NB: FromStr doesn't work with lifetimes + pub fn parse(bytes: &'a [u8]) -> Option { + let (method_cstr, tail) = split_cstr(bytes)?; + let method = method_cstr.to_str().ok()?; + + let (len_bytes, bytes) = split_at_const(tail)?; + let len = u32::from_be_bytes(*len_bytes) as usize; + if len != bytes.len() { + return None; + } + + let message = std::str::from_utf8(bytes).ok()?; + Some(Self { method, message }) + } +} + +/// A single SASL message. +/// This struct is deliberately decoupled from lower-level +/// [`BeAuthenticationSaslMessage`](pq_proto::BeAuthenticationSaslMessage). +#[derive(Debug)] +pub(super) enum ServerMessage { + /// We expect to see more steps. + Continue(T), + /// This is the final step. + Final(T), +} + +impl<'a> ServerMessage<&'a str> { + pub(super) fn to_reply(&self) -> BeMessage<'a> { + use BeAuthenticationSaslMessage::*; + BeMessage::AuthenticationSasl(match self { + ServerMessage::Continue(s) => Continue(s.as_bytes()), + ServerMessage::Final(s) => Final(s.as_bytes()), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_sasl_first_message() { + let proto = "SCRAM-SHA-256"; + let sasl = "n,,n=,r=KHQ2Gjc7NptyB8aov5/TnUy4"; + let sasl_len = (sasl.len() as u32).to_be_bytes(); + let bytes = [proto.as_bytes(), &[0], sasl_len.as_ref(), sasl.as_bytes()].concat(); + + let password = FirstMessage::parse(&bytes).unwrap(); + assert_eq!(password.method, proto); + assert_eq!(password.message, sasl); + } +} diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs new file mode 100644 index 0000000000..0e782c5f29 --- /dev/null +++ b/proxy/src/sasl/stream.rs @@ -0,0 +1,75 @@ +//! Abstraction for the string-oriented SASL protocols. + +use super::{messages::ServerMessage, Mechanism}; +use crate::stream::PqStream; +use std::io; +use tokio::io::{AsyncRead, AsyncWrite}; + +/// Abstracts away all peculiarities of the libpq's protocol. +pub struct SaslStream<'a, S> { + /// The underlying stream. + stream: &'a mut PqStream, + /// Current password message we received from client. + current: bytes::Bytes, + /// First SASL message produced by client. + first: Option<&'a str>, +} + +impl<'a, S> SaslStream<'a, S> { + pub fn new(stream: &'a mut PqStream, first: &'a str) -> Self { + Self { + stream, + current: bytes::Bytes::new(), + first: Some(first), + } + } +} + +impl SaslStream<'_, S> { + // Receive a new SASL message from the client. + async fn recv(&mut self) -> io::Result<&str> { + if let Some(first) = self.first.take() { + return Ok(first); + } + + self.current = self.stream.read_password_message().await?; + let s = std::str::from_utf8(&self.current) + .map_err(|_| io::Error::new(io::ErrorKind::InvalidData, "bad encoding"))?; + + Ok(s) + } +} + +impl SaslStream<'_, S> { + // Send a SASL message to the client. + async fn send(&mut self, msg: &ServerMessage<&str>) -> io::Result<()> { + self.stream.write_message(&msg.to_reply()).await?; + Ok(()) + } +} + +impl SaslStream<'_, S> { + /// Perform SASL message exchange according to the underlying algorithm + /// until user is either authenticated or denied access. + pub async fn authenticate( + mut self, + mut mechanism: M, + ) -> super::Result { + loop { + let input = self.recv().await?; + let (moved, reply) = mechanism.exchange(input)?; + + use super::Step::*; + match moved { + Continue(moved) => { + self.send(&ServerMessage::Continue(&reply)).await?; + mechanism = moved; + } + Authenticated(result) => { + self.send(&ServerMessage::Final(&reply)).await?; + return Ok(result); + } + } + } + } +} diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs new file mode 100644 index 0000000000..7cc4191435 --- /dev/null +++ b/proxy/src/scram.rs @@ -0,0 +1,61 @@ +//! Salted Challenge Response Authentication Mechanism. +//! +//! RFC: . +//! +//! Reference implementation: +//! * +//! * + +mod exchange; +mod key; +mod messages; +mod secret; +mod signature; + +#[cfg(test)] +mod password; + +pub use exchange::Exchange; +pub use key::ScramKey; +pub use secret::ServerSecret; +pub use secret::*; + +use hmac::{Hmac, Mac}; +use sha2::{Digest, Sha256}; + +// TODO: add SCRAM-SHA-256-PLUS +/// A list of supported SCRAM methods. +pub const METHODS: &[&str] = &["SCRAM-SHA-256"]; + +/// Decode base64 into array without any heap allocations +fn base64_decode_array(input: impl AsRef<[u8]>) -> Option<[u8; N]> { + let mut bytes = [0u8; N]; + + let size = base64::decode_config_slice(input, base64::STANDARD, &mut bytes).ok()?; + if size != N { + return None; + } + + Some(bytes) +} + +/// This function essentially is `Hmac(sha256, key, input)`. +/// Further reading: . +fn hmac_sha256<'a>(key: &[u8], parts: impl IntoIterator) -> [u8; 32] { + let mut mac = Hmac::::new_from_slice(key).expect("bad key size"); + parts.into_iter().for_each(|s| mac.update(s)); + + // TODO: maybe newer `hmac` et al already migrated to regular arrays? + let mut result = [0u8; 32]; + result.copy_from_slice(mac.finalize().into_bytes().as_slice()); + result +} + +fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { + let mut hasher = Sha256::new(); + parts.into_iter().for_each(|s| hasher.update(s)); + + let mut result = [0u8; 32]; + result.copy_from_slice(hasher.finalize().as_slice()); + result +} diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs new file mode 100644 index 0000000000..fca5585b25 --- /dev/null +++ b/proxy/src/scram/exchange.rs @@ -0,0 +1,135 @@ +//! Implementation of the SCRAM authentication algorithm. + +use super::messages::{ + ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN, +}; +use super::secret::ServerSecret; +use super::signature::SignatureBuilder; +use crate::sasl::{self, ChannelBinding, Error as SaslError}; + +/// The only channel binding mode we currently support. +struct TlsServerEndPoint; + +impl std::fmt::Display for TlsServerEndPoint { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "tls-server-end-point") + } +} + +impl std::str::FromStr for TlsServerEndPoint { + type Err = sasl::Error; + + fn from_str(s: &str) -> Result { + match s { + "tls-server-end-point" => Ok(TlsServerEndPoint), + _ => Err(sasl::Error::ChannelBindingBadMethod(s.into())), + } + } +} + +enum ExchangeState { + /// Waiting for [`ClientFirstMessage`]. + Initial, + /// Waiting for [`ClientFinalMessage`]. + SaltSent { + cbind_flag: ChannelBinding, + client_first_message_bare: String, + server_first_message: OwnedServerFirstMessage, + }, +} + +/// Server's side of SCRAM auth algorithm. +pub struct Exchange<'a> { + state: ExchangeState, + secret: &'a ServerSecret, + nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN], + cert_digest: Option<&'a [u8]>, +} + +impl<'a> Exchange<'a> { + pub fn new( + secret: &'a ServerSecret, + nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN], + cert_digest: Option<&'a [u8]>, + ) -> Self { + Self { + state: ExchangeState::Initial, + secret, + nonce, + cert_digest, + } + } +} + +impl sasl::Mechanism for Exchange<'_> { + type Output = super::ScramKey; + + fn exchange(mut self, input: &str) -> sasl::Result<(sasl::Step, String)> { + use {sasl::Step::*, ExchangeState::*}; + match &self.state { + Initial => { + let client_first_message = + ClientFirstMessage::parse(input).ok_or(SaslError::BadClientMessage)?; + + let server_first_message = client_first_message.build_server_first_message( + &(self.nonce)(), + &self.secret.salt_base64, + self.secret.iterations, + ); + let msg = server_first_message.as_str().to_owned(); + + self.state = SaltSent { + cbind_flag: client_first_message.cbind_flag.and_then(str::parse)?, + client_first_message_bare: client_first_message.bare.to_owned(), + server_first_message, + }; + + Ok((Continue(self), msg)) + } + SaltSent { + cbind_flag, + client_first_message_bare, + server_first_message, + } => { + let client_final_message = + ClientFinalMessage::parse(input).ok_or(SaslError::BadClientMessage)?; + + let channel_binding = cbind_flag.encode(|_| { + self.cert_digest + .map(base64::encode) + .ok_or(SaslError::ChannelBindingFailed("no cert digest provided")) + })?; + + // This might've been caused by a MITM attack + if client_final_message.channel_binding != channel_binding { + return Err(SaslError::ChannelBindingFailed("data mismatch")); + } + + if client_final_message.nonce != server_first_message.nonce() { + return Err(SaslError::AuthenticationFailed( + "combined nonce doesn't match", + )); + } + + let signature_builder = SignatureBuilder { + client_first_message_bare, + server_first_message: server_first_message.as_str(), + client_final_message_without_proof: client_final_message.without_proof, + }; + + let client_key = signature_builder + .build(&self.secret.stored_key) + .derive_client_key(&client_final_message.proof); + + if client_key.sha256() != self.secret.stored_key { + return Err(SaslError::AuthenticationFailed("password doesn't match")); + } + + let msg = client_final_message + .build_server_final_message(signature_builder, &self.secret.server_key); + + Ok((Authenticated(client_key), msg)) + } + } + } +} diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs new file mode 100644 index 0000000000..e9c65fcef3 --- /dev/null +++ b/proxy/src/scram/key.rs @@ -0,0 +1,37 @@ +//! Tools for client/server/stored key management. + +/// Faithfully taken from PostgreSQL. +pub const SCRAM_KEY_LEN: usize = 32; + +/// One of the keys derived from the [password](super::password::SaltedPassword). +/// We use the same structure for all keys, i.e. +/// `ClientKey`, `StoredKey`, and `ServerKey`. +#[derive(Default, PartialEq, Eq)] +#[repr(transparent)] +pub struct ScramKey { + bytes: [u8; SCRAM_KEY_LEN], +} + +impl ScramKey { + pub fn sha256(&self) -> Self { + super::sha256([self.as_ref()]).into() + } + + pub fn as_bytes(&self) -> [u8; SCRAM_KEY_LEN] { + self.bytes + } +} + +impl From<[u8; SCRAM_KEY_LEN]> for ScramKey { + #[inline(always)] + fn from(bytes: [u8; SCRAM_KEY_LEN]) -> Self { + Self { bytes } + } +} + +impl AsRef<[u8]> for ScramKey { + #[inline(always)] + fn as_ref(&self) -> &[u8] { + &self.bytes + } +} diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs new file mode 100644 index 0000000000..05855e74df --- /dev/null +++ b/proxy/src/scram/messages.rs @@ -0,0 +1,232 @@ +//! Definitions for SCRAM messages. + +use super::base64_decode_array; +use super::key::{ScramKey, SCRAM_KEY_LEN}; +use super::signature::SignatureBuilder; +use crate::sasl::ChannelBinding; +use std::fmt; +use std::ops::Range; + +/// Faithfully taken from PostgreSQL. +pub const SCRAM_RAW_NONCE_LEN: usize = 18; + +/// Although we ignore all extensions, we still have to validate the message. +fn validate_sasl_extensions<'a>(parts: impl Iterator) -> Option<()> { + for mut chars in parts.map(|s| s.chars()) { + let attr = chars.next()?; + if !('a'..='z').contains(&attr) && !('A'..='Z').contains(&attr) { + return None; + } + let eq = chars.next()?; + if eq != '=' { + return None; + } + } + + Some(()) +} + +#[derive(Debug)] +pub struct ClientFirstMessage<'a> { + /// `client-first-message-bare`. + pub bare: &'a str, + /// Channel binding mode. + pub cbind_flag: ChannelBinding<&'a str>, + /// (Client username)[]. + pub username: &'a str, + /// Client nonce. + pub nonce: &'a str, +} + +impl<'a> ClientFirstMessage<'a> { + // NB: FromStr doesn't work with lifetimes + pub fn parse(input: &'a str) -> Option { + let mut parts = input.split(','); + + let cbind_flag = ChannelBinding::parse(parts.next()?)?; + + // PG doesn't support authorization identity, + // so we don't bother defining GS2 header type + let authzid = parts.next()?; + if !authzid.is_empty() { + return None; + } + + // Unfortunately, `parts.as_str()` is unstable + let pos = authzid.as_ptr() as usize - input.as_ptr() as usize + 1; + let (_, bare) = input.split_at(pos); + + // In theory, these might be preceded by "reserved-mext" (i.e. "m=") + let username = parts.next()?.strip_prefix("n=")?; + let nonce = parts.next()?.strip_prefix("r=")?; + + // Validate but ignore auth extensions + validate_sasl_extensions(parts)?; + + Some(Self { + bare, + cbind_flag, + username, + nonce, + }) + } + + /// Build a response to [`ClientFirstMessage`]. + pub fn build_server_first_message( + &self, + nonce: &[u8; SCRAM_RAW_NONCE_LEN], + salt_base64: &str, + iterations: u32, + ) -> OwnedServerFirstMessage { + use std::fmt::Write; + + let mut message = String::new(); + write!(&mut message, "r={}", self.nonce).unwrap(); + base64::encode_config_buf(nonce, base64::STANDARD, &mut message); + let combined_nonce = 2..message.len(); + write!(&mut message, ",s={},i={}", salt_base64, iterations).unwrap(); + + // This design guarantees that it's impossible to create a + // server-first-message without receiving a client-first-message + OwnedServerFirstMessage { + message, + nonce: combined_nonce, + } + } +} + +#[derive(Debug)] +pub struct ClientFinalMessage<'a> { + /// `client-final-message-without-proof`. + pub without_proof: &'a str, + /// Channel binding data (base64). + pub channel_binding: &'a str, + /// Combined client & server nonce. + pub nonce: &'a str, + /// Client auth proof. + pub proof: [u8; SCRAM_KEY_LEN], +} + +impl<'a> ClientFinalMessage<'a> { + // NB: FromStr doesn't work with lifetimes + pub fn parse(input: &'a str) -> Option { + let (without_proof, proof) = input.rsplit_once(',')?; + + let mut parts = without_proof.split(','); + let channel_binding = parts.next()?.strip_prefix("c=")?; + let nonce = parts.next()?.strip_prefix("r=")?; + + // Validate but ignore auth extensions + validate_sasl_extensions(parts)?; + + let proof = base64_decode_array(proof.strip_prefix("p=")?)?; + + Some(Self { + without_proof, + channel_binding, + nonce, + proof, + }) + } + + /// Build a response to [`ClientFinalMessage`]. + pub fn build_server_final_message( + &self, + signature_builder: SignatureBuilder, + server_key: &ScramKey, + ) -> String { + let mut buf = String::from("v="); + base64::encode_config_buf( + signature_builder.build(server_key), + base64::STANDARD, + &mut buf, + ); + + buf + } +} + +/// We need to keep a convenient representation of this +/// message for the next authentication step. +pub struct OwnedServerFirstMessage { + /// Owned `server-first-message`. + message: String, + /// Slice into `message`. + nonce: Range, +} + +impl OwnedServerFirstMessage { + /// Extract combined nonce from the message. + #[inline(always)] + pub fn nonce(&self) -> &str { + &self.message[self.nonce.clone()] + } + + /// Get reference to a text representation of the message. + #[inline(always)] + pub fn as_str(&self) -> &str { + &self.message + } +} + +impl fmt::Debug for OwnedServerFirstMessage { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ServerFirstMessage") + .field("message", &self.as_str()) + .field("nonce", &self.nonce()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_client_first_message() { + use ChannelBinding::*; + + // (Almost) real strings captured during debug sessions + let cases = [ + (NotSupportedClient, "n,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"), + (NotSupportedServer, "y,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"), + ( + Required("tls-server-end-point"), + "p=tls-server-end-point,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju", + ), + ]; + + for (cb, input) in cases { + let msg = ClientFirstMessage::parse(input).unwrap(); + + assert_eq!(msg.bare, "n=pepe,r=t8JwklwKecDLwSsA72rHmVju"); + assert_eq!(msg.username, "pepe"); + assert_eq!(msg.nonce, "t8JwklwKecDLwSsA72rHmVju"); + assert_eq!(msg.cbind_flag, cb); + } + } + + #[test] + fn parse_client_final_message() { + let input = [ + "c=eSws", + "r=iiYEfS3rOgn8S3rtpSdrOsHtPLWvIkdgmHxA0hf3JNOAG4dU", + "p=SRpfsIVS4Gk11w1LqQ4QvCUBZYQmqXNSDEcHqbQ3CHI=", + ] + .join(","); + + let msg = ClientFinalMessage::parse(&input).unwrap(); + assert_eq!( + msg.without_proof, + "c=eSws,r=iiYEfS3rOgn8S3rtpSdrOsHtPLWvIkdgmHxA0hf3JNOAG4dU" + ); + assert_eq!( + msg.nonce, + "iiYEfS3rOgn8S3rtpSdrOsHtPLWvIkdgmHxA0hf3JNOAG4dU" + ); + assert_eq!( + base64::encode(msg.proof), + "SRpfsIVS4Gk11w1LqQ4QvCUBZYQmqXNSDEcHqbQ3CHI=" + ); + } +} diff --git a/proxy/src/scram/password.rs b/proxy/src/scram/password.rs new file mode 100644 index 0000000000..656780d853 --- /dev/null +++ b/proxy/src/scram/password.rs @@ -0,0 +1,48 @@ +//! Password hashing routines. + +use super::key::ScramKey; + +pub const SALTED_PASSWORD_LEN: usize = 32; + +/// Salted hashed password is essential for [key](super::key) derivation. +#[repr(transparent)] +pub struct SaltedPassword { + bytes: [u8; SALTED_PASSWORD_LEN], +} + +impl SaltedPassword { + /// See `scram-common.c : scram_SaltedPassword` for details. + /// Further reading: (see `PBKDF2`). + pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword { + let one = 1_u32.to_be_bytes(); // magic + + let mut current = super::hmac_sha256(password, [salt, &one]); + let mut result = current; + for _ in 1..iterations { + current = super::hmac_sha256(password, [current.as_ref()]); + // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094 + for (i, x) in current.iter().enumerate() { + result[i] ^= x; + } + } + + result.into() + } + + /// Derive `ClientKey` from a salted hashed password. + pub fn client_key(&self) -> ScramKey { + super::hmac_sha256(&self.bytes, [b"Client Key".as_ref()]).into() + } + + /// Derive `ServerKey` from a salted hashed password. + pub fn server_key(&self) -> ScramKey { + super::hmac_sha256(&self.bytes, [b"Server Key".as_ref()]).into() + } +} + +impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword { + #[inline(always)] + fn from(bytes: [u8; SALTED_PASSWORD_LEN]) -> Self { + Self { bytes } + } +} diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs new file mode 100644 index 0000000000..765aef4443 --- /dev/null +++ b/proxy/src/scram/secret.rs @@ -0,0 +1,116 @@ +//! Tools for SCRAM server secret management. + +use super::base64_decode_array; +use super::key::ScramKey; + +/// Server secret is produced from [password](super::password::SaltedPassword) +/// and is used throughout the authentication process. +pub struct ServerSecret { + /// Number of iterations for `PBKDF2` function. + pub iterations: u32, + /// Salt used to hash user's password. + pub salt_base64: String, + /// Hashed `ClientKey`. + pub stored_key: ScramKey, + /// Used by client to verify server's signature. + pub server_key: ScramKey, +} + +impl ServerSecret { + pub fn parse(input: &str) -> Option { + // SCRAM-SHA-256$:$: + let s = input.strip_prefix("SCRAM-SHA-256$")?; + let (params, keys) = s.split_once('$')?; + + let ((iterations, salt), (stored_key, server_key)) = + params.split_once(':').zip(keys.split_once(':'))?; + + let secret = ServerSecret { + iterations: iterations.parse().ok()?, + salt_base64: salt.to_owned(), + stored_key: base64_decode_array(stored_key)?.into(), + server_key: base64_decode_array(server_key)?.into(), + }; + + Some(secret) + } + + /// To avoid revealing information to an attacker, we use a + /// mocked server secret even if the user doesn't exist. + /// See `auth-scram.c : mock_scram_secret` for details. + #[allow(dead_code)] + pub fn mock(user: &str, nonce: &[u8; 32]) -> Self { + // Refer to `auth-scram.c : scram_mock_salt`. + let mocked_salt = super::sha256([user.as_bytes(), nonce]); + + Self { + iterations: 4096, + salt_base64: base64::encode(&mocked_salt), + stored_key: ScramKey::default(), + server_key: ScramKey::default(), + } + } + + /// Build a new server secret from the prerequisites. + /// XXX: We only use this function in tests. + #[cfg(test)] + pub fn build(password: &str, salt: &[u8], iterations: u32) -> Option { + // TODO: implement proper password normalization required by the RFC + if !password.is_ascii() { + return None; + } + + let password = super::password::SaltedPassword::new(password.as_bytes(), salt, iterations); + + Some(Self { + iterations, + salt_base64: base64::encode(&salt), + stored_key: password.client_key().sha256(), + server_key: password.server_key(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_scram_secret() { + let iterations = 4096; + let salt = "+/tQQax7twvwTj64mjBsxQ=="; + let stored_key = "D5h6KTMBlUvDJk2Y8ELfC1Sjtc6k9YHjRyuRZyBNJns="; + let server_key = "Pi3QHbcluX//NDfVkKlFl88GGzlJ5LkyPwcdlN/QBvI="; + + let secret = format!( + "SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}", + iterations = iterations, + salt = salt, + stored_key = stored_key, + server_key = server_key, + ); + + let parsed = ServerSecret::parse(&secret).unwrap(); + assert_eq!(parsed.iterations, iterations); + assert_eq!(parsed.salt_base64, salt); + + assert_eq!(base64::encode(parsed.stored_key), stored_key); + assert_eq!(base64::encode(parsed.server_key), server_key); + } + + #[test] + fn build_scram_secret() { + let salt = b"salt"; + let secret = ServerSecret::build("password", salt, 4096).unwrap(); + assert_eq!(secret.iterations, 4096); + assert_eq!(secret.salt_base64, base64::encode(salt)); + assert_eq!( + base64::encode(secret.stored_key.as_ref()), + "lF4cRm/Jky763CN4HtxdHnjV4Q8AWTNlKvGmEFFU8IQ=" + ); + assert_eq!( + base64::encode(secret.server_key.as_ref()), + "ub8OgRsftnk2ccDMOt7ffHXNcikRkQkq1lh4xaAqrSw=" + ); + } +} diff --git a/proxy/src/scram/signature.rs b/proxy/src/scram/signature.rs new file mode 100644 index 0000000000..1c2811d757 --- /dev/null +++ b/proxy/src/scram/signature.rs @@ -0,0 +1,66 @@ +//! Tools for client/server signature management. + +use super::key::{ScramKey, SCRAM_KEY_LEN}; + +/// A collection of message parts needed to derive the client's signature. +#[derive(Debug)] +pub struct SignatureBuilder<'a> { + pub client_first_message_bare: &'a str, + pub server_first_message: &'a str, + pub client_final_message_without_proof: &'a str, +} + +impl SignatureBuilder<'_> { + pub fn build(&self, key: &ScramKey) -> Signature { + let parts = [ + self.client_first_message_bare.as_bytes(), + b",", + self.server_first_message.as_bytes(), + b",", + self.client_final_message_without_proof.as_bytes(), + ]; + + super::hmac_sha256(key.as_ref(), parts).into() + } +} + +/// A computed value which, when xored with `ClientProof`, +/// produces `ClientKey` that we need for authentication. +#[derive(Debug)] +#[repr(transparent)] +pub struct Signature { + bytes: [u8; SCRAM_KEY_LEN], +} + +impl Signature { + /// Derive `ClientKey` from client's signature and proof. + pub fn derive_client_key(&self, proof: &[u8; SCRAM_KEY_LEN]) -> ScramKey { + // This is how the proof is calculated: + // + // 1. sha256(ClientKey) -> StoredKey + // 2. hmac_sha256(StoredKey, [messages...]) -> ClientSignature + // 3. ClientKey ^ ClientSignature -> ClientProof + // + // Step 3 implies that we can restore ClientKey from the proof + // by xoring the latter with the ClientSignature. Afterwards we + // can check that the presumed ClientKey meets our expectations. + let mut signature = self.bytes; + for (i, x) in proof.iter().enumerate() { + signature[i] ^= x; + } + + signature.into() + } +} + +impl From<[u8; SCRAM_KEY_LEN]> for Signature { + fn from(bytes: [u8; SCRAM_KEY_LEN]) -> Self { + Self { bytes } + } +} + +impl AsRef<[u8]> for Signature { + fn as_ref(&self) -> &[u8] { + &self.bytes + } +} diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 8fd5bef388..8e4084775c 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,13 +1,15 @@ -use anyhow::Context; +use crate::error::UserFacingError; +use anyhow::bail; use bytes::BytesMut; use pin_project_lite::pin_project; +use pq_proto::{BeMessage, FeMessage, FeStartupPacket}; use rustls::ServerConfig; use std::pin::Pin; use std::sync::Arc; use std::{io, task}; +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, ReadBuf}; use tokio_rustls::server::TlsStream; -use zenith_utils::pq_proto::{BeMessage, FeMessage, FeStartupPacket}; pin_project! { /// Stream wrapper which implements libpq's protocol. @@ -35,38 +37,63 @@ impl PqStream { self.stream } - /// Get a reference to the underlying stream. + /// Get a shared reference to the underlying stream. pub fn get_ref(&self) -> &S { &self.stream } } +fn err_connection() -> io::Error { + io::Error::new(io::ErrorKind::ConnectionAborted, "connection is lost") +} + +// TODO: change error type of `FeMessage::read_fut` +fn from_anyhow(e: anyhow::Error) -> io::Error { + io::Error::new(io::ErrorKind::Other, e.to_string()) +} + impl PqStream { /// Receive [`FeStartupPacket`], which is a first packet sent by a client. - pub async fn read_startup_packet(&mut self) -> anyhow::Result { - match FeStartupPacket::read_fut(&mut self.stream).await? { - Some(FeMessage::StartupPacket(packet)) => Ok(packet), - None => anyhow::bail!("connection is lost"), - other => anyhow::bail!("bad message type: {:?}", other), + pub async fn read_startup_packet(&mut self) -> io::Result { + // TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket` + let msg = FeStartupPacket::read_fut(&mut self.stream) + .await + .map_err(from_anyhow)? + .ok_or_else(err_connection)?; + + match msg { + FeMessage::StartupPacket(packet) => Ok(packet), + _ => panic!("unreachable state"), } } - pub async fn read_message(&mut self) -> anyhow::Result { + pub async fn read_password_message(&mut self) -> io::Result { + match self.read_message().await? { + FeMessage::PasswordMessage(msg) => Ok(msg), + bad => Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unexpected message type: {:?}", bad), + )), + } + } + + async fn read_message(&mut self) -> io::Result { FeMessage::read_fut(&mut self.stream) - .await? - .context("connection is lost") + .await + .map_err(from_anyhow)? + .ok_or_else(err_connection) } } impl PqStream { /// Write the message into an internal buffer, but don't flush the underlying stream. - pub fn write_message_noflush<'a>(&mut self, message: &BeMessage<'a>) -> io::Result<&mut Self> { + pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { BeMessage::write(&mut self.buffer, message)?; Ok(self) } /// Write the message into an internal buffer and flush it. - pub async fn write_message<'a>(&mut self, message: &BeMessage<'a>) -> io::Result<&mut Self> { + pub async fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { self.write_message_noflush(message)?; self.flush().await?; Ok(self) @@ -79,6 +106,25 @@ impl PqStream { self.stream.flush().await?; Ok(self) } + + /// Write the error message using [`Self::write_message`], then re-throw it. + /// Allowing string literals is safe under the assumption they might not contain any runtime info. + pub async fn throw_error_str(&mut self, error: &'static str) -> anyhow::Result { + // This method exists due to `&str` not implementing `Into` + self.write_message(&BeMessage::ErrorResponse(error)).await?; + bail!(error) + } + + /// Write the error message using [`Self::write_message`], then re-throw it. + /// Trait [`UserFacingError`] acts as an allowlist for error types. + pub async fn throw_error(&mut self, error: E) -> anyhow::Result + where + E: UserFacingError + Into, + { + let msg = error.to_string_client(); + self.write_message(&BeMessage::ErrorResponse(&msg)).await?; + bail!(error) + } } pin_project! { @@ -99,17 +145,35 @@ impl Stream { pub fn from_raw(raw: S) -> Self { Self::Raw { raw } } + + /// Return SNI hostname when it's available. + pub fn sni_hostname(&self) -> Option<&str> { + match self { + Stream::Raw { .. } => None, + Stream::Tls { tls } => tls.get_ref().1.sni_hostname(), + } + } +} + +#[derive(Debug, Error)] +#[error("Can't upgrade TLS stream")] +pub enum StreamUpgradeError { + #[error("Bad state reached: can't upgrade TLS stream")] + AlreadyTls, + + #[error("Can't upgrade stream: IO error: {0}")] + Io(#[from] io::Error), } impl Stream { /// If possible, upgrade raw stream into a secure TLS-based stream. - pub async fn upgrade(self, cfg: Arc) -> anyhow::Result { + pub async fn upgrade(self, cfg: Arc) -> Result { match self { Stream::Raw { raw } => { let tls = Box::new(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?); Ok(Stream::Tls { tls }) } - Stream::Tls { .. } => anyhow::bail!("can't upgrade TLS stream"), + Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } } @@ -167,7 +231,7 @@ impl AsyncWrite for Stream { pin_project! { /// This stream tracks all writes and calls user provided /// callback when the underlying stream is flushed. - pub struct MetricsStream { + pub struct MeasuredStream { #[pin] stream: S, write_count: usize, @@ -175,7 +239,7 @@ pin_project! { } } -impl MetricsStream { +impl MeasuredStream { pub fn new(stream: S, inc_write_count: W) -> Self { Self { stream, @@ -185,7 +249,7 @@ impl MetricsStream { } } -impl AsyncRead for MetricsStream { +impl AsyncRead for MeasuredStream { fn poll_read( self: Pin<&mut Self>, context: &mut task::Context<'_>, @@ -195,7 +259,7 @@ impl AsyncRead for MetricsStream { } } -impl AsyncWrite for MetricsStream { +impl AsyncWrite for MeasuredStream { fn poll_write( self: Pin<&mut Self>, context: &mut task::Context<'_>, diff --git a/proxy/src/url.rs b/proxy/src/url.rs new file mode 100644 index 0000000000..92c64bb8ad --- /dev/null +++ b/proxy/src/url.rs @@ -0,0 +1,74 @@ +use anyhow::bail; + +/// A [url](url::Url) type with additional guarantees. +#[repr(transparent)] +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ApiUrl(url::Url); + +impl ApiUrl { + /// Consume the wrapper and return inner [url](url::Url). + pub fn into_inner(self) -> url::Url { + self.0 + } + + /// See [`url::Url::path_segments_mut`]. + pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut { + // We've already verified that it works during construction. + self.0.path_segments_mut().expect("bad API url") + } +} + +/// This instance imposes additional requirements on the url. +impl std::str::FromStr for ApiUrl { + type Err = anyhow::Error; + + fn from_str(s: &str) -> anyhow::Result { + let mut url: url::Url = s.parse()?; + + // Make sure that we can build upon this URL. + if url.path_segments_mut().is_err() { + bail!("bad API url provided"); + } + + Ok(Self(url)) + } +} + +/// This instance is safe because it doesn't allow us to modify the object. +impl std::ops::Deref for ApiUrl { + type Target = url::Url; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::fmt::Display for ApiUrl { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn bad_url() { + let url = "test:foobar"; + url.parse::().expect("unexpected parsing failure"); + let _ = url.parse::().expect_err("should not parse"); + } + + #[test] + fn good_url() { + let url = "test://foobar"; + let mut a = url.parse::().expect("unexpected parsing failure"); + let mut b = url.parse::().expect("unexpected parsing failure"); + + a.path_segments_mut().unwrap().push("method"); + b.path_segments_mut().push("method"); + + assert_eq!(a, b.into_inner()); + } +} diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index 9fda3ed94f..bba5494cfe 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -1,11 +1,32 @@ -use anyhow::{anyhow, Context}; use hashbrown::HashMap; use parking_lot::Mutex; use pin_project_lite::pin_project; use std::pin::Pin; use std::task; +use thiserror::Error; use tokio::sync::oneshot; +#[derive(Debug, Error)] +pub enum RegisterError { + #[error("Waiter `{0}` already registered")] + Occupied(String), +} + +#[derive(Debug, Error)] +pub enum NotifyError { + #[error("Notify failed: waiter `{0}` not registered")] + NotFound(String), + + #[error("Notify failed: channel hangup")] + Hangup, +} + +#[derive(Debug, Error)] +pub enum WaitError { + #[error("Wait failed: channel hangup")] + Hangup, +} + pub struct Waiters(pub(self) Mutex>>); impl Default for Waiters { @@ -15,13 +36,13 @@ impl Default for Waiters { } impl Waiters { - pub fn register(&self, key: String) -> anyhow::Result> { + pub fn register(&self, key: String) -> Result, RegisterError> { let (tx, rx) = oneshot::channel(); self.0 .lock() .try_insert(key.clone(), tx) - .map_err(|_| anyhow!("waiter already registered"))?; + .map_err(|e| RegisterError::Occupied(e.entry.key().clone()))?; Ok(Waiter { receiver: rx, @@ -32,7 +53,7 @@ impl Waiters { }) } - pub fn notify(&self, key: &str, value: T) -> anyhow::Result<()> + pub fn notify(&self, key: &str, value: T) -> Result<(), NotifyError> where T: Send + Sync, { @@ -40,9 +61,9 @@ impl Waiters { .0 .lock() .remove(key) - .with_context(|| format!("key {} not found", key))?; + .ok_or_else(|| NotifyError::NotFound(key.to_string()))?; - tx.send(value).map_err(|_| anyhow!("waiter channel hangup")) + tx.send(value).map_err(|_| NotifyError::Hangup) } } @@ -66,13 +87,13 @@ pin_project! { } impl std::future::Future for Waiter<'_, T> { - type Output = anyhow::Result; + type Output = Result; fn poll(self: Pin<&mut Self>, cx: &mut task::Context<'_>) -> task::Poll { self.project() .receiver .poll(cx) - .map_err(|_| anyhow!("channel hangup")) + .map_err(|_| WaitError::Hangup) } } @@ -94,7 +115,7 @@ mod tests { Ok(()) }); - let () = waiter.await?; + waiter.await?; notifier.await? } } diff --git a/pyproject.toml b/pyproject.toml index 7dbdcc0304..b13acece18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,33 +1,82 @@ [tool.poetry] -name = "zenith" +name = "neon" version = "0.1.0" description = "" authors = [] [tool.poetry.dependencies] -python = "^3.7" +python = "^3.9" pytest = "^6.2.5" psycopg2-binary = "^2.9.1" -typing-extensions = "^3.10.0" +typing-extensions = "^4.1.0" PyJWT = {version = "^2.1.0", extras = ["crypto"]} requests = "^2.26.0" -pytest-xdist = "^2.3.0" +pytest-xdist = "^3.0.2" asyncpg = "^0.24.0" aiopg = "^1.3.1" -cached-property = "^1.5.2" Jinja2 = "^3.0.2" -types-requests = "^2.27.7" -types-psycopg2 = "^2.9.6" +types-requests = "^2.28.5" +types-psycopg2 = "^2.9.18" boto3 = "^1.20.40" -boto3-stubs = "^1.20.40" +boto3-stubs = {version = "^1.23.38", extras = ["s3"]} moto = {version = "^3.0.0", extras = ["server"]} backoff = "^1.11.1" +pytest-lazy-fixture = "^0.6.3" +prometheus-client = "^0.14.1" +pytest-timeout = "^2.1.0" +Werkzeug = "2.1.2" +pytest-order = "^1.0.1" +allure-pytest = "^2.10.0" +pytest-asyncio = "^0.19.0" +toml = "^0.10.2" +psutil = "^5.9.4" +types-psutil = "^5.9.5.4" [tool.poetry.dev-dependencies] -yapf = "==0.31.0" -flake8 = "^3.9.2" -mypy = "==0.910" +flake8 = "^5.0.4" +mypy = "==0.971" +black = "^22.6.0" +isort = "^5.10.1" +types-toml = "^0.10.8" [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" + +[tool.black] +line-length = 100 +extend-exclude = ''' +/( + vendor +)/ +''' + +[tool.isort] +profile = "black" +line_length = 100 +skip_gitignore = true +skip = [ + "vendor", +] + +[tool.mypy] +# mypy uses regex +exclude = "^vendor/" +# some tests don't typecheck when this flag is set +check_untyped_defs = false +# Help mypy find imports when running against list of individual files. +# Without this line it would behave differently when executed on the entire project. +mypy_path = "$MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner" + +disallow_incomplete_defs = false +disallow_untyped_calls = false +disallow_untyped_decorators = false +disallow_untyped_defs = false +strict = true + +[[tool.mypy.overrides]] +module = [ + "asyncpg.*", + "pg8000.*", +] +ignore_missing_imports = true diff --git a/pytest.ini b/pytest.ini index abc69b765b..7197b078c6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,6 +1,11 @@ [pytest] +filterwarnings = + error::pytest.PytestUnhandledThreadExceptionWarning + error::UserWarning + ignore:record_property is incompatible with junit_family:pytest.PytestWarning addopts = -m 'not remote_cluster' + --ignore=test_runner/performance markers = remote_cluster testpaths = @@ -9,3 +14,4 @@ minversion = 6.0 log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s log_date_format = %Y-%m-%d %H:%M:%S log_cli = true +timeout = 300 diff --git a/run_clippy.sh b/run_clippy.sh index 4ca944c1f1..bf770432d0 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -9,7 +9,14 @@ # In vscode, this setting is Rust-analyzer>Check On Save:Command -# * `-A unknown_lints` – do not warn about unknown lint suppressions -# that people with newer toolchains might use -# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) -cargo clippy "${@:2}" --all-targets --all-features --all --tests -- -A unknown_lints -D warnings +# Not every feature is supported in macOS builds, e.g. `profiling`, +# avoid running regular linting script that checks every feature. +if [[ "$OSTYPE" == "darwin"* ]]; then + # no extra features to test currently, add more here when needed + cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings +else + # * `-A unknown_lints` – do not warn about unknown lint suppressions + # that people with newer toolchains might use + # * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) + cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings +fi diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000000..928a10e555 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,12 @@ +[toolchain] +# We try to stick to a toolchain version that is widely available on popular distributions, so that most people +# can use the toolchain that comes with their operating system. But if there's a feature we miss badly from a later +# version, we can consider updating. +# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package, +# we use "unstable" version number as the highest version used in the project by default. +channel = "1.62.1" # do update GitHub CI cache values for rust builds, when changing this value +profile = "default" +# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. +# https://rust-lang.github.io/rustup/concepts/profiles.html +# but we also need `llvm-tools-preview` for coverage data merges on CI +components = ["llvm-tools-preview", "rustfmt", "clippy"] diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml new file mode 100644 index 0000000000..658bdfe42c --- /dev/null +++ b/safekeeper/Cargo.toml @@ -0,0 +1,46 @@ +[package] +name = "safekeeper" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = "1.0" +async-trait = "0.1" +byteorder = "1.4.3" +bytes = "1.0.1" +clap = "4.0" +const_format = "0.2.21" +crc32c = "0.6.0" +fs2 = "0.4.3" +git-version = "0.3.5" +hex = "0.4.3" +humantime = "2.1.0" +hyper = "0.14" +nix = "0.25" +once_cell = "1.13.0" +parking_lot = "0.12.1" +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +regex = "1.4.5" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1" +serde_with = "2.0" +signal-hook = "0.3.10" +thiserror = "1" +tokio = { version = "1.17", features = ["macros", "fs"] } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +toml_edit = { version = "0.14", features = ["easy"] } +tracing = "0.1.27" +url = "2.2.2" + +etcd_broker = { path = "../libs/etcd_broker" } +metrics = { path = "../libs/metrics" } +postgres_ffi = { path = "../libs/postgres_ffi" } +pq_proto = { path = "../libs/pq_proto" } +remote_storage = { path = "../libs/remote_storage" } +safekeeper_api = { path = "../libs/safekeeper_api" } +utils = { path = "../libs/utils" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } + +[dev-dependencies] +tempfile = "3.2" diff --git a/walkeeper/spec/ProposerAcceptorConsensus.cfg b/safekeeper/spec/ProposerAcceptorConsensus.cfg similarity index 100% rename from walkeeper/spec/ProposerAcceptorConsensus.cfg rename to safekeeper/spec/ProposerAcceptorConsensus.cfg diff --git a/walkeeper/spec/ProposerAcceptorConsensus.tla b/safekeeper/spec/ProposerAcceptorConsensus.tla similarity index 99% rename from walkeeper/spec/ProposerAcceptorConsensus.tla rename to safekeeper/spec/ProposerAcceptorConsensus.tla index 993edfcf23..e5f0bb270f 100644 --- a/walkeeper/spec/ProposerAcceptorConsensus.tla +++ b/safekeeper/spec/ProposerAcceptorConsensus.tla @@ -88,7 +88,7 @@ TypeOk == \* in campaign proposer sends RequestVote and waits for acks; \* in leader he is elected /\ prop_state[p].state \in {"campaign", "leader"} - \* 0..max_term should be actually Nat in the unbouned model, but TLC won't + \* 0..max_term should be actually Nat in the unbounded model, but TLC won't \* swallow it /\ prop_state[p].term \in 0..max_term \* votes received diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs new file mode 100644 index 0000000000..42f8188d6a --- /dev/null +++ b/safekeeper/src/bin/safekeeper.rs @@ -0,0 +1,433 @@ +// +// Main entry point for the safekeeper executable +// +use anyhow::{bail, Context, Result}; +use clap::{value_parser, Arg, ArgAction, Command}; +use const_format::formatcp; +use nix::unistd::Pid; +use remote_storage::RemoteStorageConfig; +use std::fs::{self, File}; +use std::io::{ErrorKind, Write}; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::thread; +use tokio::sync::mpsc; +use toml_edit::Document; +use tracing::*; +use url::{ParseError, Url}; +use utils::lock_file; + +use metrics::set_build_info_metric; +use safekeeper::broker; +use safekeeper::control_file; +use safekeeper::defaults::{ + DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, + DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS, +}; +use safekeeper::http; +use safekeeper::remove_wal; +use safekeeper::wal_backup; +use safekeeper::wal_service; +use safekeeper::GlobalTimelines; +use safekeeper::SafeKeeperConf; +use utils::auth::JwtAuth; +use utils::{ + http::endpoint, + id::NodeId, + logging::{self, LogFormat}, + project_git_version, signals, tcp_listener, +}; + +const PID_FILE_NAME: &str = "safekeeper.pid"; +const ID_FILE_NAME: &str = "safekeeper.id"; +project_git_version!(GIT_VERSION); + +fn main() -> anyhow::Result<()> { + let arg_matches = cli().get_matches(); + + if let Some(addr) = arg_matches.get_one::("dump-control-file") { + let state = control_file::FileStorage::load_control_file(Path::new(addr))?; + let json = serde_json::to_string(&state)?; + print!("{json}"); + return Ok(()); + } + + let mut conf = SafeKeeperConf::default(); + + if let Some(dir) = arg_matches.get_one::("datadir") { + // change into the data directory. + std::env::set_current_dir(dir)?; + } + + if arg_matches.get_flag("no-sync") { + conf.no_sync = true; + } + + if let Some(addr) = arg_matches.get_one::("listen-pg") { + conf.listen_pg_addr = addr.to_string(); + } + + if let Some(addr) = arg_matches.get_one::("listen-http") { + conf.listen_http_addr = addr.to_string(); + } + + let mut given_id = None; + if let Some(given_id_str) = arg_matches.get_one::("id") { + given_id = Some(NodeId( + given_id_str + .parse() + .context("failed to parse safekeeper id")?, + )); + } + + if let Some(addr) = arg_matches.get_one::("broker-endpoints") { + let collected_ep: Result, ParseError> = addr.split(',').map(Url::parse).collect(); + conf.broker_endpoints = collected_ep.context("Failed to parse broker endpoint urls")?; + } + if let Some(prefix) = arg_matches.get_one::("broker-etcd-prefix") { + conf.broker_etcd_prefix = prefix.to_string(); + } + + if let Some(heartbeat_timeout_str) = arg_matches.get_one::("heartbeat-timeout") { + conf.heartbeat_timeout = + humantime::parse_duration(heartbeat_timeout_str).with_context(|| { + format!( + "failed to parse heartbeat-timeout {}", + heartbeat_timeout_str + ) + })?; + } + + if let Some(backup_threads) = arg_matches.get_one::("wal-backup-threads") { + conf.backup_runtime_threads = backup_threads + .parse() + .with_context(|| format!("Failed to parse backup threads {}", backup_threads))?; + } + if let Some(storage_conf) = arg_matches.get_one::("remote-storage") { + // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse + let storage_conf_toml = format!("remote_storage = {}", storage_conf); + let parsed_toml = storage_conf_toml.parse::()?; // parse + let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again + conf.remote_storage = Some(RemoteStorageConfig::from_toml(storage_conf_parsed_toml)?); + } + if let Some(max_offloader_lag_str) = arg_matches.get_one::("max-offloader-lag") { + conf.max_offloader_lag_bytes = max_offloader_lag_str.parse().with_context(|| { + format!( + "failed to parse max offloader lag {}", + max_offloader_lag_str + ) + })?; + } + // Seems like there is no better way to accept bool values explicitly in clap. + conf.wal_backup_enabled = arg_matches + .get_one::("enable-wal-backup") + .unwrap() + .parse() + .context("failed to parse bool enable-s3-offload bool")?; + + conf.auth_validation_public_key_path = arg_matches + .get_one::("auth-validation-public-key-path") + .map(PathBuf::from); + + if let Some(log_format) = arg_matches.get_one::("log-format") { + conf.log_format = LogFormat::from_config(log_format)?; + } + + start_safekeeper(conf, given_id, arg_matches.get_flag("init")) +} + +fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { + logging::init(conf.log_format)?; + info!("version: {GIT_VERSION}"); + + // Prevent running multiple safekeepers on the same directory + let lock_file_path = conf.workdir.join(PID_FILE_NAME); + let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) { + lock_file::LockCreationResult::Created { + new_lock_contents, + file, + } => { + info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}"); + file + } + lock_file::LockCreationResult::AlreadyLocked { + existing_lock_contents, + } => anyhow::bail!( + "Could not lock pid file; safekeeper is already running in {:?} with PID {}", + conf.workdir, + existing_lock_contents + ), + lock_file::LockCreationResult::CreationFailed(e) => { + return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}"))) + } + }; + // ensure that the lock file is held even if the main thread of the process is panics + // we need to release the lock file only when the current process is gone + let _ = Box::leak(Box::new(lock_file)); + + // Set or read our ID. + set_id(&mut conf, given_id)?; + if init { + return Ok(()); + } + + let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| { + error!("failed to bind to address {}: {}", conf.listen_http_addr, e); + e + })?; + + info!("Starting safekeeper on {}", conf.listen_pg_addr); + let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| { + error!("failed to bind to address {}: {}", conf.listen_pg_addr, e); + e + })?; + + let auth = match conf.auth_validation_public_key_path.as_ref() { + None => { + info!("Auth is disabled"); + None + } + Some(path) => { + info!("Loading JWT auth key from {}", path.display()); + Some(Arc::new( + JwtAuth::from_key_path(path).context("failed to load the auth key")?, + )) + } + }; + + // Register metrics collector for active timelines. It's important to do this + // after daemonizing, otherwise process collector will be upset. + let timeline_collector = safekeeper::metrics::TimelineCollector::new(); + metrics::register_internal(Box::new(timeline_collector))?; + + let signals = signals::install_shutdown_handlers()?; + let mut threads = vec![]; + let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); + + // Load all timelines from disk to memory. + GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?; + + let conf_ = conf.clone(); + threads.push( + thread::Builder::new() + .name("http_endpoint_thread".into()) + .spawn(|| { + let router = http::make_router(conf_, auth); + endpoint::serve_thread_main( + router, + http_listener, + std::future::pending(), // never shut down + ) + .unwrap(); + })?, + ); + + let conf_cloned = conf.clone(); + let safekeeper_thread = thread::Builder::new() + .name("Safekeeper thread".into()) + .spawn(|| { + // TODO: add auth + if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener) { + info!("safekeeper thread terminated: {e}"); + } + }) + .unwrap(); + + threads.push(safekeeper_thread); + + if !conf.broker_endpoints.is_empty() { + let conf_ = conf.clone(); + threads.push( + thread::Builder::new() + .name("broker thread".into()) + .spawn(|| { + // TODO: add auth? + broker::thread_main(conf_); + })?, + ); + } else { + warn!("No broker endpoints providing, starting without node sync") + } + + let conf_ = conf.clone(); + threads.push( + thread::Builder::new() + .name("WAL removal thread".into()) + .spawn(|| { + // TODO: add auth? + remove_wal::thread_main(conf_); + })?, + ); + + let conf_ = conf.clone(); + threads.push( + thread::Builder::new() + .name("wal backup launcher thread".into()) + .spawn(move || { + // TODO: add auth? + wal_backup::wal_backup_launcher_thread_main(conf_, wal_backup_launcher_rx); + })?, + ); + + set_build_info_metric(GIT_VERSION); + // TODO: put more thoughts into handling of failed threads + // We probably should restart them. + + // NOTE: we still have to handle signals like SIGQUIT to prevent coredumps + signals.handle(|signal| { + // TODO: implement graceful shutdown with joining threads etc + info!( + "Got {}. Terminating in immediate shutdown mode", + signal.name() + ); + std::process::exit(111); + }) +} + +/// Determine safekeeper id and set it in config. +fn set_id(conf: &mut SafeKeeperConf, given_id: Option) -> Result<()> { + let id_file_path = conf.workdir.join(ID_FILE_NAME); + + let my_id: NodeId; + // If ID exists, read it in; otherwise set one passed + match fs::read(&id_file_path) { + Ok(id_serialized) => { + my_id = NodeId( + std::str::from_utf8(&id_serialized) + .context("failed to parse safekeeper id")? + .parse() + .context("failed to parse safekeeper id")?, + ); + if let Some(given_id) = given_id { + if given_id != my_id { + bail!( + "safekeeper already initialized with id {}, can't set {}", + my_id, + given_id + ); + } + } + info!("safekeeper ID {}", my_id); + } + Err(error) => match error.kind() { + ErrorKind::NotFound => { + my_id = if let Some(given_id) = given_id { + given_id + } else { + bail!("safekeeper id is not specified"); + }; + let mut f = File::create(&id_file_path)?; + f.write_all(my_id.to_string().as_bytes())?; + f.sync_all()?; + info!("initialized safekeeper ID {}", my_id); + } + _ => { + return Err(error.into()); + } + }, + } + conf.my_id = my_id; + Ok(()) +} + +fn cli() -> Command { + Command::new("Neon safekeeper") + .about("Store WAL stream to local file system and push it to WAL receivers") + .version(GIT_VERSION) + .arg( + Arg::new("datadir") + .short('D') + .long("dir") + .value_parser(value_parser!(PathBuf)) + .help("Path to the safekeeper data directory"), + ) + .arg( + Arg::new("init") + .long("init") + .action(ArgAction::SetTrue) + .help("Initialize safekeeper with ID"), + ) + .arg( + Arg::new("listen-pg") + .short('l') + .long("listen-pg") + .alias("listen") // for compatibility + .help(formatcp!("listen for incoming WAL data connections on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")), + ) + .arg( + Arg::new("listen-http") + .long("listen-http") + .help(formatcp!("http endpoint address for metrics on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")), + ) + // FIXME this argument is no longer needed since pageserver address is forwarded from compute. + // However because this argument is in use by console's e2e tests let's keep it for now and remove separately. + // So currently it is a noop. + .arg( + Arg::new("pageserver") + .short('p') + .long("pageserver"), + ) + .arg( + Arg::new("no-sync") + .short('n') + .long("no-sync") + .action(ArgAction::SetTrue) + .help("Do not wait for changes to be written safely to disk"), + ) + .arg( + Arg::new("dump-control-file") + .long("dump-control-file") + .help("Dump control file at path specified by this argument and exit"), + ) + .arg( + Arg::new("id").long("id").help("safekeeper node id: integer") + ).arg( + Arg::new("broker-endpoints") + .long("broker-endpoints") + .help("a comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'"), + ) + .arg( + Arg::new("broker-etcd-prefix") + .long("broker-etcd-prefix") + .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"), + ) + .arg( + Arg::new("heartbeat-timeout") + .long("heartbeat-timeout") + .help(formatcp!("Peer is considered dead after not receiving heartbeats from it during this period (default {}s), passed as a human readable duration.", DEFAULT_HEARTBEAT_TIMEOUT.as_secs())) + ) + .arg( + Arg::new("wal-backup-threads").long("backup-threads").help(formatcp!("number of threads for wal backup (default {DEFAULT_WAL_BACKUP_RUNTIME_THREADS}")), + ).arg( + Arg::new("remote-storage") + .long("remote-storage") + .help("Remote storage configuration for WAL backup (offloading to s3) as TOML inline table, e.g. {\"max_concurrent_syncs\" = 17, \"max_sync_errors\": 13, \"bucket_name\": \"\", \"bucket_region\":\"\", \"concurrency_limit\": 119}.\nSafekeeper offloads WAL to [prefix_in_bucket/]//, mirroring structure on the file system.") + ) + .arg( + Arg::new("max-offloader-lag") + .long("max-offloader-lag") + .help(formatcp!("Safekeeper won't be elected for WAL offloading if it is lagging for more than this value (default {}MB) in bytes", DEFAULT_MAX_OFFLOADER_LAG_BYTES / (1 << 20))) + ) + .arg( + Arg::new("enable-wal-backup") + .long("enable-wal-backup") + .default_value("true") + .default_missing_value("true") + .help("Enable/disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring WAL backup horizon."), + ) + .arg( + Arg::new("auth-validation-public-key-path") + .long("auth-validation-public-key-path") + .help("Path to an RSA .pem public key which is used to check JWT tokens") + ) + .arg( + Arg::new("log-format") + .long("log-format") + .help("Format for logging, either 'plain' or 'json'") + ) +} + +#[test] +fn verify_cli() { + cli().debug_assert(); +} diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs new file mode 100644 index 0000000000..76135241b9 --- /dev/null +++ b/safekeeper/src/broker.rs @@ -0,0 +1,232 @@ +//! Communication with etcd, providing safekeeper peers and pageserver coordination. + +use anyhow::Context; +use anyhow::Error; +use anyhow::Result; +use etcd_broker::subscription_value::SkTimelineInfo; +use etcd_broker::LeaseKeepAliveStream; +use etcd_broker::LeaseKeeper; + +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::collections::HashSet; +use std::time::Duration; +use tokio::task::JoinHandle; +use tokio::{runtime, time::sleep}; +use tracing::*; + +use crate::GlobalTimelines; +use crate::SafeKeeperConf; +use etcd_broker::{ + subscription_key::{OperationKind, SkOperationKind, SubscriptionKey}, + Client, PutOptions, +}; +use utils::id::{NodeId, TenantTimelineId}; + +const RETRY_INTERVAL_MSEC: u64 = 1000; +const PUSH_INTERVAL_MSEC: u64 = 1000; +const LEASE_TTL_SEC: i64 = 10; + +pub fn thread_main(conf: SafeKeeperConf) { + let runtime = runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + let _enter = info_span!("broker").entered(); + info!("started, broker endpoints {:?}", conf.broker_endpoints); + + runtime.block_on(async { + main_loop(conf).await; + }); +} + +/// Key to per timeline per safekeeper data. +fn timeline_safekeeper_path( + broker_etcd_prefix: String, + ttid: TenantTimelineId, + sk_id: NodeId, +) -> String { + format!( + "{}/{sk_id}", + SubscriptionKey::sk_timeline_info(broker_etcd_prefix, ttid).watch_key() + ) +} + +async fn push_sk_info( + ttid: TenantTimelineId, + mut client: Client, + key: String, + sk_info: SkTimelineInfo, + mut lease: Lease, +) -> anyhow::Result<(TenantTimelineId, Lease)> { + let put_opts = PutOptions::new().with_lease(lease.id); + client + .put( + key.clone(), + serde_json::to_string(&sk_info)?, + Some(put_opts), + ) + .await + .with_context(|| format!("failed to push safekeeper info to {}", key))?; + + // revive the lease + lease + .keeper + .keep_alive() + .await + .context("failed to send LeaseKeepAliveRequest")?; + lease + .ka_stream + .message() + .await + .context("failed to receive LeaseKeepAliveResponse")?; + + Ok((ttid, lease)) +} + +struct Lease { + id: i64, + keeper: LeaseKeeper, + ka_stream: LeaseKeepAliveStream, +} + +/// Push once in a while data about all active timelines to the broker. +async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { + let mut client = Client::connect(&conf.broker_endpoints, None).await?; + let mut leases: HashMap = HashMap::new(); + + let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); + loop { + // Note: we lock runtime here and in timeline methods as GlobalTimelines + // is under plain mutex. That's ok, all this code is not performance + // sensitive and there is no risk of deadlock as we don't await while + // lock is held. + let mut active_tlis = GlobalTimelines::get_all(); + active_tlis.retain(|tli| tli.is_active()); + + let active_tlis_set: HashSet = + active_tlis.iter().map(|tli| tli.ttid).collect(); + + // // Get and maintain (if not yet) per timeline lease to automatically delete obsolete data. + for tli in &active_tlis { + if let Entry::Vacant(v) = leases.entry(tli.ttid) { + let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; + let (keeper, ka_stream) = client.lease_keep_alive(lease.id()).await?; + v.insert(Lease { + id: lease.id(), + keeper, + ka_stream, + }); + } + } + leases.retain(|ttid, _| active_tlis_set.contains(ttid)); + + // Push data concurrently to not suffer from latency, with many timelines it can be slow. + let handles = active_tlis + .iter() + .map(|tli| { + let sk_info = tli.get_safekeeper_info(&conf); + let key = + timeline_safekeeper_path(conf.broker_etcd_prefix.clone(), tli.ttid, conf.my_id); + let lease = leases.remove(&tli.ttid).unwrap(); + tokio::spawn(push_sk_info(tli.ttid, client.clone(), key, sk_info, lease)) + }) + .collect::>(); + for h in handles { + let (ttid, lease) = h.await??; + // It is ugly to pull leases from hash and then put it back, but + // otherwise we have to resort to long living per tli tasks (which + // would generate a lot of errors when etcd is down) as task wants to + // have 'static objects, we can't borrow to it. + leases.insert(ttid, lease); + } + + sleep(push_interval).await; + } +} + +/// Subscribe and fetch all the interesting data from the broker. +async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { + let mut client = Client::connect(&conf.broker_endpoints, None).await?; + + let mut subscription = etcd_broker::subscribe_for_values( + &mut client, + SubscriptionKey::all(conf.broker_etcd_prefix.clone()), + |full_key, value_str| { + if full_key.operation == OperationKind::Safekeeper(SkOperationKind::TimelineInfo) { + match serde_json::from_str::(value_str) { + Ok(new_info) => return Some(new_info), + Err(e) => { + error!("Failed to parse timeline info from value str '{value_str}': {e}") + } + } + } + None + }, + ) + .await + .context("failed to subscribe for safekeeper info")?; + loop { + match subscription.value_updates.recv().await { + Some(new_info) => { + // note: there are blocking operations below, but it's considered fine for now + if let Ok(tli) = GlobalTimelines::get(new_info.key.id) { + // Note that we also receive *our own* info. That's + // important, as it is used as an indication of live + // connection to the broker. + tli.record_safekeeper_info(&new_info.value, new_info.key.node_id) + .await? + } + } + None => { + // XXX it means we lost connection with etcd, error is consumed inside sub object + debug!("timeline updates sender closed, aborting the pull loop"); + return Ok(()); + } + } + } +} + +async fn main_loop(conf: SafeKeeperConf) { + let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC)); + let mut push_handle: Option>> = None; + let mut pull_handle: Option>> = None; + // Selecting on JoinHandles requires some squats; is there a better way to + // reap tasks individually? + + // Handling failures in task itself won't catch panic and in Tokio, task's + // panic doesn't kill the whole executor, so it is better to do reaping + // here. + loop { + tokio::select! { + res = async { push_handle.as_mut().unwrap().await }, if push_handle.is_some() => { + // was it panic or normal error? + let err = match res { + Ok(res_internal) => res_internal.unwrap_err(), + Err(err_outer) => err_outer.into(), + }; + warn!("push task failed: {:?}", err); + push_handle = None; + }, + res = async { pull_handle.as_mut().unwrap().await }, if pull_handle.is_some() => { + // was it panic or normal error? + match res { + Ok(res_internal) => if let Err(err_inner) = res_internal { + warn!("pull task failed: {:?}", err_inner); + } + Err(err_outer) => { warn!("pull task panicked: {:?}", err_outer) } + }; + pull_handle = None; + }, + _ = ticker.tick() => { + if push_handle.is_none() { + push_handle = Some(tokio::spawn(push_loop(conf.clone()))); + } + if pull_handle.is_none() { + pull_handle = Some(tokio::spawn(pull_loop(conf.clone()))); + } + } + } + } +} diff --git a/walkeeper/src/control_file.rs b/safekeeper/src/control_file.rs similarity index 54% rename from walkeeper/src/control_file.rs rename to safekeeper/src/control_file.rs index 6016e00d1d..6be3f9abb2 100644 --- a/walkeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -2,20 +2,16 @@ use anyhow::{bail, ensure, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use lazy_static::lazy_static; use std::fs::{self, File, OpenOptions}; use std::io::{Read, Write}; +use std::ops::Deref; use std::path::{Path, PathBuf}; -use tracing::*; -use zenith_metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; -use zenith_utils::bin_ser::LeSer; - -use zenith_utils::zid::ZTenantTimelineId; - use crate::control_file_upgrade::upgrade_control_file; +use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; use crate::safekeeper::{SafeKeeperState, SK_FORMAT_VERSION, SK_MAGIC}; +use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; @@ -27,25 +23,10 @@ const CONTROL_FILE_NAME: &str = "safekeeper.control"; const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial"; pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); -// A named boolean. -#[derive(Debug)] -pub enum CreateControlFile { - True, - False, -} - -lazy_static! { - static ref PERSIST_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!( - "safekeeper_persist_control_file_seconds", - "Seconds to persist and sync control file, grouped by timeline", - &["tenant_id", "timeline_id"], - DISK_WRITE_SECONDS_BUCKETS.to_vec() - ) - .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec"); -} - -pub trait Storage { - /// Persist safekeeper state on disk. +/// Storage should keep actual state inside of it. It should implement Deref +/// trait to access state fields and have persist method for updating that state. +pub trait Storage: Deref { + /// Persist safekeeper state on disk and update internal state. fn persist(&mut self, s: &SafeKeeperState) -> Result<()>; } @@ -54,23 +35,43 @@ pub struct FileStorage { // save timeline dir to avoid reconstructing it every time timeline_dir: PathBuf, conf: SafeKeeperConf, - persist_control_file_seconds: Histogram, + + /// Last state persisted to disk. + state: SafeKeeperState, } impl FileStorage { - pub fn new(zttid: &ZTenantTimelineId, conf: &SafeKeeperConf) -> FileStorage { - let timeline_dir = conf.timeline_dir(zttid); - let tenant_id = zttid.tenant_id.to_string(); - let timeline_id = zttid.timeline_id.to_string(); - FileStorage { + /// Initialize storage by loading state from disk. + pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { + let timeline_dir = conf.timeline_dir(ttid); + + let state = Self::load_control_file_conf(conf, ttid)?; + + Ok(FileStorage { timeline_dir, conf: conf.clone(), - persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS - .with_label_values(&[&tenant_id, &timeline_id]), - } + state, + }) } - // Check the magic/version in the on-disk data and deserialize it, if possible. + /// Create file storage for a new timeline, but don't persist it yet. + pub fn create_new( + ttid: &TenantTimelineId, + conf: &SafeKeeperConf, + state: SafeKeeperState, + ) -> Result { + let timeline_dir = conf.timeline_dir(ttid); + + let store = FileStorage { + timeline_dir, + conf: conf.clone(), + state, + }; + + Ok(store) + } + + /// Check the magic/version in the on-disk data and deserialize it, if possible. fn deser_sk_state(buf: &mut &[u8]) -> Result { // Read the version independent part let magic = buf.read_u32::()?; @@ -90,32 +91,20 @@ impl FileStorage { upgrade_control_file(buf, version) } - // Load control file for given zttid at path specified by conf. + /// Load control file for given ttid at path specified by conf. pub fn load_control_file_conf( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, - create: CreateControlFile, + ttid: &TenantTimelineId, ) -> Result { - let path = conf.timeline_dir(zttid).join(CONTROL_FILE_NAME); - Self::load_control_file(path, create) + let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME); + Self::load_control_file(path) } /// Read in the control file. - /// If create=false and file doesn't exist, bails out. - pub fn load_control_file>( - control_file_path: P, - create: CreateControlFile, - ) -> Result { - info!( - "loading control file {}, create={:?}", - control_file_path.as_ref().display(), - create, - ); - + pub fn load_control_file>(control_file_path: P) -> Result { let mut control_file = OpenOptions::new() .read(true) .write(true) - .create(matches!(create, CreateControlFile::True)) .open(&control_file_path) .with_context(|| { format!( @@ -124,50 +113,49 @@ impl FileStorage { ) })?; - // Empty file is legit on 'create', don't try to deser from it. - let state = if control_file.metadata().unwrap().len() == 0 { - if let CreateControlFile::False = create { - bail!("control file is empty"); - } - SafeKeeperState::new() - } else { - let mut buf = Vec::new(); - control_file - .read_to_end(&mut buf) - .context("failed to read control file")?; + let mut buf = Vec::new(); + control_file + .read_to_end(&mut buf) + .context("failed to read control file")?; - let calculated_checksum = crc32c::crc32c(&buf[..buf.len() - CHECKSUM_SIZE]); + let calculated_checksum = crc32c::crc32c(&buf[..buf.len() - CHECKSUM_SIZE]); - let expected_checksum_bytes: &[u8; CHECKSUM_SIZE] = - buf[buf.len() - CHECKSUM_SIZE..].try_into()?; - let expected_checksum = u32::from_le_bytes(*expected_checksum_bytes); + let expected_checksum_bytes: &[u8; CHECKSUM_SIZE] = + buf[buf.len() - CHECKSUM_SIZE..].try_into()?; + let expected_checksum = u32::from_le_bytes(*expected_checksum_bytes); - ensure!( - calculated_checksum == expected_checksum, + ensure!( + calculated_checksum == expected_checksum, + format!( + "safekeeper control file checksum mismatch: expected {} got {}", + expected_checksum, calculated_checksum + ) + ); + + let state = FileStorage::deser_sk_state(&mut &buf[..buf.len() - CHECKSUM_SIZE]) + .with_context(|| { format!( - "safekeeper control file checksum mismatch: expected {} got {}", - expected_checksum, calculated_checksum + "while reading control file {}", + control_file_path.as_ref().display(), ) - ); - - FileStorage::deser_sk_state(&mut &buf[..buf.len() - CHECKSUM_SIZE]).with_context( - || { - format!( - "while reading control file {}", - control_file_path.as_ref().display(), - ) - }, - )? - }; + })?; Ok(state) } } +impl Deref for FileStorage { + type Target = SafeKeeperState; + + fn deref(&self) -> &Self::Target { + &self.state + } +} + impl Storage for FileStorage { - // persists state durably to underlying storage - // for description see https://lwn.net/Articles/457667/ + /// persists state durably to underlying storage + /// for description see https://lwn.net/Articles/457667/ fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { - let _timer = &self.persist_control_file_seconds.start_timer(); + let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer(); // write data to safekeeper.control.partial let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL); @@ -223,6 +211,9 @@ impl Storage for FileStorage { .and_then(|f| f.sync_all()) .context("failed to sync control file directory")?; } + + // update internal state + self.state = s.clone(); Ok(()) } } @@ -231,10 +222,10 @@ impl Storage for FileStorage { mod test { use super::FileStorage; use super::*; - use crate::{safekeeper::SafeKeeperState, SafeKeeperConf, ZTenantTimelineId}; + use crate::{safekeeper::SafeKeeperState, SafeKeeperConf}; use anyhow::Result; use std::fs; - use zenith_utils::lsn::Lsn; + use utils::{id::TenantTimelineId, lsn::Lsn}; fn stub_conf() -> SafeKeeperConf { let workdir = tempfile::tempdir().unwrap().into_path(); @@ -246,52 +237,57 @@ mod test { fn load_from_control_file( conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, - create: CreateControlFile, + ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); + fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir"); Ok(( - FileStorage::new(zttid, conf), - FileStorage::load_control_file_conf(conf, zttid, create)?, + FileStorage::restore_new(ttid, conf)?, + FileStorage::load_control_file_conf(conf, ttid)?, )) } + fn create( + conf: &SafeKeeperConf, + ttid: &TenantTimelineId, + ) -> Result<(FileStorage, SafeKeeperState)> { + fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir"); + let state = SafeKeeperState::empty(); + let storage = FileStorage::create_new(ttid, conf, state.clone())?; + Ok((storage, state)) + } + #[test] fn test_read_write_safekeeper_state() { let conf = stub_conf(); - let zttid = ZTenantTimelineId::generate(); + let ttid = TenantTimelineId::generate(); { - let (mut storage, mut state) = - load_from_control_file(&conf, &zttid, CreateControlFile::True) - .expect("failed to read state"); + let (mut storage, mut state) = create(&conf, &ttid).expect("failed to create state"); // change something - state.wal_start_lsn = Lsn(42); + state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } - let (_, state) = load_from_control_file(&conf, &zttid, CreateControlFile::False) - .expect("failed to read state"); - assert_eq!(state.wal_start_lsn, Lsn(42)); + let (_, state) = load_from_control_file(&conf, &ttid).expect("failed to read state"); + assert_eq!(state.commit_lsn, Lsn(42)); } #[test] fn test_safekeeper_state_checksum_mismatch() { let conf = stub_conf(); - let zttid = ZTenantTimelineId::generate(); + let ttid = TenantTimelineId::generate(); { - let (mut storage, mut state) = - load_from_control_file(&conf, &zttid, CreateControlFile::True) - .expect("failed to read state"); + let (mut storage, mut state) = create(&conf, &ttid).expect("failed to read state"); + // change something - state.wal_start_lsn = Lsn(42); + state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } - let control_path = conf.timeline_dir(&zttid).join(CONTROL_FILE_NAME); + let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME); let mut data = fs::read(&control_path).unwrap(); data[0] += 1; // change the first byte of the file to fail checksum validation fs::write(&control_path, &data).expect("failed to write control file"); - match load_from_control_file(&conf, &zttid, CreateControlFile::False) { + match load_from_control_file(&conf, &ttid) { Err(err) => assert!(err .to_string() .contains("safekeeper control file checksum mismatch")), diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs new file mode 100644 index 0000000000..95cb96fae9 --- /dev/null +++ b/safekeeper/src/control_file_upgrade.rs @@ -0,0 +1,267 @@ +//! Code to deal with safekeeper control file upgrades +use crate::safekeeper::{ + AcceptorState, PersistedPeers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, + TermSwitchEntry, +}; +use anyhow::{bail, Result}; +use pq_proto::SystemId; +use serde::{Deserialize, Serialize}; +use tracing::*; +use utils::{ + bin_ser::LeSer, + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +/// Persistent consensus state of the acceptor. +#[derive(Debug, Clone, Serialize, Deserialize)] +struct AcceptorStateV1 { + /// acceptor's last term it voted for (advanced in 1 phase) + term: Term, + /// acceptor's epoch (advanced, i.e. bumped to 'term' when VCL is reached). + epoch: Term, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +struct SafeKeeperStateV1 { + /// persistent acceptor state + acceptor_state: AcceptorStateV1, + /// information about server + server: ServerInfoV2, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + proposer_uuid: PgUuid, + /// part of WAL acknowledged by quorum and available locally + commit_lsn: Lsn, + /// minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone) + truncate_lsn: Lsn, + // Safekeeper starts receiving WAL from this LSN, zeros before it ought to + // be skipped during decoding. + wal_start_lsn: Lsn, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ServerInfoV2 { + /// Postgres server version + pub pg_version: u32, + pub system_id: SystemId, + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub wal_seg_size: u32, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SafeKeeperStateV2 { + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfoV2, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + pub proposer_uuid: PgUuid, + /// part of WAL acknowledged by quorum and available locally + pub commit_lsn: Lsn, + /// minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone) + pub truncate_lsn: Lsn, + // Safekeeper starts receiving WAL from this LSN, zeros before it ought to + // be skipped during decoding. + pub wal_start_lsn: Lsn, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ServerInfoV3 { + /// Postgres server version + pub pg_version: u32, + pub system_id: SystemId, + #[serde(with = "hex")] + pub tenant_id: TenantId, + #[serde(with = "hex")] + pub timeline_id: TimelineId, + pub wal_seg_size: u32, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SafeKeeperStateV3 { + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfoV3, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// part of WAL acknowledged by quorum and available locally + pub commit_lsn: Lsn, + /// minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone) + pub truncate_lsn: Lsn, + // Safekeeper starts receiving WAL from this LSN, zeros before it ought to + // be skipped during decoding. + pub wal_start_lsn: Lsn, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SafeKeeperStateV4 { + #[serde(with = "hex")] + pub tenant_id: TenantId, + #[serde(with = "hex")] + pub timeline_id: TimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Part of WAL acknowledged by quorum and available locally. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// First LSN not yet offloaded to s3. Useful to persist to avoid finding + /// out offloading progress on boot. + pub s3_wal_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + // Peers and their state as we remember it. Knowing peers themselves is + // fundamental; but state is saved here only for informational purposes and + // obviously can be stale. (Currently not saved at all, but let's provision + // place to have less file version upgrades). + pub peers: PersistedPeers, +} + +pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { + // migrate to storing full term history + if version == 1 { + info!("reading safekeeper control file version {}", version); + let oldstate = SafeKeeperStateV1::des(&buf[..buf.len()])?; + let ac = AcceptorState { + term: oldstate.acceptor_state.term, + term_history: TermHistory(vec![TermSwitchEntry { + term: oldstate.acceptor_state.epoch, + lsn: Lsn(0), + }]), + }; + return Ok(SafeKeeperState { + tenant_id: oldstate.server.tenant_id, + timeline_id: oldstate.server.timeline_id, + acceptor_state: ac, + server: ServerInfo { + pg_version: oldstate.server.pg_version, + system_id: oldstate.server.system_id, + wal_seg_size: oldstate.server.wal_seg_size, + }, + proposer_uuid: oldstate.proposer_uuid, + timeline_start_lsn: Lsn(0), + local_start_lsn: Lsn(0), + commit_lsn: oldstate.commit_lsn, + backup_lsn: Lsn(0), + peer_horizon_lsn: oldstate.truncate_lsn, + remote_consistent_lsn: Lsn(0), + peers: PersistedPeers(vec![]), + }); + // migrate to hexing some ids + } else if version == 2 { + info!("reading safekeeper control file version {}", version); + let oldstate = SafeKeeperStateV2::des(&buf[..buf.len()])?; + let server = ServerInfo { + pg_version: oldstate.server.pg_version, + system_id: oldstate.server.system_id, + wal_seg_size: oldstate.server.wal_seg_size, + }; + return Ok(SafeKeeperState { + tenant_id: oldstate.server.tenant_id, + timeline_id: oldstate.server.timeline_id, + acceptor_state: oldstate.acceptor_state, + server, + proposer_uuid: oldstate.proposer_uuid, + timeline_start_lsn: Lsn(0), + local_start_lsn: Lsn(0), + commit_lsn: oldstate.commit_lsn, + backup_lsn: Lsn(0), + peer_horizon_lsn: oldstate.truncate_lsn, + remote_consistent_lsn: Lsn(0), + peers: PersistedPeers(vec![]), + }); + // migrate to moving tenant_id/timeline_id to the top and adding some lsns + } else if version == 3 { + info!("reading safekeeper control file version {version}"); + let oldstate = SafeKeeperStateV3::des(&buf[..buf.len()])?; + let server = ServerInfo { + pg_version: oldstate.server.pg_version, + system_id: oldstate.server.system_id, + wal_seg_size: oldstate.server.wal_seg_size, + }; + return Ok(SafeKeeperState { + tenant_id: oldstate.server.tenant_id, + timeline_id: oldstate.server.timeline_id, + acceptor_state: oldstate.acceptor_state, + server, + proposer_uuid: oldstate.proposer_uuid, + timeline_start_lsn: Lsn(0), + local_start_lsn: Lsn(0), + commit_lsn: oldstate.commit_lsn, + backup_lsn: Lsn(0), + peer_horizon_lsn: oldstate.truncate_lsn, + remote_consistent_lsn: Lsn(0), + peers: PersistedPeers(vec![]), + }); + // migrate to having timeline_start_lsn + } else if version == 4 { + info!("reading safekeeper control file version {}", version); + let oldstate = SafeKeeperStateV4::des(&buf[..buf.len()])?; + let server = ServerInfo { + pg_version: oldstate.server.pg_version, + system_id: oldstate.server.system_id, + wal_seg_size: oldstate.server.wal_seg_size, + }; + return Ok(SafeKeeperState { + tenant_id: oldstate.tenant_id, + timeline_id: oldstate.timeline_id, + acceptor_state: oldstate.acceptor_state, + server, + proposer_uuid: oldstate.proposer_uuid, + timeline_start_lsn: Lsn(0), + local_start_lsn: Lsn(0), + commit_lsn: oldstate.commit_lsn, + backup_lsn: Lsn::INVALID, + peer_horizon_lsn: oldstate.peer_horizon_lsn, + remote_consistent_lsn: Lsn(0), + peers: PersistedPeers(vec![]), + }); + } else if version == 5 { + info!("reading safekeeper control file version {}", version); + let mut oldstate = SafeKeeperState::des(&buf[..buf.len()])?; + if oldstate.timeline_start_lsn != Lsn(0) { + return Ok(oldstate); + } + + // set special timeline_start_lsn because we don't know the real one + info!("setting timeline_start_lsn and local_start_lsn to Lsn(1)"); + oldstate.timeline_start_lsn = Lsn(1); + oldstate.local_start_lsn = Lsn(1); + + return Ok(oldstate); + } else if version == 6 { + info!("reading safekeeper control file version {}", version); + let mut oldstate = SafeKeeperState::des(&buf[..buf.len()])?; + if oldstate.server.pg_version != 0 { + return Ok(oldstate); + } + + // set pg_version to the default v14 + info!("setting pg_version to 140005"); + oldstate.server.pg_version = 140005; + + return Ok(oldstate); + } + bail!("unsupported safekeeper control file version {}", version) +} diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs new file mode 100644 index 0000000000..a1e0bcbec0 --- /dev/null +++ b/safekeeper/src/handler.rs @@ -0,0 +1,198 @@ +//! Part of Safekeeper pretending to be Postgres, i.e. handling Postgres +//! protocol commands. + +use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; +use crate::receive_wal::ReceiveWalConn; + +use crate::send_wal::ReplicationConn; + +use crate::{GlobalTimelines, SafeKeeperConf}; +use anyhow::{bail, Context, Result}; + +use postgres_ffi::PG_TLI; +use regex::Regex; + +use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; +use tracing::info; +use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, + postgres_backend::{self, PostgresBackend}, +}; + +/// Safekeeper handler of postgres commands +pub struct SafekeeperPostgresHandler { + pub conf: SafeKeeperConf, + /// assigned application name + pub appname: Option, + pub tenant_id: Option, + pub timeline_id: Option, + pub ttid: TenantTimelineId, +} + +/// Parsed Postgres command. +enum SafekeeperPostgresCommand { + StartWalPush, + StartReplication { start_lsn: Lsn }, + IdentifySystem, + JSONCtrl { cmd: AppendLogicalMessage }, +} + +fn parse_cmd(cmd: &str) -> Result { + if cmd.starts_with("START_WAL_PUSH") { + Ok(SafekeeperPostgresCommand::StartWalPush) + } else if cmd.starts_with("START_REPLICATION") { + let re = + Regex::new(r"START_REPLICATION(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)").unwrap(); + let mut caps = re.captures_iter(cmd); + let start_lsn = caps + .next() + .map(|cap| cap[1].parse::()) + .context("failed to parse start LSN from START_REPLICATION command")??; + Ok(SafekeeperPostgresCommand::StartReplication { start_lsn }) + } else if cmd.starts_with("IDENTIFY_SYSTEM") { + Ok(SafekeeperPostgresCommand::IdentifySystem) + } else if cmd.starts_with("JSON_CTRL") { + let cmd = cmd.strip_prefix("JSON_CTRL").context("invalid prefix")?; + Ok(SafekeeperPostgresCommand::JSONCtrl { + cmd: serde_json::from_str(cmd)?, + }) + } else { + bail!("unsupported command {}", cmd); + } +} + +impl postgres_backend::Handler for SafekeeperPostgresHandler { + // tenant_id and timeline_id are passed in connection string params + fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> { + if let FeStartupPacket::StartupMessage { params, .. } = sm { + if let Some(options) = params.options_raw() { + for opt in options { + // FIXME `ztenantid` and `ztimelineid` left for compatibility during deploy, + // remove these after the PR gets deployed: + // https://github.com/neondatabase/neon/pull/2433#discussion_r970005064 + match opt.split_once('=') { + Some(("ztenantid", value)) | Some(("tenant_id", value)) => { + self.tenant_id = Some(value.parse()?); + } + Some(("ztimelineid", value)) | Some(("timeline_id", value)) => { + self.timeline_id = Some(value.parse()?); + } + _ => continue, + } + } + } + + if let Some(app_name) = params.get("application_name") { + self.appname = Some(app_name.to_owned()); + } + + Ok(()) + } else { + bail!("Safekeeper received unexpected initial message: {:?}", sm); + } + } + + fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> { + let cmd = parse_cmd(query_string)?; + + info!( + "got query {:?} in timeline {:?}", + query_string, self.timeline_id + ); + + let tenant_id = self.tenant_id.context("tenantid is required")?; + let timeline_id = self.timeline_id.context("timelineid is required")?; + self.ttid = TenantTimelineId::new(tenant_id, timeline_id); + + match cmd { + SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb).run(self), + SafekeeperPostgresCommand::StartReplication { start_lsn } => { + ReplicationConn::new(pgb).run(self, pgb, start_lsn) + } + SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb), + SafekeeperPostgresCommand::JSONCtrl { ref cmd } => handle_json_ctrl(self, pgb, cmd), + } + .context(format!( + "Failed to process query for timeline {timeline_id}" + ))?; + + Ok(()) + } +} + +impl SafekeeperPostgresHandler { + pub fn new(conf: SafeKeeperConf) -> Self { + SafekeeperPostgresHandler { + conf, + appname: None, + tenant_id: None, + timeline_id: None, + ttid: TenantTimelineId::empty(), + } + } + + /// + /// Handle IDENTIFY_SYSTEM replication command + /// + fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> { + let tli = GlobalTimelines::get(self.ttid)?; + + let lsn = if self.is_walproposer_recovery() { + // walproposer should get all local WAL until flush_lsn + tli.get_flush_lsn() + } else { + // other clients shouldn't get any uncommitted WAL + tli.get_state().0.commit_lsn + } + .to_string(); + + let sysid = tli.get_state().1.server.system_id.to_string(); + let lsn_bytes = lsn.as_bytes(); + let tli = PG_TLI.to_string(); + let tli_bytes = tli.as_bytes(); + let sysid_bytes = sysid.as_bytes(); + + pgb.write_message_noflush(&BeMessage::RowDescription(&[ + RowDescriptor { + name: b"systemid", + typoid: TEXT_OID, + typlen: -1, + ..Default::default() + }, + RowDescriptor { + name: b"timeline", + typoid: INT4_OID, + typlen: 4, + ..Default::default() + }, + RowDescriptor { + name: b"xlogpos", + typoid: TEXT_OID, + typlen: -1, + ..Default::default() + }, + RowDescriptor { + name: b"dbname", + typoid: TEXT_OID, + typlen: -1, + ..Default::default() + }, + ]))? + .write_message_noflush(&BeMessage::DataRow(&[ + Some(sysid_bytes), + Some(tli_bytes), + Some(lsn_bytes), + None, + ]))? + .write_message(&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"))?; + Ok(()) + } + + /// Returns true if current connection is a replication connection, originating + /// from a walproposer recovery function. This connection gets a special handling: + /// safekeeper must stream all local WAL till the flush_lsn, whether committed or not. + pub fn is_walproposer_recovery(&self) -> bool { + self.appname == Some("wal_proposer_recovery".to_string()) + } +} diff --git a/walkeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs similarity index 57% rename from walkeeper/src/http/mod.rs rename to safekeeper/src/http/mod.rs index c82d1c0362..1831470007 100644 --- a/walkeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -1,2 +1,4 @@ pub mod routes; pub use routes::make_router; + +pub use safekeeper_api::models; diff --git a/safekeeper/src/http/openapi_spec.yaml b/safekeeper/src/http/openapi_spec.yaml new file mode 100644 index 0000000000..da225f244b --- /dev/null +++ b/safekeeper/src/http/openapi_spec.yaml @@ -0,0 +1,365 @@ +openapi: "3.0.2" +info: + title: Safekeeper control API + version: "1.0" + + +servers: + - url: "http://localhost:7676" + + +paths: + /v1/status: + get: + tags: + - "Info" + summary: Get safekeeper status + description: "" + operationId: v1GetSafekeeperStatus + responses: + "200": + description: Safekeeper status + content: + application/json: + schema: + $ref: "#/components/schemas/SafekeeperStatus" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/tenant/{tenant_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + + delete: + tags: + - "Tenant" + summary: Delete tenant and all its timelines + description: "Deletes tenant and returns a map of timelines that were deleted along with their statuses" + operationId: v1DeleteTenant + responses: + "200": + description: Tenant deleted + content: + application/json: + schema: + $ref: "#/components/schemas/TenantDeleteResult" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/tenant/{tenant_id}/timeline: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + + post: + tags: + - "Timeline" + summary: Register new timeline + description: "" + operationId: v1CreateTenantTimeline + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/TimelineCreateRequest" + responses: + "201": + description: Timeline created + # TODO: return timeline info? + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/tenant/{tenant_id}/timeline/{timeline_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + + get: + tags: + - "Timeline" + summary: Get timeline information and status + description: "" + operationId: v1GetTenantTimeline + responses: + "200": + description: Timeline status + content: + application/json: + schema: + $ref: "#/components/schemas/TimelineStatus" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + delete: + tags: + - "Timeline" + summary: Delete timeline + description: "" + operationId: v1DeleteTenantTimeline + responses: + "200": + description: Timeline deleted + content: + application/json: + schema: + $ref: "#/components/schemas/TimelineDeleteResult" + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + + /v1/record_safekeeper_info/{tenant_id}/{timeline_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + + post: + tags: + - "Tests" + summary: Used only in tests to hand craft required data + description: "" + operationId: v1RecordSafekeeperInfo + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/SkTimelineInfo" + responses: + "200": + description: Timeline info posted + # TODO: return timeline info? + "403": + $ref: "#/components/responses/ForbiddenError" + default: + $ref: "#/components/responses/GenericError" + + +components: + securitySchemes: + JWT: + type: http + scheme: bearer + bearerFormat: JWT + + + schemas: + + # + # Requests + # + + TimelineCreateRequest: + type: object + required: + - timeline_id + - peer_ids + properties: + timeline_id: + type: string + format: hex + peer_ids: + type: array + items: + type: integer + minimum: 0 + + SkTimelineInfo: + type: object + required: + - last_log_term + - flush_lsn + - commit_lsn + - backup_lsn + - remote_consistent_lsn + - peer_horizon_lsn + - safekeeper_connstr + properties: + last_log_term: + type: integer + minimum: 0 + flush_lsn: + type: string + commit_lsn: + type: string + backup_lsn: + type: string + remote_consistent_lsn: + type: string + peer_horizon_lsn: + type: string + safekeeper_connstr: + type: string + + # + # Responses + # + + SafekeeperStatus: + type: object + required: + - id + properties: + id: + type: integer + minimum: 0 # kind of unsigned integer + + TimelineStatus: + type: object + required: + - timeline_id + - tenant_id + properties: + timeline_id: + type: string + format: hex + tenant_id: + type: string + format: hex + acceptor_state: + $ref: '#/components/schemas/AcceptorStateStatus' + flush_lsn: + type: string + timeline_start_lsn: + type: string + local_start_lsn: + type: string + commit_lsn: + type: string + backup_lsn: + type: string + peer_horizon_lsn: + type: string + remote_consistent_lsn: + type: string + + AcceptorStateStatus: + type: object + required: + - term + - epoch + properties: + term: + type: integer + minimum: 0 # kind of unsigned integer + epoch: + type: integer + minimum: 0 # kind of unsigned integer + term_history: + type: array + items: + $ref: '#/components/schemas/TermSwitchEntry' + + TermSwitchEntry: + type: object + required: + - term + - lsn + properties: + term: + type: integer + minimum: 0 # kind of unsigned integer + lsn: + type: string + + TimelineDeleteResult: + type: object + required: + - dir_existed + - was_active + properties: + dir_existed: + type: boolean + was_active: + type: boolean + + TenantDeleteResult: + type: object + additionalProperties: + $ref: "#/components/schemas/TimelineDeleteResult" + example: + 57fd1b39f23704a63423de0a8435d85c: + dir_existed: true + was_active: false + 67fd1b39f23704a63423gb8435d85c33: + dir_existed: false + was_active: false + + # + # Errors + # + + GenericErrorContent: + type: object + properties: + msg: + type: string + + responses: + + # + # Errors + # + + GenericError: + description: Generic error response + content: + application/json: + schema: + $ref: "#/components/schemas/GenericErrorContent" + + ForbiddenError: + description: Forbidden error response + content: + application/json: + schema: + type: object + required: + - msg + properties: + msg: + type: string + + +security: + - JWT: [] diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs new file mode 100644 index 0000000000..6efd09c7e2 --- /dev/null +++ b/safekeeper/src/http/routes.rs @@ -0,0 +1,279 @@ +use hyper::{Body, Request, Response, StatusCode, Uri}; + +use anyhow::Context; +use once_cell::sync::Lazy; +use postgres_ffi::WAL_SEGMENT_SIZE; +use serde::Serialize; +use serde::Serializer; +use std::collections::{HashMap, HashSet}; +use std::fmt::Display; +use std::sync::Arc; +use tokio::task::JoinError; + +use crate::safekeeper::ServerInfo; +use crate::safekeeper::Term; +use crate::safekeeper::TermHistory; + +use crate::timelines_global_map::TimelineDeleteForceResult; +use crate::GlobalTimelines; +use crate::SafeKeeperConf; +use etcd_broker::subscription_value::SkTimelineInfo; +use utils::{ + auth::JwtAuth, + http::{ + endpoint::{self, auth_middleware, check_permission}, + error::ApiError, + json::{json_request, json_response}, + request::{ensure_no_body, parse_request_param}, + RequestExt, RouterBuilder, + }, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, +}; + +use super::models::TimelineCreateRequest; + +#[derive(Debug, Serialize)] +struct SafekeeperStatus { + id: NodeId, +} + +/// Healthcheck handler. +async fn status_handler(request: Request) -> Result, ApiError> { + check_permission(&request, None)?; + let conf = get_conf(&request); + let status = SafekeeperStatus { id: conf.my_id }; + json_response(StatusCode::OK, status) +} + +fn get_conf(request: &Request) -> &SafeKeeperConf { + request + .data::>() + .expect("unknown state type") + .as_ref() +} + +/// Serialize through Display trait. +fn display_serialize(z: &F, s: S) -> Result +where + S: Serializer, + F: Display, +{ + s.serialize_str(&format!("{}", z)) +} + +/// Augment AcceptorState with epoch for convenience +#[derive(Debug, Serialize)] +struct AcceptorStateStatus { + term: Term, + epoch: Term, + term_history: TermHistory, +} + +/// Info about timeline on safekeeper ready for reporting. +#[derive(Debug, Serialize)] +struct TimelineStatus { + #[serde(serialize_with = "display_serialize")] + tenant_id: TenantId, + #[serde(serialize_with = "display_serialize")] + timeline_id: TimelineId, + acceptor_state: AcceptorStateStatus, + pg_info: ServerInfo, + #[serde(serialize_with = "display_serialize")] + flush_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] + timeline_start_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] + local_start_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] + commit_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] + backup_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] + peer_horizon_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] + remote_consistent_lsn: Lsn, +} + +/// Report info about timeline. +async fn timeline_status_handler(request: Request) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let tli = GlobalTimelines::get(ttid) + // FIXME: Currently, the only errors from `GlobalTimelines::get` will be client errors + // because the provided timeline isn't there. However, the method can in theory change and + // fail from internal errors later. Remove this comment once it the method returns + // something other than `anyhow::Result`. + .map_err(ApiError::InternalServerError)?; + let (inmem, state) = tli.get_state(); + let flush_lsn = tli.get_flush_lsn(); + + let acc_state = AcceptorStateStatus { + term: state.acceptor_state.term, + epoch: state.acceptor_state.get_epoch(flush_lsn), + term_history: state.acceptor_state.term_history, + }; + + // Note: we report in memory values which can be lost. + let status = TimelineStatus { + tenant_id: ttid.tenant_id, + timeline_id: ttid.timeline_id, + acceptor_state: acc_state, + pg_info: state.server, + flush_lsn, + timeline_start_lsn: state.timeline_start_lsn, + local_start_lsn: state.local_start_lsn, + commit_lsn: inmem.commit_lsn, + backup_lsn: inmem.backup_lsn, + peer_horizon_lsn: inmem.peer_horizon_lsn, + remote_consistent_lsn: inmem.remote_consistent_lsn, + }; + json_response(StatusCode::OK, status) +} + +async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { + let request_data: TimelineCreateRequest = json_request(&mut request).await?; + + let ttid = TenantTimelineId { + tenant_id: request_data.tenant_id, + timeline_id: request_data.timeline_id, + }; + check_permission(&request, Some(ttid.tenant_id))?; + + let server_info = ServerInfo { + pg_version: request_data.pg_version, + system_id: request_data.system_id.unwrap_or(0), + wal_seg_size: request_data.wal_seg_size.unwrap_or(WAL_SEGMENT_SIZE as u32), + }; + let local_start_lsn = request_data.local_start_lsn.unwrap_or_else(|| { + request_data + .commit_lsn + .segment_lsn(server_info.wal_seg_size as usize) + }); + tokio::task::spawn_blocking(move || { + GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn) + }) + .await + .map_err(|e| ApiError::InternalServerError(e.into()))? + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + +/// Deactivates the timeline and removes its data directory. +async fn timeline_delete_force_handler( + mut request: Request, +) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + ensure_no_body(&mut request).await?; + let resp = tokio::task::spawn_blocking(move || { + // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better + // error handling here when we're able to. + GlobalTimelines::delete_force(&ttid).map_err(ApiError::InternalServerError) + }) + .await + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; + json_response(StatusCode::OK, resp) +} + +/// Deactivates all timelines for the tenant and removes its data directory. +/// See `timeline_delete_force_handler`. +async fn tenant_delete_force_handler( + mut request: Request, +) -> Result, ApiError> { + let tenant_id = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + ensure_no_body(&mut request).await?; + let delete_info = tokio::task::spawn_blocking(move || { + // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons; + // Using an `InternalServerError` should be fixed when the types support it + GlobalTimelines::delete_force_all_for_tenant(&tenant_id) + .map_err(ApiError::InternalServerError) + }) + .await + .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??; + json_response( + StatusCode::OK, + delete_info + .iter() + .map(|(ttid, resp)| (format!("{}", ttid.timeline_id), *resp)) + .collect::>(), + ) +} + +/// Used only in tests to hand craft required data. +async fn record_safekeeper_info(mut request: Request) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + let safekeeper_info: SkTimelineInfo = json_request(&mut request).await?; + + let tli = GlobalTimelines::get(ttid) + // `GlobalTimelines::get` returns an error when it can't find the timeline. + .with_context(|| { + format!( + "Couldn't get timeline {} for tenant {}", + ttid.timeline_id, ttid.tenant_id + ) + }) + .map_err(ApiError::NotFound)?; + tli.record_safekeeper_info(&safekeeper_info, NodeId(1)) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + +/// Safekeeper http router. +pub fn make_router( + conf: SafeKeeperConf, + auth: Option>, +) -> RouterBuilder { + let mut router = endpoint::make_router(); + if auth.is_some() { + router = router.middleware(auth_middleware(|request| { + #[allow(clippy::mutable_key_type)] + static ALLOWLIST_ROUTES: Lazy> = + Lazy::new(|| ["/v1/status"].iter().map(|v| v.parse().unwrap()).collect()); + if ALLOWLIST_ROUTES.contains(request.uri()) { + None + } else { + // Option> is always provided as data below, hence unwrap(). + request.data::>>().unwrap().as_deref() + } + })) + } + + // NB: on any changes do not forget to update the OpenAPI spec + // located nearby (/safekeeper/src/http/openapi_spec.yaml). + router + .data(Arc::new(conf)) + .data(auth) + .get("/v1/status", status_handler) + // Will be used in the future instead of implicit timeline creation + .post("/v1/tenant/timeline", timeline_create_handler) + .get( + "/v1/tenant/:tenant_id/timeline/:timeline_id", + timeline_status_handler, + ) + .delete( + "/v1/tenant/:tenant_id/timeline/:timeline_id", + timeline_delete_force_handler, + ) + .delete("/v1/tenant/:tenant_id", tenant_delete_force_handler) + // for tests + .post( + "/v1/record_safekeeper_info/:tenant_id/:timeline_id", + record_safekeeper_info, + ) +} diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs new file mode 100644 index 0000000000..746b4461b7 --- /dev/null +++ b/safekeeper/src/json_ctrl.rs @@ -0,0 +1,175 @@ +//! +//! This module implements JSON_CTRL protocol, which allows exchange +//! JSON messages over psql for testing purposes. +//! +//! Currently supports AppendLogicalMessage, which is used for WAL +//! modifications in tests. +//! + +use std::sync::Arc; + +use anyhow::Result; +use bytes::Bytes; +use serde::{Deserialize, Serialize}; +use tracing::*; +use utils::id::TenantTimelineId; + +use crate::handler::SafekeeperPostgresHandler; +use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo}; +use crate::safekeeper::{ + AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, +}; +use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry}; +use crate::timeline::Timeline; +use crate::GlobalTimelines; +use postgres_ffi::encode_logical_message; +use postgres_ffi::WAL_SEGMENT_SIZE; +use pq_proto::{BeMessage, RowDescriptor, TEXT_OID}; +use utils::{lsn::Lsn, postgres_backend::PostgresBackend}; + +#[derive(Serialize, Deserialize, Debug)] +pub struct AppendLogicalMessage { + // prefix and message to build LogicalMessage + lm_prefix: String, + lm_message: String, + + // if true, commit_lsn will match flush_lsn after append + set_commit_lsn: bool, + + // if true, ProposerElected will be sent before append + send_proposer_elected: bool, + + // fields from AppendRequestHeader + term: Term, + epoch_start_lsn: Lsn, + begin_lsn: Lsn, + truncate_lsn: Lsn, + pg_version: u32, +} + +#[derive(Serialize, Deserialize)] +struct AppendResult { + // safekeeper state after append + state: SafeKeeperState, + // info about new record in the WAL + inserted_wal: InsertedWAL, +} + +/// Handles command to craft logical message WAL record with given +/// content, and then append it with specified term and lsn. This +/// function is used to test safekeepers in different scenarios. +pub fn handle_json_ctrl( + spg: &SafekeeperPostgresHandler, + pgb: &mut PostgresBackend, + append_request: &AppendLogicalMessage, +) -> Result<()> { + info!("JSON_CTRL request: {:?}", append_request); + + // need to init safekeeper state before AppendRequest + let tli = prepare_safekeeper(spg.ttid, append_request.pg_version)?; + + // if send_proposer_elected is true, we need to update local history + if append_request.send_proposer_elected { + send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn)?; + } + + let inserted_wal = append_logical_message(&tli, append_request)?; + let response = AppendResult { + state: tli.get_state().1, + inserted_wal, + }; + let response_data = serde_json::to_vec(&response)?; + + pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor { + name: b"json", + typoid: TEXT_OID, + typlen: -1, + ..Default::default() + }]))? + .write_message_noflush(&BeMessage::DataRow(&[Some(&response_data)]))? + .write_message(&BeMessage::CommandComplete(b"JSON_CTRL"))?; + Ok(()) +} + +/// Prepare safekeeper to process append requests without crashes, +/// by sending ProposerGreeting with default server.wal_seg_size. +fn prepare_safekeeper(ttid: TenantTimelineId, pg_version: u32) -> Result> { + GlobalTimelines::create( + ttid, + ServerInfo { + pg_version, + wal_seg_size: WAL_SEGMENT_SIZE as u32, + system_id: 0, + }, + Lsn::INVALID, + Lsn::INVALID, + ) +} + +fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> Result<()> { + // add new term to existing history + let history = tli.get_state().1.acceptor_state.term_history; + let history = history.up_to(lsn.checked_sub(1u64).unwrap()); + let mut history_entries = history.0; + history_entries.push(TermSwitchEntry { term, lsn }); + let history = TermHistory(history_entries); + + let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected { + term, + start_streaming_at: lsn, + term_history: history, + timeline_start_lsn: lsn, + }); + + tli.process_msg(&proposer_elected_request)?; + Ok(()) +} + +#[derive(Serialize, Deserialize)] +struct InsertedWAL { + begin_lsn: Lsn, + end_lsn: Lsn, + append_response: AppendResponse, +} + +/// Extend local WAL with new LogicalMessage record. To do that, +/// create AppendRequest with new WAL and pass it to safekeeper. +fn append_logical_message(tli: &Arc, msg: &AppendLogicalMessage) -> Result { + let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); + let sk_state = tli.get_state().1; + + let begin_lsn = msg.begin_lsn; + let end_lsn = begin_lsn + wal_data.len() as u64; + + let commit_lsn = if msg.set_commit_lsn { + end_lsn + } else { + sk_state.commit_lsn + }; + + let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest { + h: AppendRequestHeader { + term: msg.term, + epoch_start_lsn: begin_lsn, + begin_lsn, + end_lsn, + commit_lsn, + truncate_lsn: msg.truncate_lsn, + proposer_uuid: [0u8; 16], + }, + wal_data: Bytes::from(wal_data), + }); + + let response = tli.process_msg(&append_request)?; + + let append_response = match response { + Some(AcceptorProposerMessage::AppendResponse(resp)) => resp, + _ => anyhow::bail!("not AppendResponse"), + }; + + Ok(InsertedWAL { + begin_lsn, + end_lsn, + append_response, + }) +} diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs new file mode 100644 index 0000000000..395a29c9ed --- /dev/null +++ b/safekeeper/src/lib.rs @@ -0,0 +1,105 @@ +use defaults::{ + DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_WAL_BACKUP_RUNTIME_THREADS, +}; +// +use remote_storage::RemoteStorageConfig; +use std::path::PathBuf; +use std::time::Duration; +use url::Url; + +use utils::{ + id::{NodeId, TenantId, TenantTimelineId}, + logging::LogFormat, +}; + +pub mod broker; +pub mod control_file; +pub mod control_file_upgrade; +pub mod handler; +pub mod http; +pub mod json_ctrl; +pub mod metrics; +pub mod receive_wal; +pub mod remove_wal; +pub mod safekeeper; +pub mod send_wal; +pub mod timeline; +pub mod wal_backup; +pub mod wal_service; +pub mod wal_storage; + +mod timelines_global_map; +pub use timelines_global_map::GlobalTimelines; + +pub mod defaults { + use std::time::Duration; + + pub use safekeeper_api::{ + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PG_LISTEN_PORT, + }; + + pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8; + pub const DEFAULT_HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(5); + pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20); +} + +#[derive(Debug, Clone)] +pub struct SafeKeeperConf { + // Repository directory, relative to current working directory. + // Normally, the safekeeper changes the current working directory + // to the repository, and 'workdir' is always '.'. But we don't do + // that during unit testing, because the current directory is global + // to the process but different unit tests work on different + // data directories to avoid clashing with each other. + pub workdir: PathBuf, + + pub no_sync: bool, + pub listen_pg_addr: String, + pub listen_http_addr: String, + pub remote_storage: Option, + pub backup_runtime_threads: usize, + pub wal_backup_enabled: bool, + pub my_id: NodeId, + pub broker_endpoints: Vec, + pub broker_etcd_prefix: String, + pub auth_validation_public_key_path: Option, + pub heartbeat_timeout: Duration, + pub max_offloader_lag_bytes: u64, + pub log_format: LogFormat, +} + +impl SafeKeeperConf { + pub fn tenant_dir(&self, tenant_id: &TenantId) -> PathBuf { + self.workdir.join(tenant_id.to_string()) + } + + pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> PathBuf { + self.tenant_dir(&ttid.tenant_id) + .join(ttid.timeline_id.to_string()) + } +} + +impl Default for SafeKeeperConf { + fn default() -> Self { + SafeKeeperConf { + // Always set to './'. We will chdir into the directory specified on the + // command line, so that when the server is running, all paths are relative + // to that. + workdir: PathBuf::from("./"), + no_sync: false, + listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), + listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), + remote_storage: None, + my_id: NodeId(0), + broker_endpoints: Vec::new(), + broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), + backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS, + wal_backup_enabled: true, + auth_validation_public_key_path: None, + heartbeat_timeout: DEFAULT_HEARTBEAT_TIMEOUT, + max_offloader_lag_bytes: DEFAULT_MAX_OFFLOADER_LAG_BYTES, + log_format: LogFormat::Plain, + } + } +} diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs new file mode 100644 index 0000000000..d4d3d37737 --- /dev/null +++ b/safekeeper/src/metrics.rs @@ -0,0 +1,493 @@ +//! Global safekeeper mertics and per-timeline safekeeper metrics. + +use std::time::{Instant, SystemTime}; + +use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS}; +use anyhow::Result; +use metrics::{ + core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts}, + proto::MetricFamily, + Gauge, IntGaugeVec, +}; +use once_cell::sync::Lazy; +use postgres_ffi::XLogSegNo; +use utils::{id::TenantTimelineId, lsn::Lsn}; + +use crate::{ + safekeeper::{SafeKeeperState, SafekeeperMemState}, + timeline::ReplicaState, + GlobalTimelines, +}; + +// Global metrics across all timelines. +pub static WRITE_WAL_BYTES: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_write_wal_bytes", + "Bytes written to WAL in a single request", + vec![ + 1.0, + 10.0, + 100.0, + 1024.0, + 8192.0, + 128.0 * 1024.0, + 1024.0 * 1024.0, + 10.0 * 1024.0 * 1024.0 + ] + ) + .expect("Failed to register safekeeper_write_wal_bytes histogram") +}); +pub static WRITE_WAL_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_write_wal_seconds", + "Seconds spent writing and syncing WAL to a disk in a single request", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_write_wal_seconds histogram") +}); +pub static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_flush_wal_seconds", + "Seconds spent syncing WAL to a disk", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_flush_wal_seconds histogram") +}); +pub static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { + register_histogram!( + "safekeeper_persist_control_file_seconds", + "Seconds to persist and sync control file", + DISK_WRITE_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") +}); + +/// Metrics for WalStorage in a single timeline. +#[derive(Clone, Default)] +pub struct WalStorageMetrics { + /// How much bytes were written in total. + write_wal_bytes: u64, + /// How much time spent writing WAL to disk, waiting for write(2). + write_wal_seconds: f64, + /// How much time spent syncing WAL to disk, waiting for fsync(2). + flush_wal_seconds: f64, +} + +impl WalStorageMetrics { + pub fn observe_write_bytes(&mut self, bytes: usize) { + self.write_wal_bytes += bytes as u64; + WRITE_WAL_BYTES.observe(bytes as f64); + } + + pub fn observe_write_seconds(&mut self, seconds: f64) { + self.write_wal_seconds += seconds; + WRITE_WAL_SECONDS.observe(seconds); + } + + pub fn observe_flush_seconds(&mut self, seconds: f64) { + self.flush_wal_seconds += seconds; + FLUSH_WAL_SECONDS.observe(seconds); + } +} + +/// Accepts a closure that returns a result, and returns the duration of the closure. +pub fn time_io_closure(closure: impl FnOnce() -> Result<()>) -> Result { + let start = std::time::Instant::now(); + closure()?; + Ok(start.elapsed().as_secs_f64()) +} + +/// Metrics for a single timeline. +pub struct FullTimelineInfo { + pub ttid: TenantTimelineId, + pub replicas: Vec, + pub wal_backup_active: bool, + pub timeline_is_active: bool, + pub num_computes: u32, + pub last_removed_segno: XLogSegNo, + + pub epoch_start_lsn: Lsn, + pub mem_state: SafekeeperMemState, + pub persisted_state: SafeKeeperState, + + pub flush_lsn: Lsn, + + pub wal_storage: WalStorageMetrics, +} + +/// Collects metrics for all active timelines. +pub struct TimelineCollector { + descs: Vec, + commit_lsn: GenericGaugeVec, + backup_lsn: GenericGaugeVec, + flush_lsn: GenericGaugeVec, + epoch_start_lsn: GenericGaugeVec, + peer_horizon_lsn: GenericGaugeVec, + remote_consistent_lsn: GenericGaugeVec, + feedback_ps_write_lsn: GenericGaugeVec, + feedback_last_time_seconds: GenericGaugeVec, + timeline_active: GenericGaugeVec, + wal_backup_active: GenericGaugeVec, + connected_computes: IntGaugeVec, + disk_usage: GenericGaugeVec, + acceptor_term: GenericGaugeVec, + written_wal_bytes: GenericGaugeVec, + written_wal_seconds: GaugeVec, + flushed_wal_seconds: GaugeVec, + collect_timeline_metrics: Gauge, + timelines_count: IntGauge, +} + +impl Default for TimelineCollector { + fn default() -> Self { + Self::new() + } +} + +impl TimelineCollector { + pub fn new() -> TimelineCollector { + let mut descs = Vec::new(); + + let commit_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_commit_lsn", + "Current commit_lsn (not necessarily persisted to disk), grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(commit_lsn.desc().into_iter().cloned()); + + let backup_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_backup_lsn", + "Current backup_lsn, up to which WAL is backed up, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(backup_lsn.desc().into_iter().cloned()); + + let flush_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_flush_lsn", + "Current flush_lsn, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(flush_lsn.desc().into_iter().cloned()); + + let epoch_start_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_epoch_start_lsn", + "Point since which compute generates new WAL in the current consensus term", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(epoch_start_lsn.desc().into_iter().cloned()); + + let peer_horizon_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_peer_horizon_lsn", + "LSN of the most lagging safekeeper", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(peer_horizon_lsn.desc().into_iter().cloned()); + + let remote_consistent_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_remote_consistent_lsn", + "LSN which is persisted to the remote storage in pageserver", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(remote_consistent_lsn.desc().into_iter().cloned()); + + let feedback_ps_write_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_feedback_ps_write_lsn", + "Last LSN received by the pageserver, acknowledged in the feedback", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(feedback_ps_write_lsn.desc().into_iter().cloned()); + + let feedback_last_time_seconds = GenericGaugeVec::new( + Opts::new( + "safekeeper_feedback_last_time_seconds", + "Timestamp of the last feedback from the pageserver", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(feedback_last_time_seconds.desc().into_iter().cloned()); + + let timeline_active = GenericGaugeVec::new( + Opts::new( + "safekeeper_timeline_active", + "Reports 1 for active timelines, 0 for inactive", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(timeline_active.desc().into_iter().cloned()); + + let wal_backup_active = GenericGaugeVec::new( + Opts::new( + "safekeeper_wal_backup_active", + "Reports 1 for timelines with active WAL backup, 0 otherwise", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(wal_backup_active.desc().into_iter().cloned()); + + let connected_computes = IntGaugeVec::new( + Opts::new( + "safekeeper_connected_computes", + "Number of active compute connections", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(connected_computes.desc().into_iter().cloned()); + + let disk_usage = GenericGaugeVec::new( + Opts::new( + "safekeeper_disk_usage_bytes", + "Estimated disk space used to store WAL segments", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(disk_usage.desc().into_iter().cloned()); + + let acceptor_term = GenericGaugeVec::new( + Opts::new("safekeeper_acceptor_term", "Current consensus term"), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(acceptor_term.desc().into_iter().cloned()); + + let written_wal_bytes = GenericGaugeVec::new( + Opts::new( + "safekeeper_written_wal_bytes_total", + "Number of WAL bytes written to disk, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(written_wal_bytes.desc().into_iter().cloned()); + + let written_wal_seconds = GaugeVec::new( + Opts::new( + "safekeeper_written_wal_seconds_total", + "Total time spent in write(2) writing WAL to disk, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(written_wal_seconds.desc().into_iter().cloned()); + + let flushed_wal_seconds = GaugeVec::new( + Opts::new( + "safekeeper_flushed_wal_seconds_total", + "Total time spent in fsync(2) flushing WAL to disk, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(flushed_wal_seconds.desc().into_iter().cloned()); + + let collect_timeline_metrics = Gauge::new( + "safekeeper_collect_timeline_metrics_seconds", + "Time spent collecting timeline metrics, including obtaining mutex lock for all timelines", + ) + .unwrap(); + descs.extend(collect_timeline_metrics.desc().into_iter().cloned()); + + let timelines_count = IntGauge::new( + "safekeeper_timelines", + "Total number of timelines loaded in-memory", + ) + .unwrap(); + descs.extend(timelines_count.desc().into_iter().cloned()); + + TimelineCollector { + descs, + commit_lsn, + backup_lsn, + flush_lsn, + epoch_start_lsn, + peer_horizon_lsn, + remote_consistent_lsn, + feedback_ps_write_lsn, + feedback_last_time_seconds, + timeline_active, + wal_backup_active, + connected_computes, + disk_usage, + acceptor_term, + written_wal_bytes, + written_wal_seconds, + flushed_wal_seconds, + collect_timeline_metrics, + timelines_count, + } + } +} + +impl Collector for TimelineCollector { + fn desc(&self) -> Vec<&Desc> { + self.descs.iter().collect() + } + + fn collect(&self) -> Vec { + let start_collecting = Instant::now(); + + // reset all metrics to clean up inactive timelines + self.commit_lsn.reset(); + self.backup_lsn.reset(); + self.flush_lsn.reset(); + self.epoch_start_lsn.reset(); + self.peer_horizon_lsn.reset(); + self.remote_consistent_lsn.reset(); + self.feedback_ps_write_lsn.reset(); + self.feedback_last_time_seconds.reset(); + self.timeline_active.reset(); + self.wal_backup_active.reset(); + self.connected_computes.reset(); + self.disk_usage.reset(); + self.acceptor_term.reset(); + self.written_wal_bytes.reset(); + self.written_wal_seconds.reset(); + self.flushed_wal_seconds.reset(); + + let timelines = GlobalTimelines::get_all(); + let timelines_count = timelines.len(); + + for arc_tli in timelines { + let tli = arc_tli.info_for_metrics(); + if tli.is_none() { + continue; + } + let tli = tli.unwrap(); + + let tenant_id = tli.ttid.tenant_id.to_string(); + let timeline_id = tli.ttid.timeline_id.to_string(); + let labels = &[tenant_id.as_str(), timeline_id.as_str()]; + + let mut most_advanced: Option = None; + for replica in tli.replicas.iter() { + if let Some(replica_feedback) = replica.pageserver_feedback { + if let Some(current) = most_advanced { + if current.ps_writelsn < replica_feedback.ps_writelsn { + most_advanced = Some(replica_feedback); + } + } else { + most_advanced = Some(replica_feedback); + } + } + } + + self.commit_lsn + .with_label_values(labels) + .set(tli.mem_state.commit_lsn.into()); + self.backup_lsn + .with_label_values(labels) + .set(tli.mem_state.backup_lsn.into()); + self.flush_lsn + .with_label_values(labels) + .set(tli.flush_lsn.into()); + self.epoch_start_lsn + .with_label_values(labels) + .set(tli.epoch_start_lsn.into()); + self.peer_horizon_lsn + .with_label_values(labels) + .set(tli.mem_state.peer_horizon_lsn.into()); + self.remote_consistent_lsn + .with_label_values(labels) + .set(tli.mem_state.remote_consistent_lsn.into()); + self.timeline_active + .with_label_values(labels) + .set(tli.timeline_is_active as u64); + self.wal_backup_active + .with_label_values(labels) + .set(tli.wal_backup_active as u64); + self.connected_computes + .with_label_values(labels) + .set(tli.num_computes as i64); + self.acceptor_term + .with_label_values(labels) + .set(tli.persisted_state.acceptor_state.term as u64); + self.written_wal_bytes + .with_label_values(labels) + .set(tli.wal_storage.write_wal_bytes); + self.written_wal_seconds + .with_label_values(labels) + .set(tli.wal_storage.write_wal_seconds); + self.flushed_wal_seconds + .with_label_values(labels) + .set(tli.wal_storage.flush_wal_seconds); + + if let Some(feedback) = most_advanced { + self.feedback_ps_write_lsn + .with_label_values(labels) + .set(feedback.ps_writelsn); + if let Ok(unix_time) = feedback.ps_replytime.duration_since(SystemTime::UNIX_EPOCH) + { + self.feedback_last_time_seconds + .with_label_values(labels) + .set(unix_time.as_secs()); + } + } + + if tli.last_removed_segno != 0 { + let segno_count = tli + .flush_lsn + .segment_number(tli.persisted_state.server.wal_seg_size as usize) + - tli.last_removed_segno; + let disk_usage_bytes = segno_count * tli.persisted_state.server.wal_seg_size as u64; + self.disk_usage + .with_label_values(labels) + .set(disk_usage_bytes); + } + } + + // collect MetricFamilys. + let mut mfs = Vec::new(); + mfs.extend(self.commit_lsn.collect()); + mfs.extend(self.backup_lsn.collect()); + mfs.extend(self.flush_lsn.collect()); + mfs.extend(self.epoch_start_lsn.collect()); + mfs.extend(self.peer_horizon_lsn.collect()); + mfs.extend(self.remote_consistent_lsn.collect()); + mfs.extend(self.feedback_ps_write_lsn.collect()); + mfs.extend(self.feedback_last_time_seconds.collect()); + mfs.extend(self.timeline_active.collect()); + mfs.extend(self.wal_backup_active.collect()); + mfs.extend(self.connected_computes.collect()); + mfs.extend(self.disk_usage.collect()); + mfs.extend(self.acceptor_term.collect()); + mfs.extend(self.written_wal_bytes.collect()); + mfs.extend(self.written_wal_seconds.collect()); + mfs.extend(self.flushed_wal_seconds.collect()); + + // report time it took to collect all info + let elapsed = start_collecting.elapsed().as_secs_f64(); + self.collect_timeline_metrics.set(elapsed); + mfs.extend(self.collect_timeline_metrics.collect()); + + // report total number of timelines + self.timelines_count.set(timelines_count as i64); + mfs.extend(self.timelines_count.collect()); + + mfs + } +} diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs new file mode 100644 index 0000000000..6577e8c4d6 --- /dev/null +++ b/safekeeper/src/receive_wal.rs @@ -0,0 +1,202 @@ +//! Safekeeper communication endpoint to WAL proposer (compute node). +//! Gets messages from the network, passes them down to consensus module and +//! sends replies back. + +use anyhow::{anyhow, bail, Result}; + +use bytes::BytesMut; +use tracing::*; +use utils::lsn::Lsn; + +use crate::safekeeper::ServerInfo; +use crate::timeline::Timeline; +use crate::GlobalTimelines; + +use std::net::SocketAddr; +use std::sync::mpsc::channel; +use std::sync::mpsc::Receiver; + +use std::sync::Arc; +use std::thread; + +use crate::safekeeper::AcceptorProposerMessage; +use crate::safekeeper::ProposerAcceptorMessage; + +use crate::handler::SafekeeperPostgresHandler; +use pq_proto::{BeMessage, FeMessage}; +use utils::{postgres_backend::PostgresBackend, sock_split::ReadStream}; + +pub struct ReceiveWalConn<'pg> { + /// Postgres connection + pg_backend: &'pg mut PostgresBackend, + /// The cached result of `pg_backend.socket().peer_addr()` (roughly) + peer_addr: SocketAddr, +} + +impl<'pg> ReceiveWalConn<'pg> { + pub fn new(pg: &'pg mut PostgresBackend) -> ReceiveWalConn<'pg> { + let peer_addr = *pg.get_peer_addr(); + ReceiveWalConn { + pg_backend: pg, + peer_addr, + } + } + + // Send message to the postgres + fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> Result<()> { + let mut buf = BytesMut::with_capacity(128); + msg.serialize(&mut buf)?; + self.pg_backend.write_message(&BeMessage::CopyData(&buf))?; + Ok(()) + } + + /// Receive WAL from wal_proposer + pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<()> { + let _enter = info_span!("WAL acceptor", timeline = %spg.timeline_id.unwrap()).entered(); + + // Notify the libpq client that it's allowed to send `CopyData` messages + self.pg_backend + .write_message(&BeMessage::CopyBothResponse)?; + + let r = self + .pg_backend + .take_stream_in() + .ok_or_else(|| anyhow!("failed to take read stream from pgbackend"))?; + let mut poll_reader = ProposerPollStream::new(r)?; + + // Receive information about server + let next_msg = poll_reader.recv_msg()?; + let tli = match next_msg { + ProposerAcceptorMessage::Greeting(ref greeting) => { + info!( + "start handshake with wal proposer {} sysid {} timeline {}", + self.peer_addr, greeting.system_id, greeting.tli, + ); + let server_info = ServerInfo { + pg_version: greeting.pg_version, + system_id: greeting.system_id, + wal_seg_size: greeting.wal_seg_size, + }; + GlobalTimelines::create(spg.ttid, server_info, Lsn::INVALID, Lsn::INVALID)? + } + _ => bail!("unexpected message {:?} instead of greeting", next_msg), + }; + + let mut next_msg = Some(next_msg); + + let mut first_time_through = true; + let mut _guard: Option = None; + loop { + if matches!(next_msg, Some(ProposerAcceptorMessage::AppendRequest(_))) { + // poll AppendRequest's without blocking and write WAL to disk without flushing, + // while it's readily available + while let Some(ProposerAcceptorMessage::AppendRequest(append_request)) = next_msg { + let msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request); + + let reply = tli.process_msg(&msg)?; + if let Some(reply) = reply { + self.write_msg(&reply)?; + } + + next_msg = poll_reader.poll_msg(); + } + + // flush all written WAL to the disk + let reply = tli.process_msg(&ProposerAcceptorMessage::FlushWAL)?; + if let Some(reply) = reply { + self.write_msg(&reply)?; + } + } else if let Some(msg) = next_msg.take() { + // process other message + let reply = tli.process_msg(&msg)?; + if let Some(reply) = reply { + self.write_msg(&reply)?; + } + } + if first_time_through { + // Register the connection and defer unregister. Do that only + // after processing first message, as it sets wal_seg_size, + // wanted by many. + tli.on_compute_connect()?; + _guard = Some(ComputeConnectionGuard { + timeline: Arc::clone(&tli), + }); + first_time_through = false; + } + + // blocking wait for the next message + if next_msg.is_none() { + next_msg = Some(poll_reader.recv_msg()?); + } + } + } +} + +struct ProposerPollStream { + msg_rx: Receiver, + read_thread: Option>>, +} + +impl ProposerPollStream { + fn new(mut r: ReadStream) -> Result { + let (msg_tx, msg_rx) = channel(); + + let read_thread = thread::Builder::new() + .name("Read WAL thread".into()) + .spawn(move || -> Result<()> { + loop { + let copy_data = match FeMessage::read(&mut r)? { + Some(FeMessage::CopyData(bytes)) => bytes, + Some(msg) => bail!("expected `CopyData` message, found {:?}", msg), + None => bail!("connection closed unexpectedly"), + }; + + let msg = ProposerAcceptorMessage::parse(copy_data)?; + msg_tx.send(msg)?; + } + // msg_tx will be dropped here, this will also close msg_rx + })?; + + Ok(Self { + msg_rx, + read_thread: Some(read_thread), + }) + } + + fn recv_msg(&mut self) -> Result { + self.msg_rx.recv().map_err(|_| { + // return error from the read thread + let res = match self.read_thread.take() { + Some(thread) => thread.join(), + None => return anyhow!("read thread is gone"), + }; + + match res { + Ok(Ok(())) => anyhow!("unexpected result from read thread"), + Err(err) => anyhow!("read thread panicked: {:?}", err), + Ok(Err(err)) => err, + } + }) + } + + fn poll_msg(&mut self) -> Option { + let res = self.msg_rx.try_recv(); + + match res { + Err(_) => None, + Ok(msg) => Some(msg), + } + } +} + +struct ComputeConnectionGuard { + timeline: Arc, +} + +impl Drop for ComputeConnectionGuard { + fn drop(&mut self) { + if let Err(e) = self.timeline.on_compute_disconnect() { + error!("failed to unregister compute connection: {}", e); + } + } +} diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs new file mode 100644 index 0000000000..b6d497f34e --- /dev/null +++ b/safekeeper/src/remove_wal.rs @@ -0,0 +1,26 @@ +//! Thread removing old WAL. + +use std::{thread, time::Duration}; + +use tracing::*; + +use crate::{GlobalTimelines, SafeKeeperConf}; + +pub fn thread_main(conf: SafeKeeperConf) { + let wal_removal_interval = Duration::from_millis(5000); + loop { + let tlis = GlobalTimelines::get_all(); + for tli in &tlis { + if !tli.is_active() { + continue; + } + let ttid = tli.ttid; + let _enter = + info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id).entered(); + if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) { + warn!("failed to remove WAL: {}", e); + } + } + thread::sleep(wal_removal_interval) + } +} diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs new file mode 100644 index 0000000000..7dfa6f636e --- /dev/null +++ b/safekeeper/src/safekeeper.rs @@ -0,0 +1,1107 @@ +//! Acceptor part of proposer-acceptor consensus algorithm. + +use anyhow::{bail, Context, Result}; +use byteorder::{LittleEndian, ReadBytesExt}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; + +use etcd_broker::subscription_value::SkTimelineInfo; +use postgres_ffi::{TimeLineID, XLogSegNo, MAX_SEND_SIZE}; +use serde::{Deserialize, Serialize}; +use std::cmp::max; +use std::cmp::min; +use std::fmt; +use std::io::Read; + +use tracing::*; + +use crate::control_file; +use crate::send_wal::HotStandbyFeedback; + +use crate::wal_storage; +use pq_proto::{ReplicationFeedback, SystemId}; +use utils::{ + bin_ser::LeSer, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, +}; + +pub const SK_MAGIC: u32 = 0xcafeceefu32; +pub const SK_FORMAT_VERSION: u32 = 7; +const SK_PROTOCOL_VERSION: u32 = 2; +pub const UNKNOWN_SERVER_VERSION: u32 = 0; + +/// Consensus logical timestamp. +pub type Term = u64; +const INVALID_TERM: Term = 0; + +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct TermSwitchEntry { + pub term: Term, + pub lsn: Lsn, +} +#[derive(Clone, Serialize, Deserialize)] +pub struct TermHistory(pub Vec); + +impl TermHistory { + pub fn empty() -> TermHistory { + TermHistory(Vec::new()) + } + + // Parse TermHistory as n_entries followed by TermSwitchEntry pairs + pub fn from_bytes(bytes: &mut Bytes) -> Result { + if bytes.remaining() < 4 { + bail!("TermHistory misses len"); + } + let n_entries = bytes.get_u32_le(); + let mut res = Vec::with_capacity(n_entries as usize); + for _ in 0..n_entries { + if bytes.remaining() < 16 { + bail!("TermHistory is incomplete"); + } + res.push(TermSwitchEntry { + term: bytes.get_u64_le(), + lsn: bytes.get_u64_le().into(), + }) + } + Ok(TermHistory(res)) + } + + /// Return copy of self with switches happening strictly after up_to + /// truncated. + pub fn up_to(&self, up_to: Lsn) -> TermHistory { + let mut res = Vec::with_capacity(self.0.len()); + for e in &self.0 { + if e.lsn > up_to { + break; + } + res.push(*e); + } + TermHistory(res) + } +} + +/// Display only latest entries for Debug. +impl fmt::Debug for TermHistory { + fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { + let n_printed = 20; + write!( + fmt, + "{}{:?}", + if self.0.len() > n_printed { "... " } else { "" }, + self.0 + .iter() + .rev() + .take(n_printed) + .map(|&e| (e.term, e.lsn)) // omit TermSwitchEntry + .collect::>() + ) + } +} + +/// Unique id of proposer. Not needed for correctness, used for monitoring. +pub type PgUuid = [u8; 16]; + +/// Persistent consensus state of the acceptor. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AcceptorState { + /// acceptor's last term it voted for (advanced in 1 phase) + pub term: Term, + /// History of term switches for safekeeper's WAL. + /// Actually it often goes *beyond* WAL contents as we adopt term history + /// from the proposer before recovery. + pub term_history: TermHistory, +} + +impl AcceptorState { + /// acceptor's epoch is the term of the highest entry in the log + pub fn get_epoch(&self, flush_lsn: Lsn) -> Term { + let th = self.term_history.up_to(flush_lsn); + match th.0.last() { + Some(e) => e.term, + None => 0, + } + } +} + +/// Information about Postgres. Safekeeper gets it once and then verifies +/// all further connections from computes match. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ServerInfo { + /// Postgres server version + pub pg_version: u32, + pub system_id: SystemId, + pub wal_seg_size: u32, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PersistedPeerInfo { + /// LSN up to which safekeeper offloaded WAL to s3. + backup_lsn: Lsn, + /// Term of the last entry. + term: Term, + /// LSN of the last record. + flush_lsn: Lsn, + /// Up to which LSN safekeeper regards its WAL as committed. + commit_lsn: Lsn, +} + +impl PersistedPeerInfo { + fn new() -> Self { + Self { + backup_lsn: Lsn::INVALID, + term: INVALID_TERM, + flush_lsn: Lsn(0), + commit_lsn: Lsn(0), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>); + +/// Persistent information stored on safekeeper node +/// On disk data is prefixed by magic and format version and followed by checksum. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SafeKeeperState { + #[serde(with = "hex")] + pub tenant_id: TenantId, + #[serde(with = "hex")] + pub timeline_id: TimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Since which LSN this timeline generally starts. Safekeeper might have + /// joined later. + pub timeline_start_lsn: Lsn, + /// Since which LSN safekeeper has (had) WAL for this timeline. + /// All WAL segments next to one containing local_start_lsn are + /// filled with data from the beginning. + pub local_start_lsn: Lsn, + /// Part of WAL acknowledged by quorum and available locally. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + // Peers and their state as we remember it. Knowing peers themselves is + // fundamental; but state is saved here only for informational purposes and + // obviously can be stale. (Currently not saved at all, but let's provision + // place to have less file version upgrades). + pub peers: PersistedPeers, +} + +#[derive(Debug, Clone)] +// In memory safekeeper state. Fields mirror ones in `SafeKeeperState`; values +// are not flushed yet. +pub struct SafekeeperMemState { + pub commit_lsn: Lsn, + pub backup_lsn: Lsn, + pub peer_horizon_lsn: Lsn, + pub remote_consistent_lsn: Lsn, + pub proposer_uuid: PgUuid, +} + +impl SafeKeeperState { + pub fn new( + ttid: &TenantTimelineId, + server_info: ServerInfo, + peers: Vec, + commit_lsn: Lsn, + local_start_lsn: Lsn, + ) -> SafeKeeperState { + SafeKeeperState { + tenant_id: ttid.tenant_id, + timeline_id: ttid.timeline_id, + acceptor_state: AcceptorState { + term: 0, + term_history: TermHistory::empty(), + }, + server: server_info, + proposer_uuid: [0; 16], + timeline_start_lsn: Lsn(0), + local_start_lsn, + commit_lsn, + backup_lsn: local_start_lsn, + peer_horizon_lsn: local_start_lsn, + remote_consistent_lsn: Lsn(0), + peers: PersistedPeers( + peers + .iter() + .map(|p| (*p, PersistedPeerInfo::new())) + .collect(), + ), + } + } + + #[cfg(test)] + pub fn empty() -> Self { + SafeKeeperState::new( + &TenantTimelineId::empty(), + ServerInfo { + pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ + system_id: 0, /* Postgres system identifier */ + wal_seg_size: 0, + }, + vec![], + Lsn::INVALID, + Lsn::INVALID, + ) + } +} + +// protocol messages + +/// Initial Proposer -> Acceptor message +#[derive(Debug, Deserialize)] +pub struct ProposerGreeting { + /// proposer-acceptor protocol version + pub protocol_version: u32, + /// Postgres server version + pub pg_version: u32, + pub proposer_id: PgUuid, + pub system_id: SystemId, + pub timeline_id: TimelineId, + pub tenant_id: TenantId, + pub tli: TimeLineID, + pub wal_seg_size: u32, +} + +/// Acceptor -> Proposer initial response: the highest term known to me +/// (acceptor voted for). +#[derive(Debug, Serialize)] +pub struct AcceptorGreeting { + term: u64, + node_id: NodeId, +} + +/// Vote request sent from proposer to safekeepers +#[derive(Debug, Deserialize)] +pub struct VoteRequest { + term: Term, +} + +/// Vote itself, sent from safekeeper to proposer +#[derive(Debug, Serialize)] +pub struct VoteResponse { + term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date. + vote_given: u64, // fixme u64 due to padding + // Safekeeper flush_lsn (end of WAL) + history of term switches allow + // proposer to choose the most advanced one. + flush_lsn: Lsn, + truncate_lsn: Lsn, + term_history: TermHistory, + timeline_start_lsn: Lsn, +} + +/* + * Proposer -> Acceptor message announcing proposer is elected and communicating + * term history to it. + */ +#[derive(Debug)] +pub struct ProposerElected { + pub term: Term, + pub start_streaming_at: Lsn, + pub term_history: TermHistory, + pub timeline_start_lsn: Lsn, +} + +/// Request with WAL message sent from proposer to safekeeper. Along the way it +/// communicates commit_lsn. +#[derive(Debug)] +pub struct AppendRequest { + pub h: AppendRequestHeader, + pub wal_data: Bytes, +} +#[derive(Debug, Clone, Deserialize)] +pub struct AppendRequestHeader { + // safekeeper's current term; if it is higher than proposer's, the compute is out of date. + pub term: Term, + // LSN since the proposer appends WAL; determines epoch switch point. + pub epoch_start_lsn: Lsn, + /// start position of message in WAL + pub begin_lsn: Lsn, + /// end position of message in WAL + pub end_lsn: Lsn, + /// LSN committed by quorum of safekeepers + pub commit_lsn: Lsn, + /// minimal LSN which may be needed by proposer to perform recovery of some safekeeper + pub truncate_lsn: Lsn, + // only for logging/debugging + pub proposer_uuid: PgUuid, +} + +/// Report safekeeper state to proposer +#[derive(Debug, Serialize, Deserialize)] +pub struct AppendResponse { + // Current term of the safekeeper; if it is higher than proposer's, the + // compute is out of date. + pub term: Term, + // NOTE: this is physical end of wal on safekeeper; currently it doesn't + // make much sense without taking epoch into account, as history can be + // diverged. + pub flush_lsn: Lsn, + // We report back our awareness about which WAL is committed, as this is + // a criterion for walproposer --sync mode exit + pub commit_lsn: Lsn, + pub hs_feedback: HotStandbyFeedback, + pub pageserver_feedback: ReplicationFeedback, +} + +impl AppendResponse { + fn term_only(term: Term) -> AppendResponse { + AppendResponse { + term, + flush_lsn: Lsn(0), + commit_lsn: Lsn(0), + hs_feedback: HotStandbyFeedback::empty(), + pageserver_feedback: ReplicationFeedback::empty(), + } + } +} + +/// Proposer -> Acceptor messages +#[derive(Debug)] +pub enum ProposerAcceptorMessage { + Greeting(ProposerGreeting), + VoteRequest(VoteRequest), + Elected(ProposerElected), + AppendRequest(AppendRequest), + NoFlushAppendRequest(AppendRequest), + FlushWAL, +} + +impl ProposerAcceptorMessage { + /// Parse proposer message. + pub fn parse(msg_bytes: Bytes) -> Result { + // xxx using Reader is inefficient but easy to work with bincode + let mut stream = msg_bytes.reader(); + // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is + let tag = stream.read_u64::()? as u8 as char; + match tag { + 'g' => { + let msg = ProposerGreeting::des_from(&mut stream)?; + Ok(ProposerAcceptorMessage::Greeting(msg)) + } + 'v' => { + let msg = VoteRequest::des_from(&mut stream)?; + Ok(ProposerAcceptorMessage::VoteRequest(msg)) + } + 'e' => { + let mut msg_bytes = stream.into_inner(); + if msg_bytes.remaining() < 16 { + bail!("ProposerElected message is not complete"); + } + let term = msg_bytes.get_u64_le(); + let start_streaming_at = msg_bytes.get_u64_le().into(); + let term_history = TermHistory::from_bytes(&mut msg_bytes)?; + if msg_bytes.remaining() < 8 { + bail!("ProposerElected message is not complete"); + } + let timeline_start_lsn = msg_bytes.get_u64_le().into(); + let msg = ProposerElected { + term, + start_streaming_at, + timeline_start_lsn, + term_history, + }; + Ok(ProposerAcceptorMessage::Elected(msg)) + } + 'a' => { + // read header followed by wal data + let hdr = AppendRequestHeader::des_from(&mut stream)?; + let rec_size = hdr + .end_lsn + .checked_sub(hdr.begin_lsn) + .context("begin_lsn > end_lsn in AppendRequest")? + .0 as usize; + if rec_size > MAX_SEND_SIZE { + bail!( + "AppendRequest is longer than MAX_SEND_SIZE ({})", + MAX_SEND_SIZE + ); + } + + let mut wal_data_vec: Vec = vec![0; rec_size]; + stream.read_exact(&mut wal_data_vec)?; + let wal_data = Bytes::from(wal_data_vec); + let msg = AppendRequest { h: hdr, wal_data }; + + Ok(ProposerAcceptorMessage::AppendRequest(msg)) + } + _ => bail!("unknown proposer-acceptor message tag: {}", tag,), + } + } +} + +/// Acceptor -> Proposer messages +#[derive(Debug)] +pub enum AcceptorProposerMessage { + Greeting(AcceptorGreeting), + VoteResponse(VoteResponse), + AppendResponse(AppendResponse), +} + +impl AcceptorProposerMessage { + /// Serialize acceptor -> proposer message. + pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> { + match self { + AcceptorProposerMessage::Greeting(msg) => { + buf.put_u64_le('g' as u64); + buf.put_u64_le(msg.term); + buf.put_u64_le(msg.node_id.0); + } + AcceptorProposerMessage::VoteResponse(msg) => { + buf.put_u64_le('v' as u64); + buf.put_u64_le(msg.term); + buf.put_u64_le(msg.vote_given); + buf.put_u64_le(msg.flush_lsn.into()); + buf.put_u64_le(msg.truncate_lsn.into()); + buf.put_u32_le(msg.term_history.0.len() as u32); + for e in &msg.term_history.0 { + buf.put_u64_le(e.term); + buf.put_u64_le(e.lsn.into()); + } + buf.put_u64_le(msg.timeline_start_lsn.into()); + } + AcceptorProposerMessage::AppendResponse(msg) => { + buf.put_u64_le('a' as u64); + buf.put_u64_le(msg.term); + buf.put_u64_le(msg.flush_lsn.into()); + buf.put_u64_le(msg.commit_lsn.into()); + buf.put_i64_le(msg.hs_feedback.ts); + buf.put_u64_le(msg.hs_feedback.xmin); + buf.put_u64_le(msg.hs_feedback.catalog_xmin); + + msg.pageserver_feedback.serialize(buf)? + } + } + + Ok(()) + } +} + +/// Safekeeper implements consensus to reliably persist WAL across nodes. +/// It controls all WAL disk writes and updates of control file. +/// +/// Currently safekeeper processes: +/// - messages from compute (proposers) and provides replies +/// - messages from broker peers +pub struct SafeKeeper { + /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn. + /// Note: be careful to set only if we are sure our WAL (term history) matches + /// committed one. + pub global_commit_lsn: Lsn, + /// LSN since the proposer safekeeper currently talking to appends WAL; + /// determines epoch switch point. + pub epoch_start_lsn: Lsn, + + pub inmem: SafekeeperMemState, // in memory part + pub state: CTRL, // persistent state storage + + pub wal_store: WAL, + + node_id: NodeId, // safekeeper's node id +} + +impl SafeKeeper +where + CTRL: control_file::Storage, + WAL: wal_storage::Storage, +{ + /// Accepts a control file storage containing the safekeeper state. + /// State must be initialized, i.e. contain filled `tenant_id`, `timeline_id` + /// and `server` (`wal_seg_size` inside it) fields. + pub fn new(state: CTRL, wal_store: WAL, node_id: NodeId) -> Result> { + if state.tenant_id == TenantId::from([0u8; 16]) + || state.timeline_id == TimelineId::from([0u8; 16]) + { + bail!( + "Calling SafeKeeper::new with empty tenant_id ({}) or timeline_id ({})", + state.tenant_id, + state.timeline_id + ); + } + + Ok(SafeKeeper { + global_commit_lsn: state.commit_lsn, + epoch_start_lsn: Lsn(0), + inmem: SafekeeperMemState { + commit_lsn: state.commit_lsn, + backup_lsn: state.backup_lsn, + peer_horizon_lsn: state.peer_horizon_lsn, + remote_consistent_lsn: state.remote_consistent_lsn, + proposer_uuid: state.proposer_uuid, + }, + state, + wal_store, + node_id, + }) + } + + /// Get history of term switches for the available WAL + fn get_term_history(&self) -> TermHistory { + self.state + .acceptor_state + .term_history + .up_to(self.flush_lsn()) + } + + pub fn get_epoch(&self) -> Term { + self.state.acceptor_state.get_epoch(self.flush_lsn()) + } + + /// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet. + fn flush_lsn(&self) -> Lsn { + max(self.wal_store.flush_lsn(), self.state.timeline_start_lsn) + } + + /// Process message from proposer and possibly form reply. Concurrent + /// callers must exclude each other. + pub fn process_msg( + &mut self, + msg: &ProposerAcceptorMessage, + ) -> Result> { + match msg { + ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg), + ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg), + ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg), + ProposerAcceptorMessage::AppendRequest(msg) => self.handle_append_request(msg, true), + ProposerAcceptorMessage::NoFlushAppendRequest(msg) => { + self.handle_append_request(msg, false) + } + ProposerAcceptorMessage::FlushWAL => self.handle_flush(), + } + } + + /// Handle initial message from proposer: check its sanity and send my + /// current term. + fn handle_greeting( + &mut self, + msg: &ProposerGreeting, + ) -> Result> { + // Check protocol compatibility + if msg.protocol_version != SK_PROTOCOL_VERSION { + bail!( + "incompatible protocol version {}, expected {}", + msg.protocol_version, + SK_PROTOCOL_VERSION + ); + } + /* Postgres major version mismatch is treated as fatal error + * because safekeepers parse WAL headers and the format + * may change between versions. + */ + if msg.pg_version / 10000 != self.state.server.pg_version / 10000 + && self.state.server.pg_version != UNKNOWN_SERVER_VERSION + { + bail!( + "incompatible server version {}, expected {}", + msg.pg_version, + self.state.server.pg_version + ); + } + + if msg.tenant_id != self.state.tenant_id { + bail!( + "invalid tenant ID, got {}, expected {}", + msg.tenant_id, + self.state.tenant_id + ); + } + if msg.timeline_id != self.state.timeline_id { + bail!( + "invalid timeline ID, got {}, expected {}", + msg.timeline_id, + self.state.timeline_id + ); + } + if self.state.server.wal_seg_size != msg.wal_seg_size { + bail!( + "invalid wal_seg_size, got {}, expected {}", + msg.wal_seg_size, + self.state.server.wal_seg_size + ); + } + + // system_id will be updated on mismatch + if self.state.server.system_id != msg.system_id { + warn!( + "unexpected system ID arrived, got {}, expected {}", + msg.system_id, self.state.server.system_id + ); + + let mut state = self.state.clone(); + state.server.system_id = msg.system_id; + if msg.pg_version != UNKNOWN_SERVER_VERSION { + state.server.pg_version = msg.pg_version; + } + self.state.persist(&state)?; + } + + info!( + "processed greeting from proposer {:?}, sending term {:?}", + msg.proposer_id, self.state.acceptor_state.term + ); + Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting { + term: self.state.acceptor_state.term, + node_id: self.node_id, + }))) + } + + /// Give vote for the given term, if we haven't done that previously. + fn handle_vote_request( + &mut self, + msg: &VoteRequest, + ) -> Result> { + // Once voted, we won't accept data from older proposers; flush + // everything we've already received so that new proposer starts + // streaming at end of our WAL, without overlap. Currently we truncate + // WAL at streaming point, so this avoids truncating already committed + // WAL. + // + // TODO: it would be smoother to not truncate committed piece at + // handle_elected instead. Currently not a big deal, as proposer is the + // only source of WAL; with peer2peer recovery it would be more + // important. + self.wal_store.flush_wal()?; + // initialize with refusal + let mut resp = VoteResponse { + term: self.state.acceptor_state.term, + vote_given: false as u64, + flush_lsn: self.flush_lsn(), + truncate_lsn: self.state.peer_horizon_lsn, + term_history: self.get_term_history(), + timeline_start_lsn: self.state.timeline_start_lsn, + }; + if self.state.acceptor_state.term < msg.term { + let mut state = self.state.clone(); + state.acceptor_state.term = msg.term; + // persist vote before sending it out + self.state.persist(&state)?; + + resp.term = self.state.acceptor_state.term; + resp.vote_given = true as u64; + } + info!("processed VoteRequest for term {}: {:?}", msg.term, &resp); + Ok(Some(AcceptorProposerMessage::VoteResponse(resp))) + } + + /// Form AppendResponse from current state. + fn append_response(&self) -> AppendResponse { + let ar = AppendResponse { + term: self.state.acceptor_state.term, + flush_lsn: self.flush_lsn(), + commit_lsn: self.state.commit_lsn, + // will be filled by the upper code to avoid bothering safekeeper + hs_feedback: HotStandbyFeedback::empty(), + pageserver_feedback: ReplicationFeedback::empty(), + }; + trace!("formed AppendResponse {:?}", ar); + ar + } + + fn handle_elected(&mut self, msg: &ProposerElected) -> Result> { + info!("received ProposerElected {:?}", msg); + if self.state.acceptor_state.term < msg.term { + let mut state = self.state.clone(); + state.acceptor_state.term = msg.term; + self.state.persist(&state)?; + } + + // If our term is higher, ignore the message (next feedback will inform the compute) + if self.state.acceptor_state.term > msg.term { + return Ok(None); + } + + // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to + // intersection of our history and history from msg + + // truncate wal, update the LSNs + self.wal_store.truncate_wal(msg.start_streaming_at)?; + + // and now adopt term history from proposer + { + let mut state = self.state.clone(); + + // Here we learn initial LSN for the first time, set fields + // interested in that. + + if state.timeline_start_lsn == Lsn(0) { + // Remember point where WAL begins globally. + state.timeline_start_lsn = msg.timeline_start_lsn; + info!( + "setting timeline_start_lsn to {:?}", + state.timeline_start_lsn + ); + } + if state.local_start_lsn == Lsn(0) { + state.local_start_lsn = msg.start_streaming_at; + info!("setting local_start_lsn to {:?}", state.local_start_lsn); + } + // Initializing commit_lsn before acking first flushed record is + // important to let find_end_of_wal skip the hole in the beginning + // of the first segment. + // + // NB: on new clusters, this happens at the same time as + // timeline_start_lsn initialization, it is taken outside to provide + // upgrade. + self.global_commit_lsn = max(self.global_commit_lsn, state.timeline_start_lsn); + self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn); + + // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment. + self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); + // Initializing remote_consistent_lsn sets that we have nothing to + // stream to pageserver(s) immediately after creation. + self.inmem.remote_consistent_lsn = + max(self.inmem.remote_consistent_lsn, state.timeline_start_lsn); + + state.acceptor_state.term_history = msg.term_history.clone(); + self.persist_control_file(state)?; + } + + info!("start receiving WAL since {:?}", msg.start_streaming_at); + + Ok(None) + } + + /// Advance commit_lsn taking into account what we have locally + fn update_commit_lsn(&mut self) -> Result<()> { + let commit_lsn = min(self.global_commit_lsn, self.flush_lsn()); + assert!(commit_lsn >= self.inmem.commit_lsn); + + self.inmem.commit_lsn = commit_lsn; + + // If new commit_lsn reached epoch switch, force sync of control + // file: walproposer in sync mode is very interested when this + // happens. Note: this is for sync-safekeepers mode only, as + // otherwise commit_lsn might jump over epoch_start_lsn. + // Also note that commit_lsn can reach epoch_start_lsn earlier + // that we receive new epoch_start_lsn, and we still need to sync + // control file in this case. + if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn { + self.persist_control_file(self.state.clone())?; + } + + Ok(()) + } + + /// Persist control file to disk, called only after timeline creation (bootstrap). + pub fn persist(&mut self) -> Result<()> { + self.persist_control_file(self.state.clone()) + } + + /// Persist in-memory state to the disk, taking other data from state. + fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> { + state.commit_lsn = self.inmem.commit_lsn; + state.backup_lsn = self.inmem.backup_lsn; + state.peer_horizon_lsn = self.inmem.peer_horizon_lsn; + state.remote_consistent_lsn = self.inmem.remote_consistent_lsn; + state.proposer_uuid = self.inmem.proposer_uuid; + self.state.persist(&state) + } + + /// Handle request to append WAL. + #[allow(clippy::comparison_chain)] + fn handle_append_request( + &mut self, + msg: &AppendRequest, + require_flush: bool, + ) -> Result> { + if self.state.acceptor_state.term < msg.h.term { + bail!("got AppendRequest before ProposerElected"); + } + + // If our term is higher, immediately refuse the message. + if self.state.acceptor_state.term > msg.h.term { + let resp = AppendResponse::term_only(self.state.acceptor_state.term); + return Ok(Some(AcceptorProposerMessage::AppendResponse(resp))); + } + + // Now we know that we are in the same term as the proposer, + // processing the message. + + self.epoch_start_lsn = msg.h.epoch_start_lsn; + self.inmem.proposer_uuid = msg.h.proposer_uuid; + + // do the job + if !msg.wal_data.is_empty() { + self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?; + } + + // flush wal to the disk, if required + if require_flush { + self.wal_store.flush_wal()?; + } + + // Update global_commit_lsn + if msg.h.commit_lsn != Lsn(0) { + // We also obtain commit lsn from peers, so value arrived here might be stale (less) + self.global_commit_lsn = max(self.global_commit_lsn, msg.h.commit_lsn); + } + + self.inmem.peer_horizon_lsn = msg.h.truncate_lsn; + self.update_commit_lsn()?; + + // Update truncate and commit LSN in control file. + // To avoid negative impact on performance of extra fsync, do it only + // when truncate_lsn delta exceeds WAL segment size. + if self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) + < self.inmem.peer_horizon_lsn + { + self.persist_control_file(self.state.clone())?; + } + + trace!( + "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}", + msg.wal_data.len(), + msg.h.end_lsn, + msg.h.commit_lsn, + msg.h.truncate_lsn, + require_flush, + ); + + // If flush_lsn hasn't updated, AppendResponse is not very useful. + if !require_flush { + return Ok(None); + } + + let resp = self.append_response(); + Ok(Some(AcceptorProposerMessage::AppendResponse(resp))) + } + + /// Flush WAL to disk. Return AppendResponse with latest LSNs. + fn handle_flush(&mut self) -> Result> { + self.wal_store.flush_wal()?; + + // commit_lsn can be updated because we have new flushed data locally. + self.update_commit_lsn()?; + + Ok(Some(AcceptorProposerMessage::AppendResponse( + self.append_response(), + ))) + } + + /// Update timeline state with peer safekeeper data. + pub fn record_safekeeper_info(&mut self, sk_info: &SkTimelineInfo) -> Result<()> { + let mut sync_control_file = false; + if let (Some(commit_lsn), Some(last_log_term)) = (sk_info.commit_lsn, sk_info.last_log_term) + { + // Note: the check is too restrictive, generally we can update local + // commit_lsn if our history matches (is part of) history of advanced + // commit_lsn provider. + if last_log_term == self.get_epoch() { + self.global_commit_lsn = max(commit_lsn, self.global_commit_lsn); + self.update_commit_lsn()?; + } + } + if let Some(backup_lsn) = sk_info.backup_lsn { + let new_backup_lsn = max(backup_lsn, self.inmem.backup_lsn); + sync_control_file |= + self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn; + self.inmem.backup_lsn = new_backup_lsn; + } + if let Some(remote_consistent_lsn) = sk_info.remote_consistent_lsn { + let new_remote_consistent_lsn = + max(remote_consistent_lsn, self.inmem.remote_consistent_lsn); + sync_control_file |= self.state.remote_consistent_lsn + + (self.state.server.wal_seg_size as u64) + < new_remote_consistent_lsn; + self.inmem.remote_consistent_lsn = new_remote_consistent_lsn; + } + if let Some(peer_horizon_lsn) = sk_info.peer_horizon_lsn { + let new_peer_horizon_lsn = max(peer_horizon_lsn, self.inmem.peer_horizon_lsn); + sync_control_file |= self.state.peer_horizon_lsn + + (self.state.server.wal_seg_size as u64) + < new_peer_horizon_lsn; + self.inmem.peer_horizon_lsn = new_peer_horizon_lsn; + } + if sync_control_file { + self.persist_control_file(self.state.clone())?; + } + Ok(()) + } + + /// Get oldest segno we still need to keep. We hold WAL till it is consumed + /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3 + /// offloading. + /// While it is safe to use inmem values for determining horizon, + /// we use persistent to make possible normal states less surprising. + pub fn get_horizon_segno(&self, wal_backup_enabled: bool) -> XLogSegNo { + let mut horizon_lsn = min( + self.state.remote_consistent_lsn, + self.state.peer_horizon_lsn, + ); + if wal_backup_enabled { + horizon_lsn = min(horizon_lsn, self.state.backup_lsn); + } + horizon_lsn.segment_number(self.state.server.wal_seg_size as usize) + } +} + +#[cfg(test)] +mod tests { + use postgres_ffi::WAL_SEGMENT_SIZE; + + use super::*; + use crate::wal_storage::Storage; + use std::ops::Deref; + + // fake storage for tests + struct InMemoryState { + persisted_state: SafeKeeperState, + } + + impl control_file::Storage for InMemoryState { + fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { + self.persisted_state = s.clone(); + Ok(()) + } + } + + impl Deref for InMemoryState { + type Target = SafeKeeperState; + + fn deref(&self) -> &Self::Target { + &self.persisted_state + } + } + + fn test_sk_state() -> SafeKeeperState { + let mut state = SafeKeeperState::empty(); + state.server.wal_seg_size = WAL_SEGMENT_SIZE as u32; + state.tenant_id = TenantId::from([1u8; 16]); + state.timeline_id = TimelineId::from([1u8; 16]); + state + } + + struct DummyWalStore { + lsn: Lsn, + } + + impl wal_storage::Storage for DummyWalStore { + fn flush_lsn(&self) -> Lsn { + self.lsn + } + + fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { + self.lsn = startpos + buf.len() as u64; + Ok(()) + } + + fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { + self.lsn = end_pos; + Ok(()) + } + + fn flush_wal(&mut self) -> Result<()> { + Ok(()) + } + + fn remove_up_to(&self) -> Box Result<()>> { + Box::new(move |_segno_up_to: XLogSegNo| Ok(())) + } + + fn get_metrics(&self) -> crate::metrics::WalStorageMetrics { + crate::metrics::WalStorageMetrics::default() + } + } + + #[test] + fn test_voting() { + let storage = InMemoryState { + persisted_state: test_sk_state(), + }; + let wal_store = DummyWalStore { lsn: Lsn(0) }; + let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap(); + + // check voting for 1 is ok + let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); + let mut vote_resp = sk.process_msg(&vote_request); + match vote_resp.unwrap() { + Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0), + r => panic!("unexpected response: {:?}", r), + } + + // reboot... + let state = sk.state.persisted_state.clone(); + let storage = InMemoryState { + persisted_state: state, + }; + + sk = SafeKeeper::new(storage, sk.wal_store, NodeId(0)).unwrap(); + + // and ensure voting second time for 1 is not ok + vote_resp = sk.process_msg(&vote_request); + match vote_resp.unwrap() { + Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0), + r => panic!("unexpected response: {:?}", r), + } + } + + #[test] + fn test_epoch_switch() { + let storage = InMemoryState { + persisted_state: test_sk_state(), + }; + let wal_store = DummyWalStore { lsn: Lsn(0) }; + + let mut sk = SafeKeeper::new(storage, wal_store, NodeId(0)).unwrap(); + + let mut ar_hdr = AppendRequestHeader { + term: 1, + epoch_start_lsn: Lsn(3), + begin_lsn: Lsn(1), + end_lsn: Lsn(2), + commit_lsn: Lsn(0), + truncate_lsn: Lsn(0), + proposer_uuid: [0; 16], + }; + let mut append_request = AppendRequest { + h: ar_hdr.clone(), + wal_data: Bytes::from_static(b"b"), + }; + + let pem = ProposerElected { + term: 1, + start_streaming_at: Lsn(1), + term_history: TermHistory(vec![TermSwitchEntry { + term: 1, + lsn: Lsn(3), + }]), + timeline_start_lsn: Lsn(0), + }; + sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) + .unwrap(); + + // check that AppendRequest before epochStartLsn doesn't switch epoch + let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)); + assert!(resp.is_ok()); + assert_eq!(sk.get_epoch(), 0); + + // but record at epochStartLsn does the switch + ar_hdr.begin_lsn = Lsn(2); + ar_hdr.end_lsn = Lsn(3); + append_request = AppendRequest { + h: ar_hdr, + wal_data: Bytes::from_static(b"b"), + }; + let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)); + assert!(resp.is_ok()); + sk.wal_store.truncate_wal(Lsn(3)).unwrap(); // imitate the complete record at 3 %) + assert_eq!(sk.get_epoch(), 1); + } +} diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs new file mode 100644 index 0000000000..576a02c686 --- /dev/null +++ b/safekeeper/src/send_wal.rs @@ -0,0 +1,331 @@ +//! This module implements the streaming side of replication protocol, starting +//! with the "START_REPLICATION" message. + +use crate::handler::SafekeeperPostgresHandler; +use crate::timeline::{ReplicaState, Timeline}; +use crate::wal_storage::WalReader; +use crate::GlobalTimelines; +use anyhow::{bail, Context, Result}; + +use bytes::Bytes; +use postgres_ffi::get_current_timestamp; +use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; +use serde::{Deserialize, Serialize}; +use std::cmp::min; +use std::net::Shutdown; +use std::sync::Arc; +use std::time::Duration; +use std::{str, thread}; + +use pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody}; +use tokio::sync::watch::Receiver; +use tokio::time::timeout; +use tracing::*; +use utils::{bin_ser::BeSer, lsn::Lsn, postgres_backend::PostgresBackend, sock_split::ReadStream}; + +// See: https://www.postgresql.org/docs/13/protocol-replication.html +const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h'; +const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r'; +// neon extension of replication protocol +const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; + +type FullTransactionId = u64; + +/// Hot standby feedback received from replica +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct HotStandbyFeedback { + pub ts: TimestampTz, + pub xmin: FullTransactionId, + pub catalog_xmin: FullTransactionId, +} + +impl HotStandbyFeedback { + pub fn empty() -> HotStandbyFeedback { + HotStandbyFeedback { + ts: 0, + xmin: 0, + catalog_xmin: 0, + } + } +} + +/// Standby status update +#[derive(Debug, Clone, Deserialize)] +pub struct StandbyReply { + pub write_lsn: Lsn, // last lsn received by pageserver + pub flush_lsn: Lsn, // pageserver's disk consistent lSN + pub apply_lsn: Lsn, // pageserver's remote consistent lSN + pub reply_ts: TimestampTz, + pub reply_requested: bool, +} + +/// A network connection that's speaking the replication protocol. +pub struct ReplicationConn { + /// This is an `Option` because we will spawn a background thread that will + /// `take` it from us. + stream_in: Option, +} + +/// Scope guard to unregister replication connection from timeline +struct ReplicationConnGuard { + replica: usize, // replica internal ID assigned by timeline + timeline: Arc, +} + +impl Drop for ReplicationConnGuard { + fn drop(&mut self) { + self.timeline.remove_replica(self.replica); + } +} + +impl ReplicationConn { + /// Create a new `ReplicationConn` + pub fn new(pgb: &mut PostgresBackend) -> Self { + Self { + stream_in: pgb.take_stream_in(), + } + } + + /// Handle incoming messages from the network. + /// This is spawned into the background by `handle_start_replication`. + fn background_thread( + mut stream_in: ReadStream, + replica_guard: Arc, + ) -> Result<()> { + let replica_id = replica_guard.replica; + let timeline = &replica_guard.timeline; + + let mut state = ReplicaState::new(); + // Wait for replica's feedback. + while let Some(msg) = FeMessage::read(&mut stream_in)? { + match &msg { + FeMessage::CopyData(m) => { + // There's three possible data messages that the client is supposed to send here: + // `HotStandbyFeedback` and `StandbyStatusUpdate` and `NeonStandbyFeedback`. + + match m.first().cloned() { + Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { + // Note: deserializing is on m[1..] because we skip the tag byte. + state.hs_feedback = HotStandbyFeedback::des(&m[1..]) + .context("failed to deserialize HotStandbyFeedback")?; + timeline.update_replica_state(replica_id, state); + } + Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => { + let _reply = StandbyReply::des(&m[1..]) + .context("failed to deserialize StandbyReply")?; + // This must be a regular postgres replica, + // because pageserver doesn't send this type of messages to safekeeper. + // Currently this is not implemented, so this message is ignored. + + warn!("unexpected StandbyReply. Read-only postgres replicas are not supported in safekeepers yet."); + // timeline.update_replica_state(replica_id, Some(state)); + } + Some(NEON_STATUS_UPDATE_TAG_BYTE) => { + // Note: deserializing is on m[9..] because we skip the tag byte and len bytes. + let buf = Bytes::copy_from_slice(&m[9..]); + let reply = ReplicationFeedback::parse(buf); + + trace!("ReplicationFeedback is {:?}", reply); + // Only pageserver sends ReplicationFeedback, so set the flag. + // This replica is the source of information to resend to compute. + state.pageserver_feedback = Some(reply); + + timeline.update_replica_state(replica_id, state); + } + _ => warn!("unexpected message {:?}", msg), + } + } + FeMessage::Sync => {} + FeMessage::CopyFail => { + // Shutdown the connection, because rust-postgres client cannot be dropped + // when connection is alive. + let _ = stream_in.shutdown(Shutdown::Both); + bail!("Copy failed"); + } + _ => { + // We only handle `CopyData`, 'Sync', 'CopyFail' messages. Anything else is ignored. + info!("unexpected message {:?}", msg); + } + } + } + + Ok(()) + } + + /// + /// Handle START_REPLICATION replication command + /// + pub fn run( + &mut self, + spg: &mut SafekeeperPostgresHandler, + pgb: &mut PostgresBackend, + mut start_pos: Lsn, + ) -> Result<()> { + let _enter = info_span!("WAL sender", timeline = %spg.timeline_id.unwrap()).entered(); + + let tli = GlobalTimelines::get(spg.ttid)?; + + // spawn the background thread which receives HotStandbyFeedback messages. + let bg_timeline = Arc::clone(&tli); + let bg_stream_in = self.stream_in.take().unwrap(); + let bg_timeline_id = spg.timeline_id.unwrap(); + + let state = ReplicaState::new(); + // This replica_id is used below to check if it's time to stop replication. + let replica_id = bg_timeline.add_replica(state); + + // Use a guard object to remove our entry from the timeline, when the background + // thread and us have both finished using it. + let replica_guard = Arc::new(ReplicationConnGuard { + replica: replica_id, + timeline: bg_timeline, + }); + let bg_replica_guard = Arc::clone(&replica_guard); + + // TODO: here we got two threads, one for writing WAL and one for receiving + // feedback. If one of them fails, we should shutdown the other one too. + let _ = thread::Builder::new() + .name("HotStandbyFeedback thread".into()) + .spawn(move || { + let _enter = + info_span!("HotStandbyFeedback thread", timeline = %bg_timeline_id).entered(); + if let Err(err) = Self::background_thread(bg_stream_in, bg_replica_guard) { + error!("Replication background thread failed: {}", err); + } + })?; + + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?; + + runtime.block_on(async move { + let (inmem_state, persisted_state) = tli.get_state(); + // add persisted_state.timeline_start_lsn == Lsn(0) check + + // Walproposer gets special handling: safekeeper must give proposer all + // local WAL till the end, whether committed or not (walproposer will + // hang otherwise). That's because walproposer runs the consensus and + // synchronizes safekeepers on the most advanced one. + // + // There is a small risk of this WAL getting concurrently garbaged if + // another compute rises which collects majority and starts fixing log + // on this safekeeper itself. That's ok as (old) proposer will never be + // able to commit such WAL. + let stop_pos: Option = if spg.is_walproposer_recovery() { + let wal_end = tli.get_flush_lsn(); + Some(wal_end) + } else { + None + }; + + info!("Start replication from {:?} till {:?}", start_pos, stop_pos); + + // switch to copy + pgb.write_message(&BeMessage::CopyBothResponse)?; + + let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn); + + let mut wal_reader = WalReader::new( + spg.conf.timeline_dir(&tli.ttid), + &persisted_state, + start_pos, + spg.conf.wal_backup_enabled, + )?; + + // buffer for wal sending, limited by MAX_SEND_SIZE + let mut send_buf = vec![0u8; MAX_SEND_SIZE]; + + // watcher for commit_lsn updates + let mut commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx(); + + loop { + if let Some(stop_pos) = stop_pos { + if start_pos >= stop_pos { + break; /* recovery finished */ + } + end_pos = stop_pos; + } else { + /* Wait until we have some data to stream */ + let lsn = wait_for_lsn(&mut commit_lsn_watch_rx, start_pos).await?; + + if let Some(lsn) = lsn { + end_pos = lsn; + } else { + // TODO: also check once in a while whether we are walsender + // to right pageserver. + if tli.should_walsender_stop(replica_id) { + // Shut down, timeline is suspended. + // TODO create proper error type for this + bail!("end streaming to {:?}", spg.appname); + } + + // timeout expired: request pageserver status + pgb.write_message(&BeMessage::KeepAlive(WalSndKeepAlive { + sent_ptr: end_pos.0, + timestamp: get_current_timestamp(), + request_reply: true, + })) + .context("Failed to send KeepAlive message")?; + continue; + } + } + + let send_size = end_pos.checked_sub(start_pos).unwrap().0 as usize; + let send_size = min(send_size, send_buf.len()); + + let send_buf = &mut send_buf[..send_size]; + + // read wal into buffer + let send_size = wal_reader.read(send_buf).await?; + let send_buf = &send_buf[..send_size]; + + // Write some data to the network socket. + pgb.write_message(&BeMessage::XLogData(XLogDataBody { + wal_start: start_pos.0, + wal_end: end_pos.0, + timestamp: get_current_timestamp(), + data: send_buf, + })) + .context("Failed to send XLogData")?; + + start_pos += send_size as u64; + trace!("sent WAL up to {}", start_pos); + } + + Ok(()) + }) + } +} + +const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); + +// Wait until we have commit_lsn > lsn or timeout expires. Returns latest commit_lsn. +async fn wait_for_lsn(rx: &mut Receiver, lsn: Lsn) -> Result> { + let commit_lsn: Lsn = *rx.borrow(); + if commit_lsn > lsn { + return Ok(Some(commit_lsn)); + } + + let res = timeout(POLL_STATE_TIMEOUT, async move { + let mut commit_lsn; + loop { + rx.changed().await?; + commit_lsn = *rx.borrow(); + if commit_lsn > lsn { + break; + } + } + + Ok(commit_lsn) + }) + .await; + + match res { + // success + Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)), + // error inside closure + Ok(Err(err)) => Err(err), + // timeout + Err(_) => Ok(None), + } +} diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs new file mode 100644 index 0000000000..132a926203 --- /dev/null +++ b/safekeeper/src/timeline.rs @@ -0,0 +1,789 @@ +//! This module implements Timeline lifecycle management and has all neccessary code +//! to glue together SafeKeeper and all other background services. + +use anyhow::{bail, Result}; +use etcd_broker::subscription_value::SkTimelineInfo; +use parking_lot::{Mutex, MutexGuard}; +use postgres_ffi::XLogSegNo; +use pq_proto::ReplicationFeedback; +use std::cmp::{max, min}; +use std::path::PathBuf; +use tokio::{ + sync::{mpsc::Sender, watch}, + time::Instant, +}; +use tracing::*; +use utils::{ + id::{NodeId, TenantTimelineId}, + lsn::Lsn, +}; + +use crate::safekeeper::{ + AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, + SafekeeperMemState, ServerInfo, Term, +}; +use crate::send_wal::HotStandbyFeedback; +use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; + +use crate::metrics::FullTimelineInfo; +use crate::wal_storage; +use crate::wal_storage::Storage as wal_storage_iface; +use crate::SafeKeeperConf; + +/// Things safekeeper should know about timeline state on peers. +#[derive(Debug, Clone)] +pub struct PeerInfo { + pub sk_id: NodeId, + /// Term of the last entry. + _last_log_term: Term, + /// LSN of the last record. + _flush_lsn: Lsn, + pub commit_lsn: Lsn, + /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new + /// sk since backup_lsn. + pub local_start_lsn: Lsn, + /// When info was received. + ts: Instant, +} + +impl PeerInfo { + fn from_sk_info(sk_id: NodeId, sk_info: &SkTimelineInfo, ts: Instant) -> PeerInfo { + PeerInfo { + sk_id, + _last_log_term: sk_info.last_log_term.unwrap_or(0), + _flush_lsn: sk_info.flush_lsn.unwrap_or(Lsn::INVALID), + commit_lsn: sk_info.commit_lsn.unwrap_or(Lsn::INVALID), + local_start_lsn: sk_info.local_start_lsn.unwrap_or(Lsn::INVALID), + ts, + } + } +} + +// vector-based node id -> peer state map with very limited functionality we +// need. +#[derive(Debug, Clone, Default)] +pub struct PeersInfo(pub Vec); + +impl PeersInfo { + fn get(&mut self, id: NodeId) -> Option<&mut PeerInfo> { + self.0.iter_mut().find(|p| p.sk_id == id) + } + + fn upsert(&mut self, p: &PeerInfo) { + match self.get(p.sk_id) { + Some(rp) => *rp = p.clone(), + None => self.0.push(p.clone()), + } + } +} + +/// Replica status update + hot standby feedback +#[derive(Debug, Clone, Copy)] +pub struct ReplicaState { + /// last known lsn received by replica + pub last_received_lsn: Lsn, // None means we don't know + /// combined remote consistent lsn of pageservers + pub remote_consistent_lsn: Lsn, + /// combined hot standby feedback from all replicas + pub hs_feedback: HotStandbyFeedback, + /// Replication specific feedback received from pageserver, if any + pub pageserver_feedback: Option, +} + +impl Default for ReplicaState { + fn default() -> Self { + Self::new() + } +} + +impl ReplicaState { + pub fn new() -> ReplicaState { + ReplicaState { + last_received_lsn: Lsn::MAX, + remote_consistent_lsn: Lsn(0), + hs_feedback: HotStandbyFeedback { + ts: 0, + xmin: u64::MAX, + catalog_xmin: u64::MAX, + }, + pageserver_feedback: None, + } + } +} + +/// Shared state associated with database instance +pub struct SharedState { + /// Safekeeper object + sk: SafeKeeper, + /// In memory list containing state of peers sent in latest messages from them. + peers_info: PeersInfo, + /// State of replicas + replicas: Vec>, + /// True when WAL backup launcher oversees the timeline, making sure WAL is + /// offloaded, allows to bother launcher less. + wal_backup_active: bool, + /// True whenever there is at least some pending activity on timeline: live + /// compute connection, pageserver is not caughtup (it must have latest WAL + /// for new compute start) or WAL backuping is not finished. Practically it + /// means safekeepers broadcast info to peers about the timeline, old WAL is + /// trimmed. + /// + /// TODO: it might be better to remove tli completely from GlobalTimelines + /// when tli is inactive instead of having this flag. + active: bool, + num_computes: u32, + last_removed_segno: XLogSegNo, +} + +impl SharedState { + /// Initialize fresh timeline state without persisting anything to disk. + fn create_new( + conf: &SafeKeeperConf, + ttid: &TenantTimelineId, + state: SafeKeeperState, + ) -> Result { + if state.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(*ttid)); + } + + if state.server.pg_version == UNKNOWN_SERVER_VERSION { + bail!(TimelineError::UninitialinzedPgVersion(*ttid)); + } + + if state.commit_lsn < state.local_start_lsn { + bail!( + "commit_lsn {} is higher than local_start_lsn {}", + state.commit_lsn, + state.local_start_lsn + ); + } + + // We don't want to write anything to disk, because we may have existing timeline there. + // These functions should not change anything on disk. + let control_store = control_file::FileStorage::create_new(ttid, conf, state)?; + let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?; + let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?; + + Ok(Self { + sk, + peers_info: PeersInfo(vec![]), + replicas: vec![], + wal_backup_active: false, + active: false, + num_computes: 0, + last_removed_segno: 0, + }) + } + + /// Restore SharedState from control file. If file doesn't exist, bails out. + fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result { + let control_store = control_file::FileStorage::restore_new(ttid, conf)?; + if control_store.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(*ttid)); + } + + let wal_store = wal_storage::PhysicalStorage::new(ttid, conf, &control_store)?; + + Ok(Self { + sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?, + peers_info: PeersInfo(vec![]), + replicas: Vec::new(), + wal_backup_active: false, + active: false, + num_computes: 0, + last_removed_segno: 0, + }) + } + + fn is_active(&self) -> bool { + self.is_wal_backup_required() + // FIXME: add tracking of relevant pageservers and check them here individually, + // otherwise migration won't work (we suspend too early). + || self.sk.inmem.remote_consistent_lsn < self.sk.inmem.commit_lsn + } + + /// Mark timeline active/inactive and return whether s3 offloading requires + /// start/stop action. + fn update_status(&mut self, ttid: TenantTimelineId) -> bool { + let is_active = self.is_active(); + if self.active != is_active { + info!("timeline {} active={} now", ttid, is_active); + } + self.active = is_active; + self.is_wal_backup_action_pending() + } + + /// Should we run s3 offloading in current state? + fn is_wal_backup_required(&self) -> bool { + let seg_size = self.get_wal_seg_size(); + self.num_computes > 0 || + // Currently only the whole segment is offloaded, so compare segment numbers. + (self.sk.inmem.commit_lsn.segment_number(seg_size) > + self.sk.inmem.backup_lsn.segment_number(seg_size)) + } + + /// Is current state of s3 offloading is not what it ought to be? + fn is_wal_backup_action_pending(&self) -> bool { + let res = self.wal_backup_active != self.is_wal_backup_required(); + if res { + let action_pending = if self.is_wal_backup_required() { + "start" + } else { + "stop" + }; + trace!( + "timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}", + self.sk.state.timeline_id, action_pending, self.num_computes, self.sk.inmem.commit_lsn, self.sk.inmem.backup_lsn + ); + } + res + } + + /// Returns whether s3 offloading is required and sets current status as + /// matching. + fn wal_backup_attend(&mut self) -> bool { + self.wal_backup_active = self.is_wal_backup_required(); + self.wal_backup_active + } + + fn get_wal_seg_size(&self) -> usize { + self.sk.state.server.wal_seg_size as usize + } + + /// Get combined state of all alive replicas + pub fn get_replicas_state(&self) -> ReplicaState { + let mut acc = ReplicaState::new(); + for state in self.replicas.iter().flatten() { + acc.hs_feedback.ts = max(acc.hs_feedback.ts, state.hs_feedback.ts); + acc.hs_feedback.xmin = min(acc.hs_feedback.xmin, state.hs_feedback.xmin); + acc.hs_feedback.catalog_xmin = + min(acc.hs_feedback.catalog_xmin, state.hs_feedback.catalog_xmin); + + // FIXME + // If multiple pageservers are streaming WAL and send feedback for the same timeline simultaneously, + // this code is not correct. + // Now the most advanced feedback is used. + // If one pageserver lags when another doesn't, the backpressure won't be activated on compute and lagging + // pageserver is prone to timeout errors. + // + // To choose what feedback to use and resend to compute node, + // we need to know which pageserver compute node considers to be main. + // See https://github.com/neondatabase/neon/issues/1171 + // + if let Some(pageserver_feedback) = state.pageserver_feedback { + if let Some(acc_feedback) = acc.pageserver_feedback { + if acc_feedback.ps_writelsn < pageserver_feedback.ps_writelsn { + warn!("More than one pageserver is streaming WAL for the timeline. Feedback resolving is not fully supported yet."); + acc.pageserver_feedback = Some(pageserver_feedback); + } + } else { + acc.pageserver_feedback = Some(pageserver_feedback); + } + + // last lsn received by pageserver + // FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver. + // See https://github.com/neondatabase/neon/issues/1171 + acc.last_received_lsn = Lsn::from(pageserver_feedback.ps_writelsn); + + // When at least one pageserver has preserved data up to remote_consistent_lsn, + // safekeeper is free to delete it, so choose max of all pageservers. + acc.remote_consistent_lsn = max( + Lsn::from(pageserver_feedback.ps_applylsn), + acc.remote_consistent_lsn, + ); + } + } + acc + } + + /// Assign new replica ID. We choose first empty cell in the replicas vector + /// or extend the vector if there are no free slots. + pub fn add_replica(&mut self, state: ReplicaState) -> usize { + if let Some(pos) = self.replicas.iter().position(|r| r.is_none()) { + self.replicas[pos] = Some(state); + return pos; + } + let pos = self.replicas.len(); + self.replicas.push(Some(state)); + pos + } + + fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo { + SkTimelineInfo { + last_log_term: Some(self.sk.get_epoch()), + flush_lsn: Some(self.sk.wal_store.flush_lsn()), + // note: this value is not flushed to control file yet and can be lost + commit_lsn: Some(self.sk.inmem.commit_lsn), + // TODO: rework feedbacks to avoid max here + remote_consistent_lsn: Some(max( + self.get_replicas_state().remote_consistent_lsn, + self.sk.inmem.remote_consistent_lsn, + )), + peer_horizon_lsn: Some(self.sk.inmem.peer_horizon_lsn), + safekeeper_connstr: Some(conf.listen_pg_addr.clone()), + backup_lsn: Some(self.sk.inmem.backup_lsn), + local_start_lsn: Some(self.sk.state.local_start_lsn), + } + } +} + +#[derive(Debug, thiserror::Error)] +pub enum TimelineError { + #[error("Timeline {0} was cancelled and cannot be used anymore")] + Cancelled(TenantTimelineId), + #[error("Timeline {0} was not found in global map")] + NotFound(TenantTimelineId), + #[error("Timeline {0} exists on disk, but wasn't loaded on startup")] + Invalid(TenantTimelineId), + #[error("Timeline {0} is already exists")] + AlreadyExists(TenantTimelineId), + #[error("Timeline {0} is not initialized, wal_seg_size is zero")] + UninitializedWalSegSize(TenantTimelineId), + #[error("Timeline {0} is not initialized, pg_version is unknown")] + UninitialinzedPgVersion(TenantTimelineId), +} + +/// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline. +/// It also holds SharedState and provides mutually exclusive access to it. +pub struct Timeline { + pub ttid: TenantTimelineId, + + /// Sending here asks for wal backup launcher attention (start/stop + /// offloading). Sending ttid instead of concrete command allows to do + /// sending without timeline lock. + pub wal_backup_launcher_tx: Sender, + + /// Used to broadcast commit_lsn updates to all background jobs. + commit_lsn_watch_tx: watch::Sender, + commit_lsn_watch_rx: watch::Receiver, + + /// Safekeeper and other state, that should remain consistent and synchronized + /// with the disk. + mutex: Mutex, + + /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal. + cancellation_tx: watch::Sender, + + /// Timeline should not be used after cancellation. Background tasks should + /// monitor this channel and stop eventually after receiving `true` from this channel. + cancellation_rx: watch::Receiver, + + /// Directory where timeline state is stored. + timeline_dir: PathBuf, +} + +impl Timeline { + /// Load existing timeline from disk. + pub fn load_timeline( + conf: SafeKeeperConf, + ttid: TenantTimelineId, + wal_backup_launcher_tx: Sender, + ) -> Result { + let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); + + let shared_state = SharedState::restore(&conf, &ttid)?; + let (commit_lsn_watch_tx, commit_lsn_watch_rx) = + watch::channel(shared_state.sk.state.commit_lsn); + let (cancellation_tx, cancellation_rx) = watch::channel(false); + + Ok(Timeline { + ttid, + wal_backup_launcher_tx, + commit_lsn_watch_tx, + commit_lsn_watch_rx, + mutex: Mutex::new(shared_state), + cancellation_rx, + cancellation_tx, + timeline_dir: conf.timeline_dir(&ttid), + }) + } + + /// Create a new timeline, which is not yet persisted to disk. + pub fn create_empty( + conf: SafeKeeperConf, + ttid: TenantTimelineId, + wal_backup_launcher_tx: Sender, + server_info: ServerInfo, + commit_lsn: Lsn, + local_start_lsn: Lsn, + ) -> Result { + let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID); + let (cancellation_tx, cancellation_rx) = watch::channel(false); + let state = SafeKeeperState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); + + Ok(Timeline { + ttid, + wal_backup_launcher_tx, + commit_lsn_watch_tx, + commit_lsn_watch_rx, + mutex: Mutex::new(SharedState::create_new(&conf, &ttid, state)?), + cancellation_rx, + cancellation_tx, + timeline_dir: conf.timeline_dir(&ttid), + }) + } + + /// Initialize fresh timeline on disk and start background tasks. If bootstrap + /// fails, timeline is cancelled and cannot be used anymore. + /// + /// Bootstrap is transactional, so if it fails, created files will be deleted, + /// and state on disk should remain unchanged. + pub fn bootstrap(&self, shared_state: &mut MutexGuard) -> Result<()> { + match std::fs::metadata(&self.timeline_dir) { + Ok(_) => { + // Timeline directory exists on disk, we should leave state unchanged + // and return error. + bail!(TimelineError::Invalid(self.ttid)); + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => { + return Err(e.into()); + } + } + + // Create timeline directory. + std::fs::create_dir_all(&self.timeline_dir)?; + + // Write timeline to disk and TODO: start background tasks. + match || -> Result<()> { + shared_state.sk.persist()?; + // TODO: add more initialization steps here + shared_state.update_status(self.ttid); + Ok(()) + }() { + Ok(_) => Ok(()), + Err(e) => { + // Bootstrap failed, cancel timeline and remove timeline directory. + self.cancel(); + + if let Err(fs_err) = std::fs::remove_dir_all(&self.timeline_dir) { + warn!( + "failed to remove timeline {} directory after bootstrap failure: {}", + self.ttid, fs_err + ); + } + + Err(e) + } + } + } + + /// Delete timeline from disk completely, by removing timeline directory. Background + /// timeline activities will stop eventually. + pub fn delete_from_disk( + &self, + shared_state: &mut MutexGuard, + ) -> Result<(bool, bool)> { + let was_active = shared_state.active; + self.cancel(); + let dir_existed = delete_dir(&self.timeline_dir)?; + Ok((dir_existed, was_active)) + } + + /// Cancel timeline to prevent further usage. Background tasks will stop + /// eventually after receiving cancellation signal. + fn cancel(&self) { + info!("Timeline {} is cancelled", self.ttid); + let _ = self.cancellation_tx.send(true); + let res = self.wal_backup_launcher_tx.blocking_send(self.ttid); + if let Err(e) = res { + error!("Failed to send stop signal to wal_backup_launcher: {}", e); + } + } + + /// Returns if timeline is cancelled. + pub fn is_cancelled(&self) -> bool { + *self.cancellation_rx.borrow() + } + + /// Take a writing mutual exclusive lock on timeline shared_state. + pub fn write_shared_state(&self) -> MutexGuard { + self.mutex.lock() + } + + /// Register compute connection, starting timeline-related activity if it is + /// not running yet. + pub fn on_compute_connect(&self) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + + let is_wal_backup_action_pending: bool; + { + let mut shared_state = self.write_shared_state(); + shared_state.num_computes += 1; + is_wal_backup_action_pending = shared_state.update_status(self.ttid); + } + // Wake up wal backup launcher, if offloading not started yet. + if is_wal_backup_action_pending { + // Can fail only if channel to a static thread got closed, which is not normal at all. + self.wal_backup_launcher_tx.blocking_send(self.ttid)?; + } + Ok(()) + } + + /// De-register compute connection, shutting down timeline activity if + /// pageserver doesn't need catchup. + pub fn on_compute_disconnect(&self) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + + let is_wal_backup_action_pending: bool; + { + let mut shared_state = self.write_shared_state(); + shared_state.num_computes -= 1; + is_wal_backup_action_pending = shared_state.update_status(self.ttid); + } + // Wake up wal backup launcher, if it is time to stop the offloading. + if is_wal_backup_action_pending { + // Can fail only if channel to a static thread got closed, which is not normal at all. + self.wal_backup_launcher_tx.blocking_send(self.ttid)?; + } + Ok(()) + } + + /// Returns true if walsender should stop sending WAL to pageserver. + /// TODO: check this pageserver is actually interested in this timeline. + pub fn should_walsender_stop(&self, replica_id: usize) -> bool { + if self.is_cancelled() { + return true; + } + let mut shared_state = self.write_shared_state(); + if shared_state.num_computes == 0 { + let replica_state = shared_state.replicas[replica_id].unwrap(); + let reported_remote_consistent_lsn = replica_state + .pageserver_feedback + .map(|f| Lsn(f.ps_applylsn)) + .unwrap_or(Lsn::INVALID); + let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet + (reported_remote_consistent_lsn!= Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. + reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); + if stop { + shared_state.update_status(self.ttid); + return true; + } + } + false + } + + /// Returns whether s3 offloading is required and sets current status as + /// matching it. + pub fn wal_backup_attend(&self) -> bool { + if self.is_cancelled() { + return false; + } + + self.write_shared_state().wal_backup_attend() + } + + /// Returns full timeline info, required for the metrics. If the timeline is + /// not active, returns None instead. + pub fn info_for_metrics(&self) -> Option { + if self.is_cancelled() { + return None; + } + + let state = self.write_shared_state(); + if state.active { + Some(FullTimelineInfo { + ttid: self.ttid, + replicas: state + .replicas + .iter() + .filter_map(|r| r.as_ref()) + .copied() + .collect(), + wal_backup_active: state.wal_backup_active, + timeline_is_active: state.active, + num_computes: state.num_computes, + last_removed_segno: state.last_removed_segno, + epoch_start_lsn: state.sk.epoch_start_lsn, + mem_state: state.sk.inmem.clone(), + persisted_state: state.sk.state.clone(), + flush_lsn: state.sk.wal_store.flush_lsn(), + wal_storage: state.sk.wal_store.get_metrics(), + }) + } else { + None + } + } + + /// Returns commit_lsn watch channel. + pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver { + self.commit_lsn_watch_rx.clone() + } + + /// Pass arrived message to the safekeeper. + pub fn process_msg( + &self, + msg: &ProposerAcceptorMessage, + ) -> Result> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + + let mut rmsg: Option; + let commit_lsn: Lsn; + { + let mut shared_state = self.write_shared_state(); + rmsg = shared_state.sk.process_msg(msg)?; + + // if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn + if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { + let state = shared_state.get_replicas_state(); + resp.hs_feedback = state.hs_feedback; + if let Some(pageserver_feedback) = state.pageserver_feedback { + resp.pageserver_feedback = pageserver_feedback; + } + } + + commit_lsn = shared_state.sk.inmem.commit_lsn; + } + self.commit_lsn_watch_tx.send(commit_lsn)?; + Ok(rmsg) + } + + /// Returns wal_seg_size. + pub fn get_wal_seg_size(&self) -> usize { + self.write_shared_state().get_wal_seg_size() + } + + /// Returns true only if the timeline is loaded and active. + pub fn is_active(&self) -> bool { + if self.is_cancelled() { + return false; + } + + self.write_shared_state().active + } + + /// Returns state of the timeline. + pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { + let state = self.write_shared_state(); + (state.sk.inmem.clone(), state.sk.state.clone()) + } + + /// Returns latest backup_lsn. + pub fn get_wal_backup_lsn(&self) -> Lsn { + self.write_shared_state().sk.inmem.backup_lsn + } + + /// Sets backup_lsn to the given value. + pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + + self.write_shared_state().sk.inmem.backup_lsn = backup_lsn; + // we should check whether to shut down offloader, but this will be done + // soon by peer communication anyway. + Ok(()) + } + + /// Get safekeeper info for broadcasting to broker and other peers. + pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo { + let shared_state = self.write_shared_state(); + shared_state.get_safekeeper_info(conf) + } + + /// Update timeline state with peer safekeeper data. + pub async fn record_safekeeper_info( + &self, + sk_info: &SkTimelineInfo, + sk_id: NodeId, + ) -> Result<()> { + let is_wal_backup_action_pending: bool; + let commit_lsn: Lsn; + { + let mut shared_state = self.write_shared_state(); + shared_state.sk.record_safekeeper_info(sk_info)?; + let peer_info = PeerInfo::from_sk_info(sk_id, sk_info, Instant::now()); + shared_state.peers_info.upsert(&peer_info); + is_wal_backup_action_pending = shared_state.update_status(self.ttid); + commit_lsn = shared_state.sk.inmem.commit_lsn; + } + self.commit_lsn_watch_tx.send(commit_lsn)?; + // Wake up wal backup launcher, if it is time to stop the offloading. + if is_wal_backup_action_pending { + self.wal_backup_launcher_tx.send(self.ttid).await?; + } + Ok(()) + } + + /// Get our latest view of alive peers status on the timeline. + /// We pass our own info through the broker as well, so when we don't have connection + /// to the broker returned vec is empty. + pub fn get_peers(&self, conf: &SafeKeeperConf) -> Vec { + let shared_state = self.write_shared_state(); + let now = Instant::now(); + shared_state + .peers_info + .0 + .iter() + // Regard peer as absent if we haven't heard from it within heartbeat_timeout. + .filter(|p| now.duration_since(p.ts) <= conf.heartbeat_timeout) + .cloned() + .collect() + } + + /// Add send_wal replica to the in-memory vector of replicas. + pub fn add_replica(&self, state: ReplicaState) -> usize { + self.write_shared_state().add_replica(state) + } + + /// Update replication replica state. + pub fn update_replica_state(&self, id: usize, state: ReplicaState) { + let mut shared_state = self.write_shared_state(); + shared_state.replicas[id] = Some(state); + } + + /// Remove send_wal replica from the in-memory vector of replicas. + pub fn remove_replica(&self, id: usize) { + let mut shared_state = self.write_shared_state(); + assert!(shared_state.replicas[id].is_some()); + shared_state.replicas[id] = None; + } + + /// Returns flush_lsn. + pub fn get_flush_lsn(&self) -> Lsn { + self.write_shared_state().sk.wal_store.flush_lsn() + } + + /// Delete WAL segments from disk that are no longer needed. This is determined + /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn. + pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + + let horizon_segno: XLogSegNo; + let remover: Box Result<(), anyhow::Error>>; + { + let shared_state = self.write_shared_state(); + horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled); + remover = shared_state.sk.wal_store.remove_up_to(); + if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { + return Ok(()); + } + // release the lock before removing + } + + // delete old WAL files + remover(horizon_segno - 1)?; + + // update last_removed_segno + let mut shared_state = self.write_shared_state(); + shared_state.last_removed_segno = horizon_segno; + Ok(()) + } +} + +/// Deletes directory and it's contents. Returns false if directory does not exist. +fn delete_dir(path: &PathBuf) -> Result { + match std::fs::remove_dir_all(path) { + Ok(_) => Ok(true), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(e) => Err(e.into()), + } +} diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs new file mode 100644 index 0000000000..a5d373a1da --- /dev/null +++ b/safekeeper/src/timelines_global_map.rs @@ -0,0 +1,359 @@ +//! This module contains global (tenant_id, timeline_id) -> Arc mapping. +//! All timelines should always be present in this map, this is done by loading them +//! all from the disk on startup and keeping them in memory. + +use crate::safekeeper::ServerInfo; +use crate::timeline::{Timeline, TimelineError}; +use crate::SafeKeeperConf; +use anyhow::{anyhow, bail, Context, Result}; +use once_cell::sync::Lazy; +use serde::Serialize; +use std::collections::HashMap; +use std::path::PathBuf; +use std::str::FromStr; +use std::sync::{Arc, Mutex, MutexGuard}; +use tokio::sync::mpsc::Sender; +use tracing::*; +use utils::id::{TenantId, TenantTimelineId, TimelineId}; +use utils::lsn::Lsn; + +struct GlobalTimelinesState { + timelines: HashMap>, + wal_backup_launcher_tx: Option>, + conf: SafeKeeperConf, +} + +impl GlobalTimelinesState { + /// Get dependencies for a timeline constructor. + fn get_dependencies(&self) -> (SafeKeeperConf, Sender) { + ( + self.conf.clone(), + self.wal_backup_launcher_tx.as_ref().unwrap().clone(), + ) + } + + /// Insert timeline into the map. Returns error if timeline with the same id already exists. + fn try_insert(&mut self, timeline: Arc) -> Result<()> { + let ttid = timeline.ttid; + if self.timelines.contains_key(&ttid) { + bail!(TimelineError::AlreadyExists(ttid)); + } + self.timelines.insert(ttid, timeline); + Ok(()) + } + + /// Get timeline from the map. Returns error if timeline doesn't exist. + fn get(&self, ttid: &TenantTimelineId) -> Result> { + self.timelines + .get(ttid) + .cloned() + .ok_or_else(|| anyhow!(TimelineError::NotFound(*ttid))) + } +} + +static TIMELINES_STATE: Lazy> = Lazy::new(|| { + Mutex::new(GlobalTimelinesState { + timelines: HashMap::new(), + wal_backup_launcher_tx: None, + conf: SafeKeeperConf::default(), + }) +}); + +/// A zero-sized struct used to manage access to the global timelines map. +pub struct GlobalTimelines; + +impl GlobalTimelines { + /// Inject dependencies needed for the timeline constructors and load all timelines to memory. + pub fn init( + conf: SafeKeeperConf, + wal_backup_launcher_tx: Sender, + ) -> Result<()> { + let mut state = TIMELINES_STATE.lock().unwrap(); + assert!(state.wal_backup_launcher_tx.is_none()); + state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); + state.conf = conf; + + // Iterate through all directories and load tenants for all directories + // named as a valid tenant_id. + let mut tenant_count = 0; + let tenants_dir = state.conf.workdir.clone(); + for tenants_dir_entry in std::fs::read_dir(&tenants_dir) + .with_context(|| format!("failed to list tenants dir {}", tenants_dir.display()))? + { + match &tenants_dir_entry { + Ok(tenants_dir_entry) => { + if let Ok(tenant_id) = + TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or("")) + { + tenant_count += 1; + GlobalTimelines::load_tenant_timelines(&mut state, tenant_id)?; + } + } + Err(e) => error!( + "failed to list tenants dir entry {:?} in directory {}, reason: {:?}", + tenants_dir_entry, + tenants_dir.display(), + e + ), + } + } + + info!( + "found {} tenants directories, successfully loaded {} timelines", + tenant_count, + state.timelines.len() + ); + Ok(()) + } + + /// Loads all timelines for the given tenant to memory. Returns fs::read_dir errors if any. + fn load_tenant_timelines( + state: &mut MutexGuard, + tenant_id: TenantId, + ) -> Result<()> { + let timelines_dir = state.conf.tenant_dir(&tenant_id); + for timelines_dir_entry in std::fs::read_dir(&timelines_dir) + .with_context(|| format!("failed to list timelines dir {}", timelines_dir.display()))? + { + match &timelines_dir_entry { + Ok(timeline_dir_entry) => { + if let Ok(timeline_id) = + TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or("")) + { + let ttid = TenantTimelineId::new(tenant_id, timeline_id); + match Timeline::load_timeline( + state.conf.clone(), + ttid, + state.wal_backup_launcher_tx.as_ref().unwrap().clone(), + ) { + Ok(timeline) => { + state.timelines.insert(ttid, Arc::new(timeline)); + } + // If we can't load a timeline, it's most likely because of a corrupted + // directory. We will log an error and won't allow to delete/recreate + // this timeline. The only way to fix this timeline is to repair manually + // and restart the safekeeper. + Err(e) => error!( + "failed to load timeline {} for tenant {}, reason: {:?}", + timeline_id, tenant_id, e + ), + } + } + } + Err(e) => error!( + "failed to list timelines dir entry {:?} in directory {}, reason: {:?}", + timelines_dir_entry, + timelines_dir.display(), + e + ), + } + } + + Ok(()) + } + + /// Create a new timeline with the given id. If the timeline already exists, returns + /// an existing timeline. + pub fn create( + ttid: TenantTimelineId, + server_info: ServerInfo, + commit_lsn: Lsn, + local_start_lsn: Lsn, + ) -> Result> { + let (conf, wal_backup_launcher_tx) = { + let state = TIMELINES_STATE.lock().unwrap(); + if let Ok(timeline) = state.get(&ttid) { + // Timeline already exists, return it. + return Ok(timeline); + } + state.get_dependencies() + }; + + info!("creating new timeline {}", ttid); + + let timeline = Arc::new(Timeline::create_empty( + conf, + ttid, + wal_backup_launcher_tx, + server_info, + commit_lsn, + local_start_lsn, + )?); + + // Take a lock and finish the initialization holding this mutex. No other threads + // can interfere with creation after we will insert timeline into the map. + let mut shared_state = timeline.write_shared_state(); + + // We can get a race condition here in case of concurrent create calls, but only + // in theory. create() will return valid timeline on the next try. + TIMELINES_STATE + .lock() + .unwrap() + .try_insert(timeline.clone())?; + + // Write the new timeline to the disk and start background workers. + // Bootstrap is transactional, so if it fails, the timeline will be deleted, + // and the state on disk should remain unchanged. + match timeline.bootstrap(&mut shared_state) { + Ok(_) => { + // We are done with bootstrap, release the lock, return the timeline. + drop(shared_state); + timeline + .wal_backup_launcher_tx + .blocking_send(timeline.ttid)?; + Ok(timeline) + } + Err(e) => { + // Note: the most likely reason for bootstrap failure is that the timeline + // directory already exists on disk. This happens when timeline is corrupted + // and wasn't loaded from disk on startup because of that. We want to preserve + // the timeline directory in this case, for further inspection. + + // TODO: this is an unusual error, perhaps we should send it to sentry + // TODO: compute will try to create timeline every second, we should add backoff + error!("failed to bootstrap timeline {}: {}", ttid, e); + + // Timeline failed to bootstrap, it cannot be used. Remove it from the map. + TIMELINES_STATE.lock().unwrap().timelines.remove(&ttid); + Err(e) + } + } + } + + /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, + /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid, + /// i.e. loaded in memory and not cancelled. + pub fn get(ttid: TenantTimelineId) -> Result> { + let res = TIMELINES_STATE.lock().unwrap().get(&ttid); + + match res { + Ok(tli) => { + if tli.is_cancelled() { + anyhow::bail!(TimelineError::Cancelled(ttid)); + } + Ok(tli) + } + Err(e) => Err(e), + } + } + + /// Returns all timelines. This is used for background timeline proccesses. + pub fn get_all() -> Vec> { + let global_lock = TIMELINES_STATE.lock().unwrap(); + global_lock + .timelines + .values() + .cloned() + .filter(|t| !t.is_cancelled()) + .collect() + } + + /// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant, + /// and that's why it can return cancelled timelines, to retry deleting them. + fn get_all_for_tenant(tenant_id: TenantId) -> Vec> { + let global_lock = TIMELINES_STATE.lock().unwrap(); + global_lock + .timelines + .values() + .filter(|t| t.ttid.tenant_id == tenant_id) + .cloned() + .collect() + } + + /// Cancels timeline, then deletes the corresponding data directory. + pub fn delete_force(ttid: &TenantTimelineId) -> Result { + let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid); + match tli_res { + Ok(timeline) => { + // Take a lock and finish the deletion holding this mutex. + let mut shared_state = timeline.write_shared_state(); + + info!("deleting timeline {}", ttid); + let (dir_existed, was_active) = timeline.delete_from_disk(&mut shared_state)?; + + // Remove timeline from the map. + TIMELINES_STATE.lock().unwrap().timelines.remove(ttid); + + Ok(TimelineDeleteForceResult { + dir_existed, + was_active, + }) + } + Err(_) => { + // Timeline is not memory, but it may still exist on disk in broken state. + let dir_path = TIMELINES_STATE.lock().unwrap().conf.timeline_dir(ttid); + let dir_existed = delete_dir(dir_path)?; + + Ok(TimelineDeleteForceResult { + dir_existed, + was_active: false, + }) + } + } + } + + /// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which + /// the tenant had, `true` if a timeline was active. There may be a race if new timelines are + /// created simultaneously. In that case the function will return error and the caller should + /// retry tenant deletion again later. + pub fn delete_force_all_for_tenant( + tenant_id: &TenantId, + ) -> Result> { + info!("deleting all timelines for tenant {}", tenant_id); + let to_delete = Self::get_all_for_tenant(*tenant_id); + + let mut err = None; + + let mut deleted = HashMap::new(); + for tli in &to_delete { + match Self::delete_force(&tli.ttid) { + Ok(result) => { + deleted.insert(tli.ttid, result); + } + Err(e) => { + error!("failed to delete timeline {}: {}", tli.ttid, e); + // Save error to return later. + err = Some(e); + } + } + } + + // If there was an error, return it. + if let Some(e) = err { + return Err(e); + } + + // There may be broken timelines on disk, so delete the whole tenant dir as well. + // Note that we could concurrently create new timelines while we were deleting them, + // so the directory may be not empty. In this case timelines will have bad state + // and timeline background jobs can panic. + delete_dir(TIMELINES_STATE.lock().unwrap().conf.tenant_dir(tenant_id))?; + + let tlis_after_delete = Self::get_all_for_tenant(*tenant_id); + if !tlis_after_delete.is_empty() { + // Some timelines were created while we were deleting them, returning error + // to the caller, so it can retry later. + bail!( + "failed to delete all timelines for tenant {}: some timelines were created while we were deleting them", + tenant_id + ); + } + + Ok(deleted) + } +} + +#[derive(Clone, Copy, Serialize)] +pub struct TimelineDeleteForceResult { + pub dir_existed: bool, + pub was_active: bool, +} + +/// Deletes directory and it's contents. Returns false if directory does not exist. +fn delete_dir(path: PathBuf) -> Result { + match std::fs::remove_dir_all(path) { + Ok(_) => Ok(true), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), + Err(e) => Err(e.into()), + } +} diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs new file mode 100644 index 0000000000..0a43d6085c --- /dev/null +++ b/safekeeper/src/wal_backup.rs @@ -0,0 +1,474 @@ +use anyhow::{Context, Result}; + +use tokio::task::JoinHandle; +use utils::id::NodeId; + +use std::cmp::min; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::pin::Pin; +use std::sync::Arc; +use std::time::Duration; + +use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; +use postgres_ffi::XLogFileName; +use postgres_ffi::{XLogSegNo, PG_TLI}; +use remote_storage::GenericRemoteStorage; +use tokio::fs::File; +use tokio::runtime::Builder; + +use tokio::select; +use tokio::sync::mpsc::{self, Receiver, Sender}; +use tokio::sync::watch; +use tokio::time::sleep; +use tracing::*; + +use utils::{id::TenantTimelineId, lsn::Lsn}; + +use crate::timeline::{PeerInfo, Timeline}; +use crate::{GlobalTimelines, SafeKeeperConf}; + +use once_cell::sync::OnceCell; + +const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10; +const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; + +pub fn wal_backup_launcher_thread_main( + conf: SafeKeeperConf, + wal_backup_launcher_rx: Receiver, +) { + let rt = Builder::new_multi_thread() + .worker_threads(conf.backup_runtime_threads) + .enable_all() + .build() + .expect("failed to create wal backup runtime"); + + rt.block_on(async { + wal_backup_launcher_main_loop(conf, wal_backup_launcher_rx).await; + }); +} + +/// Check whether wal backup is required for timeline. If yes, mark that launcher is +/// aware of current status and return the timeline. +fn is_wal_backup_required(ttid: TenantTimelineId) -> Option> { + GlobalTimelines::get(ttid) + .ok() + .filter(|tli| tli.wal_backup_attend()) +} + +struct WalBackupTaskHandle { + shutdown_tx: Sender<()>, + handle: JoinHandle<()>, +} + +struct WalBackupTimelineEntry { + timeline: Arc, + handle: Option, +} + +async fn shut_down_task(ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry) { + if let Some(wb_handle) = entry.handle.take() { + // Tell the task to shutdown. Error means task exited earlier, that's ok. + let _ = wb_handle.shutdown_tx.send(()).await; + // Await the task itself. TODO: restart panicked tasks earlier. + if let Err(e) = wb_handle.handle.await { + warn!("WAL backup task for {} panicked: {}", ttid, e); + } + } +} + +/// The goal is to ensure that normally only one safekeepers offloads. However, +/// it is fine (and inevitable, as s3 doesn't provide CAS) that for some short +/// time we have several ones as they PUT the same files. Also, +/// - frequently changing the offloader would be bad; +/// - electing seriously lagging safekeeper is undesirable; +/// So we deterministically choose among the reasonably caught up candidates. +/// TODO: take into account failed attempts to deal with hypothetical situation +/// where s3 is unreachable only for some sks. +fn determine_offloader( + alive_peers: &[PeerInfo], + wal_backup_lsn: Lsn, + ttid: TenantTimelineId, + conf: &SafeKeeperConf, +) -> (Option, String) { + // TODO: remove this once we fill newly joined safekeepers since backup_lsn. + let capable_peers = alive_peers + .iter() + .filter(|p| p.local_start_lsn <= wal_backup_lsn); + match capable_peers.clone().map(|p| p.commit_lsn).max() { + None => (None, "no connected peers to elect from".to_string()), + Some(max_commit_lsn) => { + let threshold = max_commit_lsn + .checked_sub(conf.max_offloader_lag_bytes) + .unwrap_or(Lsn(0)); + let mut caughtup_peers = capable_peers + .clone() + .filter(|p| p.commit_lsn >= threshold) + .collect::>(); + caughtup_peers.sort_by(|p1, p2| p1.sk_id.cmp(&p2.sk_id)); + + // To distribute the load, shift by timeline_id. + let offloader = caughtup_peers + [(u128::from(ttid.timeline_id) % caughtup_peers.len() as u128) as usize] + .sk_id; + + let mut capable_peers_dbg = capable_peers + .map(|p| (p.sk_id, p.commit_lsn)) + .collect::>(); + capable_peers_dbg.sort_by(|p1, p2| p1.0.cmp(&p2.0)); + ( + Some(offloader), + format!( + "elected {} among {:?} peers, with {} of them being caughtup", + offloader, + capable_peers_dbg, + caughtup_peers.len() + ), + ) + } + } +} + +/// Based on peer information determine which safekeeper should offload; if it +/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task +/// is running, kill it. +async fn update_task( + conf: &SafeKeeperConf, + ttid: TenantTimelineId, + entry: &mut WalBackupTimelineEntry, +) { + let alive_peers = entry.timeline.get_peers(conf); + let wal_backup_lsn = entry.timeline.get_wal_backup_lsn(); + let (offloader, election_dbg_str) = + determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf); + let elected_me = Some(conf.my_id) == offloader; + + if elected_me != (entry.handle.is_some()) { + if elected_me { + info!("elected for backup {}: {}", ttid, election_dbg_str); + + let (shutdown_tx, shutdown_rx) = mpsc::channel(1); + let timeline_dir = conf.timeline_dir(&ttid); + + let handle = tokio::spawn( + backup_task_main(ttid, timeline_dir, shutdown_rx) + .instrument(info_span!("WAL backup task", ttid = %ttid)), + ); + + entry.handle = Some(WalBackupTaskHandle { + shutdown_tx, + handle, + }); + } else { + info!("stepping down from backup {}: {}", ttid, election_dbg_str); + shut_down_task(ttid, entry).await; + } + } +} + +const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; + +/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup +/// tasks. Having this in separate task simplifies locking, allows to reap +/// panics and separate elections from offloading itself. +async fn wal_backup_launcher_main_loop( + conf: SafeKeeperConf, + mut wal_backup_launcher_rx: Receiver, +) { + info!( + "WAL backup launcher started, remote config {:?}", + conf.remote_storage + ); + + let conf_ = conf.clone(); + REMOTE_STORAGE.get_or_init(|| { + conf_.remote_storage.as_ref().map(|c| { + GenericRemoteStorage::from_config(conf_.workdir, c) + .expect("failed to create remote storage") + }) + }); + + // Presense in this map means launcher is aware s3 offloading is needed for + // the timeline, but task is started only if it makes sense for to offload + // from this safekeeper. + let mut tasks: HashMap = HashMap::new(); + + let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC)); + loop { + tokio::select! { + ttid = wal_backup_launcher_rx.recv() => { + // channel is never expected to get closed + let ttid = ttid.unwrap(); + if conf.remote_storage.is_none() || !conf.wal_backup_enabled { + continue; /* just drain the channel and do nothing */ + } + let timeline = is_wal_backup_required(ttid); + // do we need to do anything at all? + if timeline.is_some() != tasks.contains_key(&ttid) { + if let Some(timeline) = timeline { + // need to start the task + let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry { + timeline, + handle: None, + }); + update_task(&conf, ttid, entry).await; + } else { + // need to stop the task + info!("stopping WAL backup task for {}", ttid); + let mut entry = tasks.remove(&ttid).unwrap(); + shut_down_task(ttid, &mut entry).await; + } + } + } + // For each timeline needing offloading, check if this safekeeper + // should do the job and start/stop the task accordingly. + _ = ticker.tick() => { + for (ttid, entry) in tasks.iter_mut() { + update_task(&conf, *ttid, entry).await; + } + } + } + } +} + +struct WalBackupTask { + timeline: Arc, + timeline_dir: PathBuf, + wal_seg_size: usize, + commit_lsn_watch_rx: watch::Receiver, +} + +/// Offload single timeline. +async fn backup_task_main( + ttid: TenantTimelineId, + timeline_dir: PathBuf, + mut shutdown_rx: Receiver<()>, +) { + info!("started"); + let res = GlobalTimelines::get(ttid); + if let Err(e) = res { + error!("backup error for timeline {}: {}", ttid, e); + return; + } + let tli = res.unwrap(); + + let mut wb = WalBackupTask { + wal_seg_size: tli.get_wal_seg_size(), + commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), + timeline: tli, + timeline_dir, + }; + + // task is spinned up only when wal_seg_size already initialized + assert!(wb.wal_seg_size > 0); + + let mut canceled = false; + select! { + _ = wb.run() => {} + _ = shutdown_rx.recv() => { + canceled = true; + } + } + info!("task {}", if canceled { "canceled" } else { "terminated" }); +} + +impl WalBackupTask { + async fn run(&mut self) { + let mut backup_lsn = Lsn(0); + + let mut retry_attempt = 0u32; + // offload loop + loop { + if retry_attempt == 0 { + // wait for new WAL to arrive + if let Err(e) = self.commit_lsn_watch_rx.changed().await { + // should never happen, as we hold Arc to timeline. + error!("commit_lsn watch shut down: {:?}", e); + return; + } + } else { + // or just sleep if we errored previously + let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS; + if let Some(backoff_delay) = UPLOAD_FAILURE_RETRY_MIN_MS.checked_shl(retry_attempt) + { + retry_delay = min(retry_delay, backoff_delay); + } + sleep(Duration::from_millis(retry_delay)).await; + } + + let commit_lsn = *self.commit_lsn_watch_rx.borrow(); + + // Note that backup_lsn can be higher than commit_lsn if we + // don't have much local WAL and others already uploaded + // segments we don't even have. + if backup_lsn.segment_number(self.wal_seg_size) + >= commit_lsn.segment_number(self.wal_seg_size) + { + retry_attempt = 0; + continue; /* nothing to do, common case as we wake up on every commit_lsn bump */ + } + // Perhaps peers advanced the position, check shmem value. + backup_lsn = self.timeline.get_wal_backup_lsn(); + if backup_lsn.segment_number(self.wal_seg_size) + >= commit_lsn.segment_number(self.wal_seg_size) + { + retry_attempt = 0; + continue; + } + + match backup_lsn_range( + backup_lsn, + commit_lsn, + self.wal_seg_size, + &self.timeline_dir, + ) + .await + { + Ok(backup_lsn_result) => { + backup_lsn = backup_lsn_result; + let res = self.timeline.set_wal_backup_lsn(backup_lsn_result); + if let Err(e) = res { + error!("failed to set wal_backup_lsn: {}", e); + return; + } + retry_attempt = 0; + } + Err(e) => { + error!( + "failed while offloading range {}-{}: {:?}", + backup_lsn, commit_lsn, e + ); + + if retry_attempt < u32::MAX { + retry_attempt += 1; + } + } + } + } + } +} + +pub async fn backup_lsn_range( + start_lsn: Lsn, + end_lsn: Lsn, + wal_seg_size: usize, + timeline_dir: &Path, +) -> Result { + let mut res = start_lsn; + let segments = get_segments(start_lsn, end_lsn, wal_seg_size); + for s in &segments { + backup_single_segment(s, timeline_dir) + .await + .with_context(|| format!("offloading segno {}", s.seg_no))?; + + res = s.end_lsn; + } + info!( + "offloaded segnos {:?} up to {}, previous backup_lsn {}", + segments.iter().map(|&s| s.seg_no).collect::>(), + end_lsn, + start_lsn, + ); + Ok(res) +} + +async fn backup_single_segment(seg: &Segment, timeline_dir: &Path) -> Result<()> { + let segment_file_name = seg.file_path(timeline_dir)?; + + backup_object(&segment_file_name, seg.size()).await?; + debug!("Backup of {} done", segment_file_name.display()); + + Ok(()) +} + +#[derive(Debug, Copy, Clone)] +pub struct Segment { + seg_no: XLogSegNo, + start_lsn: Lsn, + end_lsn: Lsn, +} + +impl Segment { + pub fn new(seg_no: u64, start_lsn: Lsn, end_lsn: Lsn) -> Self { + Self { + seg_no, + start_lsn, + end_lsn, + } + } + + pub fn object_name(self) -> String { + XLogFileName(PG_TLI, self.seg_no, self.size()) + } + + pub fn file_path(self, timeline_dir: &Path) -> Result { + Ok(timeline_dir.join(self.object_name())) + } + + pub fn size(self) -> usize { + (u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize + } +} + +fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec { + let first_seg = start.segment_number(seg_size); + let last_seg = end.segment_number(seg_size); + + let res: Vec = (first_seg..last_seg) + .map(|s| { + let start_lsn = XLogSegNoOffsetToRecPtr(s, 0, seg_size); + let end_lsn = XLogSegNoOffsetToRecPtr(s + 1, 0, seg_size); + Segment::new(s, Lsn::from(start_lsn), Lsn::from(end_lsn)) + }) + .collect(); + res +} + +static REMOTE_STORAGE: OnceCell> = OnceCell::new(); + +async fn backup_object(source_file: &Path, size: usize) -> Result<()> { + let storage = REMOTE_STORAGE + .get() + .expect("failed to get remote storage") + .as_ref() + .unwrap(); + + let file = tokio::io::BufReader::new(File::open(&source_file).await.with_context(|| { + format!( + "Failed to open file {} for wal backup", + source_file.display() + ) + })?); + + storage + .upload_storage_object(Box::new(file), size, source_file) + .await +} + +pub async fn read_object( + file_path: PathBuf, + offset: u64, +) -> anyhow::Result>> { + let storage = REMOTE_STORAGE + .get() + .context("Failed to get remote storage")? + .as_ref() + .context("No remote storage configured")?; + + info!( + "segment download about to start for local path {} at offset {}", + file_path.display(), + offset + ); + let download = storage + .download_storage_object(Some((offset, None)), &file_path) + .await + .with_context(|| { + format!( + "Failed to open WAL segment download stream for local path {}", + file_path.display() + ) + })?; + + Ok(download.download_stream) +} diff --git a/walkeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs similarity index 76% rename from walkeeper/src/wal_service.rs rename to safekeeper/src/wal_service.rs index 305e59bcd3..5980160788 100644 --- a/walkeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -8,29 +8,22 @@ use std::net::{TcpListener, TcpStream}; use std::thread; use tracing::*; -use crate::callmemaybe::CallmeEvent; use crate::handler::SafekeeperPostgresHandler; use crate::SafeKeeperConf; -use tokio::sync::mpsc::UnboundedSender; -use zenith_utils::postgres_backend::{AuthType, PostgresBackend}; +use utils::postgres_backend::{AuthType, PostgresBackend}; /// Accept incoming TCP connections and spawn them into a background thread. -pub fn thread_main( - conf: SafeKeeperConf, - listener: TcpListener, - tx: UnboundedSender, -) -> Result<()> { +pub fn thread_main(conf: SafeKeeperConf, listener: TcpListener) -> Result<()> { loop { match listener.accept() { Ok((socket, peer_addr)) => { debug!("accepted connection from {}", peer_addr); let conf = conf.clone(); - let tx_clone = tx.clone(); let _ = thread::Builder::new() .name("WAL service thread".into()) .spawn(move || { - if let Err(err) = handle_socket(socket, conf, tx_clone) { + if let Err(err) = handle_socket(socket, conf) { error!("connection handler exited: {}", err); } }) @@ -51,16 +44,12 @@ fn get_tid() -> u64 { /// This is run by `thread_main` above, inside a background thread. /// -fn handle_socket( - socket: TcpStream, - conf: SafeKeeperConf, - tx: UnboundedSender, -) -> Result<()> { +fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<()> { let _enter = info_span!("", tid = ?get_tid()).entered(); socket.set_nodelay(true)?; - let mut conn_handler = SafekeeperPostgresHandler::new(conf, tx); + let mut conn_handler = SafekeeperPostgresHandler::new(conf); let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, false)?; // libpq replication protocol between safekeeper and replicas/pagers pgbackend.run(&mut conn_handler)?; diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs new file mode 100644 index 0000000000..bc5e2d7b24 --- /dev/null +++ b/safekeeper/src/wal_storage.rs @@ -0,0 +1,597 @@ +//! This module has everything to deal with WAL -- reading and writing to disk. +//! +//! Safekeeper WAL is stored in the timeline directory, in format similar to pg_wal. +//! PG timeline is always 1, so WAL segments are usually have names like this: +//! - 000000010000000000000001 +//! - 000000010000000000000002.partial +//! +//! Note that last file has `.partial` suffix, that's different from postgres. + +use anyhow::{bail, Context, Result}; + +use std::io::{self, Seek, SeekFrom}; +use std::pin::Pin; +use tokio::io::AsyncRead; + +use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; +use postgres_ffi::{XLogSegNo, PG_TLI}; +use std::cmp::{max, min}; + +use std::fs::{self, remove_file, File, OpenOptions}; +use std::io::Write; +use std::path::{Path, PathBuf}; + +use tracing::*; + +use utils::{id::TenantTimelineId, lsn::Lsn}; + +use crate::metrics::{time_io_closure, WalStorageMetrics}; +use crate::safekeeper::SafeKeeperState; + +use crate::wal_backup::read_object; +use crate::SafeKeeperConf; +use postgres_ffi::XLogFileName; +use postgres_ffi::XLOG_BLCKSZ; + +use postgres_ffi::waldecoder::WalStreamDecoder; + +use tokio::io::{AsyncReadExt, AsyncSeekExt}; + +pub trait Storage { + /// LSN of last durably stored WAL record. + fn flush_lsn(&self) -> Lsn; + + /// Write piece of WAL from buf to disk, but not necessarily sync it. + fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; + + /// Truncate WAL at specified LSN, which must be the end of WAL record. + fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>; + + /// Durably store WAL on disk, up to the last written WAL record. + fn flush_wal(&mut self) -> Result<()>; + + /// Remove all segments <= given segno. Returns closure as we want to do + /// that without timeline lock. + fn remove_up_to(&self) -> Box Result<()>>; + + /// Get metrics for this timeline. + fn get_metrics(&self) -> WalStorageMetrics; +} + +/// PhysicalStorage is a storage that stores WAL on disk. Writes are separated from flushes +/// for better performance. Storage is initialized in the constructor. +/// +/// WAL is stored in segments, each segment is a file. Last segment has ".partial" suffix in +/// its filename and may be not fully flushed. +/// +/// Relationship of LSNs: +/// `write_lsn` >= `write_record_lsn` >= `flush_record_lsn` +/// +/// When storage is created first time, all LSNs are zeroes and there are no segments on disk. +pub struct PhysicalStorage { + metrics: WalStorageMetrics, + timeline_dir: PathBuf, + conf: SafeKeeperConf, + + /// Size of WAL segment in bytes. + wal_seg_size: usize, + + /// Written to disk, but possibly still in the cache and not fully persisted. + /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. + write_lsn: Lsn, + + /// The LSN of the last WAL record written to disk. Still can be not fully flushed. + write_record_lsn: Lsn, + + /// The LSN of the last WAL record flushed to disk. + flush_record_lsn: Lsn, + + /// Decoder is required for detecting boundaries of WAL records. + decoder: WalStreamDecoder, + + /// Cached open file for the last segment. + /// + /// If Some(file) is open, then it always: + /// - has ".partial" suffix + /// - points to write_lsn, so no seek is needed for writing + /// - doesn't point to the end of the segment + file: Option, +} + +impl PhysicalStorage { + /// Create new storage. If commit_lsn is not zero, flush_lsn is tried to be restored from + /// the disk. Otherwise, all LSNs are set to zero. + pub fn new( + ttid: &TenantTimelineId, + conf: &SafeKeeperConf, + state: &SafeKeeperState, + ) -> Result { + let timeline_dir = conf.timeline_dir(ttid); + let wal_seg_size = state.server.wal_seg_size as usize; + + // Find out where stored WAL ends, starting at commit_lsn which is a + // known recent record boundary (unless we don't have WAL at all). + // + // NB: find_end_of_wal MUST be backwards compatible with the previously + // written WAL. If find_end_of_wal fails to read any WAL written by an + // older version of the code, we could lose data forever. + let write_lsn = if state.commit_lsn == Lsn(0) { + Lsn(0) + } else { + match state.server.pg_version / 10000 { + 14 => postgres_ffi::v14::xlog_utils::find_end_of_wal( + &timeline_dir, + wal_seg_size, + state.commit_lsn, + )?, + 15 => postgres_ffi::v15::xlog_utils::find_end_of_wal( + &timeline_dir, + wal_seg_size, + state.commit_lsn, + )?, + _ => bail!("unsupported postgres version: {}", state.server.pg_version), + } + }; + + // TODO: do we really know that write_lsn is fully flushed to disk? + // If not, maybe it's better to call fsync() here to be sure? + let flush_lsn = write_lsn; + + debug!( + "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", + ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, + ); + if flush_lsn < state.commit_lsn || flush_lsn < state.peer_horizon_lsn { + warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", ttid.timeline_id); + } + + Ok(PhysicalStorage { + metrics: WalStorageMetrics::default(), + timeline_dir, + conf: conf.clone(), + wal_seg_size, + write_lsn, + write_record_lsn: write_lsn, + flush_record_lsn: flush_lsn, + decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000), + file: None, + }) + } + + /// Call fdatasync if config requires so. + fn fdatasync_file(&mut self, file: &mut File) -> Result<()> { + if !self.conf.no_sync { + self.metrics + .observe_flush_seconds(time_io_closure(|| Ok(file.sync_data()?))?); + } + Ok(()) + } + + /// Call fsync if config requires so. + fn fsync_file(&mut self, file: &mut File) -> Result<()> { + if !self.conf.no_sync { + self.metrics + .observe_flush_seconds(time_io_closure(|| Ok(file.sync_all()?))?); + } + Ok(()) + } + + /// Open or create WAL segment file. Caller must call seek to the wanted position. + /// Returns `file` and `is_partial`. + fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> { + let (wal_file_path, wal_file_partial_path) = + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; + + // Try to open already completed segment + if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { + Ok((file, false)) + } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) { + // Try to open existing partial file + Ok((file, true)) + } else { + // Create and fill new partial file + let mut file = OpenOptions::new() + .create(true) + .write(true) + .open(&wal_file_partial_path) + .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?; + + write_zeroes(&mut file, self.wal_seg_size)?; + self.fsync_file(&mut file)?; + Ok((file, true)) + } + } + + /// Write WAL bytes, which are known to be located in a single WAL segment. + fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> { + let mut file = if let Some(file) = self.file.take() { + file + } else { + let (mut file, is_partial) = self.open_or_create(segno)?; + assert!(is_partial, "unexpected write into non-partial segment file"); + file.seek(SeekFrom::Start(xlogoff as u64))?; + file + }; + + file.write_all(buf)?; + + if xlogoff + buf.len() == self.wal_seg_size { + // If we reached the end of a WAL segment, flush and close it. + self.fdatasync_file(&mut file)?; + + // Rename partial file to completed file + let (wal_file_path, wal_file_partial_path) = + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; + fs::rename(&wal_file_partial_path, &wal_file_path)?; + } else { + // otherwise, file can be reused later + self.file = Some(file); + } + + Ok(()) + } + + /// Writes WAL to the segment files, until everything is writed. If some segments + /// are fully written, they are flushed to disk. The last (partial) segment can + /// be flushed separately later. + /// + /// Updates `write_lsn`. + fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> { + if self.write_lsn != pos { + // need to flush the file before discarding it + if let Some(mut file) = self.file.take() { + self.fdatasync_file(&mut file)?; + } + + self.write_lsn = pos; + } + + while !buf.is_empty() { + // Extract WAL location for this block + let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size) as usize; + let segno = self.write_lsn.segment_number(self.wal_seg_size); + + // If crossing a WAL boundary, only write up until we reach wal segment size. + let bytes_write = if xlogoff + buf.len() > self.wal_seg_size { + self.wal_seg_size - xlogoff + } else { + buf.len() + }; + + self.write_in_segment(segno, xlogoff, &buf[..bytes_write])?; + self.write_lsn += bytes_write as u64; + buf = &buf[bytes_write..]; + } + + Ok(()) + } +} + +impl Storage for PhysicalStorage { + /// flush_lsn returns LSN of last durably stored WAL record. + fn flush_lsn(&self) -> Lsn { + self.flush_record_lsn + } + + /// Write WAL to disk. + fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { + // Disallow any non-sequential writes, which can result in gaps or overwrites. + // If we need to move the pointer, use truncate_wal() instead. + if self.write_lsn > startpos { + bail!( + "write_wal rewrites WAL written before, write_lsn={}, startpos={}", + self.write_lsn, + startpos + ); + } + if self.write_lsn < startpos && self.write_lsn != Lsn(0) { + bail!( + "write_wal creates gap in written WAL, write_lsn={}, startpos={}", + self.write_lsn, + startpos + ); + } + + let write_seconds = time_io_closure(|| self.write_exact(startpos, buf))?; + // WAL is written, updating write metrics + self.metrics.observe_write_seconds(write_seconds); + self.metrics.observe_write_bytes(buf.len()); + + // figure out last record's end lsn for reporting (if we got the + // whole record) + if self.decoder.available() != startpos { + info!( + "restart decoder from {} to {}", + self.decoder.available(), + startpos, + ); + let pg_version = self.decoder.pg_version; + self.decoder = WalStreamDecoder::new(startpos, pg_version); + } + self.decoder.feed_bytes(buf); + loop { + match self.decoder.poll_decode()? { + None => break, // no full record yet + Some((lsn, _rec)) => { + self.write_record_lsn = lsn; + } + } + } + + Ok(()) + } + + fn flush_wal(&mut self) -> Result<()> { + if self.flush_record_lsn == self.write_record_lsn { + // no need to do extra flush + return Ok(()); + } + + if let Some(mut unflushed_file) = self.file.take() { + self.fdatasync_file(&mut unflushed_file)?; + self.file = Some(unflushed_file); + } else { + // We have unflushed data (write_lsn != flush_lsn), but no file. + // This should only happen if last file was fully written and flushed, + // but haven't updated flush_lsn yet. + if self.write_lsn.segment_offset(self.wal_seg_size) != 0 { + bail!( + "unexpected unflushed data with no open file, write_lsn={}, flush_lsn={}", + self.write_lsn, + self.flush_record_lsn + ); + } + } + + // everything is flushed now, let's update flush_lsn + self.flush_record_lsn = self.write_record_lsn; + Ok(()) + } + + /// Truncate written WAL by removing all WAL segments after the given LSN. + /// end_pos must point to the end of the WAL record. + fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { + // Streaming must not create a hole, so truncate cannot be called on non-written lsn + if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { + bail!( + "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}", + self.write_lsn, + end_pos + ); + } + + // Close previously opened file, if any + if let Some(mut unflushed_file) = self.file.take() { + self.fdatasync_file(&mut unflushed_file)?; + } + + let xlogoff = end_pos.segment_offset(self.wal_seg_size) as usize; + let segno = end_pos.segment_number(self.wal_seg_size); + + // Remove all segments after the given LSN. + remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno)?; + + let (mut file, is_partial) = self.open_or_create(segno)?; + + // Fill end with zeroes + file.seek(SeekFrom::Start(xlogoff as u64))?; + write_zeroes(&mut file, self.wal_seg_size - xlogoff)?; + self.fdatasync_file(&mut file)?; + + if !is_partial { + // Make segment partial once again + let (wal_file_path, wal_file_partial_path) = + wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; + fs::rename(&wal_file_path, &wal_file_partial_path)?; + } + + // Update LSNs + self.write_lsn = end_pos; + self.write_record_lsn = end_pos; + self.flush_record_lsn = end_pos; + Ok(()) + } + + fn remove_up_to(&self) -> Box Result<()>> { + let timeline_dir = self.timeline_dir.clone(); + let wal_seg_size = self.wal_seg_size; + Box::new(move |segno_up_to: XLogSegNo| { + remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to) + }) + } + + fn get_metrics(&self) -> WalStorageMetrics { + self.metrics.clone() + } +} + +/// Remove all WAL segments in timeline_dir that match the given predicate. +fn remove_segments_from_disk( + timeline_dir: &Path, + wal_seg_size: usize, + remove_predicate: impl Fn(XLogSegNo) -> bool, +) -> Result<()> { + let mut n_removed = 0; + let mut min_removed = u64::MAX; + let mut max_removed = u64::MIN; + + for entry in fs::read_dir(&timeline_dir)? { + let entry = entry?; + let entry_path = entry.path(); + let fname = entry_path.file_name().unwrap(); + + if let Some(fname_str) = fname.to_str() { + /* Ignore files that are not XLOG segments */ + if !IsXLogFileName(fname_str) && !IsPartialXLogFileName(fname_str) { + continue; + } + let (segno, _) = XLogFromFileName(fname_str, wal_seg_size); + if remove_predicate(segno) { + remove_file(entry_path)?; + n_removed += 1; + min_removed = min(min_removed, segno); + max_removed = max(max_removed, segno); + } + } + } + + if n_removed > 0 { + info!( + "removed {} WAL segments [{}; {}]", + n_removed, min_removed, max_removed + ); + } + Ok(()) +} + +pub struct WalReader { + timeline_dir: PathBuf, + wal_seg_size: usize, + pos: Lsn, + wal_segment: Option>>, + + // S3 will be used to read WAL if LSN is not available locally + enable_remote_read: bool, + + // We don't have WAL locally if LSN is less than local_start_lsn + local_start_lsn: Lsn, +} + +impl WalReader { + pub fn new( + timeline_dir: PathBuf, + state: &SafeKeeperState, + start_pos: Lsn, + enable_remote_read: bool, + ) -> Result { + if start_pos < state.timeline_start_lsn { + bail!( + "Requested streaming from {}, which is before the start of the timeline {}", + start_pos, + state.timeline_start_lsn + ); + } + + // TODO: add state.timeline_start_lsn == Lsn(0) check + if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) { + bail!("state uninitialized, no data to read"); + } + + Ok(Self { + timeline_dir, + wal_seg_size: state.server.wal_seg_size as usize, + pos: start_pos, + wal_segment: None, + enable_remote_read, + local_start_lsn: state.local_start_lsn, + }) + } + + pub async fn read(&mut self, buf: &mut [u8]) -> Result { + let mut wal_segment = match self.wal_segment.take() { + Some(reader) => reader, + None => self.open_segment().await?, + }; + + // How much to read and send in message? We cannot cross the WAL file + // boundary, and we don't want send more than provided buffer. + let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize; + let send_size = min(buf.len(), self.wal_seg_size - xlogoff); + + // Read some data from the file. + let buf = &mut buf[0..send_size]; + let send_size = wal_segment.read_exact(buf).await?; + self.pos += send_size as u64; + + // Decide whether to reuse this file. If we don't set wal_segment here + // a new reader will be opened next time. + if self.pos.segment_offset(self.wal_seg_size) != 0 { + self.wal_segment = Some(wal_segment); + } + + Ok(send_size) + } + + /// Open WAL segment at the current position of the reader. + async fn open_segment(&self) -> Result>> { + let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize; + let segno = self.pos.segment_number(self.wal_seg_size); + let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size); + let wal_file_path = self.timeline_dir.join(wal_file_name); + + // Try to open local file, if we may have WAL locally + if self.pos >= self.local_start_lsn { + let res = Self::open_wal_file(&wal_file_path).await; + match res { + Ok(mut file) => { + file.seek(SeekFrom::Start(xlogoff as u64)).await?; + return Ok(Box::pin(file)); + } + Err(e) => { + let is_not_found = e.chain().any(|e| { + if let Some(e) = e.downcast_ref::() { + e.kind() == io::ErrorKind::NotFound + } else { + false + } + }); + if !is_not_found { + return Err(e); + } + // NotFound is expected, fall through to remote read + } + }; + } + + // Try to open remote file, if remote reads are enabled + if self.enable_remote_read { + return read_object(wal_file_path, xlogoff as u64).await; + } + + bail!("WAL segment is not found") + } + + /// Helper function for opening a wal file. + async fn open_wal_file(wal_file_path: &Path) -> Result { + // First try to open the .partial file. + let mut partial_path = wal_file_path.to_owned(); + partial_path.set_extension("partial"); + if let Ok(opened_file) = tokio::fs::File::open(&partial_path).await { + return Ok(opened_file); + } + + // If that failed, try it without the .partial extension. + tokio::fs::File::open(&wal_file_path) + .await + .with_context(|| format!("Failed to open WAL file {:?}", wal_file_path)) + .map_err(|e| { + warn!("{}", e); + e + }) + } +} + +/// Zero block for filling created WAL segments. +const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ]; + +/// Helper for filling file with zeroes. +fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> { + while count >= XLOG_BLCKSZ { + file.write_all(ZERO_BLOCK)?; + count -= XLOG_BLCKSZ; + } + file.write_all(&ZERO_BLOCK[0..count])?; + Ok(()) +} + +/// Helper returning full path to WAL segment file and its .partial brother. +fn wal_file_paths( + timeline_dir: &Path, + segno: XLogSegNo, + wal_seg_size: usize, +) -> Result<(PathBuf, PathBuf)> { + let wal_file_name = XLogFileName(PG_TLI, segno, wal_seg_size); + let wal_file_path = timeline_dir.join(wal_file_name.clone()); + let wal_file_partial_path = timeline_dir.join(wal_file_name + ".partial"); + Ok((wal_file_path, wal_file_partial_path)) +} diff --git a/scripts/coverage b/scripts/coverage index f2c46d9ae9..1dc92e57cc 100755 --- a/scripts/coverage +++ b/scripts/coverage @@ -9,13 +9,6 @@ # * https://github.com/taiki-e/cargo-llvm-cov # * https://github.com/llvm/llvm-project/tree/main/llvm/test/tools/llvm-cov -from abc import ABC, abstractmethod -from dataclasses import dataclass -from pathlib import Path -from tempfile import TemporaryDirectory -from textwrap import dedent -from typing import Any, Dict, Iterator, Iterable, List, Optional - import argparse import hashlib import json @@ -24,6 +17,12 @@ import shutil import socket import subprocess import sys +from abc import ABC, abstractmethod +from dataclasses import dataclass +from pathlib import Path +from tempfile import TemporaryDirectory +from textwrap import dedent +from typing import Any, Dict, Iterable, Iterator, List, Optional def file_mtime_or_zero(path: Path) -> int: @@ -76,8 +75,6 @@ class Cargo: def rustlib_dir(self) -> Path: if not self._rustlib_dir: cmd = [ - 'cargo', - '-Zunstable-options', 'rustc', '--print=target-libdir', ] @@ -398,7 +395,7 @@ class State: # Enable LLVM's source-based coverage # see: https://clang.llvm.org/docs/SourceBasedCodeCoverage.html # see: https://blog.rust-lang.org/inside-rust/2020/11/12/source-based-code-coverage.html - '-Zinstrument-coverage', + '-Cinstrument-coverage', # Link every bit of code to prevent "holes" in coverage report # see: https://doc.rust-lang.org/rustc/codegen-options/index.html#link-dead-code '-Clink-dead-code', @@ -411,10 +408,6 @@ class State: f'--remap-path-prefix {self.cwd}=', ]) - # XXX: God, have mercy on our souls... - # see: https://github.com/rust-lang/rust/pull/90132 - os.environ['RUSTC_BOOTSTRAP'] = '1' - def _merge_profraw(self) -> bool: profdata_path = self.profdata_dir / '-'.join([ self.profraw_prefix, diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py new file mode 100755 index 0000000000..1734038661 --- /dev/null +++ b/scripts/export_import_between_pageservers.py @@ -0,0 +1,746 @@ +# +# Script to export tenants from one pageserver and import them into another page server. +# +# Outline of steps: +# 1. Get `(last_lsn, prev_lsn)` from old pageserver +# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file +# 3. This tar file might be missing relation files for empty relations, if the pageserver +# is old enough (we didn't always store those). So to recreate them, we start a local +# vanilla postgres on this basebackup and ask it what relations should exist, then touch +# any missing files and re-pack the tar. +# TODO This functionality is no longer needed, so we can delete it later if we don't +# end up using the same utils for the pg 15 upgrade. Not sure. +# 4. We import the patched basebackup into a new pageserver +# 5. We export again via fullbackup, now from the new pageserver and compare the returned +# tar file with the one we imported. This confirms that we imported everything that was +# exported, but doesn't guarantee correctness (what if we didn't **export** everything +# initially?) +# 6. We wait for the new pageserver's remote_consistent_lsn to catch up +# +# For more context on how to use this, see: +# https://github.com/neondatabase/cloud/wiki/Storage-format-migration + +import argparse +import os +import shutil +import subprocess +import tempfile +import time +import uuid +from contextlib import closing +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, cast + +import psycopg2 +import requests +from psycopg2.extensions import connection as PgConnection +from psycopg2.extensions import parse_dsn + +############################################### +### client-side utils copied from test fixtures +############################################### + +Env = Dict[str, str] + +_global_counter = 0 + + +def global_counter() -> int: + """A really dumb global counter. + This is useful for giving output files a unique number, so if we run the + same command multiple times we can keep their output separate. + """ + global _global_counter + _global_counter += 1 + return _global_counter + + +def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: + """Run a process and capture its output + Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" + where "cmd" is the name of the program and NNN is an incrementing + counter. + If those files already exist, we will overwrite them. + Returns basepath for files with captured output. + """ + assert type(cmd) is list + base = os.path.basename(cmd[0]) + "_{}".format(global_counter()) + basepath = os.path.join(capture_dir, base) + stdout_filename = basepath + ".stdout" + stderr_filename = basepath + ".stderr" + + with open(stdout_filename, "w") as stdout_f: + with open(stderr_filename, "w") as stderr_f: + print('(capturing output to "{}.stdout")'.format(base)) + subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) + + return basepath + + +class PgBin: + """A helper class for executing postgres binaries""" + + def __init__(self, log_dir: Path, pg_distrib_dir, pg_version): + self.log_dir = log_dir + self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin") + self.env = os.environ.copy() + self.env["LD_LIBRARY_PATH"] = os.path.join( + str(pg_distrib_dir), "v{}".format(pg_version), "lib" + ) + + def _fixpath(self, command: List[str]): + if "/" not in command[0]: + command[0] = os.path.join(self.pg_bin_path, command[0]) + + def _build_env(self, env_add: Optional[Env]) -> Env: + if env_add is None: + return self.env + env = self.env.copy() + env.update(env_add) + return env + + def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None): + """ + Run one of the postgres binaries. + The command should be in list form, e.g. ['pgbench', '-p', '55432'] + All the necessary environment variables will be set. + If the first argument (the command name) doesn't include a path (no '/' + characters present), then it will be edited to include the correct path. + If you want stdout/stderr captured to files, use `run_capture` instead. + """ + + self._fixpath(command) + print('Running command "{}"'.format(" ".join(command))) + env = self._build_env(env) + subprocess.run(command, env=env, cwd=cwd, check=True) + + def run_capture( + self, + command: List[str], + env: Optional[Env] = None, + cwd: Optional[str] = None, + **kwargs: Any, + ) -> str: + """ + Run one of the postgres binaries, with stderr and stdout redirected to a file. + This is just like `run`, but for chatty programs. Returns basepath for files + with captured output. + """ + + self._fixpath(command) + print('Running command "{}"'.format(" ".join(command))) + env = self._build_env(env) + return subprocess_capture( + str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs + ) + + +class PgProtocol: + """Reusable connection logic""" + + def __init__(self, **kwargs): + self.default_options = kwargs + + def conn_options(self, **kwargs): + conn_options = self.default_options.copy() + if "dsn" in kwargs: + conn_options.update(parse_dsn(kwargs["dsn"])) + conn_options.update(kwargs) + + # Individual statement timeout in seconds. 2 minutes should be + # enough for our tests, but if you need a longer, you can + # change it by calling "SET statement_timeout" after + # connecting. + conn_options["options"] = f"-cstatement_timeout=120s {conn_options.get('options', '')}" + + return conn_options + + # autocommit=True here by default because that's what we need most of the time + def connect(self, autocommit=True, **kwargs) -> PgConnection: + """ + Connect to the node. + Returns psycopg2's connection object. + This method passes all extra params to connstr. + """ + conn = psycopg2.connect(**self.conn_options(**kwargs)) + + # WARNING: this setting affects *all* tests! + conn.autocommit = autocommit + return conn + + def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]: + """ + Execute query against the node and return all rows. + This method passes all extra params to connstr. + """ + return self.safe_psql_many([query], **kwargs)[0] + + def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]: + """ + Execute queries against the node and return all rows. + This method passes all extra params to connstr. + """ + result: List[List[Any]] = [] + with closing(self.connect(**kwargs)) as conn: + with conn.cursor() as cur: + for query in queries: + print(f"Executing query: {query}") + cur.execute(query) + + if cur.description is None: + result.append([]) # query didn't return data + else: + result.append(cast(List[Any], cur.fetchall())) + return result + + +class VanillaPostgres(PgProtocol): + def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True): + super().__init__(host="localhost", port=port, dbname="postgres") + self.pgdatadir = pgdatadir + self.pg_bin = pg_bin + self.running = False + if init: + self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)]) + self.configure([f"port = {port}\n"]) + + def configure(self, options: List[str]): + """Append lines into postgresql.conf file.""" + assert not self.running + with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file: + conf_file.write("\n".join(options)) + + def start(self, log_path: Optional[str] = None): + assert not self.running + self.running = True + + if log_path is None: + log_path = os.path.join(self.pgdatadir, "pg.log") + + self.pg_bin.run_capture( + ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"] + ) + + def stop(self): + assert self.running + self.running = False + self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"]) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + if self.running: + self.stop() + + +class NeonPageserverApiException(Exception): + pass + + +class NeonPageserverHttpClient(requests.Session): + def __init__(self, host, port): + super().__init__() + self.host = host + self.port = port + + def verbose_error(self, res: requests.Response): + try: + res.raise_for_status() + except requests.RequestException as e: + try: + msg = res.json()["msg"] + except: # noqa: E722 + msg = "" + raise NeonPageserverApiException(msg) from e + + def check_status(self): + self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status() + + def tenant_list(self): + res = self.get(f"http://{self.host}:{self.port}/v1/tenant") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, list) + return res_json + + def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists): + res = self.post( + f"http://{self.host}:{self.port}/v1/tenant", + json={ + "new_tenant_id": new_tenant_id.hex, + }, + ) + + if res.status_code == 409: + if ok_if_exists: + print(f"could not create tenant: already exists for id {new_tenant_id}") + else: + res.raise_for_status() + elif res.status_code == 201: + print(f"created tenant {new_tenant_id}") + else: + self.verbose_error(res) + + return new_tenant_id + + def timeline_list(self, tenant_id: uuid.UUID): + res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, list) + return res_json + + def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1" + ) + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + +def lsn_to_hex(num: int) -> str: + """Convert lsn from int to standard hex notation.""" + return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF) + + +def lsn_from_hex(lsn_hex: str) -> int: + """Convert lsn from hex notation to int.""" + l, r = lsn_hex.split("/") + return (int(l, 16) << 32) + int(r, 16) + + +def remote_consistent_lsn( + pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID +) -> int: + detail = pageserver_http_client.timeline_detail(tenant, timeline) + + lsn_str = detail["remote_consistent_lsn"] + if lsn_str is None: + # No remote information at all. This happens right after creating + # a timeline, before any part of it has been uploaded to remote + # storage yet. + return 0 + else: + assert isinstance(lsn_str, str) + return lsn_from_hex(lsn_str) + + +def wait_for_upload( + pageserver_http_client: NeonPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID, + lsn: int, +): + """waits for local timeline upload up to specified lsn""" + for i in range(10): + current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) + if current_lsn >= lsn: + return + print( + "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 + ) + ) + time.sleep(1) + + raise Exception( + "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( + lsn_to_hex(lsn), lsn_to_hex(current_lsn) + ) + ) + + +############## +# End of utils +############## + + +def pack_base(log_dir, restored_dir, output_tar): + """Create tar file from basebackup, being careful to produce relative filenames.""" + tmp_tar_name = "tmp.tar" + tmp_tar_path = os.path.join(restored_dir, tmp_tar_name) + cmd = ["tar", "-cf", tmp_tar_name] + os.listdir(restored_dir) + # We actually cd into the dir and call tar from there. If we call tar from + # outside we won't encode filenames as relative, and they won't parse well + # on import. + subprocess_capture(log_dir, cmd, cwd=restored_dir) + shutil.move(tmp_tar_path, output_tar) + + +def reconstruct_paths(log_dir, pg_bin, base_tar, port: int): + """Reconstruct what relation files should exist in the datadir by querying postgres.""" + with tempfile.TemporaryDirectory() as restored_dir: + # Unpack the base tar + subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir]) + + # Start a vanilla postgres from the given datadir and query it to find + # what relfiles should exist, but possibly don't. + with VanillaPostgres(Path(restored_dir), pg_bin, port, init=False) as vanilla_pg: + vanilla_pg.configure([f"port={port}"]) + vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log")) + + # Create database based on template0 because we can't connect to template0 + query = "create database template0copy template template0" + vanilla_pg.safe_psql(query, user="cloud_admin") + vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin") + + # Get all databases + query = "select oid, datname from pg_database" + oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin") + template0_oid = [ + oid for (oid, database) in oid_dbname_pairs if database == "template0" + ][0] + + # Get rel paths for each database + for oid, database in oid_dbname_pairs: + if database == "template0": + # We can't connect to template0 + continue + + query = "select relname, pg_relation_filepath(oid) from pg_class" + result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database) + for relname, filepath in result: + if filepath is not None: + + if database == "template0copy": + # Add all template0copy paths to template0 + prefix = f"base/{oid}/" + if filepath.startswith(prefix): + suffix = filepath[len(prefix) :] + yield f"base/{template0_oid}/{suffix}" + elif filepath.startswith("global"): + print(f"skipping {database} global file {filepath}") + else: + raise AssertionError + else: + yield filepath + + +def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths): + """Add the appropriate empty files to a basebadkup tar.""" + with tempfile.TemporaryDirectory() as restored_dir: + # Unpack the base tar + subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir]) + + # Touch files that don't exist + for path in paths: + absolute_path = os.path.join(restored_dir, path) + exists = os.path.exists(absolute_path) + if not exists: + print(f"File {absolute_path} didn't exist. Creating..") + Path(absolute_path).touch() + + # Repackage + pack_base(log_dir, restored_dir, output_tar) + + +# HACK This is a workaround for exporting from old pageservers that +# can't export empty relations. In this case we need to start +# a vanilla postgres from the exported datadir, and query it +# to see what empty relations are missing, and then create +# those empty files before importing. +def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int): + reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar, tmp_pg_port)) + touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths) + + +def get_rlsn(pageserver_connstr, tenant_id, timeline_id): + conn = psycopg2.connect(pageserver_connstr) + conn.autocommit = True + with conn.cursor() as cur: + cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}" + cur.execute(cmd) + res = cur.fetchone() + prev_lsn = res[0] + last_lsn = res[1] + conn.close() + + return last_lsn, prev_lsn + + +def import_timeline( + args, + psql_path, + pageserver_connstr, + pageserver_http, + tenant_id, + timeline_id, + last_lsn, + prev_lsn, + tar_filename, + pg_version, +): + # Import timelines to new pageserver + import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn} {pg_version}" + full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """ + + stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr") + stdout_filename = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout") + + print(f"Running: {full_cmd}") + + with open(stdout_filename, "w") as stdout_f: + with open(stderr_filename2, "w") as stderr_f: + print(f"(capturing output to {stdout_filename})") + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) + subprocess.run( + full_cmd, + stdout=stdout_f, + stderr=stderr_f, + env=pg_bin._build_env(None), + shell=True, + check=True, + ) + + print("Done import") + + # Wait until pageserver persists the files + wait_for_upload( + pageserver_http, uuid.UUID(tenant_id), uuid.UUID(timeline_id), lsn_from_hex(last_lsn) + ) + + +def export_timeline( + args, + psql_path, + pageserver_connstr, + tenant_id, + timeline_id, + last_lsn, + prev_lsn, + tar_filename, + pg_version, +): + # Choose filenames + incomplete_filename = tar_filename + ".incomplete" + stderr_filename = os.path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr") + + # Construct export command + query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}" + cmd = [psql_path, "--no-psqlrc", pageserver_connstr, "-c", query] + + # Run export command + print(f"Running: {cmd}") + with open(incomplete_filename, "w") as stdout_f: + with open(stderr_filename, "w") as stderr_f: + print(f"(capturing output to {incomplete_filename})") + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) + subprocess.run( + cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True + ) + + # Add missing rels + pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) + add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin, args.tmp_pg_port) + + # Log more info + file_size = os.path.getsize(tar_filename) + print(f"Done export: {tar_filename}, size {file_size}") + + +def main(args: argparse.Namespace): + # any psql version will do here. use current DEFAULT_PG_VERSION = 14 + psql_path = str(Path(args.pg_distrib_dir) / "v14" / "bin" / "psql") + + old_pageserver_host = args.old_pageserver_host + new_pageserver_host = args.new_pageserver_host + + old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port) + old_http_client.check_status() + old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}" + + new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port) + new_http_client.check_status() + new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}" + + for tenant_id in args.tenants: + print(f"Tenant: {tenant_id}") + timelines = old_http_client.timeline_list(uuid.UUID(tenant_id)) + print(f"Timelines: {timelines}") + + # Create tenant in new pageserver + if args.only_import is False and not args.timelines: + new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists) + + for timeline in timelines: + # Skip timelines we don't need to export + if args.timelines and timeline["timeline_id"] not in args.timelines: + print(f"Skipping timeline {timeline['timeline_id']}") + continue + + # Choose filenames + tar_filename = os.path.join( + args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar" + ) + + pg_version = timeline["pg_version"] + + # Export timeline from old pageserver + if args.only_import is False: + last_lsn, prev_lsn = get_rlsn( + old_pageserver_connstr, + timeline["tenant_id"], + timeline["timeline_id"], + ) + export_timeline( + args, + psql_path, + old_pageserver_connstr, + timeline["tenant_id"], + timeline["timeline_id"], + last_lsn, + prev_lsn, + tar_filename, + pg_version, + ) + + # Import into new pageserver + import_timeline( + args, + psql_path, + new_pageserver_connstr, + new_http_client, + timeline["tenant_id"], + timeline["timeline_id"], + last_lsn, + prev_lsn, + tar_filename, + pg_version, + ) + + # Re-export and compare + re_export_filename = tar_filename + ".reexport" + export_timeline( + args, + psql_path, + new_pageserver_connstr, + timeline["tenant_id"], + timeline["timeline_id"], + last_lsn, + prev_lsn, + re_export_filename, + pg_version, + ) + + # Check the size is the same + old_size = (os.path.getsize(tar_filename),) + new_size = (os.path.getsize(re_export_filename),) + if old_size != new_size: + raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}") + + +def non_zero_tcp_port(arg: Any): + port = int(arg) + if port < 1 or port > 65535: + raise argparse.ArgumentTypeError(f"invalid tcp port: {arg}") + return port + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--tenant-id", + dest="tenants", + required=True, + nargs="+", + help="Id of the tenant to migrate. You can pass multiple arguments", + ) + parser.add_argument( + "--timeline-id", + dest="timelines", + required=False, + nargs="+", + help="Id of the timeline to migrate. You can pass multiple arguments", + ) + parser.add_argument( + "--from-host", + dest="old_pageserver_host", + required=True, + help="Host of the pageserver to migrate data from", + ) + parser.add_argument( + "--from-http-port", + dest="old_pageserver_http_port", + required=False, + type=int, + default=9898, + help="HTTP port of the pageserver to migrate data from. Default: 9898", + ) + parser.add_argument( + "--from-pg-port", + dest="old_pageserver_pg_port", + required=False, + type=int, + default=6400, + help="pg port of the pageserver to migrate data from. Default: 6400", + ) + parser.add_argument( + "--to-host", + dest="new_pageserver_host", + required=True, + help="Host of the pageserver to migrate data to", + ) + parser.add_argument( + "--to-http-port", + dest="new_pageserver_http_port", + required=False, + default=9898, + type=int, + help="HTTP port of the pageserver to migrate data to. Default: 9898", + ) + parser.add_argument( + "--to-pg-port", + dest="new_pageserver_pg_port", + required=False, + default=6400, + type=int, + help="pg port of the pageserver to migrate data to. Default: 6400", + ) + parser.add_argument( + "--ignore-tenant-exists", + dest="ok_if_exists", + required=False, + help="Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.", + ) + parser.add_argument( + "--pg-distrib-dir", + dest="pg_distrib_dir", + required=False, + default="/usr/local/", + help="Path where postgres binaries are installed. Default: /usr/local/", + ) + parser.add_argument( + "--psql-path", + dest="psql_path", + required=False, + default="/usr/local/v14/bin/psql", + help="Path to the psql binary. Default: /usr/local/v14/bin/psql", + ) + parser.add_argument( + "--only-import", + dest="only_import", + required=False, + default=False, + action="store_true", + help="Skip export and tenant creation part", + ) + parser.add_argument( + "--work-dir", + dest="work_dir", + required=True, + default=False, + help="directory where temporary tar files are stored", + ) + parser.add_argument( + "--tmp-pg-port", + dest="tmp_pg_port", + required=False, + default=55439, + type=non_zero_tcp_port, + help="localhost port to use for temporary postgres instance", + ) + args = parser.parse_args() + main(args) diff --git a/scripts/generate_and_push_perf_report.sh b/scripts/generate_and_push_perf_report.sh index df84fa0dd8..9e03302b0f 100755 --- a/scripts/generate_and_push_perf_report.sh +++ b/scripts/generate_and_push_perf_report.sh @@ -5,8 +5,8 @@ set -eux -o pipefail SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -echo "Uploading perf report to zenith pg" -# ingest per test results data into zenith backed postgres running in staging to build grafana reports on that data +echo "Uploading perf report to neon pg" +# ingest per test results data into neon backed postgres running in staging to build grafana reports on that data DATABASE_URL="$PERF_TEST_RESULT_CONNSTR" poetry run python "$SCRIPT_DIR"/ingest_perf_test_result.py --ingest "$REPORT_FROM" # Activate poetry's venv. Needed because git upload does not run in a project dir (it uses tmp to store the repository) @@ -16,8 +16,8 @@ DATABASE_URL="$PERF_TEST_RESULT_CONNSTR" poetry run python "$SCRIPT_DIR"/ingest_ echo "Uploading perf result to zenith-perf-data" scripts/git-upload \ - --repo=https://"$VIP_VAP_ACCESS_TOKEN"@github.com/zenithdb/zenith-perf-data.git \ - --message="add performance test result for $GITHUB_SHA zenith revision" \ + --repo=https://"$VIP_VAP_ACCESS_TOKEN"@github.com/neondatabase/zenith-perf-data.git \ + --message="add performance test result for $GITHUB_SHA neon revision" \ --branch=master \ copy "$REPORT_FROM" "data/$REPORT_TO" `# COPY FROM TO_RELATIVE`\ --merge \ diff --git a/scripts/generate_perf_report_page.py b/scripts/generate_perf_report_page.py index a15d04e056..b5b49bb600 100755 --- a/scripts/generate_perf_report_page.py +++ b/scripts/generate_perf_report_page.py @@ -1,31 +1,37 @@ #!/usr/bin/env python3 import argparse +import json from dataclasses import dataclass from pathlib import Path -import json from typing import Any, Dict, List, Optional, Tuple, cast + from jinja2 import Template # skip 'input' columns. They are included in the header and just blow the table -EXCLUDE_COLUMNS = frozenset({ - 'scale', - 'duration', - 'number_of_clients', - 'number_of_threads', - 'init_start_timestamp', - 'init_end_timestamp', - 'run_start_timestamp', - 'run_end_timestamp', -}) +EXCLUDE_COLUMNS = frozenset( + { + "scale", + "duration", + "number_of_clients", + "number_of_threads", + "init_start_timestamp", + "init_end_timestamp", + "run_start_timestamp", + "run_end_timestamp", + } +) -KEY_EXCLUDE_FIELDS = frozenset({ - 'init_start_timestamp', - 'init_end_timestamp', - 'run_start_timestamp', - 'run_end_timestamp', -}) -NEGATIVE_COLOR = 'negative' -POSITIVE_COLOR = 'positive' +KEY_EXCLUDE_FIELDS = frozenset( + { + "init_start_timestamp", + "init_end_timestamp", + "run_start_timestamp", + "run_end_timestamp", + } +) +NEGATIVE_COLOR = "negative" +POSITIVE_COLOR = "positive" +EPS = 1e-6 @dataclass @@ -54,74 +60,76 @@ def get_columns(values: List[Dict[Any, Any]]) -> Tuple[List[Tuple[str, str]], Li value_columns = [] common_columns = [] for item in values: - if item['name'] in KEY_EXCLUDE_FIELDS: + if item["name"] in KEY_EXCLUDE_FIELDS: continue - if item['report'] != 'test_param': - value_columns.append(cast(str, item['name'])) + if item["report"] != "test_param": + value_columns.append(cast(str, item["name"])) else: - common_columns.append((cast(str, item['name']), cast(str, item['value']))) + common_columns.append((cast(str, item["name"]), cast(str, item["value"]))) value_columns.sort() common_columns.sort(key=lambda x: x[0]) # sort by name return common_columns, value_columns def format_ratio(ratio: float, report: str) -> Tuple[str, str]: - color = '' - sign = '+' if ratio > 0 else '' + color = "" + sign = "+" if ratio > 0 else "" if abs(ratio) < 0.05: - return f' ({sign}{ratio:.2f})', color + return f" ({sign}{ratio:.2f})", color - if report not in {'test_param', 'higher_is_better', 'lower_is_better'}: - raise ValueError(f'Unknown report type: {report}') + if report not in {"test_param", "higher_is_better", "lower_is_better"}: + raise ValueError(f"Unknown report type: {report}") - if report == 'test_param': - return f'{ratio:.2f}', color + if report == "test_param": + return f"{ratio:.2f}", color if ratio > 0: - if report == 'higher_is_better': + if report == "higher_is_better": color = POSITIVE_COLOR - elif report == 'lower_is_better': + elif report == "lower_is_better": color = NEGATIVE_COLOR elif ratio < 0: - if report == 'higher_is_better': + if report == "higher_is_better": color = NEGATIVE_COLOR - elif report == 'lower_is_better': + elif report == "lower_is_better": color = POSITIVE_COLOR - return f' ({sign}{ratio:.2f})', color + return f" ({sign}{ratio:.2f})", color def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]: - for item in suit_run.values['data']: - if item['name'] == name: + for item in suit_run.values["data"]: + if item["name"] == name: return cast(Dict[str, Any], item) return None -def get_row_values(columns: List[str], run_result: SuitRun, - prev_result: Optional[SuitRun]) -> List[RowValue]: +def get_row_values( + columns: List[str], run_result: SuitRun, prev_result: Optional[SuitRun] +) -> List[RowValue]: row_values = [] for column in columns: current_value = extract_value(column, run_result) if current_value is None: # should never happen - raise ValueError(f'{column} not found in {run_result.values}') + raise ValueError(f"{column} not found in {run_result.values}") value = current_value["value"] if isinstance(value, float): - value = f'{value:.2f}' + value = f"{value:.2f}" if prev_result is None: - row_values.append(RowValue(value, '', '')) + row_values.append(RowValue(value, "", "")) continue prev_value = extract_value(column, prev_result) if prev_value is None: # this might happen when new metric is added and there is no value for it in previous run # let this be here, TODO add proper handling when this actually happens - raise ValueError(f'{column} not found in previous result') - ratio = float(value) / float(prev_value['value']) - 1 - ratio_display, color = format_ratio(ratio, current_value['report']) + raise ValueError(f"{column} not found in previous result") + # adding `EPS` to each term to avoid ZeroDivisionError when the denominator is zero + ratio = (float(value) + EPS) / (float(prev_value["value"]) + EPS) - 1 + ratio_display, color = format_ratio(ratio, current_value["report"]) row_values.append(RowValue(value, color, ratio_display)) return row_values @@ -137,8 +145,10 @@ def prepare_rows_from_runs(value_columns: List[str], runs: List[SuitRun]) -> Lis prev_run = None for run in runs: rows.append( - SuiteRunTableRow(revision=run.revision, - values=get_row_values(value_columns, run, prev_run))) + SuiteRunTableRow( + revision=run.revision, values=get_row_values(value_columns, run, prev_run) + ) + ) prev_run = run return rows @@ -150,27 +160,29 @@ def main(args: argparse.Namespace) -> None: # we have files in form: _.json # fill them in the hashmap so we have grouped items for the # same run configuration (scale, duration etc.) ordered by counter. - for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split('_')[0])): + for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split("_")[0])): run_data = json.loads(item.read_text()) - revision = run_data['revision'] + revision = run_data["revision"] - for suit_result in run_data['result']: - key = "{}{}".format(run_data['platform'], suit_result['suit']) + for suit_result in run_data["result"]: + key = "{}{}".format(run_data["platform"], suit_result["suit"]) # pack total duration as a synthetic value - total_duration = suit_result['total_duration'] - suit_result['data'].append({ - 'name': 'total_duration', - 'value': total_duration, - 'unit': 's', - 'report': 'lower_is_better', - }) - common_columns, value_columns = get_columns(suit_result['data']) + total_duration = suit_result["total_duration"] + suit_result["data"].append( + { + "name": "total_duration", + "value": total_duration, + "unit": "s", + "report": "lower_is_better", + } + ) + common_columns, value_columns = get_columns(suit_result["data"]) grouped_runs.setdefault( key, SuitRuns( - platform=run_data['platform'], - suit=suit_result['suit'], + platform=run_data["platform"], + suit=suit_result["suit"], common_columns=common_columns, value_columns=value_columns, runs=[], @@ -182,26 +194,26 @@ def main(args: argparse.Namespace) -> None: for result in grouped_runs.values(): suit = result.suit context[suit] = { - 'common_columns': result.common_columns, - 'value_columns': result.value_columns, - 'platform': result.platform, + "common_columns": result.common_columns, + "value_columns": result.value_columns, + "platform": result.platform, # reverse the order so newest results are on top of the table - 'rows': reversed(prepare_rows_from_runs(result.value_columns, result.runs)), + "rows": reversed(prepare_rows_from_runs(result.value_columns, result.runs)), } - template = Template((Path(__file__).parent / 'perf_report_template.html').read_text()) + template = Template((Path(__file__).parent / "perf_report_template.html").read_text()) Path(args.out).write_text(template.render(context=context)) -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - '--input-dir', - dest='input_dir', + "--input-dir", + dest="input_dir", required=True, - help='Directory with jsons generated by the test suite', + help="Directory with jsons generated by the test suite", ) - parser.add_argument('--out', required=True, help='Output html file path') + parser.add_argument("--out", required=True, help="Output html file path") args = parser.parse_args() main(args) diff --git a/scripts/git-upload b/scripts/git-upload index 4649f6998d..d56c0f8e94 100755 --- a/scripts/git-upload +++ b/scripts/git-upload @@ -1,17 +1,16 @@ #!/usr/bin/env python3 -from contextlib import contextmanager -import shlex -from tempfile import TemporaryDirectory -from distutils.dir_util import copy_tree -from pathlib import Path - import argparse import os +import shlex import shutil import subprocess import sys import textwrap +from contextlib import contextmanager +from distutils.dir_util import copy_tree +from pathlib import Path +from tempfile import TemporaryDirectory from typing import Optional @@ -80,12 +79,14 @@ class GitRepo: print('No changes detected, quitting') return - run([ + git_with_user = [ 'git', '-c', 'user.name=vipvap', '-c', 'user.email=vipvap@zenith.tech', + ] + run(git_with_user + [ 'commit', '--author="vipvap "', f'--message={message}', @@ -94,7 +95,7 @@ class GitRepo: for _ in range(5): try: run(['git', 'fetch', 'origin', branch]) - run(['git', 'rebase', f'origin/{branch}']) + run(git_with_user + ['rebase', f'origin/{branch}']) run(['git', 'push', 'origin', branch]) return diff --git a/scripts/ingest_perf_test_result.py b/scripts/ingest_perf_test_result.py index 89463c986a..7f2af290a2 100644 --- a/scripts/ingest_perf_test_result.py +++ b/scripts/ingest_perf_test_result.py @@ -1,12 +1,14 @@ #!/usr/bin/env python3 import argparse -from contextlib import contextmanager import json import os +import sys +from contextlib import contextmanager +from datetime import datetime +from pathlib import Path + import psycopg2 import psycopg2.extras -from pathlib import Path -from datetime import datetime CREATE_TABLE = """ CREATE TABLE IF NOT EXISTS perf_test_results ( @@ -24,15 +26,15 @@ CREATE TABLE IF NOT EXISTS perf_test_results ( def err(msg): - print(f'error: {msg}') - exit(1) + print(f"error: {msg}") + sys.exit(1) @contextmanager def get_connection_cursor(): - connstr = os.getenv('DATABASE_URL') + connstr = os.getenv("DATABASE_URL") if not connstr: - err('DATABASE_URL environment variable is not set') + err("DATABASE_URL environment variable is not set") with psycopg2.connect(connstr) as conn: with conn.cursor() as cur: yield cur @@ -42,35 +44,37 @@ def create_table(cur): cur.execute(CREATE_TABLE) -def ingest_perf_test_result(cursor, data_dile: Path, recorded_at_timestamp: int) -> int: - run_data = json.loads(data_dile.read_text()) - revision = run_data['revision'] - platform = run_data['platform'] +def ingest_perf_test_result(cursor, data_file: Path, recorded_at_timestamp: int) -> int: + run_data = json.loads(data_file.read_text()) + revision = run_data["revision"] + platform = run_data["platform"] - run_result = run_data['result'] + run_result = run_data["result"] args_list = [] for suit_result in run_result: - suit = suit_result['suit'] - total_duration = suit_result['total_duration'] + suit = suit_result["suit"] + total_duration = suit_result["total_duration"] - suit_result['data'].append({ - 'name': 'total_duration', - 'value': total_duration, - 'unit': 's', - 'report': 'lower_is_better', - }) + suit_result["data"].append( + { + "name": "total_duration", + "value": total_duration, + "unit": "s", + "report": "lower_is_better", + } + ) - for metric in suit_result['data']: + for metric in suit_result["data"]: values = { - 'suit': suit, - 'revision': revision, - 'platform': platform, - 'metric_name': metric['name'], - 'metric_value': metric['value'], - 'metric_unit': metric['unit'], - 'metric_report_type': metric['report'], - 'recorded_at_timestamp': datetime.utcfromtimestamp(recorded_at_timestamp), + "suit": suit, + "revision": revision, + "platform": platform, + "metric_name": metric["name"], + "metric_value": metric["value"], + "metric_unit": metric["unit"], + "metric_report_type": metric["report"], + "recorded_at_timestamp": datetime.utcfromtimestamp(recorded_at_timestamp), } args_list.append(values) @@ -104,13 +108,16 @@ def ingest_perf_test_result(cursor, data_dile: Path, recorded_at_timestamp: int) def main(): - parser = argparse.ArgumentParser(description='Perf test result uploader. \ - Database connection string should be provided via DATABASE_URL environment variable', ) + parser = argparse.ArgumentParser( + description="Perf test result uploader. \ + Database connection string should be provided via DATABASE_URL environment variable", + ) parser.add_argument( - '--ingest', + "--ingest", type=Path, - help='Path to perf test result file, or directory with perf test result files') - parser.add_argument('--initdb', action='store_true', help='Initialuze database') + help="Path to perf test result file, or directory with perf test result files", + ) + parser.add_argument("--initdb", action="store_true", help="Initialuze database") args = parser.parse_args() with get_connection_cursor() as cur: @@ -118,19 +125,19 @@ def main(): create_table(cur) if not args.ingest.exists(): - err(f'ingest path {args.ingest} does not exist') + err(f"ingest path {args.ingest} does not exist") if args.ingest: if args.ingest.is_dir(): - for item in sorted(args.ingest.iterdir(), key=lambda x: int(x.name.split('_')[0])): - recorded_at_timestamp = int(item.name.split('_')[0]) + for item in sorted(args.ingest.iterdir(), key=lambda x: int(x.name.split("_")[0])): + recorded_at_timestamp = int(item.name.split("_")[0]) ingested = ingest_perf_test_result(cur, item, recorded_at_timestamp) - print(f'Ingested {ingested} metric values from {item}') + print(f"Ingested {ingested} metric values from {item}") else: - recorded_at_timestamp = int(args.ingest.name.split('_')[0]) + recorded_at_timestamp = int(args.ingest.name.split("_")[0]) ingested = ingest_perf_test_result(cur, args.ingest, recorded_at_timestamp) - print(f'Ingested {ingested} metric values from {args.ingest}') + print(f"Ingested {ingested} metric values from {args.ingest}") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/scripts/ingest_regress_test_result.py b/scripts/ingest_regress_test_result.py new file mode 100644 index 0000000000..974167483a --- /dev/null +++ b/scripts/ingest_regress_test_result.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +import argparse +import os +import re +import sys +from contextlib import contextmanager +from pathlib import Path + +import psycopg2 + +CREATE_TABLE = """ +CREATE TABLE IF NOT EXISTS regress_test_results ( + id SERIAL PRIMARY KEY, + reference CHAR(255), + revision CHAR(40), + build_type CHAR(16), + data JSONB +) +""" + + +def err(msg): + print(f"error: {msg}") + sys.exit(1) + + +@contextmanager +def get_connection_cursor(): + connstr = os.getenv("DATABASE_URL") + if not connstr: + err("DATABASE_URL environment variable is not set") + with psycopg2.connect(connstr, connect_timeout=30) as conn: + with conn.cursor() as cur: + yield cur + + +def create_table(cur): + cur.execute(CREATE_TABLE) + + +def ingest_regress_test_result( + cursor, reference: str, revision: str, build_type: str, data_file: Path +): + data = data_file.read_text() + # In the JSON report we can have lines related to LazyFixture with escaped double-quote + # It's hard to insert them into jsonb field as is, so replace \" with ' to make it easier for us + # + # "" -> "" + data = re.sub(r'("")', r"\g<1>'\g<2>'\g<3>", data) + values = ( + reference, + revision, + build_type, + data, + ) + cursor.execute( + """ + INSERT INTO regress_test_results ( + reference, + revision, + build_type, + data + ) VALUES (%s, %s, %s, %s) + """, + values, + ) + + +def main(): + parser = argparse.ArgumentParser( + description="Regress test result uploader. \ + Database connection string should be provided via DATABASE_URL environment variable", + ) + parser.add_argument("--initdb", action="store_true", help="Initialuze database") + parser.add_argument( + "--reference", type=str, required=True, help="git reference, for example refs/heads/main" + ) + parser.add_argument("--revision", type=str, required=True, help="git revision") + parser.add_argument( + "--build-type", type=str, required=True, help="build type: release, debug or remote" + ) + parser.add_argument( + "--ingest", type=Path, required=True, help="Path to regress test result file" + ) + + args = parser.parse_args() + with get_connection_cursor() as cur: + if args.initdb: + create_table(cur) + + if not args.ingest.exists(): + err(f"ingest path {args.ingest} does not exist") + + ingest_regress_test_result( + cur, + reference=args.reference, + revision=args.revision, + build_type=args.build_type, + data_file=args.ingest, + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/ninstall.sh b/scripts/ninstall.sh new file mode 100755 index 0000000000..3554e3e4df --- /dev/null +++ b/scripts/ninstall.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -euo pipefail +# GNU coreutil's `install -C` always overrides the destination if the source +# is not a regular file, which is the case with lots of headers symlinked into +# the build directory by `./configure`. That causes Rust's Cargo to think that +# Postgres headers have been updated after `make` call even if no files have been +# touched. That causes long recompilation of `postgres_ffi` and all dependent +# packages. To counter that, we handle a special case here: do not copy the file +# if its content did not change. We only handle a single case where `install` +# installs a single file with a specific set of arguments, the rest does not +# matter in our configuration. +# +# Such behavior may be incorrect if e.g. permissions have changed, but it should +# not happen during normal Neon development that often, and rebuild should help. +# +# See https://github.com/neondatabase/neon/issues/1873 +if [ "$#" == "5" ]; then + if [ "$1" == "-C" ] && [ "$2" == "-m" ] && [ "$3" == "644" ]; then + if [ -e "$5" ] && diff -q "$4" "$5" >/dev/null 2>&1; then + exit 0 + fi + fi +fi +install "$@" diff --git a/scripts/perf_report_template.html b/scripts/perf_report_template.html index 2847e75a00..c86ab37c2d 100644 --- a/scripts/perf_report_template.html +++ b/scripts/perf_report_template.html @@ -19,7 +19,7 @@ } -

Zenith Performance Tests

+

Neon Performance Tests

{% for suit_name, suit_data in context.items() %}

Runs for {{ suit_name }}

@@ -38,7 +38,7 @@ {% for row in suit_data.rows %} - {{ row.revision[:6] }} + {{ row.revision[:6] }} {% for column_value in row.values %} {{ column_value.value }}{{column_value.ratio}} {% endfor %} diff --git a/scripts/pysync b/scripts/pysync index e548973dea..12fa08beca 100755 --- a/scripts/pysync +++ b/scripts/pysync @@ -4,4 +4,10 @@ # It is intended to be a primary endpoint for all the people who want to # just setup test environment without going into details of python package management -poetry install --no-root # this installs dev dependencies by default +poetry config --list + +if [ -z "${CI}" ]; then + poetry install --no-root --no-interaction --ansi +else + poetry install --no-root +fi diff --git a/scripts/reformat b/scripts/reformat new file mode 100755 index 0000000000..5346c78ead --- /dev/null +++ b/scripts/reformat @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -euox pipefail + +# Runs all formatting tools to ensure the project is up to date +echo 'Reformatting Rust code' +cargo fmt +echo 'Reformatting Python code' +poetry run isort test_runner scripts +poetry run flake8 test_runner scripts +poetry run black test_runner scripts diff --git a/setup.cfg b/setup.cfg index b3b39fadd7..a067ee731d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,35 +1,8 @@ -# Just trying to gather linter settings in one file. -# I wonder if there's a way to de-duplicate them... - [flake8] -max-line-length = 100 - -[pycodestyle] -max-line-length = 100 - -[yapf] -based_on_style = pep8 -column_limit = 100 -split_all_top_level_comma_separated_values = true - -[mypy] -# mypy uses regex -exclude = ^vendor/ -# some tests don't typecheck when this flag is set -check_untyped_defs = false - -disallow_incomplete_defs = false -disallow_untyped_calls = false -disallow_untyped_decorators = false -disallow_untyped_defs = false -strict = true - -[mypy-asyncpg.*] -# There is some work in progress, though: https://github.com/MagicStack/asyncpg/pull/577 -ignore_missing_imports = true - -[mypy-cached_property.*] -ignore_missing_imports = true - -[mypy-pytest.*] -ignore_missing_imports = true +# Move config to pyproject.toml as soon as flake8 supports it +# https://github.com/PyCQA/flake8/issues/234 +extend-ignore = + E203, # Whitespace before ':' -- conflicts with black + E266, # Too many leading '#' for block comment -- we use it for formatting sometimes + E501 # Line too long -- black sorts it out +extend-exclude = vendor/ diff --git a/test_runner/README.md b/test_runner/README.md index 514c5f1e3a..e066ac3235 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -1,24 +1,39 @@ -## Zenith test runner +## Neon test runner This directory contains integration tests. Prerequisites: - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python) -- Zenith and Postgres binaries +- Neon and Postgres binaries - See the root [README.md](/README.md) for build directions + If you want to test tests with test-only APIs, you would need to add `--features testing` to Rust code build commands. + For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags. + Usage example: `cargo build_testing --release` is equivalent to `cargo build --features testing --release` - Tests can be run from the git tree; or see the environment variables below to run from other directories. -- The zenith git repo, including the postgres submodule +- The neon git repo, including the postgres submodule (for some tests, e.g. `pg_regress`) +- Some tests (involving storage nodes coordination) require etcd installed. Follow + [`the guide`](https://etcd.io/docs/v3.5/install/) to obtain it. ### Test Organization -The tests are divided into a few batches, such that each batch takes roughly -the same amount of time. The batches can be run in parallel, to minimize total -runtime. Currently, there are only two batches: +Regression tests are in the 'regress' directory. They can be run in +parallel to minimize total runtime. Most regression test sets up their +environment with its own pageservers and safekeepers (but see +`TEST_SHARED_FIXTURES`). -- test_batch_pg_regress: Runs PostgreSQL regression tests -- test_others: All other tests +'pg_clients' contains tests for connecting with various client +libraries. Each client test uses a Dockerfile that pulls an image that +contains the client, and connects to PostgreSQL with it. The client +tests can be run against an existing PostgreSQL or Neon installation. + +'performance' contains performance regression tests. Each test +exercises a particular scenario or workload, and outputs +measurements. They should be run serially, to avoid the tests +interfering with the performance of each other. Some performance tests +set up their own Neon environment, while others can be run against an +existing PostgreSQL or Neon environment. ### Running the tests @@ -41,17 +56,30 @@ If you want to run all tests that have the string "bench" in their names: `./scripts/pytest -k bench` +To run tests in parellel we utilize `pytest-xdist` plugin. By default everything runs single threaded. Number of workers can be specified with `-n` argument: + +`./scripts/pytest -n4` + +By default performance tests are excluded. To run them explicitly pass performance tests selection to the script: + +`./scripts/pytest test_runner/performance` + Useful environment variables: -`ZENITH_BIN`: The directory where zenith binaries can be found. +`NEON_BIN`: The directory where neon binaries can be found. `POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found. +Since pageserver supports several postgres versions, `POSTGRES_DISTRIB_DIR` must contain +a subdirectory for each version with naming convention `v{PG_VERSION}/`. +Inside that dir, a `bin/postgres` binary should be present. +`DEFAULT_PG_VERSION`: The version of Postgres to use, +This is used to construct full path to the postgres binaries. +Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"` `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. -`ZENITH_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as -`FORCE_MOCK_S3`: inits every test's pageserver with a mock S3 used as a remote storage. -`--pageserver-config-override=${value}` parameter values when zenith cli is invoked -`RUST_LOG`: logging configuration to pass into Zenith CLI +`NEON_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as +`--pageserver-config-override=${value}` parameter values when neon_local cli is invoked +`RUST_LOG`: logging configuration to pass into Neon CLI Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them: `./scripts/pytest -s --log-cli-level=INFO ...` @@ -64,32 +92,32 @@ Exit after the first test failure: ### Writing a test -Every test needs a Zenith Environment, or ZenithEnv to operate in. A Zenith Environment +Every test needs a Neon Environment, or NeonEnv to operate in. A Neon Environment is like a little cloud-in-a-box, and consists of a Pageserver, 0-N Safekeepers, and compute Postgres nodes. The connections between them can be configured to use JWT authentication tokens, and some other configuration options can be tweaked too. -The easiest way to get access to a Zenith Environment is by using the `zenith_simple_env` +The easiest way to get access to a Neon Environment is by using the `neon_simple_env` fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes or make other destructive changes in that environment. Also don't assume that there are no tenants or branches or data in the cluster. For convenience, there is a branch called `empty`, though. The convention is to create a test-specific branch of that and load any test data there, instead of the 'main' branch. -For more complicated cases, you can build a custom Zenith Environment, with the `zenith_env` +For more complicated cases, you can build a custom Neon Environment, with the `neon_env` fixture: ```python -def test_foobar(zenith_env_builder: ZenithEnvBuilder): +def test_foobar(neon_env_builder: NeonEnvBuilder): # Prescribe the environment. # We want to have 3 safekeeper nodes, and use JWT authentication in the # connections to the page server - zenith_env_builder.num_safekeepers = 3 - zenith_env_builder.set_pageserver_auth(True) + neon_env_builder.num_safekeepers = 3 + neon_env_builder.set_pageserver_auth(True) # Now create the environment. This initializes the repository, and starts # up the page server and the safekeepers - env = zenith_env_builder.init() + env = neon_env_builder.init_start() # Run the test ... diff --git a/test_runner/batch_others/test_auth.py b/test_runner/batch_others/test_auth.py deleted file mode 100644 index 7f86986e2e..0000000000 --- a/test_runner/batch_others/test_auth.py +++ /dev/null @@ -1,68 +0,0 @@ -from contextlib import closing -from typing import Iterator -from uuid import UUID, uuid4 -import psycopg2 -from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithPageserverApiException -import pytest - - -def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.pageserver_auth_enabled = True - env = zenith_env_builder.init() - - ps = env.pageserver - - tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant.hex) - tenant_http_client = env.pageserver.http_client(tenant_token) - invalid_tenant_token = env.auth_keys.generate_tenant_token(uuid4().hex) - invalid_tenant_http_client = env.pageserver.http_client(invalid_tenant_token) - - management_token = env.auth_keys.generate_management_token() - management_http_client = env.pageserver.http_client(management_token) - - # this does not invoke auth check and only decodes jwt and checks it for validity - # check both tokens - ps.safe_psql("set FOO", password=tenant_token) - ps.safe_psql("set FOO", password=management_token) - - # tenant can create branches - tenant_http_client.branch_create(env.initial_tenant, 'new1', 'main') - # console can create branches for tenant - management_http_client.branch_create(env.initial_tenant, 'new2', 'main') - - # fail to create branch using token with different tenant_id - with pytest.raises(ZenithPageserverApiException, - match='Forbidden: Tenant id mismatch. Permission denied'): - invalid_tenant_http_client.branch_create(env.initial_tenant, "new3", "main") - - # create tenant using management token - management_http_client.tenant_create(uuid4()) - - # fail to create tenant using tenant token - with pytest.raises( - ZenithPageserverApiException, - match='Forbidden: Attempt to access management api with tenant scope. Permission denied' - ): - tenant_http_client.tenant_create(uuid4()) - - -@pytest.mark.parametrize('with_wal_acceptors', [False, True]) -def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool): - zenith_env_builder.pageserver_auth_enabled = True - if with_wal_acceptors: - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() - - branch = f"test_compute_auth_to_pageserver{with_wal_acceptors}" - env.zenith_cli.create_branch(branch, "main") - - pg = env.postgres.create_start(branch) - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - cur.execute('CREATE TABLE t(key int primary key, value text)') - cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (5000050000, ) diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py deleted file mode 100644 index 860db51c8a..0000000000 --- a/test_runner/batch_others/test_branch_behind.py +++ /dev/null @@ -1,136 +0,0 @@ -import subprocess -from contextlib import closing - -import psycopg2.extras -import pytest -from fixtures.log_helper import log -from fixtures.utils import print_gc_result -from fixtures.zenith_fixtures import ZenithEnvBuilder - - -# -# Create a couple of branches off the main branch, at a historical point in time. -# -def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): - - # Use safekeeper in this test to avoid a subtle race condition. - # Without safekeeper, walreceiver reconnection can stuck - # because of IO deadlock. - # - # See https://github.com/zenithdb/zenith/issues/1068 - zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() - - # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch("test_branch_behind", "main") - - pgmain = env.postgres.create_start('test_branch_behind') - log.info("postgres is running on 'test_branch_behind' branch") - - main_pg_conn = pgmain.connect() - main_cur = main_pg_conn.cursor() - - main_cur.execute("SHOW zenith.zenith_timeline") - timeline = main_cur.fetchone()[0] - - # Create table, and insert the first 100 rows - main_cur.execute('CREATE TABLE foo (t text)') - - # keep some early lsn to test branch creation on out of date lsn - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - gced_lsn = main_cur.fetchone()[0] - - main_cur.execute(''' - INSERT INTO foo - SELECT 'long string to consume some space' || g - FROM generate_series(1, 100) g - ''') - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_a = main_cur.fetchone()[0] - log.info(f'LSN after 100 rows: {lsn_a}') - - # Insert some more rows. (This generates enough WAL to fill a few segments.) - main_cur.execute(''' - INSERT INTO foo - SELECT 'long string to consume some space' || g - FROM generate_series(1, 200000) g - ''') - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_b = main_cur.fetchone()[0] - log.info(f'LSN after 200100 rows: {lsn_b}') - - # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch("test_branch_behind_hundred", "test_branch_behind@" + lsn_a) - - # Insert many more rows. This generates enough WAL to fill a few segments. - main_cur.execute(''' - INSERT INTO foo - SELECT 'long string to consume some space' || g - FROM generate_series(1, 200000) g - ''') - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_c = main_cur.fetchone()[0] - log.info(f'LSN after 400100 rows: {lsn_c}') - - # Branch at the point where only 200100 rows were inserted - env.zenith_cli.create_branch("test_branch_behind_more", "test_branch_behind@" + lsn_b) - - pg_hundred = env.postgres.create_start("test_branch_behind_hundred") - pg_more = env.postgres.create_start("test_branch_behind_more") - - # On the 'hundred' branch, we should see only 100 rows - hundred_pg_conn = pg_hundred.connect() - hundred_cur = hundred_pg_conn.cursor() - hundred_cur.execute('SELECT count(*) FROM foo') - assert hundred_cur.fetchone() == (100, ) - - # On the 'more' branch, we should see 100200 rows - more_pg_conn = pg_more.connect() - more_cur = more_pg_conn.cursor() - more_cur.execute('SELECT count(*) FROM foo') - assert more_cur.fetchone() == (200100, ) - - # All the rows are visible on the main branch - main_cur.execute('SELECT count(*) FROM foo') - assert main_cur.fetchone() == (400100, ) - - # Check bad lsn's for branching - - # branch at segment boundary - env.zenith_cli.create_branch("test_branch_segment_boundary", "test_branch_behind@0/3000000") - pg = env.postgres.create_start("test_branch_segment_boundary") - cur = pg.connect().cursor() - cur.execute('SELECT 1') - assert cur.fetchone() == (1, ) - - # branch at pre-initdb lsn - with pytest.raises(Exception, match="invalid branch start lsn"): - env.zenith_cli.create_branch("test_branch_preinitdb", "main@0/42") - - # branch at pre-ancestor lsn - with pytest.raises(Exception, match="less than timeline ancestor lsn"): - env.zenith_cli.create_branch("test_branch_preinitdb", "test_branch_behind@0/42") - - # check that we cannot create branch based on garbage collected data - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - # call gc to advace latest_gc_cutoff_lsn - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - - with pytest.raises(Exception, match="invalid branch start lsn"): - # this gced_lsn is pretty random, so if gc is disabled this woudln't fail - env.zenith_cli.create_branch("test_branch_create_fail", f"test_branch_behind@{gced_lsn}") - - # check that after gc everything is still there - hundred_cur.execute('SELECT count(*) FROM foo') - assert hundred_cur.fetchone() == (100, ) - - more_cur.execute('SELECT count(*) FROM foo') - assert more_cur.fetchone() == (200100, ) - - main_cur.execute('SELECT count(*) FROM foo') - assert main_cur.fetchone() == (400100, ) diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py deleted file mode 100644 index 504f455936..0000000000 --- a/test_runner/batch_others/test_clog_truncate.py +++ /dev/null @@ -1,74 +0,0 @@ -import time -import os - -from contextlib import closing - -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.log_helper import log - - -# -# Test compute node start after clog truncation -# -def test_clog_truncate(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_clog_truncate", "empty") - - # set agressive autovacuum to make sure that truncation will happen - config = [ - 'autovacuum_max_workers=10', - 'autovacuum_vacuum_threshold=0', - 'autovacuum_vacuum_insert_threshold=0', - 'autovacuum_vacuum_cost_delay=0', - 'autovacuum_vacuum_cost_limit=10000', - 'autovacuum_naptime =1s', - 'autovacuum_freeze_max_age=100000' - ] - - pg = env.postgres.create_start('test_clog_truncate', config_lines=config) - log.info('postgres is running on test_clog_truncate branch') - - # Install extension containing function needed for test - pg.safe_psql('CREATE EXTENSION zenith_test_utils') - - # Consume many xids to advance clog - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute('select test_consume_xids(1000*1000*10);') - log.info('xids consumed') - - # call a checkpoint to trigger TruncateSubtrans - cur.execute('CHECKPOINT;') - - # ensure WAL flush - cur.execute('select txid_current()') - log.info(cur.fetchone()) - - # wait for autovacuum to truncate the pg_xact - # XXX Is it worth to add a timeout here? - pg_xact_0000_path = os.path.join(pg.pg_xact_dir_path(), '0000') - log.info(f"pg_xact_0000_path = {pg_xact_0000_path}") - - while os.path.isfile(pg_xact_0000_path): - log.info(f"file exists. wait for truncation. " "pg_xact_0000_path = {pg_xact_0000_path}") - time.sleep(5) - - # checkpoint to advance latest lsn - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute('CHECKPOINT;') - cur.execute('select pg_current_wal_insert_lsn()') - lsn_after_truncation = cur.fetchone()[0] - - # create new branch after clog truncation and start a compute node on it - log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}') - env.zenith_cli.create_branch("test_clog_truncate_new", - "test_clog_truncate@" + lsn_after_truncation) - - pg2 = env.postgres.create_start('test_clog_truncate_new') - log.info('postgres is running on test_clog_truncate_new branch') - - # check that new node doesn't contain truncated segment - pg_xact_0000_path_new = os.path.join(pg2.pg_xact_dir_path(), '0000') - log.info(f"pg_xact_0000_path_new = {pg_xact_0000_path_new}") - assert os.path.isfile(pg_xact_0000_path_new) is False diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/batch_others/test_createdropdb.py deleted file mode 100644 index 38243b298b..0000000000 --- a/test_runner/batch_others/test_createdropdb.py +++ /dev/null @@ -1,93 +0,0 @@ -import os -import pathlib - -from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content -from fixtures.log_helper import log - - -# -# Test CREATE DATABASE when there have been relmapper changes -# -def test_createdb(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_createdb", "empty") - - pg = env.postgres.create_start('test_createdb') - log.info("postgres is running on 'test_createdb' branch") - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # Cause a 'relmapper' change in the original branch - cur.execute('VACUUM FULL pg_class') - - cur.execute('CREATE DATABASE foodb') - - cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn = cur.fetchone()[0] - - # Create a branch - env.zenith_cli.create_branch("test_createdb2", "test_createdb@" + lsn) - - pg2 = env.postgres.create_start('test_createdb2') - - # Test that you can connect to the new database on both branches - for db in (pg, pg2): - db.connect(dbname='foodb').close() - - -# -# Test DROP DATABASE -# -def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir): - env = zenith_simple_env - env.zenith_cli.create_branch("test_dropdb", "empty") - - pg = env.postgres.create_start('test_dropdb') - log.info("postgres is running on 'test_dropdb' branch") - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute('CREATE DATABASE foodb') - - cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_before_drop = cur.fetchone()[0] - - cur.execute("SELECT oid FROM pg_database WHERE datname='foodb';") - dboid = cur.fetchone()[0] - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute('DROP DATABASE foodb') - - cur.execute('CHECKPOINT') - - cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_after_drop = cur.fetchone()[0] - - # Create two branches before and after database drop. - env.zenith_cli.create_branch("test_before_dropdb", "test_dropdb@" + lsn_before_drop) - pg_before = env.postgres.create_start('test_before_dropdb') - - env.zenith_cli.create_branch("test_after_dropdb", "test_dropdb@" + lsn_after_drop) - pg_after = env.postgres.create_start('test_after_dropdb') - - # Test that database exists on the branch before drop - pg_before.connect(dbname='foodb').close() - - # Test that database subdir exists on the branch before drop - assert pg_before.pgdata_dir - dbpath = pathlib.Path(pg_before.pgdata_dir) / 'base' / str(dboid) - log.info(dbpath) - - assert os.path.isdir(dbpath) == True - - # Test that database subdir doesn't exist on the branch after drop - assert pg_after.pgdata_dir - dbpath = pathlib.Path(pg_after.pgdata_dir) / 'base' / str(dboid) - log.info(dbpath) - - assert os.path.isdir(dbpath) == False - - # Check that we restore the content of the datadir correctly - check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/batch_others/test_createuser.py b/test_runner/batch_others/test_createuser.py deleted file mode 100644 index 1959b47dcc..0000000000 --- a/test_runner/batch_others/test_createuser.py +++ /dev/null @@ -1,33 +0,0 @@ -from contextlib import closing - -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.log_helper import log - - -# -# Test CREATE USER to check shared catalog restore -# -def test_createuser(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_createuser", "empty") - - pg = env.postgres.create_start('test_createuser') - log.info("postgres is running on 'test_createuser' branch") - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # Cause a 'relmapper' change in the original branch - cur.execute('CREATE USER testuser with password %s', ('testpwd', )) - - cur.execute('CHECKPOINT') - - cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn = cur.fetchone()[0] - - # Create a branch - env.zenith_cli.create_branch("test_createuser2", "test_createuser@" + lsn) - - pg2 = env.postgres.create_start('test_createuser2') - - # Test that you can connect to new branch as a new user - assert pg2.safe_psql('select current_user', username='testuser') == [('testuser', )] diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/batch_others/test_gc_aggressive.py deleted file mode 100644 index 9de6ba9f59..0000000000 --- a/test_runner/batch_others/test_gc_aggressive.py +++ /dev/null @@ -1,80 +0,0 @@ -from contextlib import closing - -import asyncio -import asyncpg -import random - -from fixtures.zenith_fixtures import ZenithEnv, Postgres, Safekeeper -from fixtures.log_helper import log - -# Test configuration -# -# Create a table with {num_rows} rows, and perform {updates_to_perform} random -# UPDATEs on it, using {num_connections} separate connections. -num_connections = 10 -num_rows = 100000 -updates_to_perform = 10000 - -updates_performed = 0 - - -# Run random UPDATEs on test table -async def update_table(pg: Postgres): - global updates_performed - pg_conn = await pg.connect_async() - - while updates_performed < updates_to_perform: - updates_performed += 1 - id = random.randrange(1, num_rows) - row = await pg_conn.fetchrow(f'UPDATE foo SET counter = counter + 1 WHERE id = {id}') - - -# Perform aggressive GC with 0 horizon -async def gc(env: ZenithEnv, timeline: str): - psconn = await env.pageserver.connect_async() - - while updates_performed < updates_to_perform: - await psconn.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - - -# At the same time, run UPDATEs and GC -async def update_and_gc(env: ZenithEnv, pg: Postgres, timeline: str): - workers = [] - for worker_id in range(num_connections): - workers.append(asyncio.create_task(update_table(pg))) - workers.append(asyncio.create_task(gc(env, timeline))) - - # await all workers - await asyncio.gather(*workers) - - -# -# Aggressively force GC, while running queries. -# -# (repro for https://github.com/zenithdb/zenith/issues/1047) -# -def test_gc_aggressive(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_gc_aggressive", "empty") - pg = env.postgres.create_start('test_gc_aggressive') - log.info('postgres is running on test_gc_aggressive branch') - - conn = pg.connect() - cur = conn.cursor() - - cur.execute("SHOW zenith.zenith_timeline") - timeline = cur.fetchone()[0] - - # Create table, and insert the first 100 rows - cur.execute('CREATE TABLE foo (id int, counter int, t text)') - cur.execute(f''' - INSERT INTO foo - SELECT g, 0, 'long string to consume some space' || g - FROM generate_series(1, {num_rows}) g - ''') - cur.execute('CREATE INDEX ON foo(id)') - - asyncio.run(update_and_gc(env, pg, timeline)) - - row = cur.execute('SELECT COUNT(*), SUM(counter) FROM foo') - assert cur.fetchone() == (num_rows, updates_to_perform) diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py deleted file mode 100644 index eccffc4d69..0000000000 --- a/test_runner/batch_others/test_pageserver_api.py +++ /dev/null @@ -1,49 +0,0 @@ -import json -from uuid import uuid4, UUID -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient -from typing import cast -import pytest, psycopg2 - - -def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): - client.check_status() - - # check initial tenant is there - assert initial_tenant.hex in {t['id'] for t in client.tenant_list()} - - # create new tenant and check it is also there - tenant_id = uuid4() - client.tenant_create(tenant_id) - assert tenant_id.hex in {t['id'] for t in client.tenant_list()} - - # check its timelines - timelines = client.timeline_list(tenant_id) - assert len(timelines) > 0 - for timeline_id_str in timelines: - timeline_details = client.timeline_detail(tenant_id, UUID(timeline_id_str)) - assert timeline_details['type'] == 'Local' - assert timeline_details['tenant_id'] == tenant_id.hex - assert timeline_details['timeline_id'] == timeline_id_str - - # create branch - branch_name = uuid4().hex - client.branch_create(tenant_id, branch_name, "main") - - # check it is there - assert branch_name in {b['name'] for b in client.branch_list(tenant_id)} - - -def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - client = env.pageserver.http_client() - check_client(client, env.initial_tenant) - - -def test_pageserver_http_api_client_auth_enabled(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.pageserver_auth_enabled = True - env = zenith_env_builder.init() - - management_token = env.auth_keys.generate_management_token() - - client = env.pageserver.http_client(auth_token=management_token) - check_client(client, env.initial_tenant) diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/batch_others/test_pageserver_restart.py deleted file mode 100644 index 0cfc50f0ff..0000000000 --- a/test_runner/batch_others/test_pageserver_restart.py +++ /dev/null @@ -1,63 +0,0 @@ -import pytest -import random -import time - -from contextlib import closing -from multiprocessing import Process, Value -from fixtures.zenith_fixtures import ZenithEnvBuilder -from fixtures.log_helper import log - - -# Test restarting page server, while safekeeper and compute node keep -# running. -def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder): - # One safekeeper is enough for this test. - zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() - - env.zenith_cli.create_branch("test_pageserver_restart", "main") - pg = env.postgres.create_start('test_pageserver_restart') - - pg_conn = pg.connect() - cur = pg_conn.cursor() - - # Create table, and insert some rows. Make it big enough that it doesn't fit in - # shared_buffers, otherwise the SELECT after restart will just return answer - # from shared_buffers without hitting the page server, which defeats the point - # of this test. - cur.execute('CREATE TABLE foo (t text)') - cur.execute(''' - INSERT INTO foo - SELECT 'long string to consume some space' || g - FROM generate_series(1, 100000) g - ''') - - # Verify that the table is larger than shared_buffers - cur.execute(''' - select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize - from pg_settings where name = 'shared_buffers' - ''') - row = cur.fetchone() - log.info(f"shared_buffers is {row[0]}, table size {row[1]}") - assert int(row[0]) < int(row[1]) - - # Stop and restart pageserver. This is a more or less graceful shutdown, although - # the page server doesn't currently have a shutdown routine so there's no difference - # between stopping and crashing. - env.pageserver.stop() - env.pageserver.start() - - # Stopping the pageserver breaks the connection from the postgres backend to - # the page server, and causes the next query on the connection to fail. Start a new - # postgres connection too, to avoid that error. (Ideally, the compute node would - # handle that and retry internally, without propagating the error to the user, but - # currently it doesn't...) - pg_conn = pg.connect() - cur = pg_conn.cursor() - - cur.execute("SELECT count(*) FROM foo") - assert cur.fetchone() == (100000, ) - - # Stop the page server by force, and restart it - env.pageserver.stop() - env.pageserver.start() diff --git a/test_runner/batch_others/test_pgbench.py b/test_runner/batch_others/test_pgbench.py deleted file mode 100644 index 09713023bc..0000000000 --- a/test_runner/batch_others/test_pgbench.py +++ /dev/null @@ -1,14 +0,0 @@ -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.log_helper import log - - -def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin): - env = zenith_simple_env - env.zenith_cli.create_branch("test_pgbench", "empty") - pg = env.postgres.create_start('test_pgbench') - log.info("postgres is running on 'test_pgbench' branch") - - connstr = pg.connstr() - - pg_bin.run_capture(['pgbench', '-i', connstr]) - pg_bin.run_capture(['pgbench'] + '-c 10 -T 5 -P 1 -M prepared'.split() + [connstr]) diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/batch_others/test_proxy.py deleted file mode 100644 index 9510e880b2..0000000000 --- a/test_runner/batch_others/test_proxy.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_proxy_select_1(static_proxy): - static_proxy.safe_psql("select 1;") diff --git a/test_runner/batch_others/test_readonly_node.py b/test_runner/batch_others/test_readonly_node.py deleted file mode 100644 index ba256e71f7..0000000000 --- a/test_runner/batch_others/test_readonly_node.py +++ /dev/null @@ -1,90 +0,0 @@ -import pytest -from fixtures.log_helper import log -from fixtures.zenith_fixtures import ZenithEnv - - -# -# Create read-only compute nodes, anchored at historical points in time. -# -# This is very similar to the 'test_branch_behind' test, but instead of -# creating branches, creates read-only nodes. -# -def test_readonly_node(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_readonly_node", "empty") - - pgmain = env.postgres.create_start('test_readonly_node') - log.info("postgres is running on 'test_readonly_node' branch") - - main_pg_conn = pgmain.connect() - main_cur = main_pg_conn.cursor() - - # Create table, and insert the first 100 rows - main_cur.execute('CREATE TABLE foo (t text)') - - main_cur.execute(''' - INSERT INTO foo - SELECT 'long string to consume some space' || g - FROM generate_series(1, 100) g - ''') - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_a = main_cur.fetchone()[0] - log.info('LSN after 100 rows: ' + lsn_a) - - # Insert some more rows. (This generates enough WAL to fill a few segments.) - main_cur.execute(''' - INSERT INTO foo - SELECT 'long string to consume some space' || g - FROM generate_series(1, 200000) g - ''') - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_b = main_cur.fetchone()[0] - log.info('LSN after 200100 rows: ' + lsn_b) - - # Insert many more rows. This generates enough WAL to fill a few segments. - main_cur.execute(''' - INSERT INTO foo - SELECT 'long string to consume some space' || g - FROM generate_series(1, 200000) g - ''') - - main_cur.execute('SELECT pg_current_wal_insert_lsn()') - lsn_c = main_cur.fetchone()[0] - log.info('LSN after 400100 rows: ' + lsn_c) - - # Create first read-only node at the point where only 100 rows were inserted - pg_hundred = env.postgres.create_start("test_readonly_node_hundred", - branch=f'test_readonly_node@{lsn_a}') - - # And another at the point where 200100 rows were inserted - pg_more = env.postgres.create_start("test_readonly_node_more", - branch=f'test_readonly_node@{lsn_b}') - - # On the 'hundred' node, we should see only 100 rows - hundred_pg_conn = pg_hundred.connect() - hundred_cur = hundred_pg_conn.cursor() - hundred_cur.execute('SELECT count(*) FROM foo') - assert hundred_cur.fetchone() == (100, ) - - # On the 'more' node, we should see 100200 rows - more_pg_conn = pg_more.connect() - more_cur = more_pg_conn.cursor() - more_cur.execute('SELECT count(*) FROM foo') - assert more_cur.fetchone() == (200100, ) - - # All the rows are visible on the main branch - main_cur.execute('SELECT count(*) FROM foo') - assert main_cur.fetchone() == (400100, ) - - # Check creating a node at segment boundary - pg = env.postgres.create_start("test_branch_segment_boundary", - branch="test_readonly_node@0/3000000") - cur = pg.connect().cursor() - cur.execute('SELECT 1') - assert cur.fetchone() == (1, ) - - # Create node at pre-initdb lsn - with pytest.raises(Exception, match="invalid basebackup lsn"): - # compute node startup with invalid LSN should fail - env.zenith_cli.pg_start("test_readonly_node_preinitdb", - timeline_spec="test_readonly_node@0/42") diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py deleted file mode 100644 index fa6feaf412..0000000000 --- a/test_runner/batch_others/test_remote_storage.py +++ /dev/null @@ -1,101 +0,0 @@ -# It's possible to run any regular test with the local fs remote storage via -# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/zenith_zzz/'}" poetry ...... - -import time, shutil, os -from contextlib import closing -from pathlib import Path -from uuid import UUID -from fixtures.zenith_fixtures import ZenithEnvBuilder -from fixtures.log_helper import log -import pytest - - -# -# Tests that a piece of data is backed up and restored correctly: -# -# 1. Initial pageserver -# * starts a pageserver with remote storage, stores specific data in its tables -# * triggers a checkpoint (which produces a local data scheduled for backup), gets the corresponding timeline id -# * polls the timeline status to ensure it's copied remotely -# * stops the pageserver, clears all local directories -# -# 2. Second pageserver -# * starts another pageserver, connected to the same remote storage -# * same timeline id is queried for status, triggering timeline's download -# * timeline status is polled until it's downloaded -# * queries the specific data, ensuring that it matches the one stored before -# -# The tests are done for all types of remote storage pageserver supports. -@pytest.mark.skip(reason="will be fixed with https://github.com/zenithdb/zenith/issues/1193") -@pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3']) -def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, storage_type: str): - zenith_env_builder.rust_log_override = 'debug' - zenith_env_builder.num_safekeepers = 1 - if storage_type == 'local_fs': - zenith_env_builder.enable_local_fs_remote_storage() - elif storage_type == 'mock_s3': - zenith_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore') - else: - raise RuntimeError(f'Unknown storage type: {storage_type}') - - data_id = 1 - data_secret = 'very secret secret' - - ##### First start, insert secret data and upload it to the remote storage - env = zenith_env_builder.init() - pg = env.postgres.create_start() - - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute(f''' - CREATE TABLE t1(id int primary key, secret text); - INSERT INTO t1 VALUES ({data_id}, '{data_secret}'); - ''') - - # run checkpoint manually to be sure that data landed in remote storage - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor() as pscur: - pscur.execute(f"do_gc {tenant_id} {timeline_id}") - log.info("waiting for upload") # TODO api to check if upload is done - time.sleep(2) - - ##### Stop the first pageserver instance, erase all its data - env.postgres.stop_all() - env.pageserver.stop() - - dir_to_clear = Path(env.repo_dir) / 'tenants' - shutil.rmtree(dir_to_clear) - os.mkdir(dir_to_clear) - - ##### Second start, restore the data and ensure it's the same - env.pageserver.start() - - client = env.pageserver.http_client() - client.timeline_attach(UUID(tenant_id), UUID(timeline_id)) - # FIXME cannot handle duplicate download requests (which might be caused by repeated timeline detail calls) - # subject to fix in https://github.com/zenithdb/zenith/issues/997 - time.sleep(5) - - log.info("waiting for timeline redownload") - attempts = 0 - while True: - timeline_details = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) - assert timeline_details['timeline_id'] == timeline_id - assert timeline_details['tenant_id'] == tenant_id - if timeline_details['type'] == 'Local': - log.info("timeline downloaded, checking its data") - break - attempts += 1 - if attempts > 10: - raise Exception("timeline redownload failed") - log.debug("still waiting") - time.sleep(1) - - pg = env.postgres.create_start() - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute(f'SELECT secret FROM t1 WHERE id = {data_id};') - assert cur.fetchone() == (data_secret, ) diff --git a/test_runner/batch_others/test_restart_compute.py b/test_runner/batch_others/test_restart_compute.py deleted file mode 100644 index f7810be555..0000000000 --- a/test_runner/batch_others/test_restart_compute.py +++ /dev/null @@ -1,75 +0,0 @@ -import pytest - -from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnvBuilder -from fixtures.log_helper import log - - -# -# Test restarting and recreating a postgres instance -# -@pytest.mark.parametrize('with_wal_acceptors', [False, True]) -def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool): - zenith_env_builder.pageserver_auth_enabled = True - if with_wal_acceptors: - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() - - env.zenith_cli.create_branch("test_restart_compute", "main") - - pg = env.postgres.create_start('test_restart_compute') - log.info("postgres is running on 'test_restart_compute' branch") - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute('CREATE TABLE t(key int primary key, value text)') - cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - cur.execute('SELECT sum(key) FROM t') - r = cur.fetchone() - assert r == (5000050000, ) - log.info(f"res = {r}") - - # Remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute') - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # We can still see the row - cur.execute('SELECT sum(key) FROM t') - r = cur.fetchone() - assert r == (5000050000, ) - log.info(f"res = {r}") - - # Insert another row - cur.execute("INSERT INTO t VALUES (100001, 'payload2')") - cur.execute('SELECT count(*) FROM t') - - r = cur.fetchone() - assert r == (100001, ) - log.info(f"res = {r}") - - # Again remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute') - - # That select causes lots of FPI's and increases probability of wakeepers - # lagging behind after query completion - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # We can still see the rows - cur.execute('SELECT count(*) FROM t') - - r = cur.fetchone() - assert r == (100001, ) - log.info(f"res = {r}") - - # And again remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute') - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # We can still see the rows - cur.execute('SELECT count(*) FROM t') - - r = cur.fetchone() - assert r == (100001, ) - log.info(f"res = {r}") diff --git a/test_runner/batch_others/test_snapfiles_gc.py b/test_runner/batch_others/test_snapfiles_gc.py deleted file mode 100644 index c6d4512bc9..0000000000 --- a/test_runner/batch_others/test_snapfiles_gc.py +++ /dev/null @@ -1,131 +0,0 @@ -from contextlib import closing -import psycopg2.extras -import time -from fixtures.utils import print_gc_result -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.log_helper import log - - -# -# Test Garbage Collection of old layer files -# -# This test is pretty tightly coupled with the current implementation of layered -# storage, in layered_repository.rs. -# -def test_layerfiles_gc(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_layerfiles_gc", "empty") - pg = env.postgres.create_start('test_layerfiles_gc') - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - - # Get the timeline ID of our branch. We need it for the 'do_gc' command - cur.execute("SHOW zenith.zenith_timeline") - timeline = cur.fetchone()[0] - - # Create a test table - cur.execute("CREATE TABLE foo(x integer)") - cur.execute("INSERT INTO foo VALUES (1)") - - cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass") - row = cur.fetchone() - log.info(f"relfilenode is {row[0]}") - - # Run GC, to clear out any garbage left behind in the catalogs by - # the CREATE TABLE command. We want to have a clean slate with no garbage - # before running the actual tests below, otherwise the counts won't match - # what we expect. - # - # Also run vacuum first to make it less likely that autovacuum or pruning - # kicks in and confuses our numbers. - cur.execute("VACUUM") - - # delete the row, to update the Visibility Map. We don't want the VM - # update to confuse our numbers either. - cur.execute("DELETE FROM foo") - - log.info("Running GC before test") - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - # remember the number of files - layer_relfiles_remain = (row['layer_relfiles_total'] - - row['layer_relfiles_removed']) - assert layer_relfiles_remain > 0 - - # Insert a row and run GC. Checkpoint should freeze the layer - # so that there is only the most recent image layer left for the rel, - # removing the old image and delta layer. - log.info("Inserting one row and running GC") - cur.execute("INSERT INTO foo VALUES (1)") - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - assert row['layer_relfiles_total'] == layer_relfiles_remain + 2 - assert row['layer_relfiles_removed'] == 2 - assert row['layer_relfiles_dropped'] == 0 - - # Insert two more rows and run GC. - # This should create new image and delta layer file with the new contents, and - # then remove the old one image and the just-created delta layer. - log.info("Inserting two more rows and running GC") - cur.execute("INSERT INTO foo VALUES (2)") - cur.execute("INSERT INTO foo VALUES (3)") - - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - assert row['layer_relfiles_total'] == layer_relfiles_remain + 2 - assert row['layer_relfiles_removed'] == 2 - assert row['layer_relfiles_dropped'] == 0 - - # Do it again. Should again create two new layer files and remove old ones. - log.info("Inserting two more rows and running GC") - cur.execute("INSERT INTO foo VALUES (2)") - cur.execute("INSERT INTO foo VALUES (3)") - - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - assert row['layer_relfiles_total'] == layer_relfiles_remain + 2 - assert row['layer_relfiles_removed'] == 2 - assert row['layer_relfiles_dropped'] == 0 - - # Run GC again, with no changes in the database. Should not remove anything. - log.info("Run GC again, with nothing to do") - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - assert row['layer_relfiles_total'] == layer_relfiles_remain - assert row['layer_relfiles_removed'] == 0 - assert row['layer_relfiles_dropped'] == 0 - - # - # Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage - # - log.info("Drop table and run GC again") - cur.execute("DROP TABLE foo") - - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - - # We still cannot remove the latest layers - # because they serve as tombstones for earlier layers. - assert row['layer_relfiles_dropped'] == 0 - # Each relation fork is counted separately, hence 3. - assert row['layer_relfiles_needed_as_tombstone'] == 3 - - # The catalog updates also create new layer files of the catalogs, which - # are counted as 'removed' - assert row['layer_relfiles_removed'] > 0 - - # TODO Change the test to check actual CG of dropped layers. - # Each relation fork is counted separately, hence 3. - #assert row['layer_relfiles_dropped'] == 3 - - # TODO: perhaps we should count catalog and user relations separately, - # to make this kind of testing more robust diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py deleted file mode 100644 index 5c6d78e730..0000000000 --- a/test_runner/batch_others/test_tenant_relocation.py +++ /dev/null @@ -1,267 +0,0 @@ -from contextlib import closing, contextmanager -import os -import pathlib -import subprocess -import threading -from uuid import UUID -from fixtures.log_helper import log -import time -import signal -import pytest - -from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, ZenithPageserverHttpClient, zenith_binpath, pg_distrib_dir - - -def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): - assert abs(a - b) / a < margin_ratio, (a, b, margin_ratio) - - -@contextmanager -def new_pageserver_helper(new_pageserver_dir: pathlib.Path, - pageserver_bin: pathlib.Path, - remote_storage_mock_path: pathlib.Path, - pg_port: int, - http_port: int): - """ - cannot use ZenithPageserver yet because it depends on zenith cli - which currently lacks support for multiple pageservers - """ - cmd = [ - str(pageserver_bin), - '--init', - '--workdir', - str(new_pageserver_dir), - f"-c listen_pg_addr='localhost:{pg_port}'", - f"-c listen_http_addr='localhost:{http_port}'", - f"-c pg_distrib_dir='{pg_distrib_dir}'", - f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", - ] - - subprocess.check_output(cmd, text=True) - - # actually run new pageserver - cmd = [ - str(pageserver_bin), - '--workdir', - str(new_pageserver_dir), - '--daemonize', - ] - log.info("starting new pageserver %s", cmd) - out = subprocess.check_output(cmd, text=True) - log.info("started new pageserver %s", out) - try: - yield - finally: - log.info("stopping new pageserver") - pid = int((new_pageserver_dir / 'pageserver.pid').read_text()) - os.kill(pid, signal.SIGQUIT) - - -def wait_for(number_of_iterations: int, interval: int, func): - last_exception = None - for i in range(number_of_iterations): - try: - res = func() - except Exception as e: - log.info("waiting for %s iteration %s failed", func, i + 1) - last_exception = e - time.sleep(interval) - continue - return res - raise Exception("timed out while waiting for %s" % func) from last_exception - - -@contextmanager -def pg_cur(pg): - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - yield cur - - -def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Event): - log.info("load started") - - inserted_ctr = 0 - failed = False - while not stop_event.is_set(): - try: - with pg_cur(pg) as cur: - cur.execute("INSERT INTO load VALUES ('some payload')") - inserted_ctr += 1 - except: - if not failed: - log.info("load failed") - failed = True - load_ok_event.clear() - else: - if failed: - with pg_cur(pg) as cur: - # if we recovered after failure verify that we have correct number of rows - log.info("recovering at %s", inserted_ctr) - cur.execute("SELECT count(*) FROM load") - # it seems that sometimes transaction gets commited before we can acknowledge - # the result, so sometimes selected value is larger by one than we expect - assert cur.fetchone()[0] - inserted_ctr <= 1 - log.info("successfully recovered %s", inserted_ctr) - failed = False - load_ok_event.set() - log.info('load thread stopped') - - -def assert_local(pageserver_http_client: ZenithPageserverHttpClient, tenant: UUID, timeline: str): - timeline_detail = pageserver_http_client.timeline_detail(tenant, UUID(timeline)) - assert timeline_detail.get('type') == "Local", timeline_detail - return timeline_detail - - -@pytest.mark.skip(reason="will be fixed with https://github.com/zenithdb/zenith/issues/1193") -@pytest.mark.parametrize('with_load', ['with_load', 'without_load']) -def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, - port_distributor: PortDistributor, - with_load: str): - zenith_env_builder.num_safekeepers = 1 - zenith_env_builder.enable_local_fs_remote_storage() - - env = zenith_env_builder.init() - - # create folder for remote storage mock - remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage' - - tenant = env.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) - log.info("tenant to relocate %s", tenant) - - env.zenith_cli.create_branch("test_tenant_relocation", "main", tenant_id=tenant) - - tenant_pg = env.postgres.create_start( - "test_tenant_relocation", - "main", # branch name, None means same as node name - tenant_id=tenant, - ) - - # insert some data - with closing(tenant_pg.connect()) as conn: - with conn.cursor() as cur: - # save timeline for later gc call - cur.execute("SHOW zenith.zenith_timeline") - timeline = cur.fetchone()[0] - log.info("timeline to relocate %s", timeline) - - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - cur.execute("CREATE TABLE t(key int primary key, value text)") - cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'some payload'") - cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (500500, ) - - if with_load == 'with_load': - # create load table - with pg_cur(tenant_pg) as cur: - cur.execute("CREATE TABLE load(value text)") - - load_stop_event = threading.Event() - load_ok_event = threading.Event() - load_thread = threading.Thread(target=load, - args=(tenant_pg, load_stop_event, load_ok_event)) - load_thread.start() - - # run checkpoint manually to be sure that data landed in remote storage - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor() as pscur: - pscur.execute(f"do_gc {tenant.hex} {timeline}") - - # ensure upload is completed - pageserver_http_client = env.pageserver.http_client() - timeline_detail = pageserver_http_client.timeline_detail(tenant, UUID(timeline)) - assert timeline_detail['disk_consistent_lsn'] == timeline_detail['timeline_state']['Ready'] - - log.info("inititalizing new pageserver") - # bootstrap second pageserver - new_pageserver_dir = env.repo_dir / 'new_pageserver' - new_pageserver_dir.mkdir() - - new_pageserver_pg_port = port_distributor.get_port() - new_pageserver_http_port = port_distributor.get_port() - log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port) - pageserver_bin = pathlib.Path(zenith_binpath) / 'pageserver' - - new_pageserver_http_client = ZenithPageserverHttpClient(port=new_pageserver_http_port, - auth_token=None) - - with new_pageserver_helper(new_pageserver_dir, - pageserver_bin, - remote_storage_mock_path, - new_pageserver_pg_port, - new_pageserver_http_port): - - # call to attach timeline to new pageserver - new_pageserver_http_client.timeline_attach(tenant, UUID(timeline)) - # FIXME cannot handle duplicate download requests, subject to fix in https://github.com/zenithdb/zenith/issues/997 - time.sleep(5) - # new pageserver should in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint - new_timeline_detail = wait_for( - number_of_iterations=5, - interval=1, - func=lambda: assert_local(new_pageserver_http_client, tenant, timeline)) - assert new_timeline_detail['timeline_state'].get('Ready'), new_timeline_detail - # when load is active these checks can break because lsns are not static - # so lets check with some margin - if with_load == 'without_load': - # TODO revisit this once https://github.com/zenithdb/zenith/issues/1049 is fixed - assert_abs_margin_ratio(new_timeline_detail['disk_consistent_lsn'], - timeline_detail['disk_consistent_lsn'], - 0.01) - assert_abs_margin_ratio(new_timeline_detail['timeline_state']['Ready'], - timeline_detail['timeline_state']['Ready'], - 0.01) - - # callmemaybe to start replication from safekeeper to the new pageserver - # when there is no load there is a clean checkpoint and no wal delta - # needs to be streamed to the new pageserver - # TODO (rodionov) use attach to start replication - with pg_cur(PgProtocol(host='localhost', port=new_pageserver_pg_port)) as cur: - # "callmemaybe {} {} host={} port={} options='-c ztimelineid={} ztenantid={}'" - safekeeper_connstring = f"host=localhost port={env.safekeepers[0].port.pg} options='-c ztimelineid={timeline} ztenantid={tenant} pageserver_connstr=postgresql://no_user:@localhost:{new_pageserver_pg_port}'" - cur.execute("callmemaybe {} {} {}".format(tenant, timeline, safekeeper_connstring)) - - tenant_pg.stop() - - # rewrite zenith cli config to use new pageserver for basebackup to start new compute - cli_config_lines = (env.repo_dir / 'config').read_text().splitlines() - cli_config_lines[-2] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'" - cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'" - (env.repo_dir / 'config').write_text('\n'.join(cli_config_lines)) - - tenant_pg_config_file_path = pathlib.Path(tenant_pg.config_file_path()) - tenant_pg_config_file_path.open('a').write( - f"\nzenith.page_server_connstring = 'postgresql://no_user:@localhost:{new_pageserver_pg_port}'" - ) - - tenant_pg.start() - - # detach tenant from old pageserver before we check - # that all the data is there to be sure that old pageserver - # is no longer involved, and if it is, we will see the errors - pageserver_http_client.timeline_detach(tenant, UUID(timeline)) - - with pg_cur(tenant_pg) as cur: - # check that data is still there - cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (500500, ) - # check that we can write new data - cur.execute("INSERT INTO t SELECT generate_series(1001,2000), 'some payload'") - cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (2001000, ) - - if with_load == 'with_load': - assert load_ok_event.wait(1) - log.info('stopping load thread') - load_stop_event.set() - load_thread.join() - log.info('load thread stopped') - - # bring old pageserver back for clean shutdown via zenith cli - # new pageserver will be shut down by the context manager - cli_config_lines = (env.repo_dir / 'config').read_text().splitlines() - cli_config_lines[-2] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'" - cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'" - (env.repo_dir / 'config').write_text('\n'.join(cli_config_lines)) diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py deleted file mode 100644 index 232c724870..0000000000 --- a/test_runner/batch_others/test_tenants.py +++ /dev/null @@ -1,44 +0,0 @@ -from contextlib import closing - -import pytest - -from fixtures.zenith_fixtures import ZenithEnvBuilder - - -@pytest.mark.parametrize('with_wal_acceptors', [False, True]) -def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool): - if with_wal_acceptors: - zenith_env_builder.num_safekeepers = 3 - - env = zenith_env_builder.init() - """Tests tenants with and without wal acceptors""" - tenant_1 = env.create_tenant() - tenant_2 = env.create_tenant() - - env.zenith_cli.create_branch(f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - "main", - tenant_id=tenant_1) - env.zenith_cli.create_branch(f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - "main", - tenant_id=tenant_2) - - pg_tenant1 = env.postgres.create_start( - f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - None, # branch name, None means same as node name - tenant_1, - ) - pg_tenant2 = env.postgres.create_start( - f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - None, # branch name, None means same as node name - tenant_2, - ) - - for pg in [pg_tenant1, pg_tenant2]: - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - cur.execute("CREATE TABLE t(key int primary key, value text)") - cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (5000050000, ) diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py deleted file mode 100644 index b48f830528..0000000000 --- a/test_runner/batch_others/test_timeline_size.py +++ /dev/null @@ -1,132 +0,0 @@ -from contextlib import closing -from uuid import UUID -import psycopg2.extras -import psycopg2.errors -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres -from fixtures.log_helper import log -import time - - -def test_timeline_size(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch("test_timeline_size", "empty") - - client = env.pageserver.http_client() - res = client.branch_detail(env.initial_tenant, "test_timeline_size") - assert res["current_logical_size"] == res["current_logical_size_non_incremental"] - - pgmain = env.postgres.create_start("test_timeline_size") - log.info("postgres is running on 'test_timeline_size' branch") - - with closing(pgmain.connect()) as conn: - with conn.cursor() as cur: - cur.execute("SHOW zenith.zenith_timeline") - - # Create table, and insert the first 100 rows - cur.execute("CREATE TABLE foo (t text)") - cur.execute(""" - INSERT INTO foo - SELECT 'long string to consume some space' || g - FROM generate_series(1, 10) g - """) - - res = client.branch_detail(env.initial_tenant, "test_timeline_size") - assert res["current_logical_size"] == res["current_logical_size_non_incremental"] - cur.execute("TRUNCATE foo") - - res = client.branch_detail(env.initial_tenant, "test_timeline_size") - assert res["current_logical_size"] == res["current_logical_size_non_incremental"] - - -# wait until received_lsn_lag is 0 -def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60): - started_at = time.time() - - received_lsn_lag = 1 - while received_lsn_lag > 0: - elapsed = time.time() - started_at - if elapsed > timeout: - raise RuntimeError( - f"timed out waiting for pageserver to reach pg_current_wal_flush_lsn()") - - with closing(pgmain.connect()) as conn: - with conn.cursor() as cur: - - cur.execute(''' - select pg_size_pretty(pg_cluster_size()), - pg_wal_lsn_diff(pg_current_wal_flush_lsn(),received_lsn) as received_lsn_lag - FROM backpressure_lsns(); - ''') - res = cur.fetchone() - log.info(f"pg_cluster_size = {res[0]}, received_lsn_lag = {res[1]}") - received_lsn_lag = res[1] - - time.sleep(polling_interval) - - -def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() - env.zenith_cli.create_branch("test_timeline_size_quota", "main") - - client = env.pageserver.http_client() - res = client.branch_detail(env.initial_tenant, "test_timeline_size_quota") - assert res["current_logical_size"] == res["current_logical_size_non_incremental"] - - pgmain = env.postgres.create_start( - "test_timeline_size_quota", - # Set small limit for the test - config_lines=['zenith.max_cluster_size=30MB'], - ) - log.info("postgres is running on 'test_timeline_size_quota' branch") - - with closing(pgmain.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CREATE EXTENSION zenith") # TODO move it to zenith_fixtures? - - cur.execute("CREATE TABLE foo (t text)") - - wait_for_pageserver_catchup(pgmain) - - # Insert many rows. This query must fail because of space limit - try: - cur.execute(''' - INSERT INTO foo - SELECT 'long string to consume some space' || g - FROM generate_series(1, 100000) g - ''') - - wait_for_pageserver_catchup(pgmain) - - cur.execute(''' - INSERT INTO foo - SELECT 'long string to consume some space' || g - FROM generate_series(1, 500000) g - ''') - - # If we get here, the timeline size limit failed - log.error("Query unexpectedly succeeded") - assert False - - except psycopg2.errors.DiskFull as err: - log.info(f"Query expectedly failed with: {err}") - - # drop table to free space - cur.execute('DROP TABLE foo') - - wait_for_pageserver_catchup(pgmain) - - # create it again and insert some rows. This query must succeed - cur.execute("CREATE TABLE foo (t text)") - cur.execute(''' - INSERT INTO foo - SELECT 'long string to consume some space' || g - FROM generate_series(1, 10000) g - ''') - - wait_for_pageserver_catchup(pgmain) - - cur.execute("SELECT * from pg_size_pretty(pg_cluster_size())") - pg_cluster_size = cur.fetchone() - log.info(f"pg_cluster_size = {pg_cluster_size}") diff --git a/test_runner/batch_others/test_vm_bits.py b/test_runner/batch_others/test_vm_bits.py deleted file mode 100644 index 49e48dd450..0000000000 --- a/test_runner/batch_others/test_vm_bits.py +++ /dev/null @@ -1,79 +0,0 @@ -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.log_helper import log - - -# -# Test that the VM bit is cleared correctly at a HEAP_DELETE and -# HEAP_UPDATE record. -# -def test_vm_bit_clear(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - - env.zenith_cli.create_branch("test_vm_bit_clear", "empty") - pg = env.postgres.create_start('test_vm_bit_clear') - - log.info("postgres is running on 'test_vm_bit_clear' branch") - pg_conn = pg.connect() - cur = pg_conn.cursor() - - # Install extension containing function needed for test - cur.execute('CREATE EXTENSION zenith_test_utils') - - # Create a test table and freeze it to set the VM bit. - cur.execute('CREATE TABLE vmtest_delete (id integer PRIMARY KEY)') - cur.execute('INSERT INTO vmtest_delete VALUES (1)') - cur.execute('VACUUM FREEZE vmtest_delete') - - cur.execute('CREATE TABLE vmtest_update (id integer PRIMARY KEY)') - cur.execute('INSERT INTO vmtest_update SELECT g FROM generate_series(1, 1000) g') - cur.execute('VACUUM FREEZE vmtest_update') - - # DELETE and UDPATE the rows. - cur.execute('DELETE FROM vmtest_delete WHERE id = 1') - cur.execute('UPDATE vmtest_update SET id = 5000 WHERE id = 1') - - # Branch at this point, to test that later - env.zenith_cli.create_branch("test_vm_bit_clear_new", "test_vm_bit_clear") - - # Clear the buffer cache, to force the VM page to be re-fetched from - # the page server - cur.execute('SELECT clear_buffer_cache()') - - # Check that an index-only scan doesn't see the deleted row. If the - # clearing of the VM bit was not replayed correctly, this would incorrectly - # return deleted row. - cur.execute(''' - set enable_seqscan=off; - set enable_indexscan=on; - set enable_bitmapscan=off; - ''') - - cur.execute('SELECT * FROM vmtest_delete WHERE id = 1') - assert (cur.fetchall() == []) - cur.execute('SELECT * FROM vmtest_update WHERE id = 1') - assert (cur.fetchall() == []) - - cur.close() - - # Check the same thing on the branch that we created right after the DELETE - # - # As of this writing, the code in smgrwrite() creates a full-page image whenever - # a dirty VM page is evicted. If the VM bit was not correctly cleared by the - # earlier WAL record, the full-page image hides the problem. Starting a new - # server at the right point-in-time avoids that full-page image. - pg_new = env.postgres.create_start('test_vm_bit_clear_new') - - log.info("postgres is running on 'test_vm_bit_clear_new' branch") - pg_new_conn = pg_new.connect() - cur_new = pg_new_conn.cursor() - - cur_new.execute(''' - set enable_seqscan=off; - set enable_indexscan=on; - set enable_bitmapscan=off; - ''') - - cur_new.execute('SELECT * FROM vmtest_delete WHERE id = 1') - assert (cur_new.fetchall() == []) - cur_new.execute('SELECT * FROM vmtest_update WHERE id = 1') - assert (cur_new.fetchall() == []) diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py deleted file mode 100644 index 4d9e18bb58..0000000000 --- a/test_runner/batch_others/test_wal_acceptor.py +++ /dev/null @@ -1,692 +0,0 @@ -import pytest -import random -import time -import os -import signal -import subprocess -import sys -import threading -import uuid - -from contextlib import closing -from dataclasses import dataclass, field -from multiprocessing import Process, Value -from pathlib import Path -from fixtures.zenith_fixtures import PgBin, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol -from fixtures.utils import lsn_to_hex, mkdir_if_needed -from fixtures.log_helper import log -from typing import List, Optional, Any - - -# basic test, write something in setup with wal acceptors, ensure that commits -# succeed and data is written -def test_normal_work(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() - - env.zenith_cli.create_branch("test_wal_acceptors_normal_work", "main") - - pg = env.postgres.create_start('test_wal_acceptors_normal_work') - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - cur.execute('CREATE TABLE t(key int primary key, value text)') - cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (5000050000, ) - - -@dataclass -class BranchMetrics: - name: str - latest_valid_lsn: int - # One entry per each Safekeeper, order is the same - flush_lsns: List[int] = field(default_factory=list) - commit_lsns: List[int] = field(default_factory=list) - - -# Run page server and multiple acceptors, and multiple compute nodes running -# against different timelines. -def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() - - n_timelines = 3 - - branches = ["test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)] - - # start postgres on each timeline - pgs = [] - for branch in branches: - env.zenith_cli.create_branch(branch, "main") - pgs.append(env.postgres.create_start(branch)) - - tenant_id = env.initial_tenant - - def collect_metrics(message: str) -> List[BranchMetrics]: - with env.pageserver.http_client() as pageserver_http: - branch_details = [ - pageserver_http.branch_detail(tenant_id=tenant_id, name=branch) - for branch in branches - ] - # All changes visible to pageserver (latest_valid_lsn) should be - # confirmed by safekeepers first. As we cannot atomically get - # state of both pageserver and safekeepers, we should start with - # pageserver. Looking at outdated data from pageserver is ok. - # Asking safekeepers first is not ok because new commits may arrive - # to both safekeepers and pageserver after we've already obtained - # safekeepers' state, it will look contradictory. - sk_metrics = [sk.http_client().get_metrics() for sk in env.safekeepers] - - branch_metrics = [] - with env.pageserver.http_client() as pageserver_http: - for branch_detail in branch_details: - timeline_id: str = branch_detail["timeline_id"] - - m = BranchMetrics( - name=branch_detail["name"], - latest_valid_lsn=branch_detail["latest_valid_lsn"], - ) - for sk_m in sk_metrics: - m.flush_lsns.append(sk_m.flush_lsn_inexact[(tenant_id.hex, timeline_id)]) - m.commit_lsns.append(sk_m.commit_lsn_inexact[(tenant_id.hex, timeline_id)]) - - for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): - # Invariant. May be < when transaction is in progress. - assert commit_lsn <= flush_lsn - # We only call collect_metrics() after a transaction is confirmed by - # the compute node, which only happens after a consensus of safekeepers - # has confirmed the transaction. We assume majority consensus here. - assert (2 * sum(m.latest_valid_lsn <= lsn - for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers) - assert (2 * sum(m.latest_valid_lsn <= lsn - for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers) - branch_metrics.append(m) - log.info(f"{message}: {branch_metrics}") - return branch_metrics - - # TODO: https://github.com/zenithdb/zenith/issues/809 - # collect_metrics("before CREATE TABLE") - - # Do everything in different loops to have actions on different timelines - # interleaved. - # create schema - for pg in pgs: - pg.safe_psql("CREATE TABLE t(key int primary key, value text)") - init_m = collect_metrics("after CREATE TABLE") - - # Populate data for 2/3 branches - class MetricsChecker(threading.Thread): - def __init__(self) -> None: - super().__init__(daemon=True) - self.should_stop = threading.Event() - self.exception: Optional[BaseException] = None - - def run(self) -> None: - try: - while not self.should_stop.is_set(): - collect_metrics("during INSERT INTO") - time.sleep(1) - except: - log.error("MetricsChecker's thread failed, the test will be failed on .stop() call", - exc_info=True) - # We want to preserve traceback as well as the exception - exc_type, exc_value, exc_tb = sys.exc_info() - assert exc_type - e = exc_type(exc_value) - e.__traceback__ = exc_tb - self.exception = e - - def stop(self) -> None: - self.should_stop.set() - self.join() - if self.exception: - raise self.exception - - metrics_checker = MetricsChecker() - metrics_checker.start() - - for pg in pgs[:-1]: - pg.safe_psql("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - - metrics_checker.stop() - - collect_metrics("after INSERT INTO") - - # Check data for 2/3 branches - for pg in pgs[:-1]: - res = pg.safe_psql("SELECT sum(key) FROM t") - assert res[0] == (5000050000, ) - - final_m = collect_metrics("after SELECT") - # Assume that LSNs (a) behave similarly in all branches; and (b) INSERT INTO alters LSN significantly. - # Also assume that safekeepers will not be significantly out of sync in this test. - middle_lsn = (init_m[0].latest_valid_lsn + final_m[0].latest_valid_lsn) // 2 - assert max(init_m[0].flush_lsns) < middle_lsn < min(final_m[0].flush_lsns) - assert max(init_m[0].commit_lsns) < middle_lsn < min(final_m[0].commit_lsns) - assert max(init_m[1].flush_lsns) < middle_lsn < min(final_m[1].flush_lsns) - assert max(init_m[1].commit_lsns) < middle_lsn < min(final_m[1].commit_lsns) - assert max(init_m[2].flush_lsns) <= min(final_m[2].flush_lsns) < middle_lsn - assert max(init_m[2].commit_lsns) <= min(final_m[2].commit_lsns) < middle_lsn - - -# Check that dead minority doesn't prevent the commits: execute insert n_inserts -# times, with fault_probability chance of getting a wal acceptor down or up -# along the way. 2 of 3 are always alive, so the work keeps going. -def test_restarts(zenith_env_builder: ZenithEnvBuilder): - fault_probability = 0.01 - n_inserts = 1000 - n_acceptors = 3 - - zenith_env_builder.num_safekeepers = n_acceptors - env = zenith_env_builder.init() - - env.zenith_cli.create_branch("test_wal_acceptors_restarts", "main") - pg = env.postgres.create_start('test_wal_acceptors_restarts') - - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - pg_conn = pg.connect() - cur = pg_conn.cursor() - - failed_node = None - cur.execute('CREATE TABLE t(key int primary key, value text)') - for i in range(n_inserts): - cur.execute("INSERT INTO t values (%s, 'payload');", (i + 1, )) - - if random.random() <= fault_probability: - if failed_node is None: - failed_node = env.safekeepers[random.randrange(0, n_acceptors)] - failed_node.stop() - else: - failed_node.start() - failed_node = None - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (500500, ) - - -start_delay_sec = 2 - - -def delayed_wal_acceptor_start(wa): - time.sleep(start_delay_sec) - wa.start() - - -# When majority of acceptors is offline, commits are expected to be frozen -def test_unavailability(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 2 - env = zenith_env_builder.init() - - env.zenith_cli.create_branch("test_wal_acceptors_unavailability", "main") - pg = env.postgres.create_start('test_wal_acceptors_unavailability') - - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - pg_conn = pg.connect() - cur = pg_conn.cursor() - - # check basic work with table - cur.execute('CREATE TABLE t(key int primary key, value text)') - cur.execute("INSERT INTO t values (1, 'payload')") - - # shutdown one of two acceptors, that is, majority - env.safekeepers[0].stop() - - proc = Process(target=delayed_wal_acceptor_start, args=(env.safekeepers[0], )) - proc.start() - - start = time.time() - cur.execute("INSERT INTO t values (2, 'payload')") - # ensure that the query above was hanging while acceptor was down - assert (time.time() - start) >= start_delay_sec - proc.join() - - # for the world's balance, do the same with second acceptor - env.safekeepers[1].stop() - - proc = Process(target=delayed_wal_acceptor_start, args=(env.safekeepers[1], )) - proc.start() - - start = time.time() - cur.execute("INSERT INTO t values (3, 'payload')") - # ensure that the query above was hanging while acceptor was down - assert (time.time() - start) >= start_delay_sec - proc.join() - - cur.execute("INSERT INTO t values (4, 'payload')") - - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (10, ) - - -# shut down random subset of acceptors, sleep, wake them up, rinse, repeat -def xmas_garland(acceptors, stop): - while not bool(stop.value): - victims = [] - for wa in acceptors: - if random.random() >= 0.5: - victims.append(wa) - for v in victims: - v.stop() - time.sleep(1) - for v in victims: - v.start() - time.sleep(1) - - -# value which gets unset on exit -@pytest.fixture -def stop_value(): - stop = Value('i', 0) - yield stop - stop.value = 1 - - -# do inserts while concurrently getting up/down subsets of acceptors -def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value): - - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() - - env.zenith_cli.create_branch("test_wal_acceptors_race_conditions", "main") - pg = env.postgres.create_start('test_wal_acceptors_race_conditions') - - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - pg_conn = pg.connect() - cur = pg_conn.cursor() - - cur.execute('CREATE TABLE t(key int primary key, value text)') - - proc = Process(target=xmas_garland, args=(env.safekeepers, stop_value)) - proc.start() - - for i in range(1000): - cur.execute("INSERT INTO t values (%s, 'payload');", (i + 1, )) - - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (500500, ) - - stop_value.value = 1 - proc.join() - - -class ProposerPostgres(PgProtocol): - """Object for running postgres without ZenithEnv""" - def __init__(self, - pgdata_dir: str, - pg_bin, - timeline_id: uuid.UUID, - tenant_id: uuid.UUID, - listen_addr: str, - port: int): - super().__init__(host=listen_addr, port=port, username='zenith_admin') - - self.pgdata_dir: str = pgdata_dir - self.pg_bin: PgBin = pg_bin - self.timeline_id: uuid.UUID = timeline_id - self.tenant_id: uuid.UUID = tenant_id - self.listen_addr: str = listen_addr - self.port: int = port - - def pg_data_dir_path(self) -> str: - """ Path to data directory """ - return self.pgdata_dir - - def config_file_path(self) -> str: - """ Path to postgresql.conf """ - return os.path.join(self.pgdata_dir, 'postgresql.conf') - - def create_dir_config(self, wal_acceptors: str): - """ Create dir and config for running --sync-safekeepers """ - - mkdir_if_needed(self.pg_data_dir_path()) - with open(self.config_file_path(), "w") as f: - cfg = [ - "synchronous_standby_names = 'walproposer'\n", - "shared_preload_libraries = 'zenith'\n", - f"zenith.zenith_timeline = '{self.timeline_id.hex}'\n", - f"zenith.zenith_tenant = '{self.tenant_id.hex}'\n", - f"zenith.page_server_connstring = ''\n", - f"wal_acceptors = '{wal_acceptors}'\n", - f"listen_addresses = '{self.listen_addr}'\n", - f"port = '{self.port}'\n", - ] - - f.writelines(cfg) - - def sync_safekeepers(self) -> str: - """ - Run 'postgres --sync-safekeepers'. - Returns execution result, which is commit_lsn after sync. - """ - - command = ["postgres", "--sync-safekeepers"] - env = { - "PGDATA": self.pg_data_dir_path(), - } - - basepath = self.pg_bin.run_capture(command, env) - stdout_filename = basepath + '.stdout' - - with open(stdout_filename, 'r') as stdout_f: - stdout = stdout_f.read() - return stdout.strip("\n ") - - def initdb(self): - """ Run initdb """ - - args = ["initdb", "-U", "zenith_admin", "-D", self.pg_data_dir_path()] - self.pg_bin.run(args) - - def start(self): - """ Start postgres with pg_ctl """ - - log_path = os.path.join(self.pg_data_dir_path(), "pg.log") - args = ["pg_ctl", "-D", self.pg_data_dir_path(), "-l", log_path, "-w", "start"] - self.pg_bin.run(args) - - def stop(self): - """ Stop postgres with pg_ctl """ - - args = ["pg_ctl", "-D", self.pg_data_dir_path(), "-m", "immediate", "-w", "stop"] - self.pg_bin.run(args) - - -# insert wal in all safekeepers and run sync on proposer -def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, - pg_bin: PgBin, - port_distributor: PortDistributor): - - # We don't really need the full environment for this test, just the - # safekeepers would be enough. - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() - - timeline_id = uuid.uuid4() - tenant_id = uuid.uuid4() - - # write config for proposer - pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata") - pg = ProposerPostgres(pgdata_dir, - pg_bin, - timeline_id, - tenant_id, - '127.0.0.1', - port_distributor.get_port()) - pg.create_dir_config(env.get_safekeeper_connstrs()) - - # valid lsn, which is not in the segment start, nor in zero segment - epoch_start_lsn = 0x16B9188 # 0/16B9188 - begin_lsn = epoch_start_lsn - - # append and commit WAL - lsn_after_append = [] - for i in range(3): - res = env.safekeepers[i].append_logical_message( - tenant_id, - timeline_id, - { - "lm_prefix": "prefix", - "lm_message": "message", - "set_commit_lsn": True, - "send_proposer_elected": True, - "term": 2, - "begin_lsn": begin_lsn, - "epoch_start_lsn": epoch_start_lsn, - "truncate_lsn": epoch_start_lsn, - }, - ) - lsn_hex = lsn_to_hex(res["inserted_wal"]["end_lsn"]) - lsn_after_append.append(lsn_hex) - log.info(f"safekeeper[{i}] lsn after append: {lsn_hex}") - - # run sync safekeepers - lsn_after_sync = pg.sync_safekeepers() - log.info(f"lsn after sync = {lsn_after_sync}") - - assert all(lsn_after_sync == lsn for lsn in lsn_after_append) - - -def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): - - zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() - - env.zenith_cli.create_branch("test_timeline_status", "main") - pg = env.postgres.create_start('test_timeline_status') - - wa = env.safekeepers[0] - wa_http_cli = wa.http_client() - wa_http_cli.check_status() - - # learn zenith timeline from compute - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] - - # fetch something sensible from status - epoch = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch - - pg.safe_psql("create table t(i int)") - - # ensure epoch goes up after reboot - pg.stop().start() - pg.safe_psql("insert into t values(10)") - - epoch_after_reboot = wa_http_cli.timeline_status(tenant_id, timeline_id).acceptor_epoch - assert epoch_after_reboot > epoch - - -class SafekeeperEnv: - def __init__(self, - repo_dir: Path, - port_distributor: PortDistributor, - pg_bin: PgBin, - num_safekeepers: int = 1): - self.repo_dir = repo_dir - self.port_distributor = port_distributor - self.pg_bin = pg_bin - self.num_safekeepers = num_safekeepers - self.bin_safekeeper = os.path.join(str(zenith_binpath), 'safekeeper') - self.safekeepers: Optional[List[subprocess.CompletedProcess[Any]]] = None - self.postgres: Optional[ProposerPostgres] = None - self.tenant_id: Optional[uuid.UUID] = None - self.timeline_id: Optional[uuid.UUID] = None - - def init(self) -> "SafekeeperEnv": - assert self.postgres is None, "postgres is already initialized" - assert self.safekeepers is None, "safekeepers are already initialized" - - self.timeline_id = uuid.uuid4() - self.tenant_id = uuid.uuid4() - mkdir_if_needed(str(self.repo_dir)) - - # Create config and a Safekeeper object for each safekeeper - self.safekeepers = [] - for i in range(1, self.num_safekeepers + 1): - self.safekeepers.append(self.start_safekeeper(i)) - - # Create and start postgres - self.postgres = self.create_postgres() - self.postgres.start() - - return self - - def start_safekeeper(self, i): - port = SafekeeperPort( - pg=self.port_distributor.get_port(), - http=self.port_distributor.get_port(), - ) - - if self.num_safekeepers == 1: - name = "single" - else: - name = f"sk{i}" - - safekeeper_dir = os.path.join(self.repo_dir, name) - mkdir_if_needed(safekeeper_dir) - - args = [ - self.bin_safekeeper, - "-l", - f"127.0.0.1:{port.pg}", - "--listen-http", - f"127.0.0.1:{port.http}", - "-D", - safekeeper_dir, - "--daemonize" - ] - - log.info(f'Running command "{" ".join(args)}"') - return subprocess.run(args, check=True) - - def get_safekeeper_connstrs(self): - return ','.join([sk_proc.args[2] for sk_proc in self.safekeepers]) - - def create_postgres(self): - pgdata_dir = os.path.join(self.repo_dir, "proposer_pgdata") - pg = ProposerPostgres(pgdata_dir, - self.pg_bin, - self.timeline_id, - self.tenant_id, - "127.0.0.1", - self.port_distributor.get_port()) - pg.initdb() - pg.create_dir_config(self.get_safekeeper_connstrs()) - return pg - - def kill_safekeeper(self, sk_dir): - """Read pid file and kill process""" - pid_file = os.path.join(sk_dir, "safekeeper.pid") - with open(pid_file, "r") as f: - pid = int(f.read()) - log.info(f"Killing safekeeper with pid {pid}") - os.kill(pid, signal.SIGKILL) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - log.info('Cleaning up all safekeeper and compute nodes') - - # Stop all the nodes - if self.postgres is not None: - self.postgres.stop() - if self.safekeepers is not None: - for sk_proc in self.safekeepers: - self.kill_safekeeper(sk_proc.args[6]) - - -def test_safekeeper_without_pageserver(test_output_dir: str, - port_distributor: PortDistributor, - pg_bin: PgBin): - # Create the environment in the test-specific output dir - repo_dir = Path(os.path.join(test_output_dir, "repo")) - - env = SafekeeperEnv( - repo_dir, - port_distributor, - pg_bin, - num_safekeepers=1, - ) - - with env: - env.init() - assert env.postgres is not None - - env.postgres.safe_psql("create table t(i int)") - env.postgres.safe_psql("insert into t select generate_series(1, 100)") - res = env.postgres.safe_psql("select sum(i) from t")[0][0] - assert res == 5050 - - -def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): - def safekeepers_guc(env: ZenithEnv, sk_names: List[str]) -> str: - return ','.join( - [f'localhost:{sk.port.pg}' for sk in env.safekeepers if sk.name in sk_names]) - - def execute_payload(pg: Postgres): - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - cur.execute('CREATE TABLE IF NOT EXISTS t(key int, value text)') - cur.execute("INSERT INTO t VALUES (0, 'something')") - cur.execute('SELECT SUM(key) FROM t') - sum_before = cur.fetchone()[0] - - cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - cur.execute('SELECT SUM(key) FROM t') - sum_after = cur.fetchone()[0] - assert sum_after == sum_before + 5000050000 - - def show_statuses(safekeepers: List[Safekeeper], tenant_id: str, timeline_id: str): - for sk in safekeepers: - http_cli = sk.http_client() - try: - status = http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"Safekeeper {sk.name} status: {status}") - except Exception as e: - log.info(f"Safekeeper {sk.name} status error: {e}") - - zenith_env_builder.num_safekeepers = 4 - env = zenith_env_builder.init() - env.zenith_cli.create_branch("test_replace_safekeeper", "main") - - log.info("Use only first 3 safekeepers") - env.safekeepers[3].stop() - active_safekeepers = ['sk1', 'sk2', 'sk3'] - pg = env.postgres.create('test_replace_safekeeper') - pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers)) - pg.start() - - # learn zenith timeline from compute - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] - - execute_payload(pg) - show_statuses(env.safekeepers, tenant_id, timeline_id) - - log.info("Restart all safekeepers to flush everything") - env.safekeepers[0].stop(immediate=True) - execute_payload(pg) - env.safekeepers[0].start() - env.safekeepers[1].stop(immediate=True) - execute_payload(pg) - env.safekeepers[1].start() - env.safekeepers[2].stop(immediate=True) - execute_payload(pg) - env.safekeepers[2].start() - - env.safekeepers[0].stop(immediate=True) - env.safekeepers[1].stop(immediate=True) - env.safekeepers[2].stop(immediate=True) - env.safekeepers[0].start() - env.safekeepers[1].start() - env.safekeepers[2].start() - - execute_payload(pg) - show_statuses(env.safekeepers, tenant_id, timeline_id) - - log.info("Stop sk1 (simulate failure) and use only quorum of sk2 and sk3") - env.safekeepers[0].stop(immediate=True) - execute_payload(pg) - show_statuses(env.safekeepers, tenant_id, timeline_id) - - log.info("Recreate postgres to replace failed sk1 with new sk4") - pg.stop_and_destroy().create('test_replace_safekeeper') - active_safekeepers = ['sk2', 'sk3', 'sk4'] - env.safekeepers[3].start() - pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers)) - pg.start() - - execute_payload(pg) - show_statuses(env.safekeepers, tenant_id, timeline_id) - - log.info("Stop sk2 to require quorum of sk3 and sk4 for normal work") - env.safekeepers[1].stop(immediate=True) - execute_payload(pg) - show_statuses(env.safekeepers, tenant_id, timeline_id) diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py deleted file mode 100644 index 1d2a186eb7..0000000000 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ /dev/null @@ -1,211 +0,0 @@ -import asyncio -import asyncpg -import random -import time - -from fixtures.zenith_fixtures import ZenithEnvBuilder, Postgres, Safekeeper -from fixtures.log_helper import getLogger -from fixtures.utils import lsn_from_hex, lsn_to_hex -from typing import List - -log = getLogger('root.wal_acceptor_async') - - -class BankClient(object): - def __init__(self, conn: asyncpg.Connection, n_accounts, init_amount): - self.conn: asyncpg.Connection = conn - self.n_accounts = n_accounts - self.init_amount = init_amount - - async def initdb(self): - await self.conn.execute('DROP TABLE IF EXISTS bank_accs') - await self.conn.execute('CREATE TABLE bank_accs(uid int primary key, amount int)') - await self.conn.execute( - ''' - INSERT INTO bank_accs - SELECT *, $1 FROM generate_series(0, $2) - ''', - self.init_amount, - self.n_accounts - 1) - await self.conn.execute('DROP TABLE IF EXISTS bank_log') - await self.conn.execute('CREATE TABLE bank_log(from_uid int, to_uid int, amount int)') - - # TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed - await self.conn.execute('ALTER TABLE bank_accs SET (autovacuum_enabled = false)') - await self.conn.execute('ALTER TABLE bank_log SET (autovacuum_enabled = false)') - - async def check_invariant(self): - row = await self.conn.fetchrow('SELECT sum(amount) AS sum FROM bank_accs') - assert row['sum'] == self.n_accounts * self.init_amount - - -async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount): - # avoid deadlocks by sorting uids - if from_uid > to_uid: - from_uid, to_uid, amount = to_uid, from_uid, -amount - - async with conn.transaction(): - await conn.execute( - 'UPDATE bank_accs SET amount = amount + ($1) WHERE uid = $2', - amount, - to_uid, - ) - await conn.execute( - 'UPDATE bank_accs SET amount = amount - ($1) WHERE uid = $2', - amount, - from_uid, - ) - await conn.execute( - 'INSERT INTO bank_log VALUES ($1, $2, $3)', - from_uid, - to_uid, - amount, - ) - - -class WorkerStats(object): - def __init__(self, n_workers): - self.counters = [0] * n_workers - self.running = True - - def reset(self): - self.counters = [0] * len(self.counters) - - def inc_progress(self, worker_id): - self.counters[worker_id] += 1 - - def check_progress(self): - log.debug("Workers progress: {}".format(self.counters)) - - # every worker should finish at least one tx - assert all(cnt > 0 for cnt in self.counters) - - progress = sum(self.counters) - log.info('All workers made {} transactions'.format(progress)) - - -async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accounts, max_transfer): - pg_conn = await pg.connect_async() - log.debug('Started worker {}'.format(worker_id)) - - while stats.running: - from_uid = random.randint(0, n_accounts - 1) - to_uid = (from_uid + random.randint(1, n_accounts - 1)) % n_accounts - amount = random.randint(1, max_transfer) - - await bank_transfer(pg_conn, from_uid, to_uid, amount) - stats.inc_progress(worker_id) - - log.debug('Executed transfer({}) {} => {}'.format(amount, from_uid, to_uid)) - - log.debug('Finished worker {}'.format(worker_id)) - - await pg_conn.close() - - -async def wait_for_lsn(safekeeper: Safekeeper, - tenant_id: str, - timeline_id: str, - wait_lsn: str, - polling_interval=1, - timeout=60): - """ - Poll flush_lsn from safekeeper until it's greater or equal than - provided wait_lsn. To do that, timeline_status is fetched from - safekeeper every polling_interval seconds. - """ - - started_at = time.time() - client = safekeeper.http_client() - - flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn - log.info( - f'Safekeeper at port {safekeeper.port.pg} has flush_lsn {flush_lsn}, waiting for lsn {wait_lsn}' - ) - - while lsn_from_hex(wait_lsn) > lsn_from_hex(flush_lsn): - elapsed = time.time() - started_at - if elapsed > timeout: - raise RuntimeError( - f"timed out waiting for safekeeper at port {safekeeper.port.pg} to reach {wait_lsn}, current lsn is {flush_lsn}" - ) - - await asyncio.sleep(polling_interval) - flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn - log.debug(f'safekeeper port={safekeeper.port.pg} flush_lsn={flush_lsn} wait_lsn={wait_lsn}') - - -# This test will run several iterations and check progress in each of them. -# On each iteration 1 acceptor is stopped, and 2 others should allow -# background workers execute transactions. In the end, state should remain -# consistent. -async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_workers=10): - n_accounts = 100 - init_amount = 100000 - max_transfer = 100 - period_time = 10 - iterations = 6 - - # Set timeout for this test at 5 minutes. It should be enough for test to complete - # and less than CircleCI's no_output_timeout, taking into account that this timeout - # is checked only at the beginning of every iteration. - test_timeout_at = time.monotonic() + 5 * 60 - - pg_conn = await pg.connect_async() - tenant_id = await pg_conn.fetchval("show zenith.zenith_tenant") - timeline_id = await pg_conn.fetchval("show zenith.zenith_timeline") - - bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount) - # create tables and initial balances - await bank.initdb() - - stats = WorkerStats(n_workers) - workers = [] - for worker_id in range(n_workers): - worker = run_random_worker(stats, pg, worker_id, bank.n_accounts, max_transfer) - workers.append(asyncio.create_task(worker)) - - for it in range(iterations): - assert time.monotonic() < test_timeout_at, 'test timed out' - - victim_idx = it % len(acceptors) - victim = acceptors[victim_idx] - victim.stop() - - flush_lsn = await pg_conn.fetchval('SELECT pg_current_wal_flush_lsn()') - flush_lsn = lsn_to_hex(flush_lsn) - log.info(f'Postgres flush_lsn {flush_lsn}') - - # Wait until alive safekeepers catch up with postgres - for idx, safekeeper in enumerate(acceptors): - if idx != victim_idx: - await wait_for_lsn(safekeeper, tenant_id, timeline_id, flush_lsn) - - stats.reset() - await asyncio.sleep(period_time) - # assert that at least one transaction has completed in every worker - stats.check_progress() - - victim.start() - - log.info('Iterations are finished, exiting coroutines...') - stats.running = False - # await all workers - await asyncio.gather(*workers) - # assert balances sum hasn't changed - await bank.check_invariant() - await pg_conn.close() - - -# restart acceptors one by one, while executing and validating bank transactions -def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() - - env.zenith_cli.create_branch("test_wal_acceptors_restarts_under_load", "main") - pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load') - - asyncio.run(run_restarts_under_load(pg, env.safekeepers)) - - # TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed - pg.stop() diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_zenith_cli.py deleted file mode 100644 index ce051dfd6e..0000000000 --- a/test_runner/batch_others/test_zenith_cli.py +++ /dev/null @@ -1,129 +0,0 @@ -import json -import uuid -import requests - -from psycopg2.extensions import cursor as PgCursor -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient -from typing import cast - - -def helper_compare_branch_list(pageserver_http_client: ZenithPageserverHttpClient, - env: ZenithEnv, - initial_tenant: uuid.UUID): - """ - Compare branches list returned by CLI and directly via API. - Filters out branches created by other tests. - """ - branches = pageserver_http_client.branch_list(initial_tenant) - branches_api = sorted(map(lambda b: cast(str, b['name']), branches)) - branches_api = [b for b in branches_api if b.startswith('test_cli_') or b in ('empty', 'main')] - - res = env.zenith_cli.list_branches() - branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n"))) - branches_cli = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')] - - res = env.zenith_cli.list_branches(tenant_id=initial_tenant) - branches_cli_with_tenant_arg = sorted( - map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n"))) - branches_cli_with_tenant_arg = [ - b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main') - ] - - assert branches_api == branches_cli == branches_cli_with_tenant_arg - - -def test_cli_branch_list(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - pageserver_http_client = env.pageserver.http_client() - - # Initial sanity check - helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant) - env.zenith_cli.create_branch("test_cli_branch_list_main", "empty") - helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant) - - # Create a nested branch - res = env.zenith_cli.create_branch("test_cli_branch_list_nested", "test_cli_branch_list_main") - assert res.stderr == '' - helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant) - - # Check that all new branches are visible via CLI - res = env.zenith_cli.list_branches() - assert res.stderr == '' - branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n"))) - - assert 'test_cli_branch_list_main' in branches_cli - assert 'test_cli_branch_list_nested' in branches_cli - - -def helper_compare_tenant_list(pageserver_http_client: ZenithPageserverHttpClient, env: ZenithEnv): - tenants = pageserver_http_client.tenant_list() - tenants_api = sorted(map(lambda t: cast(str, t['id']), tenants)) - - res = env.zenith_cli.list_tenants() - assert res.stderr == '' - tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) - - assert tenants_api == tenants_cli - - -def test_cli_tenant_list(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - pageserver_http_client = env.pageserver.http_client() - # Initial sanity check - helper_compare_tenant_list(pageserver_http_client, env) - - # Create new tenant - tenant1 = uuid.uuid4() - env.zenith_cli.create_tenant(tenant1) - - # check tenant1 appeared - helper_compare_tenant_list(pageserver_http_client, env) - - # Create new tenant - tenant2 = uuid.uuid4() - env.zenith_cli.create_tenant(tenant2) - - # check tenant2 appeared - helper_compare_tenant_list(pageserver_http_client, env) - - res = env.zenith_cli.list_tenants() - tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) - - assert env.initial_tenant.hex in tenants - assert tenant1.hex in tenants - assert tenant2.hex in tenants - - -def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): - # Start with single sk - zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() - - # Connect to sk port on v4 loopback - res = requests.get(f'http://127.0.0.1:{env.safekeepers[0].port.http}/v1/status') - assert res.ok - - # FIXME Test setup is using localhost:xx in ps config. - # Perhaps consider switching test suite to v4 loopback. - - # Connect to ps port on v4 loopback - # res = requests.get(f'http://127.0.0.1:{env.pageserver.service_port.http}/v1/status') - # assert res.ok - - -def test_cli_start_stop(zenith_env_builder: ZenithEnvBuilder): - # Start with single sk - zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() - - # Stop default ps/sk - env.zenith_cli.pageserver_stop() - env.zenith_cli.safekeeper_stop() - - # Default start - res = env.zenith_cli.raw_cli(["start"]) - res.check_returncode() - - # Default stop - res = env.zenith_cli.raw_cli(["stop"]) - res.check_returncode() diff --git a/test_runner/batch_pg_regress/test_isolation.py b/test_runner/batch_pg_regress/test_isolation.py deleted file mode 100644 index ddafc3815b..0000000000 --- a/test_runner/batch_pg_regress/test_isolation.py +++ /dev/null @@ -1,47 +0,0 @@ -import os - -from fixtures.utils import mkdir_if_needed -from fixtures.zenith_fixtures import ZenithEnv, base_dir, pg_distrib_dir - - -def test_isolation(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys): - env = zenith_simple_env - - env.zenith_cli.create_branch("test_isolation", "empty") - # Connect to postgres and create a database called "regression". - # isolation tests use prepared transactions, so enable them - pg = env.postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100']) - pg.safe_psql('CREATE DATABASE isolation_regression') - - # Create some local directories for pg_isolation_regress to run in. - runpath = os.path.join(test_output_dir, 'regress') - mkdir_if_needed(runpath) - mkdir_if_needed(os.path.join(runpath, 'testtablespace')) - - # Compute all the file locations that pg_isolation_regress will need. - build_path = os.path.join(pg_distrib_dir, 'build/src/test/isolation') - src_path = os.path.join(base_dir, 'vendor/postgres/src/test/isolation') - bindir = os.path.join(pg_distrib_dir, 'bin') - schedule = os.path.join(src_path, 'isolation_schedule') - pg_isolation_regress = os.path.join(build_path, 'pg_isolation_regress') - - pg_isolation_regress_command = [ - pg_isolation_regress, - '--use-existing', - '--bindir={}'.format(bindir), - '--dlpath={}'.format(build_path), - '--inputdir={}'.format(src_path), - '--schedule={}'.format(schedule), - ] - - env_vars = { - 'PGPORT': str(pg.port), - 'PGUSER': pg.username, - 'PGHOST': pg.host, - } - - # Run the command. - # We don't capture the output. It's not too chatty, and it always - # logs the exact same data to `regression.out` anyway. - with capsys.disabled(): - pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath) diff --git a/test_runner/batch_pg_regress/test_pg_regress.py b/test_runner/batch_pg_regress/test_pg_regress.py deleted file mode 100644 index 5199f65216..0000000000 --- a/test_runner/batch_pg_regress/test_pg_regress.py +++ /dev/null @@ -1,54 +0,0 @@ -import os - -from fixtures.utils import mkdir_if_needed -from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content, base_dir, pg_distrib_dir - - -def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin, capsys): - env = zenith_simple_env - - env.zenith_cli.create_branch("test_pg_regress", "empty") - # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start('test_pg_regress') - pg.safe_psql('CREATE DATABASE regression') - - # Create some local directories for pg_regress to run in. - runpath = os.path.join(test_output_dir, 'regress') - mkdir_if_needed(runpath) - mkdir_if_needed(os.path.join(runpath, 'testtablespace')) - - # Compute all the file locations that pg_regress will need. - build_path = os.path.join(pg_distrib_dir, 'build/src/test/regress') - src_path = os.path.join(base_dir, 'vendor/postgres/src/test/regress') - bindir = os.path.join(pg_distrib_dir, 'bin') - schedule = os.path.join(src_path, 'parallel_schedule') - pg_regress = os.path.join(build_path, 'pg_regress') - - pg_regress_command = [ - pg_regress, - '--bindir=""', - '--use-existing', - '--bindir={}'.format(bindir), - '--dlpath={}'.format(build_path), - '--schedule={}'.format(schedule), - '--inputdir={}'.format(src_path), - ] - - env_vars = { - 'PGPORT': str(pg.port), - 'PGUSER': pg.username, - 'PGHOST': pg.host, - } - - # Run the command. - # We don't capture the output. It's not too chatty, and it always - # logs the exact same data to `regression.out` anyway. - with capsys.disabled(): - pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - - # checkpoint one more time to ensure that the lsn we get is the latest one - pg.safe_psql('CHECKPOINT') - lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0] - - # Check that we restore the content of the datadir correctly - check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/batch_pg_regress/test_zenith_regress.py b/test_runner/batch_pg_regress/test_zenith_regress.py deleted file mode 100644 index 31d5b07093..0000000000 --- a/test_runner/batch_pg_regress/test_zenith_regress.py +++ /dev/null @@ -1,59 +0,0 @@ -import os - -from fixtures.utils import mkdir_if_needed -from fixtures.zenith_fixtures import (ZenithEnv, - check_restored_datadir_content, - base_dir, - pg_distrib_dir) -from fixtures.log_helper import log - - -def test_zenith_regress(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys): - env = zenith_simple_env - - env.zenith_cli.create_branch("test_zenith_regress", "empty") - # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start('test_zenith_regress') - pg.safe_psql('CREATE DATABASE regression') - - # Create some local directories for pg_regress to run in. - runpath = os.path.join(test_output_dir, 'regress') - mkdir_if_needed(runpath) - mkdir_if_needed(os.path.join(runpath, 'testtablespace')) - - # Compute all the file locations that pg_regress will need. - # This test runs zenith specific tests - build_path = os.path.join(pg_distrib_dir, 'build/src/test/regress') - src_path = os.path.join(base_dir, 'test_runner/zenith_regress') - bindir = os.path.join(pg_distrib_dir, 'bin') - schedule = os.path.join(src_path, 'parallel_schedule') - pg_regress = os.path.join(build_path, 'pg_regress') - - pg_regress_command = [ - pg_regress, - '--use-existing', - '--bindir={}'.format(bindir), - '--dlpath={}'.format(build_path), - '--schedule={}'.format(schedule), - '--inputdir={}'.format(src_path), - ] - - log.info(pg_regress_command) - env_vars = { - 'PGPORT': str(pg.port), - 'PGUSER': pg.username, - 'PGHOST': pg.host, - } - - # Run the command. - # We don't capture the output. It's not too chatty, and it always - # logs the exact same data to `regression.out` anyway. - with capsys.disabled(): - pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - - # checkpoint one more time to ensure that the lsn we get is the latest one - pg.safe_psql('CHECKPOINT') - lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0] - - # Check that we restore the content of the datadir correctly - check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 59e415e3a8..8b7f6a2eea 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -1,6 +1,7 @@ pytest_plugins = ( - "fixtures.zenith_fixtures", + "fixtures.neon_fixtures", "fixtures.benchmark_fixture", + "fixtures.pg_stats", "fixtures.compare_fixtures", "fixtures.slow", ) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 8fb7edda9c..fe936414c7 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -1,49 +1,47 @@ +import calendar import dataclasses +import enum import json import os -from pathlib import Path import re -import subprocess import timeit -import calendar -import enum -from datetime import datetime -import uuid -import pytest -from _pytest.config import Config -from _pytest.terminal import TerminalReporter import warnings - from contextlib import contextmanager +from datetime import datetime +from pathlib import Path # Type-related stuff -from typing import Iterator +from typing import Callable, ClassVar, Iterator, Optional + +import pytest +from _pytest.config import Config +from _pytest.config.argparsing import Parser +from _pytest.terminal import TerminalReporter +from fixtures.neon_fixtures import NeonPageserver +from fixtures.types import TenantId, TimelineId + """ This file contains fixtures for micro-benchmarks. -To use, declare the 'zenbenchmark' fixture in the test function. Run the -bencmark, and then record the result by calling zenbenchmark.record. For example: +To use, declare the `zenbenchmark` fixture in the test function. Run the +bencmark, and then record the result by calling `zenbenchmark.record`. For example: -import timeit -from fixtures.zenith_fixtures import ZenithEnv - -def test_mybench(zenith_simple_env: env, zenbenchmark): - - # Initialize the test - ... - - # Run the test, timing how long it takes - with zenbenchmark.record_duration('test_query'): - cur.execute('SELECT test_query(...)') - - # Record another measurement - zenbenchmark.record('speed_of_light', 300000, 'km/s') +>>> import timeit +>>> from fixtures.neon_fixtures import NeonEnv +>>> def test_mybench(neon_simple_env: NeonEnv, zenbenchmark): +... # Initialize the test +... ... +... # Run the test, timing how long it takes +... with zenbenchmark.record_duration('test_query'): +... cur.execute('SELECT test_query(...)') +... # Record another measurement +... zenbenchmark.record('speed_of_light', 300000, 'km/s') There's no need to import this file to use it. It should be declared as a plugin -inside conftest.py, and that makes it available to all tests. +inside `conftest.py`, and that makes it available to all tests. You can measure multiple things in one test, and record each one with a separate -call to zenbenchmark. For example, you could time the bulk loading that happens +call to `zenbenchmark`. For example, you could time the bulk loading that happens in the test initialization, or measure disk usage after the test query. """ @@ -51,77 +49,143 @@ in the test initialization, or measure disk usage after the test query. @dataclasses.dataclass class PgBenchRunResult: - scale: int number_of_clients: int number_of_threads: int number_of_transactions_actually_processed: int latency_average: float - latency_stddev: float - tps_including_connection_time: float - tps_excluding_connection_time: float - init_duration: float - init_start_timestamp: int - init_end_timestamp: int + latency_stddev: Optional[float] + tps: float run_duration: float run_start_timestamp: int run_end_timestamp: int + scale: int # TODO progress @classmethod - def parse_from_output( + def parse_from_stdout( cls, - out: 'subprocess.CompletedProcess[str]', - init_duration: float, - init_start_timestamp: int, - init_end_timestamp: int, + stdout: str, run_duration: float, run_start_timestamp: int, run_end_timestamp: int, ): - stdout_lines = out.stdout.splitlines() + stdout_lines = stdout.splitlines() + + latency_stddev = None + # we know significant parts of these values from test input # but to be precise take them from output - # scaling factor: 5 - assert "scaling factor" in stdout_lines[1] - scale = int(stdout_lines[1].split()[-1]) - # number of clients: 1 - assert "number of clients" in stdout_lines[3] - number_of_clients = int(stdout_lines[3].split()[-1]) - # number of threads: 1 - assert "number of threads" in stdout_lines[4] - number_of_threads = int(stdout_lines[4].split()[-1]) - # number of transactions actually processed: 1000/1000 - assert "number of transactions actually processed" in stdout_lines[6] - number_of_transactions_actually_processed = int(stdout_lines[6].split("/")[1]) - # latency average = 19.894 ms - assert "latency average" in stdout_lines[7] - latency_average = stdout_lines[7].split()[-2] - # latency stddev = 3.387 ms - assert "latency stddev" in stdout_lines[8] - latency_stddev = stdout_lines[8].split()[-2] - # tps = 50.219689 (including connections establishing) - assert "(including connections establishing)" in stdout_lines[9] - tps_including_connection_time = stdout_lines[9].split()[2] - # tps = 50.264435 (excluding connections establishing) - assert "(excluding connections establishing)" in stdout_lines[10] - tps_excluding_connection_time = stdout_lines[10].split()[2] + for line in stdout_lines: + # scaling factor: 5 + if line.startswith("scaling factor:"): + scale = int(line.split()[-1]) + # number of clients: 1 + if line.startswith("number of clients: "): + number_of_clients = int(line.split()[-1]) + # number of threads: 1 + if line.startswith("number of threads: "): + number_of_threads = int(line.split()[-1]) + # number of transactions actually processed: 1000/1000 + # OR + # number of transactions actually processed: 1000 + if line.startswith("number of transactions actually processed"): + if "/" in line: + number_of_transactions_actually_processed = int(line.split("/")[1]) + else: + number_of_transactions_actually_processed = int(line.split()[-1]) + # latency average = 19.894 ms + if line.startswith("latency average"): + latency_average = float(line.split()[-2]) + # latency stddev = 3.387 ms + # (only printed with some options) + if line.startswith("latency stddev"): + latency_stddev = float(line.split()[-2]) + + # Get the TPS without initial connection time. The format + # of the tps lines changed in pgbench v14, but we accept + # either format: + # + # pgbench v13 and below: + # tps = 50.219689 (including connections establishing) + # tps = 50.264435 (excluding connections establishing) + # + # pgbench v14: + # initial connection time = 3.858 ms + # tps = 309.281539 (without initial connection time) + if line.startswith("tps = ") and ( + "(excluding connections establishing)" in line + or "(without initial connection time)" in line + ): + tps = float(line.split()[2]) return cls( - scale=scale, number_of_clients=number_of_clients, number_of_threads=number_of_threads, number_of_transactions_actually_processed=number_of_transactions_actually_processed, - latency_average=float(latency_average), - latency_stddev=float(latency_stddev), - tps_including_connection_time=float(tps_including_connection_time), - tps_excluding_connection_time=float(tps_excluding_connection_time), - init_duration=init_duration, - init_start_timestamp=init_start_timestamp, - init_end_timestamp=init_end_timestamp, + latency_average=latency_average, + latency_stddev=latency_stddev, + tps=tps, run_duration=run_duration, run_start_timestamp=run_start_timestamp, run_end_timestamp=run_end_timestamp, + scale=scale, + ) + + +@dataclasses.dataclass +class PgBenchInitResult: + REGEX: ClassVar[re.Pattern] = re.compile( # type: ignore[type-arg] + r"done in (\d+\.\d+) s " + r"\(" + r"(?:drop tables (\d+\.\d+) s)?(?:, )?" + r"(?:create tables (\d+\.\d+) s)?(?:, )?" + r"(?:client-side generate (\d+\.\d+) s)?(?:, )?" + r"(?:vacuum (\d+\.\d+) s)?(?:, )?" + r"(?:primary keys (\d+\.\d+) s)?(?:, )?" + r"\)\." + ) + + total: float + drop_tables: Optional[float] + create_tables: Optional[float] + client_side_generate: Optional[float] + vacuum: Optional[float] + primary_keys: Optional[float] + duration: float + start_timestamp: int + end_timestamp: int + + @classmethod + def parse_from_stderr( + cls, + stderr: str, + duration: float, + start_timestamp: int, + end_timestamp: int, + ): + # Parses pgbench initialize output for default initialization steps (dtgvp) + # Example: done in 5.66 s (drop tables 0.05 s, create tables 0.31 s, client-side generate 2.01 s, vacuum 0.53 s, primary keys 0.38 s). + + last_line = stderr.splitlines()[-1] + + if (m := cls.REGEX.match(last_line)) is not None: + total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [ + float(v) for v in m.groups() if v is not None + ] + else: + raise RuntimeError(f"can't parse pgbench initialize results from `{last_line}`") + + return cls( + total=total, + drop_tables=drop_tables, + create_tables=create_tables, + client_side_generate=client_side_generate, + vacuum=vacuum, + primary_keys=primary_keys, + duration=duration, + start_timestamp=start_timestamp, + end_timestamp=end_timestamp, ) @@ -129,19 +193,20 @@ class PgBenchRunResult: class MetricReport(str, enum.Enum): # str is a hack to make it json serializable # this means that this is a constant test parameter # like number of transactions, or number of clients - TEST_PARAM = 'test_param' + TEST_PARAM = "test_param" # reporter can use it to mark test runs with higher values as improvements - HIGHER_IS_BETTER = 'higher_is_better' + HIGHER_IS_BETTER = "higher_is_better" # the same but for lower values - LOWER_IS_BETTER = 'lower_is_better' + LOWER_IS_BETTER = "lower_is_better" -class ZenithBenchmarker: +class NeonBenchmarker: """ An object for recording benchmark results. This is created for each test function by the zenbenchmark fixture """ - def __init__(self, property_recorder): + + def __init__(self, property_recorder: Callable[[str, object], None]): # property recorder here is a pytest fixture provided by junitxml module # https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property self.property_recorder = property_recorder @@ -157,7 +222,7 @@ class ZenithBenchmarker: Record a benchmark result. """ # just to namespace the value - name = f"zenith_benchmarker_{metric_name}" + name = f"neon_benchmarker_{metric_name}" self.property_recorder( name, { @@ -169,7 +234,7 @@ class ZenithBenchmarker: ) @contextmanager - def record_duration(self, metric_name: str): + def record_duration(self, metric_name: str) -> Iterator[None]: """ Record a duration. Usage: @@ -187,72 +252,105 @@ class ZenithBenchmarker: report=MetricReport.LOWER_IS_BETTER, ) - def record_pg_bench_result(self, pg_bench_result: PgBenchRunResult): - self.record("scale", pg_bench_result.scale, '', MetricReport.TEST_PARAM) - self.record("number_of_clients", - pg_bench_result.number_of_clients, - '', - MetricReport.TEST_PARAM) - self.record("number_of_threads", - pg_bench_result.number_of_threads, - '', - MetricReport.TEST_PARAM) + def record_pg_bench_result(self, prefix: str, pg_bench_result: PgBenchRunResult): self.record( - "number_of_transactions_actually_processed", + f"{prefix}.number_of_clients", + pg_bench_result.number_of_clients, + "", + MetricReport.TEST_PARAM, + ) + self.record( + f"{prefix}.number_of_threads", + pg_bench_result.number_of_threads, + "", + MetricReport.TEST_PARAM, + ) + self.record( + f"{prefix}.number_of_transactions_actually_processed", pg_bench_result.number_of_transactions_actually_processed, - '', - # thats because this is predefined by test matrix and doesnt change across runs + "", + # that's because this is predefined by test matrix and doesn't change across runs report=MetricReport.TEST_PARAM, ) - self.record("latency_average", - pg_bench_result.latency_average, - unit="ms", - report=MetricReport.LOWER_IS_BETTER) - self.record("latency_stddev", - pg_bench_result.latency_stddev, - unit="ms", - report=MetricReport.LOWER_IS_BETTER) - self.record("tps_including_connection_time", - pg_bench_result.tps_including_connection_time, - '', - report=MetricReport.HIGHER_IS_BETTER) - self.record("tps_excluding_connection_time", - pg_bench_result.tps_excluding_connection_time, - '', - report=MetricReport.HIGHER_IS_BETTER) - self.record("init_duration", - pg_bench_result.init_duration, - unit="s", - report=MetricReport.LOWER_IS_BETTER) - self.record("init_start_timestamp", - pg_bench_result.init_start_timestamp, - '', - MetricReport.TEST_PARAM) - self.record("init_end_timestamp", - pg_bench_result.init_end_timestamp, - '', - MetricReport.TEST_PARAM) - self.record("run_duration", - pg_bench_result.run_duration, - unit="s", - report=MetricReport.LOWER_IS_BETTER) - self.record("run_start_timestamp", - pg_bench_result.run_start_timestamp, - '', - MetricReport.TEST_PARAM) - self.record("run_end_timestamp", - pg_bench_result.run_end_timestamp, - '', - MetricReport.TEST_PARAM) + self.record( + f"{prefix}.latency_average", + pg_bench_result.latency_average, + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) + if pg_bench_result.latency_stddev is not None: + self.record( + f"{prefix}.latency_stddev", + pg_bench_result.latency_stddev, + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) + self.record(f"{prefix}.tps", pg_bench_result.tps, "", report=MetricReport.HIGHER_IS_BETTER) + self.record( + f"{prefix}.run_duration", + pg_bench_result.run_duration, + unit="s", + report=MetricReport.LOWER_IS_BETTER, + ) + self.record( + f"{prefix}.run_start_timestamp", + pg_bench_result.run_start_timestamp, + "", + MetricReport.TEST_PARAM, + ) + self.record( + f"{prefix}.run_end_timestamp", + pg_bench_result.run_end_timestamp, + "", + MetricReport.TEST_PARAM, + ) + self.record( + f"{prefix}.scale", + pg_bench_result.scale, + "", + MetricReport.TEST_PARAM, + ) - def get_io_writes(self, pageserver) -> int: + def record_pg_bench_init_result(self, prefix: str, result: PgBenchInitResult): + test_params = [ + "start_timestamp", + "end_timestamp", + ] + for test_param in test_params: + self.record( + f"{prefix}.{test_param}", getattr(result, test_param), "", MetricReport.TEST_PARAM + ) + + metrics = [ + "duration", + "drop_tables", + "create_tables", + "client_side_generate", + "vacuum", + "primary_keys", + ] + for metric in metrics: + if (value := getattr(result, metric)) is not None: + self.record( + f"{prefix}.{metric}", value, unit="s", report=MetricReport.LOWER_IS_BETTER + ) + + def get_io_writes(self, pageserver: NeonPageserver) -> int: """ Fetch the "cumulative # of bytes written" metric from the pageserver """ - # Fetch all the exposed prometheus metrics from page server - all_metrics = pageserver.http_client().get_metrics() - # Use a regular expression to extract the one we're interested in - # + metric_name = r'libmetrics_disk_io_bytes_total{io_operation="write"}' + return self.get_int_counter_value(pageserver, metric_name) + + def get_peak_mem(self, pageserver: NeonPageserver) -> int: + """ + Fetch the "maxrss" metric from the pageserver + """ + metric_name = r"libmetrics_maxrss_kb" + return self.get_int_counter_value(pageserver, metric_name) + + def get_int_counter_value(self, pageserver: NeonPageserver, metric_name: str) -> int: + """Fetch the value of given int counter from pageserver metrics.""" # TODO: If we start to collect more of the prometheus metrics in the # performance test suite like this, we should refactor this to load and # parse all the metrics into a more convenient structure in one go. @@ -260,28 +358,18 @@ class ZenithBenchmarker: # The metric should be an integer, as it's a number of bytes. But in general # all prometheus metrics are floats. So to be pedantic, read it as a float # and round to integer. - matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$', - all_metrics, - re.MULTILINE) - assert matches - return int(round(float(matches.group(1)))) - - def get_peak_mem(self, pageserver) -> int: - """ - Fetch the "maxrss" metric from the pageserver - """ - # Fetch all the exposed prometheus metrics from page server all_metrics = pageserver.http_client().get_metrics() - # See comment in get_io_writes() - matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics, re.MULTILINE) - assert matches + matches = re.search(rf"^{metric_name} (\S+)$", all_metrics, re.MULTILINE) + assert matches, f"metric {metric_name} not found" return int(round(float(matches.group(1)))) - def get_timeline_size(self, repo_dir: Path, tenantid: uuid.UUID, timelineid: str): + def get_timeline_size( + self, repo_dir: Path, tenant_id: TenantId, timeline_id: TimelineId + ) -> int: """ Calculate the on-disk size of a timeline """ - path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid.hex, timelineid) + path = f"{repo_dir}/tenants/{tenant_id}/timelines/{timeline_id}" totalbytes = 0 for root, dirs, files in os.walk(path): @@ -291,7 +379,9 @@ class ZenithBenchmarker: return totalbytes @contextmanager - def record_pageserver_writes(self, pageserver, metric_name): + def record_pageserver_writes( + self, pageserver: NeonPageserver, metric_name: str + ) -> Iterator[None]: """ Record bytes written by the pageserver during a test. """ @@ -299,27 +389,29 @@ class ZenithBenchmarker: yield after = self.get_io_writes(pageserver) - self.record(metric_name, - round((after - before) / (1024 * 1024)), - "MB", - report=MetricReport.LOWER_IS_BETTER) + self.record( + metric_name, + round((after - before) / (1024 * 1024)), + "MB", + report=MetricReport.LOWER_IS_BETTER, + ) @pytest.fixture(scope="function") -def zenbenchmark(record_property) -> Iterator[ZenithBenchmarker]: +def zenbenchmark(record_property: Callable[[str, object], None]) -> Iterator[NeonBenchmarker]: """ This is a python decorator for benchmark fixtures. It contains functions for recording measurements, and prints them out at the end. """ - benchmarker = ZenithBenchmarker(record_property) + benchmarker = NeonBenchmarker(record_property) yield benchmarker -def pytest_addoption(parser): +def pytest_addoption(parser: Parser): parser.addoption( "--out-dir", dest="out_dir", - help="Directory to ouput performance tests results to.", + help="Directory to output performance tests results to.", ) @@ -339,7 +431,9 @@ def get_out_path(target_dir: Path, revision: str) -> Path: # Hook to print the results at the end @pytest.hookimpl(hookwrapper=True) -def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, config: Config): +def pytest_terminal_summary( + terminalreporter: TerminalReporter, exitstatus: int, config: Config +) -> Iterator[None]: yield revision = os.getenv("GITHUB_SHA", "local") platform = os.getenv("PLATFORM", "local") @@ -354,18 +448,18 @@ def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, results = [] for name, report in reports.items(): - terminalreporter.write(f"{name}", green=True) - terminalreporter.line("") - if "[zenith" in name: - vanilla_report = reports.get(name.replace("[zenith", "[vanilla")) + # terminalreporter.write(f"{name}", green=True) + # terminalreporter.line("") + if "[neon" in name: + vanilla_report = reports.get(name.replace("[neon", "[vanilla")) if vanilla_report: for key, prop in report.user_properties: if prop["unit"] == "s": - zenith_value = prop["value"] + neon_value = prop["value"] vanilla_value = dict(vanilla_report.user_properties)[key]["value"] - ratio = float(zenith_value) / vanilla_value + ratio = float(neon_value) / vanilla_value - results.append((ratio, name.replace("[zenith", "[zenith/vanilla"), prop["name"])) + results.append((ratio, name.replace("[neon", "[neon/vanilla"), prop["name"])) results.sort(reverse=True) for ratio, test, prop in results: @@ -416,6 +510,5 @@ def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, return get_out_path(Path(out_dir), revision=revision).write_text( - json.dumps({ - "revision": revision, "platform": platform, "result": result - }, indent=4)) + json.dumps({"revision": revision, "platform": platform, "result": result}, indent=4) + ) diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 570c787184..291f924379 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -1,20 +1,23 @@ -import pytest -from contextlib import contextmanager from abc import ABC, abstractmethod - -from fixtures.zenith_fixtures import PgBin, PgProtocol, VanillaPostgres, ZenithEnv -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker +from contextlib import _GeneratorContextManager, contextmanager # Type-related stuff -from typing import Iterator +from typing import Dict, Iterator, List + +import pytest +from _pytest.fixtures import FixtureRequest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.neon_fixtures import NeonEnv, PgBin, PgProtocol, RemotePostgres, VanillaPostgres +from fixtures.pg_stats import PgStatTable class PgCompare(ABC): """Common interface of all postgres implementations, useful for benchmarks. - This class is a helper class for the zenith_with_baseline fixture. See its documentation + This class is a helper class for the neon_with_baseline fixture. See its documentation for more details. """ + @property @abstractmethod def pg(self) -> PgProtocol: @@ -26,19 +29,20 @@ class PgCompare(ABC): pass @property - def zenbenchmark(self) -> ZenithBenchmarker: + @abstractmethod + def zenbenchmark(self) -> NeonBenchmarker: pass @abstractmethod - def flush(self) -> None: + def flush(self): pass @abstractmethod - def report_peak_memory_use(self) -> None: + def report_peak_memory_use(self): pass @abstractmethod - def report_size(self) -> None: + def report_size(self): pass @contextmanager @@ -51,71 +55,120 @@ class PgCompare(ABC): def record_duration(self, out_name): pass + @contextmanager + def record_pg_stats(self, pg_stats: List[PgStatTable]) -> Iterator[None]: + init_data = self._retrieve_pg_stats(pg_stats) -class ZenithCompare(PgCompare): - """PgCompare interface for the zenith stack.""" - def __init__(self, - zenbenchmark: ZenithBenchmarker, - zenith_simple_env: ZenithEnv, - pg_bin: PgBin, - branch_name): - self.env = zenith_simple_env + yield + + data = self._retrieve_pg_stats(pg_stats) + + for k in set(init_data) & set(data): + self.zenbenchmark.record(k, data[k] - init_data[k], "", MetricReport.HIGHER_IS_BETTER) + + def _retrieve_pg_stats(self, pg_stats: List[PgStatTable]) -> Dict[str, int]: + results: Dict[str, int] = {} + + with self.pg.connect().cursor() as cur: + for pg_stat in pg_stats: + cur.execute(pg_stat.query) + row = cur.fetchone() + assert row is not None + assert len(row) == len(pg_stat.columns) + + for col, val in zip(pg_stat.columns, row): + results[f"{pg_stat.table}.{col}"] = int(val) + + return results + + +class NeonCompare(PgCompare): + """PgCompare interface for the neon stack.""" + + def __init__( + self, + zenbenchmark: NeonBenchmarker, + neon_simple_env: NeonEnv, + pg_bin: PgBin, + branch_name: str, + ): + self.env = neon_simple_env self._zenbenchmark = zenbenchmark self._pg_bin = pg_bin + self.pageserver_http_client = self.env.pageserver.http_client() # We only use one branch and one timeline - self.branch = branch_name - self.env.zenith_cli.create_branch(self.branch, "empty") - self._pg = self.env.postgres.create_start(self.branch) - self.timeline = self.pg.safe_psql("SHOW zenith.zenith_timeline")[0][0] - - # Long-lived cursor, useful for flushing - self.psconn = self.env.pageserver.connect() - self.pscur = self.psconn.cursor() + self.env.neon_cli.create_branch(branch_name, "empty") + self._pg = self.env.postgres.create_start(branch_name) + self.timeline = self.pg.safe_psql("SHOW neon.timeline_id")[0][0] @property - def pg(self): + def pg(self) -> PgProtocol: return self._pg @property - def zenbenchmark(self): + def zenbenchmark(self) -> NeonBenchmarker: return self._zenbenchmark @property - def pg_bin(self): + def pg_bin(self) -> PgBin: return self._pg_bin def flush(self): - self.pscur.execute(f"do_gc {self.env.initial_tenant.hex} {self.timeline} 0") + self.pageserver_http_client.timeline_gc(self.env.initial_tenant, self.timeline, 0) - def report_peak_memory_use(self) -> None: - self.zenbenchmark.record("peak_mem", - self.zenbenchmark.get_peak_mem(self.env.pageserver) / 1024, - 'MB', - report=MetricReport.LOWER_IS_BETTER) + def compact(self): + self.pageserver_http_client.timeline_compact(self.env.initial_tenant, self.timeline) - def report_size(self) -> None: - timeline_size = self.zenbenchmark.get_timeline_size(self.env.repo_dir, - self.env.initial_tenant, - self.timeline) - self.zenbenchmark.record('size', - timeline_size / (1024 * 1024), - 'MB', - report=MetricReport.LOWER_IS_BETTER) + def report_peak_memory_use(self): + self.zenbenchmark.record( + "peak_mem", + self.zenbenchmark.get_peak_mem(self.env.pageserver) / 1024, + "MB", + report=MetricReport.LOWER_IS_BETTER, + ) - def record_pageserver_writes(self, out_name): + def report_size(self): + timeline_size = self.zenbenchmark.get_timeline_size( + self.env.repo_dir, self.env.initial_tenant, self.timeline + ) + self.zenbenchmark.record( + "size", timeline_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER + ) + + params = f'{{tenant_id="{self.env.initial_tenant}",timeline_id="{self.timeline}"}}' + total_files = self.zenbenchmark.get_int_counter_value( + self.env.pageserver, "pageserver_created_persistent_files_total" + params + ) + total_bytes = self.zenbenchmark.get_int_counter_value( + self.env.pageserver, "pageserver_written_persistent_bytes_total" + params + ) + self.zenbenchmark.record( + "data_uploaded", total_bytes / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER + ) + self.zenbenchmark.record( + "num_files_uploaded", total_files, "", report=MetricReport.LOWER_IS_BETTER + ) + + def record_pageserver_writes(self, out_name: str) -> _GeneratorContextManager[None]: return self.zenbenchmark.record_pageserver_writes(self.env.pageserver, out_name) - def record_duration(self, out_name): + def record_duration(self, out_name: str) -> _GeneratorContextManager[None]: return self.zenbenchmark.record_duration(out_name) class VanillaCompare(PgCompare): """PgCompare interface for vanilla postgres.""" - def __init__(self, zenbenchmark, vanilla_pg: VanillaPostgres): + + def __init__(self, zenbenchmark: NeonBenchmarker, vanilla_pg: VanillaPostgres): self._pg = vanilla_pg self._zenbenchmark = zenbenchmark - vanilla_pg.configure(['shared_buffers=1MB']) + vanilla_pg.configure( + [ + "shared_buffers=1MB", + "synchronous_commit=off", + ] + ) vanilla_pg.start() # Long-lived cursor, useful for flushing @@ -123,61 +176,112 @@ class VanillaCompare(PgCompare): self.cur = self.conn.cursor() @property - def pg(self): + def pg(self) -> PgProtocol: return self._pg @property - def zenbenchmark(self): + def zenbenchmark(self) -> NeonBenchmarker: return self._zenbenchmark @property - def pg_bin(self): + def pg_bin(self) -> PgBin: return self._pg.pg_bin def flush(self): self.cur.execute("checkpoint") - def report_peak_memory_use(self) -> None: + def report_peak_memory_use(self): pass # TODO find something - def report_size(self) -> None: - data_size = self.pg.get_subdir_size('base') - self.zenbenchmark.record('data_size', - data_size / (1024 * 1024), - 'MB', - report=MetricReport.LOWER_IS_BETTER) - wal_size = self.pg.get_subdir_size('pg_wal') - self.zenbenchmark.record('wal_size', - wal_size / (1024 * 1024), - 'MB', - report=MetricReport.LOWER_IS_BETTER) + def report_size(self): + data_size = self.pg.get_subdir_size("base") + self.zenbenchmark.record( + "data_size", data_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER + ) + wal_size = self.pg.get_subdir_size("pg_wal") + self.zenbenchmark.record( + "wal_size", wal_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER + ) @contextmanager - def record_pageserver_writes(self, out_name): + def record_pageserver_writes(self, out_name: str) -> Iterator[None]: yield # Do nothing - def record_duration(self, out_name): + def record_duration(self, out_name: str) -> _GeneratorContextManager[None]: return self.zenbenchmark.record_duration(out_name) -@pytest.fixture(scope='function') -def zenith_compare(request, zenbenchmark, pg_bin, zenith_simple_env) -> ZenithCompare: +class RemoteCompare(PgCompare): + """PgCompare interface for a remote postgres instance.""" + + def __init__(self, zenbenchmark: NeonBenchmarker, remote_pg: RemotePostgres): + self._pg = remote_pg + self._zenbenchmark = zenbenchmark + + # Long-lived cursor, useful for flushing + self.conn = self.pg.connect() + self.cur = self.conn.cursor() + + @property + def pg(self) -> PgProtocol: + return self._pg + + @property + def zenbenchmark(self) -> NeonBenchmarker: + return self._zenbenchmark + + @property + def pg_bin(self) -> PgBin: + return self._pg.pg_bin + + def flush(self): + # TODO: flush the remote pageserver + pass + + def report_peak_memory_use(self): + # TODO: get memory usage from remote pageserver + pass + + def report_size(self): + # TODO: get storage size from remote pageserver + pass + + @contextmanager + def record_pageserver_writes(self, out_name: str) -> Iterator[None]: + yield # Do nothing + + def record_duration(self, out_name: str) -> _GeneratorContextManager[None]: + return self.zenbenchmark.record_duration(out_name) + + +@pytest.fixture(scope="function") +def neon_compare( + request: FixtureRequest, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + neon_simple_env: NeonEnv, +) -> NeonCompare: branch_name = request.node.name - return ZenithCompare(zenbenchmark, zenith_simple_env, pg_bin, branch_name) + return NeonCompare(zenbenchmark, neon_simple_env, pg_bin, branch_name) -@pytest.fixture(scope='function') -def vanilla_compare(zenbenchmark, vanilla_pg) -> VanillaCompare: +@pytest.fixture(scope="function") +def vanilla_compare(zenbenchmark: NeonBenchmarker, vanilla_pg: VanillaPostgres) -> VanillaCompare: return VanillaCompare(zenbenchmark, vanilla_pg) -@pytest.fixture(params=["vanilla_compare", "zenith_compare"], ids=["vanilla", "zenith"]) -def zenith_with_baseline(request) -> PgCompare: - """Parameterized fixture that helps compare zenith against vanilla postgres. +@pytest.fixture(scope="function") +def remote_compare(zenbenchmark: NeonBenchmarker, remote_pg: RemotePostgres) -> RemoteCompare: + return RemoteCompare(zenbenchmark, remote_pg) + + +@pytest.fixture(params=["vanilla_compare", "neon_compare"], ids=["vanilla", "neon"]) +def neon_with_baseline(request: FixtureRequest) -> PgCompare: + """Parameterized fixture that helps compare neon against vanilla postgres. A test that uses this fixture turns into a parameterized test that runs against: 1. A vanilla postgres instance - 2. A simple zenith env (see zenith_simple_env) + 2. A simple neon env (see neon_simple_env) 3. Possibly other postgres protocol implementations. The main goal of this fixture is to make it easier for people to read and write @@ -189,12 +293,10 @@ def zenith_with_baseline(request) -> PgCompare: of that. If a test requires some one-off special implementation-specific logic, use of - isinstance(zenith_with_baseline, ZenithCompare) is encouraged. Though if that + isinstance(neon_with_baseline, NeonCompare) is encouraged. Though if that implementation-specific logic is widely useful across multiple tests, it might make sense to add methods to the PgCompare class. """ - fixture = request.getfixturevalue(request.param) - if isinstance(fixture, PgCompare): - return fixture - else: - raise AssertionError(f"test error: fixture {request.param} is not PgCompare") + fixture = request.getfixturevalue(request.param) # type: ignore + assert isinstance(fixture, PgCompare), f"test error: fixture {fixture} is not PgCompare" + return fixture diff --git a/test_runner/fixtures/log_helper.py b/test_runner/fixtures/log_helper.py index 9aa5f40bf3..17f2402391 100644 --- a/test_runner/fixtures/log_helper.py +++ b/test_runner/fixtures/log_helper.py @@ -1,5 +1,6 @@ import logging import logging.config + """ This file configures logging to use in python tests. Logs are automatically captured and shown in their @@ -22,20 +23,16 @@ https://docs.pytest.org/en/6.2.x/logging.html LOGGING = { "version": 1, "loggers": { - "root": { - "level": "INFO" - }, - "root.wal_acceptor_async": { - "level": "INFO" # a lot of logs on DEBUG level - } - } + "root": {"level": "INFO"}, + "root.safekeeper_async": {"level": "INFO"}, # a lot of logs on DEBUG level + }, } -def getLogger(name='root') -> logging.Logger: +def getLogger(name="root") -> logging.Logger: """Method to get logger for tests. - Should be used to get correctly initialized logger. """ + Should be used to get correctly initialized logger.""" return logging.getLogger(name) diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py new file mode 100644 index 0000000000..86ab4425ed --- /dev/null +++ b/test_runner/fixtures/metrics.py @@ -0,0 +1,65 @@ +from collections import defaultdict +from typing import Dict, List, Optional, Tuple + +from prometheus_client.parser import text_string_to_metric_families +from prometheus_client.samples import Sample + + +class Metrics: + metrics: Dict[str, List[Sample]] + name: str + + def __init__(self, name: str = ""): + self.metrics = defaultdict(list) + self.name = name + + def query_all(self, name: str, filter: Dict[str, str]) -> List[Sample]: + res = [] + for sample in self.metrics[name]: + try: + if all(sample.labels[k] == v for k, v in filter.items()): + res.append(sample) + except KeyError: + pass + return res + + def query_one(self, name: str, filter: Optional[Dict[str, str]] = None) -> Sample: + res = self.query_all(name, filter or {}) + assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}" + return res[0] + + +def parse_metrics(text: str, name: str = "") -> Metrics: + metrics = Metrics(name) + gen = text_string_to_metric_families(text) + for family in gen: + for sample in family.samples: + metrics.metrics[sample.name].append(sample) + + return metrics + + +PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( + "pageserver_current_logical_size", + "pageserver_current_physical_size", + "pageserver_getpage_reconstruct_seconds_bucket", + "pageserver_getpage_reconstruct_seconds_count", + "pageserver_getpage_reconstruct_seconds_sum", + "pageserver_io_operations_bytes_total", + "pageserver_io_operations_seconds_bucket", + "pageserver_io_operations_seconds_count", + "pageserver_io_operations_seconds_sum", + "pageserver_last_record_lsn", + "pageserver_materialized_cache_hits_total", + "pageserver_smgr_query_seconds_bucket", + "pageserver_smgr_query_seconds_count", + "pageserver_smgr_query_seconds_sum", + "pageserver_storage_operations_seconds_bucket", + "pageserver_storage_operations_seconds_count", + "pageserver_storage_operations_seconds_sum", + "pageserver_wait_lsn_seconds_bucket", + "pageserver_wait_lsn_seconds_count", + "pageserver_wait_lsn_seconds_sum", + "pageserver_created_persistent_files_total", + "pageserver_written_persistent_bytes_total", +) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py new file mode 100644 index 0000000000..f68c6a25db --- /dev/null +++ b/test_runner/fixtures/neon_fixtures.py @@ -0,0 +1,2905 @@ +from __future__ import annotations + +import abc +import asyncio +import enum +import filecmp +import json +import os +import re +import shutil +import socket +import subprocess +import tempfile +import textwrap +import time +import uuid +from contextlib import closing, contextmanager +from dataclasses import dataclass, field +from enum import Flag, auto +from functools import cached_property +from pathlib import Path +from types import TracebackType +from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast + +import asyncpg +import backoff # type: ignore +import boto3 +import jwt +import psycopg2 +import pytest +import requests +from _pytest.config import Config +from _pytest.fixtures import FixtureRequest +from fixtures.log_helper import log +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import Fn, allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture + +# Type-related stuff +from psycopg2.extensions import connection as PgConnection +from psycopg2.extensions import cursor as PgCursor +from psycopg2.extensions import make_dsn, parse_dsn +from typing_extensions import Literal + +""" +This file contains pytest fixtures. A fixture is a test resource that can be +summoned by placing its name in the test's arguments. + +A fixture is created with the decorator @pytest.fixture decorator. +See docs: https://docs.pytest.org/en/6.2.x/fixture.html + +There are several environment variables that can control the running of tests: +NEON_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information. + +There's no need to import this file to use it. It should be declared as a plugin +inside conftest.py, and that makes it available to all tests. + +Don't import functions from this file, or pytest will emit warnings. Instead +put directly-importable functions into utils.py or another separate file. +""" + +Env = Dict[str, str] + +DEFAULT_OUTPUT_DIR: str = "test_output" +DEFAULT_BRANCH_NAME: str = "main" +DEFAULT_PG_VERSION_DEFAULT: str = "14" + +BASE_PORT: int = 15000 +WORKER_PORT_NUM: int = 1000 + + +def pytest_configure(config: Config): + """ + Check that we do not overflow available ports range. + """ + + numprocesses = config.getoption("numprocesses") + if ( + numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768 + ): # do not use ephemeral ports + raise Exception("Too many workers configured. Cannot distribute ports for services.") + + +@pytest.fixture(scope="session") +def base_dir() -> Iterator[Path]: + # find the base directory (currently this is the git root) + base_dir = get_self_dir().parent.parent + log.info(f"base_dir is {base_dir}") + + yield base_dir + + +@pytest.fixture(scope="session") +def neon_binpath(base_dir: Path) -> Iterator[Path]: + if os.getenv("REMOTE_ENV"): + # we are in remote env and do not have neon binaries locally + # this is the case for benchmarks run on self-hosted runner + return + + # Find the neon binaries. + if env_neon_bin := os.environ.get("NEON_BIN"): + binpath = Path(env_neon_bin) + else: + build_type = os.environ.get("BUILD_TYPE", "debug") + binpath = base_dir / "target" / build_type + log.info(f"neon_binpath is {binpath}") + + if not (binpath / "pageserver").exists(): + raise Exception(f"neon binaries not found at '{binpath}'") + + yield binpath + + +@pytest.fixture(scope="session") +def pg_distrib_dir(base_dir: Path) -> Iterator[Path]: + if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"): + distrib_dir = Path(env_postgres_bin).resolve() + else: + distrib_dir = base_dir / "pg_install" + + log.info(f"pg_distrib_dir is {distrib_dir}") + yield distrib_dir + + +@pytest.fixture(scope="session") +def top_output_dir(base_dir: Path) -> Iterator[Path]: + # Compute the top-level directory for all tests. + if env_test_output := os.environ.get("TEST_OUTPUT"): + output_dir = Path(env_test_output).resolve() + else: + output_dir = base_dir / DEFAULT_OUTPUT_DIR + output_dir.mkdir(exist_ok=True) + + log.info(f"top_output_dir is {output_dir}") + yield output_dir + + +@pytest.fixture(scope="session") +def pg_version() -> Iterator[str]: + if env_default_pg_version := os.environ.get("DEFAULT_PG_VERSION"): + version = env_default_pg_version + else: + version = DEFAULT_PG_VERSION_DEFAULT + + log.info(f"pg_version is {version}") + yield version + + +@pytest.fixture(scope="session") +def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: str) -> Iterator[Path]: + versioned_dir = pg_distrib_dir / f"v{pg_version}" + + psql_bin_path = versioned_dir / "bin/psql" + postgres_bin_path = versioned_dir / "bin/postgres" + + if os.getenv("REMOTE_ENV"): + # When testing against a remote server, we only need the client binary. + if not psql_bin_path.exists(): + raise Exception(f"psql not found at '{psql_bin_path}'") + else: + if not postgres_bin_path.exists(): + raise Exception(f"postgres not found at '{postgres_bin_path}'") + + log.info(f"versioned_pg_distrib_dir is {versioned_dir}") + yield versioned_dir + + +def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]: + """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar. + + This function can be used as a scope like this: + @pytest.fixture(scope=shareable_scope) + def myfixture(...) + ... + """ + return "function" if os.environ.get("TEST_SHARED_FIXTURES") is None else "session" + + +@pytest.fixture(scope="session") +def worker_seq_no(worker_id: str) -> int: + # worker_id is a pytest-xdist fixture + # it can be master or gw + # parse it to always get a number + if worker_id == "master": + return 0 + assert worker_id.startswith("gw") + return int(worker_id[2:]) + + +@pytest.fixture(scope="session") +def worker_base_port(worker_seq_no: int) -> int: + # so we divide ports in ranges of 100 ports + # so workers have disjoint set of ports for services + return BASE_PORT + worker_seq_no * WORKER_PORT_NUM + + +def get_dir_size(path: str) -> int: + """Return size in bytes.""" + totalbytes = 0 + for root, dirs, files in os.walk(path): + for name in files: + totalbytes += os.path.getsize(os.path.join(root, name)) + + return totalbytes + + +def can_bind(host: str, port: int) -> bool: + """ + Check whether a host:port is available to bind for listening + + Inspired by the can_bind() perl function used in Postgres tests, in + vendor/postgres-v14/src/test/perl/PostgresNode.pm + """ + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + # TODO: The pageserver and safekeepers don't use SO_REUSEADDR at the + # moment. If that changes, we should use start using SO_REUSEADDR here + # too, to allow reusing ports more quickly. + # See https://github.com/neondatabase/neon/issues/801 + # sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + + try: + sock.bind((host, port)) + sock.listen() + return True + except socket.error: + log.info(f"Port {port} is in use, skipping") + return False + finally: + sock.close() + + +class PortDistributor: + def __init__(self, base_port: int, port_number: int): + self.iterator = iter(range(base_port, base_port + port_number)) + self.port_map: Dict[int, int] = {} + + def get_port(self) -> int: + for port in self.iterator: + if can_bind("localhost", port): + return port + raise RuntimeError( + "port range configured for test is exhausted, consider enlarging the range" + ) + + def replace_with_new_port(self, value: Union[int, str]) -> Union[int, str]: + """ + Returns a new port for a port number in a string (like "localhost:1234") or int. + Replacements are memorised, so a substitution for the same port is always the same. + """ + + # TODO: replace with structural pattern matching for Python >= 3.10 + if isinstance(value, int): + return self._replace_port_int(value) + + if isinstance(value, str): + return self._replace_port_str(value) + + raise TypeError(f"unsupported type {type(value)} of {value=}") + + def _replace_port_int(self, value: int) -> int: + known_port = self.port_map.get(value) + if known_port is None: + known_port = self.port_map[value] = self.get_port() + + return known_port + + def _replace_port_str(self, value: str) -> str: + # Use regex to find port in a string + # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432" + # See https://bugs.python.org/issue27657 + ports = re.findall(r":(\d+)(?:/|$)", value) + assert len(ports) == 1, f"can't find port in {value}" + port_int = int(ports[0]) + + return value.replace(f":{port_int}", f":{self._replace_port_int(port_int)}") + + +@pytest.fixture(scope="session") +def port_distributor(worker_base_port: int) -> PortDistributor: + return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM) + + +@pytest.fixture(scope="session") +def default_broker( + request: FixtureRequest, port_distributor: PortDistributor, top_output_dir: Path +) -> Iterator[Etcd]: + client_port = port_distributor.get_port() + # multiple pytest sessions could get launched in parallel, get them different datadirs + etcd_datadir = get_test_output_dir(request, top_output_dir) / f"etcd_datadir_{client_port}" + etcd_datadir.mkdir(exist_ok=True, parents=True) + + broker = Etcd( + datadir=str(etcd_datadir), port=client_port, peer_port=port_distributor.get_port() + ) + yield broker + broker.stop() + allure_attach_from_dir(etcd_datadir) + + +@pytest.fixture(scope="session") +def run_id() -> Iterator[uuid.UUID]: + yield uuid.uuid4() + + +@pytest.fixture(scope="session") +def mock_s3_server(port_distributor: PortDistributor) -> Iterator[MockS3Server]: + mock_s3_server = MockS3Server(port_distributor.get_port()) + yield mock_s3_server + mock_s3_server.kill() + + +class PgProtocol: + """Reusable connection logic""" + + def __init__(self, **kwargs: Any): + self.default_options = kwargs + + def connstr(self, **kwargs: Any) -> str: + """ + Build a libpq connection string for the Postgres instance. + """ + return str(make_dsn(**self.conn_options(**kwargs))) + + def conn_options(self, **kwargs: Any) -> Dict[str, Any]: + """ + Construct a dictionary of connection options from default values and extra parameters. + An option can be dropped from the returning dictionary by None-valued extra parameter. + """ + result = self.default_options.copy() + if "dsn" in kwargs: + result.update(parse_dsn(kwargs["dsn"])) + result.update(kwargs) + result = {k: v for k, v in result.items() if v is not None} + + # Individual statement timeout in seconds. 2 minutes should be + # enough for our tests, but if you need a longer, you can + # change it by calling "SET statement_timeout" after + # connecting. + options = result.get("options", "") + if "statement_timeout" not in options: + options = f"-cstatement_timeout=120s {options}" + result["options"] = options + return result + + # autocommit=True here by default because that's what we need most of the time + def connect(self, autocommit: bool = True, **kwargs: Any) -> PgConnection: + """ + Connect to the node. + Returns psycopg2's connection object. + This method passes all extra params to connstr. + """ + conn = psycopg2.connect(**self.conn_options(**kwargs)) + + # WARNING: this setting affects *all* tests! + conn.autocommit = autocommit + return conn + + @contextmanager + def cursor(self, autocommit: bool = True, **kwargs: Any) -> Iterator[PgCursor]: + """ + Shorthand for pg.connect().cursor(). + The cursor and connection are closed when the context is exited. + """ + with closing(self.connect(autocommit=autocommit, **kwargs)) as conn: + yield conn.cursor() + + async def connect_async(self, **kwargs: Any) -> asyncpg.Connection: + """ + Connect to the node from async python. + Returns asyncpg's connection object. + """ + + # asyncpg takes slightly different options than psycopg2. Try + # to convert the defaults from the psycopg2 format. + + # The psycopg2 option 'dbname' is called 'database' is asyncpg + conn_options = self.conn_options(**kwargs) + if "dbname" in conn_options: + conn_options["database"] = conn_options.pop("dbname") + + # Convert options='-c=' to server_settings + if "options" in conn_options: + options = conn_options.pop("options") + for match in re.finditer(r"-c(\w*)=(\w*)", options): + key = match.group(1) + val = match.group(2) + if "server_options" in conn_options: + conn_options["server_settings"].update({key: val}) + else: + conn_options["server_settings"] = {key: val} + return await asyncpg.connect(**conn_options) + + def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]: + """ + Execute query against the node and return all rows. + This method passes all extra params to connstr. + """ + return self.safe_psql_many([query], **kwargs)[0] + + def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]: + """ + Execute queries against the node and return all rows. + This method passes all extra params to connstr. + """ + result: List[List[Any]] = [] + with closing(self.connect(**kwargs)) as conn: + with conn.cursor() as cur: + for query in queries: + log.info(f"Executing query: {query}") + cur.execute(query) + + if cur.description is None: + result.append([]) # query didn't return data + else: + result.append(cur.fetchall()) + return result + + +@dataclass +class AuthKeys: + pub: str + priv: str + + def generate_management_token(self) -> str: + token = jwt.encode({"scope": "pageserverapi"}, self.priv, algorithm="RS256") + + # jwt.encode can return 'bytes' or 'str', depending on Python version or type + # hinting or something (not sure what). If it returned 'bytes', convert it to 'str' + # explicitly. + if isinstance(token, bytes): + token = token.decode() + + return token + + def generate_tenant_token(self, tenant_id: TenantId) -> str: + token = jwt.encode( + {"scope": "tenant", "tenant_id": str(tenant_id)}, + self.priv, + algorithm="RS256", + ) + + if isinstance(token, bytes): + token = token.decode() + + return token + + +class MockS3Server: + """ + Starts a mock S3 server for testing on a port given, errors if the server fails to start or exits prematurely. + Relies that `poetry` and `moto` server are installed, since it's the way the tests are run. + + Also provides a set of methods to derive the connection properties from and the method to kill the underlying server. + """ + + def __init__( + self, + port: int, + ): + self.port = port + + # XXX: do not use `shell=True` or add `exec ` to the command here otherwise. + # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux + # if a process is started from the shell process. + self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"]) + error = None + try: + return_code = self.subprocess.poll() + if return_code is not None: + error = f"expected mock s3 server to run but it exited with code {return_code}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'" + except Exception as e: + error = f"expected mock s3 server to start but it failed with exception: {e}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'" + if error is not None: + log.error(error) + self.kill() + raise RuntimeError("failed to start s3 mock server") + + def endpoint(self) -> str: + return f"http://127.0.0.1:{self.port}" + + def region(self) -> str: + return "us-east-1" + + def access_key(self) -> str: + return "test" + + def secret_key(self) -> str: + return "test" + + def kill(self): + self.subprocess.kill() + + +@enum.unique +class RemoteStorageKind(str, enum.Enum): + LOCAL_FS = "local_fs" + MOCK_S3 = "mock_s3" + REAL_S3 = "real_s3" + # Pass to tests that are generic to remote storage + # to ensure the test pass with or without the remote storage + NOOP = "noop" + + +def available_remote_storages() -> List[RemoteStorageKind]: + remote_storages = [RemoteStorageKind.LOCAL_FS, RemoteStorageKind.MOCK_S3] + if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None: + remote_storages.append(RemoteStorageKind.REAL_S3) + log.info("Enabling real s3 storage for tests") + else: + log.info("Using mock implementations to test remote storage") + return remote_storages + + +@dataclass +class LocalFsStorage: + root: Path + + +@dataclass +class S3Storage: + bucket_name: str + bucket_region: str + access_key: str + secret_key: str + endpoint: Optional[str] = None + prefix_in_bucket: Optional[str] = None + + def access_env_vars(self) -> Dict[str, str]: + return { + "AWS_ACCESS_KEY_ID": self.access_key, + "AWS_SECRET_ACCESS_KEY": self.secret_key, + } + + +RemoteStorage = Union[LocalFsStorage, S3Storage] + + +# serialize as toml inline table +def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str: + if isinstance(remote_storage, LocalFsStorage): + remote_storage_config = f"local_path='{remote_storage.root}'" + elif isinstance(remote_storage, S3Storage): + remote_storage_config = f"bucket_name='{remote_storage.bucket_name}',\ + bucket_region='{remote_storage.bucket_region}'" + + if remote_storage.prefix_in_bucket is not None: + remote_storage_config += f",prefix_in_bucket='{remote_storage.prefix_in_bucket}'" + + if remote_storage.endpoint is not None: + remote_storage_config += f",endpoint='{remote_storage.endpoint}'" + else: + raise Exception("invalid remote storage type") + + return f"{{{remote_storage_config}}}" + + +class RemoteStorageUsers(Flag): + PAGESERVER = auto() + SAFEKEEPER = auto() + + +class NeonEnvBuilder: + """ + Builder object to create a Neon runtime environment + + You should use the `neon_env_builder` or `neon_simple_env` pytest + fixture to create the NeonEnv object. That way, the repository is + created in the right directory, based on the test name, and it's properly + cleaned up after the test has finished. + """ + + def __init__( + self, + repo_dir: Path, + port_distributor: PortDistributor, + broker: Etcd, + run_id: uuid.UUID, + mock_s3_server: MockS3Server, + neon_binpath: Path, + pg_distrib_dir: Path, + pg_version: str, + remote_storage: Optional[RemoteStorage] = None, + remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER, + pageserver_config_override: Optional[str] = None, + num_safekeepers: int = 1, + # Use non-standard SK ids to check for various parsing bugs + safekeepers_id_start: int = 0, + # fsync is disabled by default to make the tests go faster + safekeepers_enable_fsync: bool = False, + auth_enabled: bool = False, + rust_log_override: Optional[str] = None, + default_branch_name: str = DEFAULT_BRANCH_NAME, + ): + self.repo_dir = repo_dir + self.rust_log_override = rust_log_override + self.port_distributor = port_distributor + self.remote_storage = remote_storage + self.remote_storage_users = remote_storage_users + self.broker = broker + self.run_id = run_id + self.mock_s3_server = mock_s3_server + self.pageserver_config_override = pageserver_config_override + self.num_safekeepers = num_safekeepers + self.safekeepers_id_start = safekeepers_id_start + self.safekeepers_enable_fsync = safekeepers_enable_fsync + self.auth_enabled = auth_enabled + self.default_branch_name = default_branch_name + self.env: Optional[NeonEnv] = None + self.remote_storage_prefix: Optional[str] = None + self.keep_remote_storage_contents: bool = True + self.neon_binpath = neon_binpath + self.pg_distrib_dir = pg_distrib_dir + self.pg_version = pg_version + + def init(self) -> NeonEnv: + # Cannot create more than one environment from one builder + assert self.env is None, "environment already initialized" + self.env = NeonEnv(self) + return self.env + + def start(self): + self.env.start() + + def init_start(self) -> NeonEnv: + env = self.init() + self.start() + return env + + def enable_remote_storage( + self, + remote_storage_kind: RemoteStorageKind, + test_name: str, + force_enable: bool = True, + ): + if remote_storage_kind == RemoteStorageKind.NOOP: + return + elif remote_storage_kind == RemoteStorageKind.LOCAL_FS: + self.enable_local_fs_remote_storage(force_enable=force_enable) + elif remote_storage_kind == RemoteStorageKind.MOCK_S3: + self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable) + elif remote_storage_kind == RemoteStorageKind.REAL_S3: + self.enable_real_s3_remote_storage(test_name=test_name, force_enable=force_enable) + else: + raise RuntimeError(f"Unknown storage type: {remote_storage_kind}") + + def enable_local_fs_remote_storage(self, force_enable: bool = True): + """ + Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path. + Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. + """ + assert force_enable or self.remote_storage is None, "remote storage is enabled already" + self.remote_storage = LocalFsStorage(Path(self.repo_dir / "local_fs_remote_storage")) + + def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable: bool = True): + """ + Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already. + Starts up the mock server, if that does not run yet. + Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. + """ + assert force_enable or self.remote_storage is None, "remote storage is enabled already" + mock_endpoint = self.mock_s3_server.endpoint() + mock_region = self.mock_s3_server.region() + + self.remote_storage_client = boto3.client( + "s3", + endpoint_url=mock_endpoint, + region_name=mock_region, + aws_access_key_id=self.mock_s3_server.access_key(), + aws_secret_access_key=self.mock_s3_server.secret_key(), + ) + self.remote_storage_client.create_bucket(Bucket=bucket_name) + + self.remote_storage = S3Storage( + bucket_name=bucket_name, + endpoint=mock_endpoint, + bucket_region=mock_region, + access_key=self.mock_s3_server.access_key(), + secret_key=self.mock_s3_server.secret_key(), + ) + + def enable_real_s3_remote_storage(self, test_name: str, force_enable: bool = True): + """ + Sets up configuration to use real s3 endpoint without mock server + """ + assert force_enable or self.remote_storage is None, "remote storage is enabled already" + + access_key = os.getenv("AWS_ACCESS_KEY_ID") + assert access_key, "no aws access key provided" + secret_key = os.getenv("AWS_SECRET_ACCESS_KEY") + assert secret_key, "no aws access key provided" + + # session token is needed for local runs with sso auth + session_token = os.getenv("AWS_SESSION_TOKEN") + + bucket_name = os.getenv("REMOTE_STORAGE_S3_BUCKET") + assert bucket_name, "no remote storage bucket name provided" + region = os.getenv("REMOTE_STORAGE_S3_REGION") + assert region, "no remote storage region provided" + + # do not leave data in real s3 + self.keep_remote_storage_contents = False + + # construct a prefix inside bucket for the particular test case and test run + self.remote_storage_prefix = f"{self.run_id}/{test_name}" + + self.remote_storage_client = boto3.client( + "s3", + region_name=region, + aws_access_key_id=access_key, + aws_secret_access_key=secret_key, + aws_session_token=session_token, + ) + self.remote_storage = S3Storage( + bucket_name=bucket_name, + bucket_region=region, + access_key=access_key, + secret_key=secret_key, + prefix_in_bucket=self.remote_storage_prefix, + ) + + def cleanup_remote_storage(self): + # here wee check for true remote storage, no the local one + # local cleanup is not needed after test because in ci all env will be destroyed anyway + if self.remote_storage_prefix is None: + log.info("no remote storage was set up, skipping cleanup") + return + + if self.keep_remote_storage_contents: + log.info("keep_remote_storage_contents skipping remote storage cleanup") + return + + log.info( + "removing data from test s3 bucket %s by prefix %s", + self.remote_storage.bucket_name, + self.remote_storage_prefix, + ) + paginator = self.remote_storage_client.get_paginator("list_objects_v2") + pages = paginator.paginate( + Bucket=self.remote_storage.bucket_name, + Prefix=self.remote_storage_prefix, + ) + + objects_to_delete = {"Objects": []} + cnt = 0 + for item in pages.search("Contents"): + # weirdly when nothing is found it returns [None] + if item is None: + break + + objects_to_delete["Objects"].append({"Key": item["Key"]}) + + # flush once aws limit reached + if len(objects_to_delete["Objects"]) >= 1000: + self.remote_storage_client.delete_objects( + Bucket=self.remote_storage.bucket_name, + Delete=objects_to_delete, + ) + objects_to_delete = dict(Objects=[]) + cnt += 1 + + # flush rest + if len(objects_to_delete["Objects"]): + self.remote_storage_client.delete_objects( + Bucket=self.remote_storage.bucket_name, Delete=objects_to_delete + ) + + log.info("deleted %s objects from remote storage", cnt) + + def __enter__(self) -> "NeonEnvBuilder": + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_value: Optional[BaseException], + traceback: Optional[TracebackType], + ): + # Stop all the nodes. + if self.env: + log.info("Cleaning up all storage and compute nodes") + self.env.postgres.stop_all() + for sk in self.env.safekeepers: + sk.stop(immediate=True) + self.env.pageserver.stop(immediate=True) + + self.cleanup_remote_storage() + + +class NeonEnv: + """ + An object representing the Neon runtime environment. It consists of + the page server, 0-N safekeepers, and the compute nodes. + + NeonEnv contains functions for stopping/starting nodes in the + environment, checking their status, creating tenants, connecting to the + nodes, creating and destroying compute nodes, etc. The page server and + the safekeepers are considered fixed in the environment, you cannot + create or destroy them after the environment is initialized. (That will + likely change in the future, as we start supporting multiple page + servers and adding/removing safekeepers on the fly). + + Some notable functions and fields in NeonEnv: + + postgres - A factory object for creating postgres compute nodes. + + pageserver - An object that contains functions for manipulating and + connecting to the pageserver + + safekeepers - An array containing objects representing the safekeepers + + pg_bin - pg_bin.run() can be used to execute Postgres client binaries, + like psql or pg_dump + + initial_tenant - tenant ID of the initial tenant created in the repository + + neon_cli - can be used to run the 'neon' CLI tool + + create_tenant() - initializes a new tenant in the page server, returns + the tenant id + """ + + def __init__(self, config: NeonEnvBuilder): + self.repo_dir = config.repo_dir + self.rust_log_override = config.rust_log_override + self.port_distributor = config.port_distributor + self.s3_mock_server = config.mock_s3_server + self.neon_cli = NeonCli(env=self) + self.postgres = PostgresFactory(self) + self.safekeepers: List[Safekeeper] = [] + self.broker = config.broker + self.remote_storage = config.remote_storage + self.remote_storage_users = config.remote_storage_users + self.pg_version = config.pg_version + self.neon_binpath = config.neon_binpath + self.pg_distrib_dir = config.pg_distrib_dir + + # generate initial tenant ID here instead of letting 'neon init' generate it, + # so that we don't need to dig it out of the config file afterwards. + self.initial_tenant = TenantId.generate() + + # Create a config file corresponding to the options + toml = textwrap.dedent( + f""" + default_tenant_id = '{self.initial_tenant}' + """ + ) + + toml += textwrap.dedent( + f""" + [etcd_broker] + broker_endpoints = ['{self.broker.client_url()}'] + etcd_binary_path = '{self.broker.binary_path}' + """ + ) + + # Create config for pageserver + pageserver_port = PageserverPort( + pg=self.port_distributor.get_port(), + http=self.port_distributor.get_port(), + ) + pageserver_auth_type = "NeonJWT" if config.auth_enabled else "Trust" + + toml += textwrap.dedent( + f""" + [pageserver] + id=1 + listen_pg_addr = 'localhost:{pageserver_port.pg}' + listen_http_addr = 'localhost:{pageserver_port.http}' + auth_type = '{pageserver_auth_type}' + """ + ) + + # Create a corresponding NeonPageserver object + self.pageserver = NeonPageserver( + self, port=pageserver_port, config_override=config.pageserver_config_override + ) + + # Create config and a Safekeeper object for each safekeeper + for i in range(1, config.num_safekeepers + 1): + port = SafekeeperPort( + pg=self.port_distributor.get_port(), + http=self.port_distributor.get_port(), + ) + id = config.safekeepers_id_start + i # assign ids sequentially + toml += textwrap.dedent( + f""" + [[safekeepers]] + id = {id} + pg_port = {port.pg} + http_port = {port.http} + sync = {'true' if config.safekeepers_enable_fsync else 'false'}""" + ) + if config.auth_enabled: + toml += textwrap.dedent( + """ + auth_enabled = true + """ + ) + if ( + bool(self.remote_storage_users & RemoteStorageUsers.SAFEKEEPER) + and self.remote_storage is not None + ): + toml += textwrap.dedent( + f""" + remote_storage = "{remote_storage_to_toml_inline_table(self.remote_storage)}" + """ + ) + safekeeper = Safekeeper(env=self, id=id, port=port) + self.safekeepers.append(safekeeper) + + log.info(f"Config: {toml}") + self.neon_cli.init(toml) + + def start(self): + # Start up broker, pageserver and all safekeepers + self.broker.try_start() + self.pageserver.start() + + for safekeeper in self.safekeepers: + safekeeper.start() + + def get_safekeeper_connstrs(self) -> str: + """Get list of safekeeper endpoints suitable for safekeepers GUC""" + return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers) + + def timeline_dir(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: + """Get a timeline directory's path based on the repo directory of the test environment""" + return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + + def get_pageserver_version(self) -> str: + bin_pageserver = str(self.neon_binpath / "pageserver") + res = subprocess.run( + [bin_pageserver, "--version"], + check=True, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + return res.stdout + + @cached_property + def auth_keys(self) -> AuthKeys: + pub = (Path(self.repo_dir) / "auth_public_key.pem").read_text() + priv = (Path(self.repo_dir) / "auth_private_key.pem").read_text() + return AuthKeys(pub=pub, priv=priv) + + +@pytest.fixture(scope=shareable_scope) +def _shared_simple_env( + request: FixtureRequest, + port_distributor: PortDistributor, + mock_s3_server: MockS3Server, + default_broker: Etcd, + run_id: uuid.UUID, + top_output_dir: Path, + neon_binpath: Path, + pg_distrib_dir: Path, + pg_version: str, +) -> Iterator[NeonEnv]: + """ + # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES + is set, this is shared by all tests using `neon_simple_env`. + """ + + if os.environ.get("TEST_SHARED_FIXTURES") is None: + # Create the environment in the per-test output directory + repo_dir = get_test_output_dir(request, top_output_dir) / "repo" + else: + # We're running shared fixtures. Share a single directory. + repo_dir = top_output_dir / "shared_repo" + shutil.rmtree(repo_dir, ignore_errors=True) + + with NeonEnvBuilder( + repo_dir=repo_dir, + port_distributor=port_distributor, + broker=default_broker, + mock_s3_server=mock_s3_server, + neon_binpath=neon_binpath, + pg_distrib_dir=pg_distrib_dir, + pg_version=pg_version, + run_id=run_id, + ) as builder: + env = builder.init_start() + + # For convenience in tests, create a branch from the freshly-initialized cluster. + env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) + + yield env + + +@pytest.fixture(scope="function") +def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: + """ + Simple Neon environment, with no authentication and no safekeepers. + + If TEST_SHARED_FIXTURES environment variable is set, we reuse the same + environment for all tests that use 'neon_simple_env', keeping the + page server and safekeepers running. Any compute nodes are stopped after + each the test, however. + """ + yield _shared_simple_env + + _shared_simple_env.postgres.stop_all() + + +@pytest.fixture(scope="function") +def neon_env_builder( + test_output_dir: str, + port_distributor: PortDistributor, + mock_s3_server: MockS3Server, + neon_binpath: Path, + pg_distrib_dir: Path, + pg_version: str, + default_broker: Etcd, + run_id: uuid.UUID, +) -> Iterator[NeonEnvBuilder]: + """ + Fixture to create a Neon environment for test. + + To use, define 'neon_env_builder' fixture in your test to get access to the + builder object. Set properties on it to describe the environment. + Finally, initialize and start up the environment by calling + neon_env_builder.init_start(). + + After the initialization, you can launch compute nodes by calling + the functions in the 'env.postgres' factory object, stop/start the + nodes, etc. + """ + + # Create the environment in the test-specific output dir + repo_dir = os.path.join(test_output_dir, "repo") + + # Return the builder to the caller + with NeonEnvBuilder( + repo_dir=Path(repo_dir), + port_distributor=port_distributor, + mock_s3_server=mock_s3_server, + neon_binpath=neon_binpath, + pg_distrib_dir=pg_distrib_dir, + pg_version=pg_version, + broker=default_broker, + run_id=run_id, + ) as builder: + yield builder + + +class PageserverApiException(Exception): + pass + + +class PageserverHttpClient(requests.Session): + def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None): + super().__init__() + self.port = port + self.auth_token = auth_token + self.is_testing_enabled_or_skip = is_testing_enabled_or_skip + + if auth_token is not None: + self.headers["Authorization"] = f"Bearer {auth_token}" + + def verbose_error(self, res: requests.Response): + try: + res.raise_for_status() + except requests.RequestException as e: + try: + msg = res.json()["msg"] + except: # noqa: E722 + msg = "" + raise PageserverApiException(msg) from e + + def check_status(self): + self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + + def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): + self.is_testing_enabled_or_skip() + + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.put( + f"http://localhost:{self.port}/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + ) + log.info(f"Got failpoints request response code {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + + def tenant_list(self) -> List[Dict[Any, Any]]: + res = self.get(f"http://localhost:{self.port}/v1/tenant") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, list) + return res_json + + def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId: + res = self.post( + f"http://localhost:{self.port}/v1/tenant", + json={ + "new_tenant_id": str(new_tenant_id) if new_tenant_id else None, + }, + ) + self.verbose_error(res) + if res.status_code == 409: + raise Exception(f"could not create tenant: already exists for id {new_tenant_id}") + new_tenant_id = res.json() + assert isinstance(new_tenant_id, str) + return TenantId(new_tenant_id) + + def tenant_attach(self, tenant_id: TenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach") + self.verbose_error(res) + + def tenant_detach(self, tenant_id: TenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach") + self.verbose_error(res) + + def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def tenant_size(self, tenant_id: TenantId) -> int: + """ + Returns the tenant size, together with the model inputs as the second tuple item. + """ + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/size") + self.verbose_error(res) + res = res.json() + assert isinstance(res, dict) + assert TenantId(res["id"]) == tenant_id + size = res["size"] + assert type(size) == int + # there are additional inputs, which are the collected raw information before being fed to the tenant_size_model + # there are no tests for those right now. + return size + + def timeline_list(self, tenant_id: TenantId) -> List[Dict[str, Any]]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, list) + return res_json + + def timeline_create( + self, + tenant_id: TenantId, + new_timeline_id: Optional[TimelineId] = None, + ancestor_timeline_id: Optional[TimelineId] = None, + ancestor_start_lsn: Optional[Lsn] = None, + ) -> Dict[Any, Any]: + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", + json={ + "new_timeline_id": str(new_timeline_id) if new_timeline_id else None, + "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None, + "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None, + }, + ) + self.verbose_error(res) + if res.status_code == 409: + raise Exception(f"could not create timeline: already exists for id {new_timeline_id}") + + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def timeline_detail( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + include_non_incremental_logical_size: bool = False, + include_non_incremental_physical_size: bool = False, + ) -> Dict[Any, Any]: + params = {} + if include_non_incremental_logical_size: + params["include-non-incremental-logical-size"] = "yes" + if include_non_incremental_physical_size: + params["include-non-incremental-physical-size"] = "yes" + + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", + params=params, + ) + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId): + res = self.delete( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" + ) + self.verbose_error(res) + res_json = res.json() + assert res_json is None + + def timeline_gc( + self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int] + ) -> dict[str, Any]: + self.is_testing_enabled_or_skip() + + log.info( + f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}" + ) + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc", + json={"gc_horizon": gc_horizon}, + ) + log.info(f"Got GC request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + return res_json + + def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId): + self.is_testing_enabled_or_skip() + + log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact" + ) + log.info(f"Got compact request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + + def timeline_get_lsn_by_timestamp( + self, tenant_id: TenantId, timeline_id: TimelineId, timestamp + ): + log.info( + f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}" + ) + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}", + ) + self.verbose_error(res) + res_json = res.json() + return res_json + + def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): + self.is_testing_enabled_or_skip() + + log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint" + ) + log.info(f"Got checkpoint request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + + def get_metrics(self) -> str: + res = self.get(f"http://localhost:{self.port}/metrics") + self.verbose_error(res) + return res.text + + +@dataclass +class PageserverPort: + pg: int + http: int + + +CREATE_TIMELINE_ID_EXTRACTOR: re.Pattern = re.compile( # type: ignore[type-arg] + r"^Created timeline '(?P[^']+)'", re.MULTILINE +) +TIMELINE_DATA_EXTRACTOR: re.Pattern = re.compile( # type: ignore[type-arg] + r"\s?(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE +) + + +class AbstractNeonCli(abc.ABC): + """ + A typed wrapper around an arbitrary Neon CLI tool. + Supports a way to run arbitrary command directly via CLI. + Do not use directly, use specific subclasses instead. + """ + + def __init__(self, env: NeonEnv): + self.env = env + + COMMAND: str = cast(str, None) # To be overwritten by the derived class. + + def raw_cli( + self, + arguments: List[str], + extra_env_vars: Optional[Dict[str, str]] = None, + check_return_code=True, + timeout=None, + ) -> "subprocess.CompletedProcess[str]": + """ + Run the command with the specified arguments. + + Arguments must be in list form, e.g. ['pg', 'create'] + + Return both stdout and stderr, which can be accessed as + + >>> result = env.neon_cli.raw_cli(...) + >>> assert result.stderr == "" + >>> log.info(result.stdout) + + If `check_return_code`, on non-zero exit code logs failure and raises. + """ + + assert type(arguments) == list + assert type(self.COMMAND) == str + + bin_neon = str(self.env.neon_binpath / self.COMMAND) + + args = [bin_neon] + arguments + log.info('Running command "{}"'.format(" ".join(args))) + log.info(f'Running in "{self.env.repo_dir}"') + + env_vars = os.environ.copy() + env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir) + env_vars["POSTGRES_DISTRIB_DIR"] = str(self.env.pg_distrib_dir) + if self.env.rust_log_override is not None: + env_vars["RUST_LOG"] = self.env.rust_log_override + for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): + env_vars[extra_env_key] = extra_env_value + + # Pass coverage settings + var = "LLVM_PROFILE_FILE" + val = os.environ.get(var) + if val: + env_vars[var] = val + + # Intercept CalledProcessError and print more info + res = subprocess.run( + args, + env=env_vars, + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=timeout, + ) + if not res.returncode: + log.info(f"Run success: {res.stdout}") + elif check_return_code: + # this way command output will be in recorded and shown in CI in failure message + msg = f"""\ + Run {res.args} failed: + stdout: {res.stdout} + stderr: {res.stderr} + """ + log.info(msg) + raise Exception(msg) from subprocess.CalledProcessError( + res.returncode, res.args, res.stdout, res.stderr + ) + return res + + +class NeonCli(AbstractNeonCli): + """ + A typed wrapper around the `neon` CLI tool. + Supports main commands via typed methods and a way to run arbitrary command directly via CLI. + """ + + COMMAND = "neon_local" + + def create_tenant( + self, + tenant_id: Optional[TenantId] = None, + timeline_id: Optional[TimelineId] = None, + conf: Optional[Dict[str, str]] = None, + ) -> Tuple[TenantId, TimelineId]: + """ + Creates a new tenant, returns its id and its initial timeline's id. + """ + if tenant_id is None: + tenant_id = TenantId.generate() + if timeline_id is None: + timeline_id = TimelineId.generate() + if conf is None: + res = self.raw_cli( + [ + "tenant", + "create", + "--tenant-id", + str(tenant_id), + "--timeline-id", + str(timeline_id), + "--pg-version", + self.env.pg_version, + ] + ) + else: + res = self.raw_cli( + [ + "tenant", + "create", + "--tenant-id", + str(tenant_id), + "--timeline-id", + str(timeline_id), + "--pg-version", + self.env.pg_version, + ] + + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) + ) + res.check_returncode() + return tenant_id, timeline_id + + def config_tenant(self, tenant_id: TenantId, conf: Dict[str, str]): + """ + Update tenant config. + """ + if conf is None: + res = self.raw_cli(["tenant", "config", "--tenant-id", str(tenant_id)]) + else: + res = self.raw_cli( + ["tenant", "config", "--tenant-id", str(tenant_id)] + + sum(list(map(lambda kv: (["-c", kv[0] + ":" + kv[1]]), conf.items())), []) + ) + res.check_returncode() + + def list_tenants(self) -> "subprocess.CompletedProcess[str]": + res = self.raw_cli(["tenant", "list"]) + res.check_returncode() + return res + + def create_timeline( + self, + new_branch_name: str, + tenant_id: Optional[TenantId] = None, + ) -> TimelineId: + cmd = [ + "timeline", + "create", + "--branch-name", + new_branch_name, + "--tenant-id", + str(tenant_id or self.env.initial_tenant), + "--pg-version", + self.env.pg_version, + ] + + res = self.raw_cli(cmd) + res.check_returncode() + + matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) + + created_timeline_id = None + if matches is not None: + created_timeline_id = matches.group("timeline_id") + + return TimelineId(str(created_timeline_id)) + + def create_root_branch( + self, + branch_name: str, + tenant_id: Optional[TenantId] = None, + ): + cmd = [ + "timeline", + "create", + "--branch-name", + branch_name, + "--tenant-id", + str(tenant_id or self.env.initial_tenant), + "--pg-version", + self.env.pg_version, + ] + + res = self.raw_cli(cmd) + res.check_returncode() + + matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) + + created_timeline_id = None + if matches is not None: + created_timeline_id = matches.group("timeline_id") + + if created_timeline_id is None: + raise Exception("could not find timeline id after `neon timeline create` invocation") + else: + return TimelineId(created_timeline_id) + + def create_branch( + self, + new_branch_name: str = DEFAULT_BRANCH_NAME, + ancestor_branch_name: Optional[str] = None, + tenant_id: Optional[TenantId] = None, + ancestor_start_lsn: Optional[Lsn] = None, + ) -> TimelineId: + cmd = [ + "timeline", + "branch", + "--branch-name", + new_branch_name, + "--tenant-id", + str(tenant_id or self.env.initial_tenant), + ] + if ancestor_branch_name is not None: + cmd.extend(["--ancestor-branch-name", ancestor_branch_name]) + if ancestor_start_lsn is not None: + cmd.extend(["--ancestor-start-lsn", str(ancestor_start_lsn)]) + + res = self.raw_cli(cmd) + res.check_returncode() + + matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) + + created_timeline_id = None + if matches is not None: + created_timeline_id = matches.group("timeline_id") + + if created_timeline_id is None: + raise Exception("could not find timeline id after `neon timeline create` invocation") + else: + return TimelineId(str(created_timeline_id)) + + def list_timelines(self, tenant_id: Optional[TenantId] = None) -> List[Tuple[str, TimelineId]]: + """ + Returns a list of (branch_name, timeline_id) tuples out of parsed `neon timeline list` CLI output. + """ + + # main [b49f7954224a0ad25cc0013ea107b54b] + # ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] + res = self.raw_cli( + ["timeline", "list", "--tenant-id", str(tenant_id or self.env.initial_tenant)] + ) + timelines_cli = sorted( + map( + lambda branch_and_id: (branch_and_id[0], TimelineId(branch_and_id[1])), + TIMELINE_DATA_EXTRACTOR.findall(res.stdout), + ) + ) + return timelines_cli + + def init( + self, + config_toml: str, + initial_timeline_id: Optional[TimelineId] = None, + ) -> "subprocess.CompletedProcess[str]": + with tempfile.NamedTemporaryFile(mode="w+") as tmp: + tmp.write(config_toml) + tmp.flush() + + cmd = ["init", f"--config={tmp.name}"] + if initial_timeline_id: + cmd.extend(["--timeline-id", str(initial_timeline_id)]) + + cmd.extend(["--pg-version", self.env.pg_version]) + + append_pageserver_param_overrides( + params_to_update=cmd, + remote_storage=self.env.remote_storage, + remote_storage_users=self.env.remote_storage_users, + pageserver_config_override=self.env.pageserver.config_override, + ) + + res = self.raw_cli(cmd) + res.check_returncode() + return res + + def pageserver_start( + self, + overrides: Tuple[str, ...] = (), + ) -> "subprocess.CompletedProcess[str]": + start_args = ["pageserver", "start", *overrides] + append_pageserver_param_overrides( + params_to_update=start_args, + remote_storage=self.env.remote_storage, + remote_storage_users=self.env.remote_storage_users, + pageserver_config_override=self.env.pageserver.config_override, + ) + + s3_env_vars = None + if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage): + s3_env_vars = self.env.remote_storage.access_env_vars() + + return self.raw_cli(start_args, extra_env_vars=s3_env_vars) + + def pageserver_stop(self, immediate=False) -> "subprocess.CompletedProcess[str]": + cmd = ["pageserver", "stop"] + if immediate: + cmd.extend(["-m", "immediate"]) + + log.info(f"Stopping pageserver with {cmd}") + return self.raw_cli(cmd) + + def safekeeper_start(self, id: int) -> "subprocess.CompletedProcess[str]": + s3_env_vars = None + if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage): + s3_env_vars = self.env.remote_storage.access_env_vars() + + return self.raw_cli(["safekeeper", "start", str(id)], extra_env_vars=s3_env_vars) + + def safekeeper_stop( + self, id: Optional[int] = None, immediate=False + ) -> "subprocess.CompletedProcess[str]": + args = ["safekeeper", "stop"] + if id is not None: + args.append(str(id)) + if immediate: + args.extend(["-m", "immediate"]) + return self.raw_cli(args) + + def pg_create( + self, + branch_name: str, + node_name: Optional[str] = None, + tenant_id: Optional[TenantId] = None, + lsn: Optional[Lsn] = None, + port: Optional[int] = None, + ) -> "subprocess.CompletedProcess[str]": + args = [ + "pg", + "create", + "--tenant-id", + str(tenant_id or self.env.initial_tenant), + "--branch-name", + branch_name, + "--pg-version", + self.env.pg_version, + ] + if lsn is not None: + args.extend(["--lsn", str(lsn)]) + if port is not None: + args.extend(["--port", str(port)]) + if node_name is not None: + args.append(node_name) + + res = self.raw_cli(args) + res.check_returncode() + return res + + def pg_start( + self, + node_name: str, + tenant_id: Optional[TenantId] = None, + lsn: Optional[Lsn] = None, + port: Optional[int] = None, + ) -> "subprocess.CompletedProcess[str]": + args = [ + "pg", + "start", + "--tenant-id", + str(tenant_id or self.env.initial_tenant), + "--pg-version", + self.env.pg_version, + ] + if lsn is not None: + args.append(f"--lsn={lsn}") + if port is not None: + args.append(f"--port={port}") + if node_name is not None: + args.append(node_name) + + res = self.raw_cli(args) + res.check_returncode() + return res + + def pg_stop( + self, + node_name: str, + tenant_id: Optional[TenantId] = None, + destroy=False, + check_return_code=True, + ) -> "subprocess.CompletedProcess[str]": + args = [ + "pg", + "stop", + "--tenant-id", + str(tenant_id or self.env.initial_tenant), + ] + if destroy: + args.append("--destroy") + if node_name is not None: + args.append(node_name) + + return self.raw_cli(args, check_return_code=check_return_code) + + +class WalCraft(AbstractNeonCli): + """ + A typed wrapper around the `wal_craft` CLI tool. + Supports main commands via typed methods and a way to run arbitrary command directly via CLI. + """ + + COMMAND = "wal_craft" + + def postgres_config(self) -> List[str]: + res = self.raw_cli(["print-postgres-config"]) + res.check_returncode() + return res.stdout.split("\n") + + def in_existing(self, type: str, connection: str) -> None: + res = self.raw_cli(["in-existing", type, connection]) + res.check_returncode() + + +class ComputeCtl(AbstractNeonCli): + """ + A typed wrapper around the `compute_ctl` CLI tool. + """ + + COMMAND = "compute_ctl" + + +class NeonPageserver(PgProtocol): + """ + An object representing a running pageserver. + """ + + TEMP_FILE_SUFFIX = "___temp" + + def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional[str] = None): + super().__init__(host="localhost", port=port.pg, user="cloud_admin") + self.env = env + self.running = False + self.service_port = port + self.config_override = config_override + self.version = env.get_pageserver_version() + + def start(self, overrides: Tuple[str, ...] = ()) -> "NeonPageserver": + """ + Start the page server. + `overrides` allows to add some config to this pageserver start. + Returns self. + """ + assert self.running is False + + self.env.neon_cli.pageserver_start(overrides=overrides) + self.running = True + return self + + def stop(self, immediate: bool = False) -> "NeonPageserver": + """ + Stop the page server. + Returns self. + """ + if self.running: + self.env.neon_cli.pageserver_stop(immediate) + self.running = False + return self + + def __enter__(self) -> "NeonPageserver": + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + tb: Optional[TracebackType], + ): + self.stop(immediate=True) + + def is_testing_enabled_or_skip(self): + if '"testing"' not in self.version: + pytest.skip("pageserver was built without 'testing' feature") + + def is_profiling_enabled_or_skip(self): + if '"profiling"' not in self.version: + pytest.skip("pageserver was built without 'profiling' feature") + + def http_client(self, auth_token: Optional[str] = None) -> PageserverHttpClient: + return PageserverHttpClient( + port=self.service_port.http, + auth_token=auth_token, + is_testing_enabled_or_skip=self.is_testing_enabled_or_skip, + ) + + +def append_pageserver_param_overrides( + params_to_update: List[str], + remote_storage: Optional[RemoteStorage], + remote_storage_users: RemoteStorageUsers, + pageserver_config_override: Optional[str] = None, +): + if bool(remote_storage_users & RemoteStorageUsers.PAGESERVER) and remote_storage is not None: + remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage) + + params_to_update.append( + f"--pageserver-config-override=remote_storage={remote_storage_toml_table}" + ) + + env_overrides = os.getenv("NEON_PAGESERVER_OVERRIDES") + if env_overrides is not None: + params_to_update += [ + f"--pageserver-config-override={o.strip()}" for o in env_overrides.split(";") + ] + + if pageserver_config_override is not None: + params_to_update += [ + f"--pageserver-config-override={o.strip()}" + for o in pageserver_config_override.split(";") + ] + + +class PgBin: + """A helper class for executing postgres binaries""" + + def __init__(self, log_dir: Path, pg_distrib_dir: Path, pg_version: str): + self.log_dir = log_dir + self.pg_version = pg_version + self.pg_bin_path = pg_distrib_dir / f"v{pg_version}" / "bin" + self.pg_lib_dir = pg_distrib_dir / f"v{pg_version}" / "lib" + self.env = os.environ.copy() + self.env["LD_LIBRARY_PATH"] = str(self.pg_lib_dir) + + def _fixpath(self, command: List[str]): + if "/" not in str(command[0]): + command[0] = str(self.pg_bin_path / command[0]) + + def _build_env(self, env_add: Optional[Env]) -> Env: + if env_add is None: + return self.env + env = self.env.copy() + env.update(env_add) + return env + + def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None): + """ + Run one of the postgres binaries. + + The command should be in list form, e.g. ['pgbench', '-p', '55432'] + + All the necessary environment variables will be set. + + If the first argument (the command name) doesn't include a path (no '/' + characters present), then it will be edited to include the correct path. + + If you want stdout/stderr captured to files, use `run_capture` instead. + """ + + self._fixpath(command) + log.info(f"Running command '{' '.join(command)}'") + env = self._build_env(env) + subprocess.run(command, env=env, cwd=cwd, check=True) + + def run_capture( + self, + command: List[str], + env: Optional[Env] = None, + cwd: Optional[str] = None, + **kwargs: Any, + ) -> str: + """ + Run one of the postgres binaries, with stderr and stdout redirected to a file. + + This is just like `run`, but for chatty programs. Returns basepath for files + with captured output. + """ + + self._fixpath(command) + log.info(f"Running command '{' '.join(command)}'") + env = self._build_env(env) + return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs) + + +@pytest.fixture(scope="function") +def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: str) -> PgBin: + return PgBin(test_output_dir, pg_distrib_dir, pg_version) + + +class VanillaPostgres(PgProtocol): + def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init: bool = True): + super().__init__(host="localhost", port=port, dbname="postgres") + self.pgdatadir = pgdatadir + self.pg_bin = pg_bin + self.running = False + if init: + self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)]) + self.configure([f"port = {port}\n"]) + + def configure(self, options: List[str]): + """Append lines into postgresql.conf file.""" + assert not self.running + with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file: + conf_file.write("\n".join(options)) + + def start(self, log_path: Optional[str] = None): + assert not self.running + self.running = True + + if log_path is None: + log_path = os.path.join(self.pgdatadir, "pg.log") + + self.pg_bin.run_capture( + ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"] + ) + + def stop(self): + assert self.running + self.running = False + self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"]) + + def get_subdir_size(self, subdir) -> int: + """Return size of pgdatadir subdirectory in bytes.""" + return get_dir_size(os.path.join(self.pgdatadir, subdir)) + + def __enter__(self) -> "VanillaPostgres": + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + tb: Optional[TracebackType], + ): + if self.running: + self.stop() + + +@pytest.fixture(scope="function") +def vanilla_pg( + test_output_dir: Path, + port_distributor: PortDistributor, + pg_distrib_dir: Path, + pg_version: str, +) -> Iterator[VanillaPostgres]: + pgdatadir = test_output_dir / "pgdata-vanilla" + pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: + yield vanilla_pg + + +class RemotePostgres(PgProtocol): + def __init__(self, pg_bin: PgBin, remote_connstr: str): + super().__init__(**parse_dsn(remote_connstr)) + self.pg_bin = pg_bin + # The remote server is assumed to be running already + self.running = True + + def configure(self, options: List[str]): + raise Exception("cannot change configuration of remote Posgres instance") + + def start(self): + raise Exception("cannot start a remote Postgres instance") + + def stop(self): + raise Exception("cannot stop a remote Postgres instance") + + def get_subdir_size(self, subdir) -> int: + # TODO: Could use the server's Generic File Access functions if superuser. + # See https://www.postgresql.org/docs/14/functions-admin.html#FUNCTIONS-ADMIN-GENFILE + raise Exception("cannot get size of a Postgres instance") + + def __enter__(self) -> "RemotePostgres": + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + tb: Optional[TracebackType], + ): + # do nothing + pass + + +@pytest.fixture(scope="function") +def remote_pg( + test_output_dir: Path, pg_distrib_dir: Path, pg_version: str +) -> Iterator[RemotePostgres]: + pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) + + connstr = os.getenv("BENCHMARK_CONNSTR") + if connstr is None: + raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable") + + with RemotePostgres(pg_bin, connstr) as remote_pg: + yield remote_pg + + +class PSQL: + """ + Helper class to make it easier to run psql in the proxy tests. + Copied and modified from PSQL from cloud/tests_e2e/common/psql.py + """ + + path: str + database_url: str + + def __init__( + self, + path: str = "psql", + host: str = "127.0.0.1", + port: int = 5432, + ): + assert shutil.which(path) + + self.path = path + self.database_url = f"postgres://{host}:{port}/main?options=project%3Dgeneric-project-name" + + async def run(self, query: Optional[str] = None) -> asyncio.subprocess.Process: + run_args = [self.path, "--no-psqlrc", "--quiet", "--tuples-only", self.database_url] + if query is not None: + run_args += ["--command", query] + + log.info(f"Run psql: {subprocess.list2cmdline(run_args)}") + return await asyncio.create_subprocess_exec( + *run_args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env={"LC_ALL": "C", **os.environ}, # one locale to rule them all + ) + + +class NeonProxy(PgProtocol): + def __init__( + self, + proxy_port: int, + http_port: int, + neon_binpath: Path, + auth_endpoint=None, + mgmt_port=None, + ): + super().__init__(dsn=auth_endpoint, port=proxy_port) + self.host = "127.0.0.1" + self.http_port = http_port + self.neon_binpath = neon_binpath + self.proxy_port = proxy_port + self.mgmt_port = mgmt_port + self.auth_endpoint = auth_endpoint + self._popen: Optional[subprocess.Popen[bytes]] = None + self.link_auth_uri_prefix = "http://dummy-uri" + + def start(self): + """ + Starts a proxy with option '--auth-backend postgres' and a postgres instance already provided though '--auth-endpoint '." + """ + assert self._popen is None + assert self.auth_endpoint is not None + + # Start proxy + args = [ + str(self.neon_binpath / "proxy"), + *["--http", f"{self.host}:{self.http_port}"], + *["--proxy", f"{self.host}:{self.proxy_port}"], + *["--auth-backend", "postgres"], + *["--auth-endpoint", self.auth_endpoint], + ] + self._popen = subprocess.Popen(args) + self._wait_until_ready() + + def start_with_link_auth(self): + """ + Starts a proxy with option '--auth-backend link' and a dummy authentication link '--uri dummy-auth-link'." + """ + assert self._popen is None + + # Start proxy + bin_proxy = str(self.neon_binpath / "proxy") + args = [bin_proxy] + args.extend(["--http", f"{self.host}:{self.http_port}"]) + args.extend(["--proxy", f"{self.host}:{self.proxy_port}"]) + args.extend(["--mgmt", f"{self.host}:{self.mgmt_port}"]) + args.extend(["--auth-backend", "link"]) + args.extend(["--uri", self.link_auth_uri_prefix]) + arg_str = " ".join(args) + log.info(f"starting proxy with command line ::: {arg_str}") + self._popen = subprocess.Popen(args, stdout=subprocess.PIPE) + self._wait_until_ready() + + @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10) + def _wait_until_ready(self): + requests.get(f"http://{self.host}:{self.http_port}/v1/status") + + def get_metrics(self) -> str: + request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics") + request_result.raise_for_status() + return request_result.text + + def __enter__(self) -> "NeonProxy": + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + tb: Optional[TracebackType], + ): + if self._popen is not None: + # NOTE the process will die when we're done with tests anyway, because + # it's a child process. This is mostly to clean up in between different tests. + self._popen.kill() + + +@pytest.fixture(scope="function") +def link_proxy(port_distributor: PortDistributor, neon_binpath: Path) -> Iterator[NeonProxy]: + """Neon proxy that routes through link auth.""" + http_port = port_distributor.get_port() + proxy_port = port_distributor.get_port() + mgmt_port = port_distributor.get_port() + with NeonProxy(proxy_port, http_port, neon_binpath=neon_binpath, mgmt_port=mgmt_port) as proxy: + proxy.start_with_link_auth() + yield proxy + + +@pytest.fixture(scope="function") +def static_proxy( + vanilla_pg: VanillaPostgres, port_distributor: PortDistributor, neon_binpath: Path +) -> Iterator[NeonProxy]: + """Neon proxy that routes directly to vanilla postgres.""" + + # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql` + vanilla_pg.start() + vanilla_pg.safe_psql("create user proxy with login superuser password 'password'") + + port = vanilla_pg.default_options["port"] + host = vanilla_pg.default_options["host"] + dbname = vanilla_pg.default_options["dbname"] + auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}" + + proxy_port = port_distributor.get_port() + http_port = port_distributor.get_port() + + with NeonProxy( + proxy_port=proxy_port, + http_port=http_port, + neon_binpath=neon_binpath, + auth_endpoint=auth_endpoint, + ) as proxy: + proxy.start() + yield proxy + + +class Postgres(PgProtocol): + """An object representing a running postgres daemon.""" + + def __init__( + self, env: NeonEnv, tenant_id: TenantId, port: int, check_stop_result: bool = True + ): + super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres") + self.env = env + self.running = False + self.node_name: Optional[str] = None # dubious, see asserts below + self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA + self.tenant_id = tenant_id + self.port = port + self.check_stop_result = check_stop_result + # path to conf is /pgdatadirs/tenants///postgresql.conf + + def create( + self, + branch_name: str, + node_name: Optional[str] = None, + lsn: Optional[Lsn] = None, + config_lines: Optional[List[str]] = None, + ) -> "Postgres": + """ + Create the pg data directory. + Returns self. + """ + + if not config_lines: + config_lines = [] + + self.node_name = node_name or f"{branch_name}_pg_node" + self.env.neon_cli.pg_create( + branch_name, node_name=self.node_name, tenant_id=self.tenant_id, lsn=lsn, port=self.port + ) + path = Path("pgdatadirs") / "tenants" / str(self.tenant_id) / self.node_name + self.pgdata_dir = os.path.join(self.env.repo_dir, path) + + if config_lines is None: + config_lines = [] + + # set small 'max_replication_write_lag' to enable backpressure + # and make tests more stable. + config_lines = ["max_replication_write_lag=15MB"] + config_lines + self.config(config_lines) + + return self + + def start(self) -> "Postgres": + """ + Start the Postgres instance. + Returns self. + """ + + assert self.node_name is not None + + log.info(f"Starting postgres node {self.node_name}") + + self.env.neon_cli.pg_start(self.node_name, tenant_id=self.tenant_id, port=self.port) + self.running = True + + return self + + def pg_data_dir_path(self) -> str: + """Path to data directory""" + assert self.node_name + path = Path("pgdatadirs") / "tenants" / str(self.tenant_id) / self.node_name + return os.path.join(self.env.repo_dir, path) + + def pg_xact_dir_path(self) -> str: + """Path to pg_xact dir""" + return os.path.join(self.pg_data_dir_path(), "pg_xact") + + def pg_twophase_dir_path(self) -> str: + """Path to pg_twophase dir""" + return os.path.join(self.pg_data_dir_path(), "pg_twophase") + + def config_file_path(self) -> str: + """Path to postgresql.conf""" + return os.path.join(self.pg_data_dir_path(), "postgresql.conf") + + def adjust_for_safekeepers(self, safekeepers: str) -> "Postgres": + """ + Adjust instance config for working with wal acceptors instead of + pageserver (pre-configured by CLI) directly. + """ + + # TODO: reuse config() + with open(self.config_file_path(), "r") as f: + cfg_lines = f.readlines() + with open(self.config_file_path(), "w") as f: + for cfg_line in cfg_lines: + # walproposer uses different application_name + if ( + "synchronous_standby_names" in cfg_line + or + # don't repeat safekeepers/wal_acceptors multiple times + "neon.safekeepers" in cfg_line + ): + continue + f.write(cfg_line) + f.write("synchronous_standby_names = 'walproposer'\n") + f.write("neon.safekeepers = '{}'\n".format(safekeepers)) + return self + + def config(self, lines: List[str]) -> "Postgres": + """ + Add lines to postgresql.conf. + Lines should be an array of valid postgresql.conf rows. + Returns self. + """ + + with open(self.config_file_path(), "a") as conf: + for line in lines: + conf.write(line) + conf.write("\n") + + return self + + def stop(self) -> "Postgres": + """ + Stop the Postgres instance if it's running. + Returns self. + """ + + if self.running: + assert self.node_name is not None + self.env.neon_cli.pg_stop( + self.node_name, self.tenant_id, check_return_code=self.check_stop_result + ) + self.running = False + + return self + + def stop_and_destroy(self) -> "Postgres": + """ + Stop the Postgres instance, then destroy it. + Returns self. + """ + + assert self.node_name is not None + self.env.neon_cli.pg_stop( + self.node_name, self.tenant_id, True, check_return_code=self.check_stop_result + ) + self.node_name = None + self.running = False + + return self + + def create_start( + self, + branch_name: str, + node_name: Optional[str] = None, + lsn: Optional[Lsn] = None, + config_lines: Optional[List[str]] = None, + ) -> "Postgres": + """ + Create a Postgres instance, apply config + and then start it. + Returns self. + """ + + started_at = time.time() + + self.create( + branch_name=branch_name, + node_name=node_name, + config_lines=config_lines, + lsn=lsn, + ).start() + + log.info(f"Postgres startup took {time.time() - started_at} seconds") + + return self + + def __enter__(self) -> "Postgres": + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc: Optional[BaseException], + tb: Optional[TracebackType], + ): + self.stop() + + +class PostgresFactory: + """An object representing multiple running postgres daemons.""" + + def __init__(self, env: NeonEnv): + self.env = env + self.num_instances: int = 0 + self.instances: List[Postgres] = [] + + def create_start( + self, + branch_name: str, + node_name: Optional[str] = None, + tenant_id: Optional[TenantId] = None, + lsn: Optional[Lsn] = None, + config_lines: Optional[List[str]] = None, + ) -> Postgres: + + pg = Postgres( + self.env, + tenant_id=tenant_id or self.env.initial_tenant, + port=self.env.port_distributor.get_port(), + ) + self.num_instances += 1 + self.instances.append(pg) + + return pg.create_start( + branch_name=branch_name, + node_name=node_name, + config_lines=config_lines, + lsn=lsn, + ) + + def create( + self, + branch_name: str, + node_name: Optional[str] = None, + tenant_id: Optional[TenantId] = None, + lsn: Optional[Lsn] = None, + config_lines: Optional[List[str]] = None, + ) -> Postgres: + + pg = Postgres( + self.env, + tenant_id=tenant_id or self.env.initial_tenant, + port=self.env.port_distributor.get_port(), + ) + + self.num_instances += 1 + self.instances.append(pg) + + return pg.create( + branch_name=branch_name, + node_name=node_name, + lsn=lsn, + config_lines=config_lines, + ) + + def stop_all(self) -> "PostgresFactory": + for pg in self.instances: + pg.stop() + + return self + + +@dataclass +class SafekeeperPort: + pg: int + http: int + + +@dataclass +class Safekeeper: + """An object representing a running safekeeper daemon.""" + + env: NeonEnv + port: SafekeeperPort + id: int + running: bool = False + + def start(self) -> "Safekeeper": + assert self.running is False + self.env.neon_cli.safekeeper_start(self.id) + self.running = True + # wait for wal acceptor start by checking its status + started_at = time.time() + while True: + try: + with self.http_client() as http_cli: + http_cli.check_status() + except Exception as e: + elapsed = time.time() - started_at + if elapsed > 3: + raise RuntimeError( + f"timed out waiting {elapsed:.0f}s for wal acceptor start: {e}" + ) + time.sleep(0.5) + else: + break # success + return self + + def stop(self, immediate: bool = False) -> "Safekeeper": + log.info("Stopping safekeeper {}".format(self.id)) + self.env.neon_cli.safekeeper_stop(self.id, immediate) + self.running = False + return self + + def append_logical_message( + self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any] + ) -> Dict[str, Any]: + """ + Send JSON_CTRL query to append LogicalMessage to WAL and modify + safekeeper state. It will construct LogicalMessage from provided + prefix and message, and then will write it to WAL. + """ + + # "replication=0" hacks psycopg not to send additional queries + # on startup, see https://github.com/psycopg/psycopg2/pull/482 + connstr = f"host=localhost port={self.port.pg} replication=0 options='-c timeline_id={timeline_id} tenant_id={tenant_id}'" + + with closing(psycopg2.connect(connstr)) as conn: + # server doesn't support transactions + conn.autocommit = True + with conn.cursor() as cur: + request_json = json.dumps(request) + log.info(f"JSON_CTRL request on port {self.port.pg}: {request_json}") + cur.execute("JSON_CTRL " + request_json) + all = cur.fetchall() + log.info(f"JSON_CTRL response: {all[0][0]}") + res = json.loads(all[0][0]) + assert isinstance(res, dict) + return res + + def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient: + return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token) + + def data_dir(self) -> str: + return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}") + + +@dataclass +class SafekeeperTimelineStatus: + acceptor_epoch: int + pg_version: int + flush_lsn: Lsn + timeline_start_lsn: Lsn + backup_lsn: Lsn + remote_consistent_lsn: Lsn + + +@dataclass +class SafekeeperMetrics: + # These are metrics from Prometheus which uses float64 internally. + # As a consequence, values may differ from real original int64s. + flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) + commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) + + +class SafekeeperHttpClient(requests.Session): + HTTPError = requests.HTTPError + + def __init__(self, port: int, auth_token: Optional[str] = None): + super().__init__() + self.port = port + self.auth_token = auth_token + + if auth_token is not None: + self.headers["Authorization"] = f"Bearer {auth_token}" + + def check_status(self): + self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + + def timeline_create( + self, tenant_id: TenantId, timeline_id: TimelineId, pg_version: int, commit_lsn: Lsn + ): + body = { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "pg_version": pg_version, + "commit_lsn": str(commit_lsn), + } + res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body) + res.raise_for_status() + + def timeline_status( + self, tenant_id: TenantId, timeline_id: TimelineId + ) -> SafekeeperTimelineStatus: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") + res.raise_for_status() + resj = res.json() + return SafekeeperTimelineStatus( + acceptor_epoch=resj["acceptor_state"]["epoch"], + pg_version=resj["pg_info"]["pg_version"], + flush_lsn=Lsn(resj["flush_lsn"]), + timeline_start_lsn=Lsn(resj["timeline_start_lsn"]), + backup_lsn=Lsn(resj["backup_lsn"]), + remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]), + ) + + def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): + res = self.post( + f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", + json=body, + ) + res.raise_for_status() + + def timeline_delete_force(self, tenant_id: TenantId, timeline_id: TimelineId) -> Dict[Any, Any]: + res = self.delete( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]: + res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def get_metrics_str(self) -> str: + request_result = self.get(f"http://localhost:{self.port}/metrics") + request_result.raise_for_status() + return request_result.text + + def get_metrics(self) -> SafekeeperMetrics: + all_metrics_text = self.get_metrics_str() + + metrics = SafekeeperMetrics() + for match in re.finditer( + r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', + all_metrics_text, + re.MULTILINE, + ): + metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int( + match.group(3) + ) + for match in re.finditer( + r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', + all_metrics_text, + re.MULTILINE, + ): + metrics.commit_lsn_inexact[ + (TenantId(match.group(1)), TimelineId(match.group(2))) + ] = int(match.group(3)) + return metrics + + +@dataclass +class Etcd: + """An object managing etcd instance""" + + datadir: str + port: int + peer_port: int + binary_path: Path = field(init=False) + handle: Optional[subprocess.Popen[Any]] = None # handle of running daemon + + def __post_init__(self): + self.binary_path = etcd_path() + + def client_url(self): + return f"http://127.0.0.1:{self.port}" + + def check_status(self): + with requests.Session() as s: + s.mount("http://", requests.adapters.HTTPAdapter(max_retries=1)) # do not retry + s.get(f"{self.client_url()}/health").raise_for_status() + + def try_start(self): + if self.handle is not None: + log.debug(f"etcd is already running on port {self.port}") + return + + Path(self.datadir).mkdir(exist_ok=True) + + if not self.binary_path.is_file(): + raise RuntimeError(f"etcd broker binary '{self.binary_path}' is not a file") + + client_url = self.client_url() + log.info(f'Starting etcd to listen incoming connections at "{client_url}"') + with open(os.path.join(self.datadir, "etcd.log"), "wb") as log_file: + args = [ + self.binary_path, + f"--data-dir={self.datadir}", + f"--listen-client-urls={client_url}", + f"--advertise-client-urls={client_url}", + f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}", + # Set --quota-backend-bytes to keep the etcd virtual memory + # size smaller. Our test etcd clusters are very small. + # See https://github.com/etcd-io/etcd/issues/7910 + "--quota-backend-bytes=100000000", + ] + self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file) + + # wait for start + started_at = time.time() + while True: + try: + self.check_status() + except Exception as e: + elapsed = time.time() - started_at + if elapsed > 5: + raise RuntimeError(f"timed out waiting {elapsed:.0f}s for etcd start: {e}") + time.sleep(0.5) + else: + break # success + + def stop(self): + if self.handle is not None: + self.handle.terminate() + self.handle.wait() + + +def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + """Compute the working directory for an individual test.""" + test_name = request.node.name + test_dir = top_output_dir / test_name.replace("/", "-") + log.info(f"get_test_output_dir is {test_dir}") + # make mypy happy + assert isinstance(test_dir, Path) + return test_dir + + +# This is autouse, so the test output directory always gets created, even +# if a test doesn't put anything there. It also solves a problem with the +# neon_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it +# creates the repo in the test output directory. But it cannot depend on +# 'test_output_dir' fixture, because when TEST_SHARED_FIXTURES is not set, +# it has 'session' scope and cannot access fixtures with 'function' +# scope. So it uses the get_test_output_dir() function to get the path, and +# this fixture ensures that the directory exists. That works because +# 'autouse' fixtures are run before other fixtures. +@pytest.fixture(scope="function", autouse=True) +def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[Path]: + """Create the working directory for an individual test.""" + + # one directory per test + test_dir = get_test_output_dir(request, top_output_dir) + log.info(f"test_output_dir is {test_dir}") + shutil.rmtree(test_dir, ignore_errors=True) + test_dir.mkdir() + + yield test_dir + + allure_attach_from_dir(test_dir) + + +SKIP_DIRS = frozenset( + ( + "pg_wal", + "pg_stat", + "pg_stat_tmp", + "pg_subtrans", + "pg_logical", + "pg_replslot/wal_proposer_slot", + ) +) + +SKIP_FILES = frozenset( + ( + "pg_internal.init", + "pg.log", + "zenith.signal", + "postgresql.conf", + "postmaster.opts", + "postmaster.pid", + "pg_control", + ) +) + + +def should_skip_dir(dirname: str) -> bool: + return dirname in SKIP_DIRS + + +def should_skip_file(filename: str) -> bool: + if filename in SKIP_FILES: + return True + # check for temp table files according to https://www.postgresql.org/docs/current/storage-file-layout.html + # i e "tBBB_FFF" + if not filename.startswith("t"): + return False + + tmp_name = filename[1:].split("_") + if len(tmp_name) != 2: + return False + + try: + list(map(int, tmp_name)) + except: # noqa: E722 + return False + return True + + +# +# Test helpers +# +def list_files_to_compare(pgdata_dir: Path) -> List[str]: + pgdata_files = [] + for root, _file, filenames in os.walk(pgdata_dir): + for filename in filenames: + rel_dir = os.path.relpath(root, pgdata_dir) + # Skip some dirs and files we don't want to compare + if should_skip_dir(rel_dir) or should_skip_file(filename): + continue + rel_file = os.path.join(rel_dir, filename) + pgdata_files.append(rel_file) + + pgdata_files.sort() + log.info(pgdata_files) + return pgdata_files + + +# pg is the existing and running compute node, that we want to compare with a basebackup +def check_restored_datadir_content( + test_output_dir: Path, + env: NeonEnv, + pg: Postgres, +): + # Get the timeline ID. We need it for the 'basebackup' command + timeline = TimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0]) + + # stop postgres to ensure that files won't change + pg.stop() + + # Take a basebackup from pageserver + restored_dir_path = env.repo_dir / f"{pg.node_name}_restored_datadir" + restored_dir_path.mkdir(exist_ok=True) + + pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) + psql_path = os.path.join(pg_bin.pg_bin_path, "psql") + + cmd = rf""" + {psql_path} \ + --no-psqlrc \ + postgres://localhost:{env.pageserver.service_port.pg} \ + -c 'basebackup {pg.tenant_id} {timeline}' \ + | tar -x -C {restored_dir_path} + """ + + # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. + # PgBin sets it automatically, but here we need to pipe psql output to the tar command. + psql_env = {"LD_LIBRARY_PATH": pg_bin.pg_lib_dir} + result = subprocess.run(cmd, env=psql_env, capture_output=True, text=True, shell=True) + + # Print captured stdout/stderr if basebackup cmd failed. + if result.returncode != 0: + log.error("Basebackup shell command failed with:") + log.error(result.stdout) + log.error(result.stderr) + assert result.returncode == 0 + + # list files we're going to compare + assert pg.pgdata_dir + pgdata_files = list_files_to_compare(Path(pg.pgdata_dir)) + restored_files = list_files_to_compare(restored_dir_path) + + # check that file sets are equal + assert pgdata_files == restored_files + + # compare content of the files + # filecmp returns (match, mismatch, error) lists + # We've already filtered all mismatching files in list_files_to_compare(), + # so here expect that the content is identical + (match, mismatch, error) = filecmp.cmpfiles( + pg.pgdata_dir, restored_dir_path, pgdata_files, shallow=False + ) + log.info(f"filecmp result mismatch and error lists:\n\t mismatch={mismatch}\n\t error={error}") + + for f in mismatch: + + f1 = os.path.join(pg.pgdata_dir, f) + f2 = os.path.join(restored_dir_path, f) + stdout_filename = "{}.filediff".format(f2) + + with open(stdout_filename, "w") as stdout_f: + subprocess.run("xxd -b {} > {}.hex ".format(f1, f1), shell=True) + subprocess.run("xxd -b {} > {}.hex ".format(f2, f2), shell=True) + + cmd = "diff {}.hex {}.hex".format(f1, f2) + subprocess.run([cmd], stdout=stdout_f, shell=True) + + assert (mismatch, error) == ([], []) + + +def assert_no_in_progress_downloads_for_tenant( + pageserver_http_client: PageserverHttpClient, + tenant: TenantId, +): + tenant_status = pageserver_http_client.tenant_status(tenant) + assert tenant_status["has_in_progress_downloads"] is False, tenant_status + + +def remote_consistent_lsn( + pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId +) -> Lsn: + detail = pageserver_http_client.timeline_detail(tenant, timeline) + + lsn_str = detail["remote_consistent_lsn"] + if lsn_str is None: + # No remote information at all. This happens right after creating + # a timeline, before any part of it has been uploaded to remote + # storage yet. + return Lsn(0) + assert isinstance(lsn_str, str) + return Lsn(lsn_str) + + +def wait_for_upload( + pageserver_http_client: PageserverHttpClient, + tenant: TenantId, + timeline: TimelineId, + lsn: Lsn, +): + """waits for local timeline upload up to specified lsn""" + for i in range(20): + current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) + if current_lsn >= lsn: + return + log.info( + "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( + lsn, current_lsn, i + 1 + ) + ) + time.sleep(1) + raise Exception( + "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( + lsn, current_lsn + ) + ) + + +def last_record_lsn( + pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId +) -> Lsn: + detail = pageserver_http_client.timeline_detail(tenant, timeline) + + lsn_str = detail["last_record_lsn"] + assert isinstance(lsn_str, str) + return Lsn(lsn_str) + + +def wait_for_last_record_lsn( + pageserver_http_client: PageserverHttpClient, + tenant: TenantId, + timeline: TimelineId, + lsn: Lsn, +) -> Lsn: + """waits for pageserver to catch up to a certain lsn, returns the last observed lsn.""" + for i in range(10): + current_lsn = last_record_lsn(pageserver_http_client, tenant, timeline) + if current_lsn >= lsn: + return current_lsn + log.info( + "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( + lsn, current_lsn, i + 1 + ) + ) + time.sleep(1) + raise Exception( + "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn) + ) + + +def wait_for_last_flush_lsn( + env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId +) -> Lsn: + """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" + last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn) + + +def fork_at_current_lsn( + env: NeonEnv, + pg: Postgres, + new_branch_name: str, + ancestor_branch_name: str, + tenant_id: Optional[TenantId] = None, +) -> TimelineId: + """ + Create new branch at the last LSN of an existing branch. + The "last LSN" is taken from the given Postgres instance. The pageserver will wait for all the + the WAL up to that LSN to arrive in the pageserver before creating the branch. + """ + current_lsn = pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0] + return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn) diff --git a/test_runner/fixtures/pg_stats.py b/test_runner/fixtures/pg_stats.py new file mode 100644 index 0000000000..adb3a7730e --- /dev/null +++ b/test_runner/fixtures/pg_stats.py @@ -0,0 +1,60 @@ +from functools import cached_property +from typing import List + +import pytest + + +class PgStatTable: + table: str + columns: List[str] + additional_query: str + + def __init__(self, table: str, columns: List[str], filter_query: str = ""): + self.table = table + self.columns = columns + self.additional_query = filter_query + + @cached_property + def query(self) -> str: + return f"SELECT {','.join(self.columns)} FROM {self.table} {self.additional_query}" + + +@pytest.fixture(scope="function") +def pg_stats_rw() -> List[PgStatTable]: + return [ + PgStatTable( + "pg_stat_database", + ["tup_returned", "tup_fetched", "tup_inserted", "tup_updated", "tup_deleted"], + "WHERE datname='postgres'", + ), + ] + + +@pytest.fixture(scope="function") +def pg_stats_ro() -> List[PgStatTable]: + return [ + PgStatTable( + "pg_stat_database", ["tup_returned", "tup_fetched"], "WHERE datname='postgres'" + ), + ] + + +@pytest.fixture(scope="function") +def pg_stats_wo() -> List[PgStatTable]: + return [ + PgStatTable( + "pg_stat_database", + ["tup_inserted", "tup_updated", "tup_deleted"], + "WHERE datname='postgres'", + ), + ] + + +@pytest.fixture(scope="function") +def pg_stats_wal() -> List[PgStatTable]: + return [ + PgStatTable( + "pg_stat_wal", + ["wal_records", "wal_fpi", "wal_bytes", "wal_buffers_full", "wal_write"], + ) + ] diff --git a/test_runner/fixtures/slow.py b/test_runner/fixtures/slow.py index c20b766a93..ae0e87b553 100644 --- a/test_runner/fixtures/slow.py +++ b/test_runner/fixtures/slow.py @@ -1,4 +1,9 @@ +from typing import Any, List + import pytest +from _pytest.config import Config +from _pytest.config.argparsing import Parser + """ This plugin allows tests to be marked as slow using pytest.mark.slow. By default slow tests are excluded. They need to be specifically requested with the --runslow flag in @@ -8,15 +13,15 @@ Copied from here: https://docs.pytest.org/en/latest/example/simple.html """ -def pytest_addoption(parser): +def pytest_addoption(parser: Parser): parser.addoption("--runslow", action="store_true", default=False, help="run slow tests") -def pytest_configure(config): +def pytest_configure(config: Config): config.addinivalue_line("markers", "slow: mark test as slow to run") -def pytest_collection_modifyitems(config, items): +def pytest_collection_modifyitems(config: Config, items: List[Any]): if config.getoption("--runslow"): # --runslow given in cli: do not skip slow tests return diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py new file mode 100644 index 0000000000..2bb962d44a --- /dev/null +++ b/test_runner/fixtures/types.py @@ -0,0 +1,95 @@ +import random +from functools import total_ordering +from typing import Any, Type, TypeVar, Union + +T = TypeVar("T", bound="Id") + + +@total_ordering +class Lsn: + """ + Datatype for an LSN. Internally it is a 64-bit integer, but the string + representation is like "1/123abcd". See also pg_lsn datatype in Postgres + """ + + def __init__(self, x: Union[int, str]): + if isinstance(x, int): + self.lsn_int = x + else: + """Convert lsn from hex notation to int.""" + l, r = x.split("/") + self.lsn_int = (int(l, 16) << 32) + int(r, 16) + assert 0 <= self.lsn_int <= 0xFFFFFFFF_FFFFFFFF + + def __str__(self) -> str: + """Convert lsn from int to standard hex notation.""" + return f"{(self.lsn_int >> 32):X}/{(self.lsn_int & 0xFFFFFFFF):X}" + + def __repr__(self) -> str: + return f'Lsn("{str(self)}")' + + def __int__(self) -> int: + return self.lsn_int + + def __lt__(self, other: Any) -> bool: + if not isinstance(other, Lsn): + return NotImplemented + return self.lsn_int < other.lsn_int + + def __eq__(self, other: Any) -> bool: + if not isinstance(other, Lsn): + return NotImplemented + return self.lsn_int == other.lsn_int + + # Returns the difference between two Lsns, in bytes + def __sub__(self, other: Any) -> int: + if not isinstance(other, Lsn): + return NotImplemented + return self.lsn_int - other.lsn_int + + def __hash__(self) -> int: + return hash(self.lsn_int) + + +@total_ordering +class Id: + """ + Datatype for a Neon tenant and timeline IDs. Internally it's a 16-byte array, and + the string representation is in hex. This corresponds to the Id / TenantId / + TimelineIds in the Rust code. + """ + + def __init__(self, x: str): + self.id = bytearray.fromhex(x) + assert len(self.id) == 16 + + def __str__(self) -> str: + return self.id.hex() + + def __lt__(self, other) -> bool: + if not isinstance(other, type(self)): + return NotImplemented + return self.id < other.id + + def __eq__(self, other) -> bool: + if not isinstance(other, type(self)): + return NotImplemented + return self.id == other.id + + def __hash__(self) -> int: + return hash(str(self.id)) + + @classmethod + def generate(cls: Type[T]) -> T: + """Generate a random ID""" + return cls(random.randbytes(16).hex()) + + +class TenantId(Id): + def __repr__(self) -> str: + return f'`TenantId("{self.id.hex()}")' + + +class TimelineId(Id): + def __repr__(self) -> str: + return f'TimelineId("{self.id.hex()}")' diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 236c225bfb..506fe6f9da 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,29 +1,27 @@ +import contextlib import os +import re +import shutil import subprocess +import tarfile +import time +from pathlib import Path +from typing import Any, Callable, Dict, List, Tuple, TypeVar -from typing import Any, List +import allure # type: ignore from fixtures.log_helper import log +from psycopg2.extensions import cursor + +Fn = TypeVar("Fn", bound=Callable[..., Any]) -def get_self_dir() -> str: - """ Get the path to the directory where this script lives. """ - return os.path.dirname(os.path.abspath(__file__)) +def get_self_dir() -> Path: + """Get the path to the directory where this script lives.""" + return Path(__file__).resolve().parent -def mkdir_if_needed(path: str) -> None: - """ Create a directory if it doesn't already exist - - Note this won't try to create intermediate directories. - """ - try: - os.mkdir(path) - except FileExistsError: - pass - assert os.path.isdir(path) - - -def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: - """ Run a process and capture its output +def subprocess_capture(capture_dir: Path, cmd: List[str], **kwargs: Any) -> str: + """Run a process and capture its output Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" where "cmd" is the name of the program and NNN is an incrementing @@ -32,16 +30,22 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: If those files already exist, we will overwrite them. Returns basepath for files with captured output. """ - assert type(cmd) is list - base = os.path.basename(cmd[0]) + '_{}'.format(global_counter()) + assert isinstance(cmd, list) + base = f"{os.path.basename(cmd[0])}_{global_counter()}" basepath = os.path.join(capture_dir, base) - stdout_filename = basepath + '.stdout' - stderr_filename = basepath + '.stderr' + stdout_filename = f"{basepath}.stdout" + stderr_filename = f"{basepath}.stderr" - with open(stdout_filename, 'w') as stdout_f: - with open(stderr_filename, 'w') as stderr_f: - log.info('(capturing output to "{}.stdout")'.format(base)) - subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) + try: + with open(stdout_filename, "w") as stdout_f: + with open(stderr_filename, "w") as stderr_f: + log.info(f'Capturing stdout to "{base}.stdout" and stderr to "{base}.stderr"') + subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) + finally: + # Remove empty files if there is no output + for filename in (stdout_filename, stderr_filename): + if os.stat(filename).st_size == 0: + os.remove(filename) return basepath @@ -50,7 +54,7 @@ _global_counter = 0 def global_counter() -> int: - """ A really dumb global counter. + """A really dumb global counter. This is useful for giving output files a unique number, so if we run the same command multiple times we can keep their output separate. @@ -60,22 +64,182 @@ def global_counter() -> int: return _global_counter -def lsn_to_hex(num: int) -> str: - """ Convert lsn from int to standard hex notation. """ - return "{:X}/{:X}".format(num >> 32, num & 0xffffffff) - - -def lsn_from_hex(lsn_hex: str) -> int: - """ Convert lsn from hex notation to int. """ - l, r = lsn_hex.split('/') - return (int(l, 16) << 32) + int(r, 16) - - -def print_gc_result(row): +def print_gc_result(row: Dict[str, Any]): log.info("GC duration {elapsed} ms".format_map(row)) log.info( - " REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}" - .format_map(row)) - log.info( - " NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}" - .format_map(row)) + " total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_pitr {layers_needed_by_pitr}" + " needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}".format_map( + row + ) + ) + + +def etcd_path() -> Path: + path_output = shutil.which("etcd") + if path_output is None: + raise RuntimeError("etcd not found in PATH") + return Path(path_output) + + +def query_scalar(cur: cursor, query: str) -> Any: + """ + It is a convenience wrapper to avoid repetitions + of cur.execute(); cur.fetchone()[0] + + And this is mypy friendly, because without None + check mypy says that Optional is not indexable. + """ + cur.execute(query) + var = cur.fetchone() + assert var is not None + return var[0] + + +# Traverse directory to get total size. +def get_dir_size(path: str) -> int: + """Return size in bytes.""" + totalbytes = 0 + for root, dirs, files in os.walk(path): + for name in files: + try: + totalbytes += os.path.getsize(os.path.join(root, name)) + except FileNotFoundError: + pass # file could be concurrently removed + + return totalbytes + + +def get_timeline_dir_size(path: Path) -> int: + """Get the timeline directory's total size, which only counts the layer files' size.""" + sz = 0 + for dir_entry in path.iterdir(): + with contextlib.suppress(Exception): + # file is an image layer + _ = parse_image_layer(dir_entry.name) + sz += dir_entry.stat().st_size + continue + + with contextlib.suppress(Exception): + # file is a delta layer + _ = parse_delta_layer(dir_entry.name) + sz += dir_entry.stat().st_size + return sz + + +def parse_image_layer(f_name: str) -> Tuple[int, int, int]: + """Parse an image layer file name. Return key start, key end, and snapshot lsn""" + parts = f_name.split("__") + key_parts = parts[0].split("-") + return int(key_parts[0], 16), int(key_parts[1], 16), int(parts[1], 16) + + +def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]: + """Parse a delta layer file name. Return key start, key end, lsn start, and lsn end""" + parts = f_name.split("__") + key_parts = parts[0].split("-") + lsn_parts = parts[1].split("-") + return ( + int(key_parts[0], 16), + int(key_parts[1], 16), + int(lsn_parts[0], 16), + int(lsn_parts[1], 16), + ) + + +def get_scale_for_db(size_mb: int) -> int: + """Returns pgbench scale factor for given target db size in MB. + + Ref https://www.cybertec-postgresql.com/en/a-formula-to-calculate-pgbench-scaling-factor-for-target-db-size/ + """ + + return round(0.06689 * size_mb - 0.5) + + +ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] + r"flamegraph\.svg|regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)" +) + + +def allure_attach_from_dir(dir: Path): + """Attach all non-empty files from `dir` that matches `ATTACHMENT_NAME_REGEX` to Allure report""" + + for attachment in Path(dir).glob("**/*"): + if ATTACHMENT_NAME_REGEX.fullmatch(attachment.name) and attachment.stat().st_size > 0: + source = str(attachment) + name = str(attachment.relative_to(dir)) + + # compress files larger than 1Mb, they're hardly readable in a browser + if attachment.stat().st_size > 1024 * 1024: + source = f"{attachment}.tar.gz" + with tarfile.open(source, "w:gz") as tar: + tar.add(attachment, arcname=attachment.name) + name = f"{name}.tar.gz" + + if source.endswith(".tar.gz"): + attachment_type = "application/gzip" + extension = "tar.gz" + elif source.endswith(".svg"): + attachment_type = "image/svg+xml" + extension = "svg" + elif source.endswith(".html"): + attachment_type = "text/html" + extension = "html" + else: + attachment_type = "text/plain" + extension = attachment.suffix.removeprefix(".") + + allure.attach.file(source, name, attachment_type, extension) + + +def start_in_background( + command: list[str], cwd: Path, log_file_name: str, is_started: Fn +) -> subprocess.Popen[bytes]: + """Starts a process, creates the logfile and redirects stderr and stdout there. Runs the start checks before the process is started, or errors.""" + + log.info(f'Running command "{" ".join(command)}"') + + with open(cwd / log_file_name, "wb") as log_file: + spawned_process = subprocess.Popen(command, stdout=log_file, stderr=log_file, cwd=cwd) + error = None + try: + return_code = spawned_process.poll() + if return_code is not None: + error = f"expected subprocess to run but it exited with code {return_code}" + else: + attempts = 10 + try: + wait_until( + number_of_iterations=attempts, + interval=1, + func=is_started, + ) + except Exception: + error = f"Failed to get correct status from subprocess in {attempts} attempts" + except Exception as e: + error = f"expected subprocess to start but it failed with exception: {e}" + + if error is not None: + log.error(error) + spawned_process.kill() + raise Exception(f"Failed to run subprocess as {command}, reason: {error}") + + log.info("subprocess spawned") + return spawned_process + + +def wait_until(number_of_iterations: int, interval: float, func: Fn): + """ + Wait until 'func' returns successfully, without exception. Returns the + last return value from the function. + """ + last_exception = None + for i in range(number_of_iterations): + try: + res = func() + except Exception as e: + log.info("waiting for %s iteration %s failed", func, i + 1) + last_exception = e + time.sleep(interval) + continue + return res + raise Exception("timed out while waiting for %s" % func) from last_exception diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py deleted file mode 100644 index b4b3de1db3..0000000000 --- a/test_runner/fixtures/zenith_fixtures.py +++ /dev/null @@ -1,1727 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass, field -import textwrap -from cached_property import cached_property -import asyncpg -import os -import boto3 -import pathlib -import uuid -import warnings -import jwt -import json -import psycopg2 -import pytest -import re -import shutil -import socket -import subprocess -import time -import filecmp -import tempfile - -from contextlib import closing -from pathlib import Path -from dataclasses import dataclass - -# Type-related stuff -from psycopg2.extensions import connection as PgConnection -from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple -from typing_extensions import Literal -import pytest - -import requests -import backoff # type: ignore - -from .utils import (get_self_dir, mkdir_if_needed, subprocess_capture) -from fixtures.log_helper import log -""" -This file contains pytest fixtures. A fixture is a test resource that can be -summoned by placing its name in the test's arguments. - -A fixture is created with the decorator @zenfixture, which is a wrapper around -the standard pytest.fixture with some extra behavior. - -There are several environment variables that can control the running of tests: -ZENITH_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information. - -There's no need to import this file to use it. It should be declared as a plugin -inside conftest.py, and that makes it available to all tests. - -Don't import functions from this file, or pytest will emit warnings. Instead -put directly-importable functions into utils.py or another separate file. -""" - -Env = Dict[str, str] -Fn = TypeVar('Fn', bound=Callable[..., Any]) - -DEFAULT_OUTPUT_DIR = 'test_output' -DEFAULT_POSTGRES_DIR = 'tmp_install' - -BASE_PORT = 15000 -WORKER_PORT_NUM = 100 - - -def pytest_addoption(parser): - parser.addoption( - "--skip-interfering-proc-check", - dest="skip_interfering_proc_check", - action="store_true", - help="skip check for interferring processes", - ) - - -# These are set in pytest_configure() -base_dir = "" -zenith_binpath = "" -pg_distrib_dir = "" -top_output_dir = "" - - -def check_interferring_processes(config): - if config.getoption("skip_interfering_proc_check"): - warnings.warn("interferring process check is skipped") - return - - # does not use -c as it is not supported on macOS - cmd = ['pgrep', 'pageserver|postgres|safekeeper'] - result = subprocess.run(cmd, stdout=subprocess.DEVNULL) - if result.returncode == 0: - # returncode of 0 means it found something. - # This is bad; we don't want any of those processes polluting the - # result of the test. - # NOTE this shows as an internal pytest error, there might be a better way - raise Exception( - 'Found interfering processes running. Stop all Zenith pageservers, nodes, safekeepers, as well as stand-alone Postgres.' - ) - - -def pytest_configure(config): - """ - Ensure that no unwanted daemons are running before we start testing. - Check that we do not owerflow available ports range. - """ - check_interferring_processes(config) - - numprocesses = config.getoption('numprocesses') - if numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768: # do not use ephemeral ports - raise Exception('Too many workers configured. Cannot distribute ports for services.') - - # find the base directory (currently this is the git root) - global base_dir - base_dir = os.path.normpath(os.path.join(get_self_dir(), '../..')) - log.info(f'base_dir is {base_dir}') - - # Compute the top-level directory for all tests. - global top_output_dir - env_test_output = os.environ.get('TEST_OUTPUT') - if env_test_output is not None: - top_output_dir = env_test_output - else: - top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR) - mkdir_if_needed(top_output_dir) - - if os.getenv("REMOTE_ENV"): - # we are in remote env and do not have zenith binaries locally - # this is the case for benchmarks run on self-hosted runner - return - # Find the zenith binaries. - global zenith_binpath - env_zenith_bin = os.environ.get('ZENITH_BIN') - if env_zenith_bin: - zenith_binpath = env_zenith_bin - else: - zenith_binpath = os.path.join(base_dir, 'target/debug') - log.info(f'zenith_binpath is {zenith_binpath}') - if not os.path.exists(os.path.join(zenith_binpath, 'pageserver')): - raise Exception('zenith binaries not found at "{}"'.format(zenith_binpath)) - - # Find the postgres installation. - global pg_distrib_dir - env_postgres_bin = os.environ.get('POSTGRES_DISTRIB_DIR') - if env_postgres_bin: - pg_distrib_dir = env_postgres_bin - else: - pg_distrib_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR)) - log.info(f'pg_distrib_dir is {pg_distrib_dir}') - if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/postgres')): - raise Exception('postgres not found at "{}"'.format(pg_distrib_dir)) - - -def zenfixture(func: Fn) -> Fn: - """ - This is a python decorator for fixtures with a flexible scope. - - By default every test function will set up and tear down a new - database. In pytest, this is called fixtures "function" scope. - - If the environment variable TEST_SHARED_FIXTURES is set, then all - tests will share the same database. State, logs, etc. will be - stored in a directory called "shared". - """ - - scope: Literal['session', 'function'] = \ - 'function' if os.environ.get('TEST_SHARED_FIXTURES') is None else 'session' - - return pytest.fixture(func, scope=scope) - - -@zenfixture -def worker_seq_no(worker_id: str): - # worker_id is a pytest-xdist fixture - # it can be master or gw - # parse it to always get a number - if worker_id == 'master': - return 0 - assert worker_id.startswith('gw') - return int(worker_id[2:]) - - -@zenfixture -def worker_base_port(worker_seq_no: int): - # so we divide ports in ranges of 100 ports - # so workers have disjoint set of ports for services - return BASE_PORT + worker_seq_no * WORKER_PORT_NUM - - -def get_dir_size(path: str) -> int: - """Return size in bytes.""" - totalbytes = 0 - for root, dirs, files in os.walk(path): - for name in files: - totalbytes += os.path.getsize(os.path.join(root, name)) - - return totalbytes - - -def can_bind(host: str, port: int) -> bool: - """ - Check whether a host:port is available to bind for listening - - Inspired by the can_bind() perl function used in Postgres tests, in - vendor/postgres/src/test/perl/PostgresNode.pm - """ - with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: - # TODO: The pageserver and safekeepers don't use SO_REUSEADDR at the - # moment. If that changes, we should use start using SO_REUSEADDR here - # too, to allow reusing ports more quickly. - # See https://github.com/zenithdb/zenith/issues/801 - #sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - - try: - sock.bind((host, port)) - sock.listen() - return True - except socket.error: - log.info(f"Port {port} is in use, skipping") - return False - - -class PortDistributor: - def __init__(self, base_port: int, port_number: int) -> None: - self.iterator = iter(range(base_port, base_port + port_number)) - - def get_port(self) -> int: - for port in self.iterator: - if can_bind("localhost", port): - return port - else: - raise RuntimeError( - 'port range configured for test is exhausted, consider enlarging the range') - - -@zenfixture -def port_distributor(worker_base_port): - return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM) - - -class PgProtocol: - """ Reusable connection logic """ - def __init__(self, - host: str, - port: int, - username: Optional[str] = None, - password: Optional[str] = None): - self.host = host - self.port = port - self.username = username - self.password = password - - def connstr(self, - *, - dbname: str = 'postgres', - username: Optional[str] = None, - password: Optional[str] = None) -> str: - """ - Build a libpq connection string for the Postgres instance. - """ - - username = username or self.username - password = password or self.password - res = f'host={self.host} port={self.port} dbname={dbname}' - - if username: - res = f'{res} user={username}' - - if password: - res = f'{res} password={password}' - - return res - - # autocommit=True here by default because that's what we need most of the time - def connect(self, - *, - autocommit=True, - dbname: str = 'postgres', - username: Optional[str] = None, - password: Optional[str] = None) -> PgConnection: - """ - Connect to the node. - Returns psycopg2's connection object. - This method passes all extra params to connstr. - """ - - conn = psycopg2.connect(self.connstr( - dbname=dbname, - username=username, - password=password, - )) - # WARNING: this setting affects *all* tests! - conn.autocommit = autocommit - return conn - - async def connect_async(self, - *, - dbname: str = 'postgres', - username: Optional[str] = None, - password: Optional[str] = None) -> asyncpg.Connection: - """ - Connect to the node from async python. - Returns asyncpg's connection object. - """ - - conn = await asyncpg.connect( - host=self.host, - port=self.port, - database=dbname, - user=username or self.username, - password=password, - ) - return conn - - def safe_psql(self, query: str, **kwargs: Any) -> List[Any]: - """ - Execute query against the node and return all rows. - This method passes all extra params to connstr. - """ - - with closing(self.connect(**kwargs)) as conn: - with conn.cursor() as cur: - cur.execute(query) - if cur.description is None: - return [] # query didn't return data - return cast(List[Any], cur.fetchall()) - - -@dataclass -class AuthKeys: - pub: bytes - priv: bytes - - def generate_management_token(self): - token = jwt.encode({"scope": "pageserverapi"}, self.priv, algorithm="RS256") - - # jwt.encode can return 'bytes' or 'str', depending on Python version or type - # hinting or something (not sure what). If it returned 'bytes', convert it to 'str' - # explicitly. - if isinstance(token, bytes): - token = token.decode() - - return token - - def generate_tenant_token(self, tenant_id): - token = jwt.encode({ - "scope": "tenant", "tenant_id": tenant_id - }, - self.priv, - algorithm="RS256") - - if isinstance(token, bytes): - token = token.decode() - - return token - - -class MockS3Server: - """ - Starts a mock S3 server for testing on a port given, errors if the server fails to start or exits prematurely. - Relies that `poetry` and `moto` server are installed, since it's the way the tests are run. - - Also provides a set of methods to derive the connection properties from and the method to kill the underlying server. - """ - def __init__( - self, - port: int, - ): - self.port = port - - self.subprocess = subprocess.Popen([f'poetry run moto_server s3 -p{port}'], shell=True) - error = None - try: - return_code = self.subprocess.poll() - if return_code is not None: - error = f"expected mock s3 server to run but it exited with code {return_code}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'" - except Exception as e: - error = f"expected mock s3 server to start but it failed with exception: {e}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'" - if error is not None: - log.error(error) - self.subprocess.kill() - raise RuntimeError("failed to start s3 mock server") - - def endpoint(self) -> str: - return f"http://127.0.0.1:{self.port}" - - def region(self) -> str: - return 'us-east-1' - - def access_key(self) -> str: - return 'test' - - def secret_key(self) -> str: - return 'test' - - def kill(self): - self.subprocess.kill() - - -class ZenithEnvBuilder: - """ - Builder object to create a Zenith runtime environment - - You should use the `zenith_env_builder` or `zenith_simple_env` pytest - fixture to create the ZenithEnv object. That way, the repository is - created in the right directory, based on the test name, and it's properly - cleaned up after the test has finished. - """ - def __init__(self, - repo_dir: Path, - port_distributor: PortDistributor, - pageserver_remote_storage: Optional[RemoteStorage] = None, - pageserver_config_override: Optional[str] = None, - num_safekeepers: int = 0, - pageserver_auth_enabled: bool = False, - rust_log_override: Optional[str] = None): - self.repo_dir = repo_dir - self.rust_log_override = rust_log_override - self.port_distributor = port_distributor - self.pageserver_remote_storage = pageserver_remote_storage - self.pageserver_config_override = pageserver_config_override - self.num_safekeepers = num_safekeepers - self.pageserver_auth_enabled = pageserver_auth_enabled - self.env: Optional[ZenithEnv] = None - - self.s3_mock_server: Optional[MockS3Server] = None - - if os.getenv('FORCE_MOCK_S3') is not None: - bucket_name = f'{repo_dir.name}_bucket' - log.warning(f'Unconditionally initializing mock S3 server for bucket {bucket_name}') - self.enable_s3_mock_remote_storage(bucket_name) - - def init(self) -> ZenithEnv: - # Cannot create more than one environment from one builder - assert self.env is None, "environment already initialized" - self.env = ZenithEnv(self) - return self.env - - """ - Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path. - Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. - """ - - def enable_local_fs_remote_storage(self, force_enable=True): - assert force_enable or self.pageserver_remote_storage is None, "remote storage is enabled already" - self.pageserver_remote_storage = LocalFsStorage( - Path(self.repo_dir / 'local_fs_remote_storage')) - - """ - Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already. - Starts up the mock server, if that does not run yet. - Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. - """ - - def enable_s3_mock_remote_storage(self, bucket_name: str, force_enable=True): - assert force_enable or self.pageserver_remote_storage is None, "remote storage is enabled already" - if not self.s3_mock_server: - self.s3_mock_server = MockS3Server(self.port_distributor.get_port()) - - mock_endpoint = self.s3_mock_server.endpoint() - mock_region = self.s3_mock_server.region() - mock_access_key = self.s3_mock_server.access_key() - mock_secret_key = self.s3_mock_server.secret_key() - boto3.client( - 's3', - endpoint_url=mock_endpoint, - region_name=mock_region, - aws_access_key_id=mock_access_key, - aws_secret_access_key=mock_secret_key, - ).create_bucket(Bucket=bucket_name) - self.pageserver_remote_storage = S3Storage(bucket=bucket_name, - endpoint=mock_endpoint, - region=mock_region, - access_key=mock_access_key, - secret_key=mock_secret_key) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - - # Stop all the nodes. - if self.env: - log.info('Cleaning up all storage and compute nodes') - self.env.postgres.stop_all() - for sk in self.env.safekeepers: - sk.stop(immediate=True) - self.env.pageserver.stop(immediate=True) - if self.s3_mock_server: - self.s3_mock_server.kill() - - -class ZenithEnv: - """ - An object representing the Zenith runtime environment. It consists of - the page server, 0-N safekeepers, and the compute nodes. - - ZenithEnv contains functions for stopping/starting nodes in the - environment, checking their status, creating tenants, connecting to the - nodes, creating and destroying compute nodes, etc. The page server and - the safekeepers are considered fixed in the environment, you cannot - create or destroy them after the environment is initialized. (That will - likely change in the future, as we start supporting multiple page - servers and adding/removing safekeepers on the fly). - - Some notable functions and fields in ZenithEnv: - - postgres - A factory object for creating postgres compute nodes. - - pageserver - An object that contains functions for manipulating and - connecting to the pageserver - - safekeepers - An array containing objects representing the safekeepers - - pg_bin - pg_bin.run() can be used to execute Postgres client binaries, - like psql or pg_dump - - initial_tenant - tenant ID of the initial tenant created in the repository - - zenith_cli() - zenith_cli() can be used to run the 'zenith' CLI tool - - create_tenant() - initializes a new tenant in the page server, returns - the tenant id - """ - def __init__(self, config: ZenithEnvBuilder): - self.repo_dir = config.repo_dir - self.rust_log_override = config.rust_log_override - self.port_distributor = config.port_distributor - self.s3_mock_server = config.s3_mock_server - self.zenith_cli = ZenithCli(env=self) - - self.postgres = PostgresFactory(self) - - self.safekeepers: List[Safekeeper] = [] - - # generate initial tenant ID here instead of letting 'zenith init' generate it, - # so that we don't need to dig it out of the config file afterwards. - self.initial_tenant = uuid.uuid4() - - # Create a config file corresponding to the options - toml = textwrap.dedent(f""" - default_tenantid = '{self.initial_tenant.hex}' - """) - - # Create config for pageserver - pageserver_port = PageserverPort( - pg=self.port_distributor.get_port(), - http=self.port_distributor.get_port(), - ) - pageserver_auth_type = "ZenithJWT" if config.pageserver_auth_enabled else "Trust" - - toml += textwrap.dedent(f""" - [pageserver] - listen_pg_addr = 'localhost:{pageserver_port.pg}' - listen_http_addr = 'localhost:{pageserver_port.http}' - auth_type = '{pageserver_auth_type}' - """) - - # Create a corresponding ZenithPageserver object - self.pageserver = ZenithPageserver(self, - port=pageserver_port, - remote_storage=config.pageserver_remote_storage, - config_override=config.pageserver_config_override) - - # Create config and a Safekeeper object for each safekeeper - for i in range(1, config.num_safekeepers + 1): - port = SafekeeperPort( - pg=self.port_distributor.get_port(), - http=self.port_distributor.get_port(), - ) - - if config.num_safekeepers == 1: - name = "single" - else: - name = f"sk{i}" - toml += f""" -[[safekeepers]] -name = '{name}' -pg_port = {port.pg} -http_port = {port.http} -sync = false # Disable fsyncs to make the tests go faster - """ - safekeeper = Safekeeper(env=self, name=name, port=port) - self.safekeepers.append(safekeeper) - - log.info(f"Config: {toml}") - - self.zenith_cli.init(toml) - - # Start up the page server and all the safekeepers - self.pageserver.start() - - for safekeeper in self.safekeepers: - safekeeper.start() - - def get_safekeeper_connstrs(self) -> str: - """ Get list of safekeeper endpoints suitable for wal_acceptors GUC """ - return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers]) - - def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: - if tenant_id is None: - tenant_id = uuid.uuid4() - self.zenith_cli.create_tenant(tenant_id) - return tenant_id - - @cached_property - def auth_keys(self) -> AuthKeys: - pub = (Path(self.repo_dir) / 'auth_public_key.pem').read_bytes() - priv = (Path(self.repo_dir) / 'auth_private_key.pem').read_bytes() - return AuthKeys(pub=pub, priv=priv) - - -@zenfixture -def _shared_simple_env(request: Any, port_distributor) -> Iterator[ZenithEnv]: - """ - Internal fixture backing the `zenith_simple_env` fixture. If TEST_SHARED_FIXTURES - is set, this is shared by all tests using `zenith_simple_env`. - """ - - if os.environ.get('TEST_SHARED_FIXTURES') is None: - # Create the environment in the per-test output directory - repo_dir = os.path.join(get_test_output_dir(request), "repo") - else: - # We're running shared fixtures. Share a single directory. - repo_dir = os.path.join(str(top_output_dir), "shared_repo") - shutil.rmtree(repo_dir, ignore_errors=True) - - with ZenithEnvBuilder(Path(repo_dir), port_distributor) as builder: - - env = builder.init() - - # For convenience in tests, create a branch from the freshly-initialized cluster. - env.zenith_cli.create_branch("empty", "main") - - # Return the builder to the caller - yield env - - -@pytest.fixture(scope='function') -def zenith_simple_env(_shared_simple_env: ZenithEnv) -> Iterator[ZenithEnv]: - """ - Simple Zenith environment, with no authentication and no safekeepers. - - If TEST_SHARED_FIXTURES environment variable is set, we reuse the same - environment for all tests that use 'zenith_simple_env', keeping the - page server and safekeepers running. Any compute nodes are stopped after - each the test, however. - """ - yield _shared_simple_env - - _shared_simple_env.postgres.stop_all() - if _shared_simple_env.s3_mock_server: - _shared_simple_env.s3_mock_server.kill() - - -@pytest.fixture(scope='function') -def zenith_env_builder(test_output_dir, port_distributor) -> Iterator[ZenithEnvBuilder]: - """ - Fixture to create a Zenith environment for test. - - To use, define 'zenith_env_builder' fixture in your test to get access to the - builder object. Set properties on it to describe the environment. - Finally, initialize and start up the environment by calling - zenith_env_builder.init(). - - After the initialization, you can launch compute nodes by calling - the functions in the 'env.postgres' factory object, stop/start the - nodes, etc. - """ - - # Create the environment in the test-specific output dir - repo_dir = os.path.join(test_output_dir, "repo") - - # Return the builder to the caller - with ZenithEnvBuilder(Path(repo_dir), port_distributor) as builder: - yield builder - - -class ZenithPageserverApiException(Exception): - pass - - -class ZenithPageserverHttpClient(requests.Session): - def __init__(self, port: int, auth_token: Optional[str] = None) -> None: - super().__init__() - self.port = port - self.auth_token = auth_token - - if auth_token is not None: - self.headers['Authorization'] = f'Bearer {auth_token}' - - def verbose_error(self, res: requests.Response): - try: - res.raise_for_status() - except requests.RequestException as e: - try: - msg = res.json()['msg'] - except: - msg = '' - raise ZenithPageserverApiException(msg) from e - - def check_status(self): - self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() - - def timeline_attach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): - res = self.post( - f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}/attach", ) - self.verbose_error(res) - - def timeline_detach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): - res = self.post( - f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}/detach", ) - self.verbose_error(res) - - def branch_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def branch_create(self, tenant_id: uuid.UUID, name: str, start_point: str) -> Dict[Any, Any]: - res = self.post(f"http://localhost:{self.port}/v1/branch", - json={ - 'tenant_id': tenant_id.hex, - 'name': name, - 'start_point': start_point, - }) - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def branch_detail(self, tenant_id: uuid.UUID, name: str) -> Dict[Any, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}/{name}?include-non-incremental-logical-size=1", - ) - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def tenant_list(self) -> List[Dict[Any, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/tenant") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def tenant_create(self, tenant_id: uuid.UUID): - res = self.post( - f"http://localhost:{self.port}/v1/tenant", - json={ - 'tenant_id': tenant_id.hex, - }, - ) - self.verbose_error(res) - return res.json() - - def timeline_list(self, tenant_id: uuid.UUID) -> List[str]: - res = self.get(f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def get_metrics(self) -> str: - res = self.get(f"http://localhost:{self.port}/metrics") - self.verbose_error(res) - return res.text - - -@dataclass -class PageserverPort: - pg: int - http: int - - -@dataclass -class LocalFsStorage: - root: Path - - -@dataclass -class S3Storage: - bucket: str - region: str - access_key: Optional[str] - secret_key: Optional[str] - endpoint: Optional[str] - - -RemoteStorage = Union[LocalFsStorage, S3Storage] - - -class ZenithCli: - """ - A typed wrapper around the `zenith` CLI tool. - Supports main commands via typed methods and a way to run arbitrary command directly via CLI. - """ - def __init__(self, env: ZenithEnv) -> None: - self.env = env - pass - - def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: - if tenant_id is None: - tenant_id = uuid.uuid4() - self.raw_cli(['tenant', 'create', tenant_id.hex]) - return tenant_id - - def list_tenants(self) -> 'subprocess.CompletedProcess[str]': - return self.raw_cli(['tenant', 'list']) - - def create_branch(self, - branch_name: str, - starting_point: str, - tenant_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]': - args = ['branch'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) - args.extend([branch_name, starting_point]) - - return self.raw_cli(args) - - def list_branches(self, - tenant_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]': - args = ['branch'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) - return self.raw_cli(args) - - def init(self, config_toml: str) -> 'subprocess.CompletedProcess[str]': - with tempfile.NamedTemporaryFile(mode='w+') as tmp: - tmp.write(config_toml) - tmp.flush() - - cmd = ['init', f'--config={tmp.name}'] - append_pageserver_param_overrides(cmd, - self.env.pageserver.remote_storage, - self.env.pageserver.config_override) - - return self.raw_cli(cmd) - - def pageserver_start(self) -> 'subprocess.CompletedProcess[str]': - start_args = ['pageserver', 'start'] - append_pageserver_param_overrides(start_args, - self.env.pageserver.remote_storage, - self.env.pageserver.config_override) - return self.raw_cli(start_args) - - def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]': - cmd = ['pageserver', 'stop'] - if immediate: - cmd.extend(['-m', 'immediate']) - - log.info(f"Stopping pageserver with {cmd}") - return self.raw_cli(cmd) - - def safekeeper_start(self, name: str) -> 'subprocess.CompletedProcess[str]': - return self.raw_cli(['safekeeper', 'start', name]) - - def safekeeper_stop(self, - name: Optional[str] = None, - immediate=False) -> 'subprocess.CompletedProcess[str]': - args = ['safekeeper', 'stop'] - if immediate: - args.extend(['-m', 'immediate']) - if name is not None: - args.append(name) - return self.raw_cli(args) - - def pg_create( - self, - node_name: str, - tenant_id: Optional[uuid.UUID] = None, - timeline_spec: Optional[str] = None, - port: Optional[int] = None, - ) -> 'subprocess.CompletedProcess[str]': - args = ['pg', 'create'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) - if port is not None: - args.append(f'--port={port}') - args.append(node_name) - if timeline_spec is not None: - args.append(timeline_spec) - return self.raw_cli(args) - - def pg_start( - self, - node_name: str, - tenant_id: Optional[uuid.UUID] = None, - timeline_spec: Optional[str] = None, - port: Optional[int] = None, - ) -> 'subprocess.CompletedProcess[str]': - args = ['pg', 'start'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) - if port is not None: - args.append(f'--port={port}') - args.append(node_name) - if timeline_spec is not None: - args.append(timeline_spec) - - return self.raw_cli(args) - - def pg_stop( - self, - node_name: str, - tenant_id: Optional[uuid.UUID] = None, - destroy=False, - ) -> 'subprocess.CompletedProcess[str]': - args = ['pg', 'stop'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) - if destroy: - args.append('--destroy') - args.append(node_name) - - return self.raw_cli(args) - - def raw_cli(self, - arguments: List[str], - check_return_code=True) -> 'subprocess.CompletedProcess[str]': - """ - Run "zenith" with the specified arguments. - - Arguments must be in list form, e.g. ['pg', 'create'] - - Return both stdout and stderr, which can be accessed as - - >>> result = env.zenith_cli.raw_cli(...) - >>> assert result.stderr == "" - >>> log.info(result.stdout) - """ - - assert type(arguments) == list - - bin_zenith = os.path.join(str(zenith_binpath), 'zenith') - - args = [bin_zenith] + arguments - log.info('Running command "{}"'.format(' '.join(args))) - log.info(f'Running in "{self.env.repo_dir}"') - - env_vars = os.environ.copy() - env_vars['ZENITH_REPO_DIR'] = str(self.env.repo_dir) - env_vars['POSTGRES_DISTRIB_DIR'] = str(pg_distrib_dir) - - if self.env.rust_log_override is not None: - env_vars['RUST_LOG'] = self.env.rust_log_override - - # Pass coverage settings - var = 'LLVM_PROFILE_FILE' - val = os.environ.get(var) - if val: - env_vars[var] = val - - # Intercept CalledProcessError and print more info - try: - res = subprocess.run(args, - env=env_vars, - check=True, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - log.info(f"Run success: {res.stdout}") - except subprocess.CalledProcessError as exc: - # this way command output will be in recorded and shown in CI in failure message - msg = f"""\ - Run failed: {exc} - stdout: {exc.stdout} - stderr: {exc.stderr} - """ - log.info(msg) - - raise Exception(msg) from exc - - if check_return_code: - res.check_returncode() - return res - - -class ZenithPageserver(PgProtocol): - """ - An object representing a running pageserver. - - Initializes the repository via `zenith init`. - """ - def __init__(self, - env: ZenithEnv, - port: PageserverPort, - remote_storage: Optional[RemoteStorage] = None, - config_override: Optional[str] = None, - enable_auth=False): - super().__init__(host='localhost', port=port.pg, username='zenith_admin') - self.env = env - self.running = False - self.service_port = port # do not shadow PgProtocol.port which is just int - self.remote_storage = remote_storage - self.config_override = config_override - - def start(self) -> 'ZenithPageserver': - """ - Start the page server. - Returns self. - """ - assert self.running == False - - self.env.zenith_cli.pageserver_start() - self.running = True - return self - - def stop(self, immediate=False) -> 'ZenithPageserver': - """ - Stop the page server. - Returns self. - """ - if self.running: - self.env.zenith_cli.pageserver_stop(immediate) - self.running = False - - return self - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc, tb): - self.stop(True) - - def http_client(self, auth_token: Optional[str] = None) -> ZenithPageserverHttpClient: - return ZenithPageserverHttpClient( - port=self.service_port.http, - auth_token=auth_token, - ) - - -def append_pageserver_param_overrides( - params_to_update: List[str], - pageserver_remote_storage: Optional[RemoteStorage], - pageserver_config_override: Optional[str] = None, -): - if pageserver_remote_storage is not None: - if isinstance(pageserver_remote_storage, LocalFsStorage): - pageserver_storage_override = f"local_path='{pageserver_remote_storage.root}'" - elif isinstance(pageserver_remote_storage, S3Storage): - pageserver_storage_override = f"bucket_name='{pageserver_remote_storage.bucket}',\ - bucket_region='{pageserver_remote_storage.region}'" - - if pageserver_remote_storage.access_key is not None: - pageserver_storage_override += f",access_key_id='{pageserver_remote_storage.access_key}'" - if pageserver_remote_storage.secret_key is not None: - pageserver_storage_override += f",secret_access_key='{pageserver_remote_storage.secret_key}'" - if pageserver_remote_storage.endpoint is not None: - pageserver_storage_override += f",endpoint='{pageserver_remote_storage.endpoint}'" - - else: - raise Exception(f'Unknown storage configuration {pageserver_remote_storage}') - params_to_update.append( - f'--pageserver-config-override=remote_storage={{{pageserver_storage_override}}}') - - env_overrides = os.getenv('ZENITH_PAGESERVER_OVERRIDES') - if env_overrides is not None: - params_to_update += [ - f'--pageserver-config-override={o.strip()}' for o in env_overrides.split(';') - ] - - if pageserver_config_override is not None: - params_to_update += [ - f'--pageserver-config-override={o.strip()}' - for o in pageserver_config_override.split(';') - ] - - -class PgBin: - """ A helper class for executing postgres binaries """ - def __init__(self, log_dir: str): - self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), 'bin') - self.env = os.environ.copy() - self.env['LD_LIBRARY_PATH'] = os.path.join(str(pg_distrib_dir), 'lib') - - def _fixpath(self, command: List[str]) -> None: - if '/' not in command[0]: - command[0] = os.path.join(self.pg_bin_path, command[0]) - - def _build_env(self, env_add: Optional[Env]) -> Env: - if env_add is None: - return self.env - env = self.env.copy() - env.update(env_add) - return env - - def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None) -> None: - """ - Run one of the postgres binaries. - - The command should be in list form, e.g. ['pgbench', '-p', '55432'] - - All the necessary environment variables will be set. - - If the first argument (the command name) doesn't include a path (no '/' - characters present), then it will be edited to include the correct path. - - If you want stdout/stderr captured to files, use `run_capture` instead. - """ - - self._fixpath(command) - log.info('Running command "{}"'.format(' '.join(command))) - env = self._build_env(env) - subprocess.run(command, env=env, cwd=cwd, check=True) - - def run_capture(self, - command: List[str], - env: Optional[Env] = None, - cwd: Optional[str] = None, - **kwargs: Any) -> str: - """ - Run one of the postgres binaries, with stderr and stdout redirected to a file. - - This is just like `run`, but for chatty programs. Returns basepath for files - with captured output. - """ - - self._fixpath(command) - log.info('Running command "{}"'.format(' '.join(command))) - env = self._build_env(env) - return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs) - - -@pytest.fixture(scope='function') -def pg_bin(test_output_dir: str) -> PgBin: - return PgBin(test_output_dir) - - -class VanillaPostgres(PgProtocol): - def __init__(self, pgdatadir: str, pg_bin: PgBin, port: int): - super().__init__(host='localhost', port=port) - self.pgdatadir = pgdatadir - self.pg_bin = pg_bin - self.running = False - self.pg_bin.run_capture(['initdb', '-D', pgdatadir]) - - def configure(self, options: List[str]) -> None: - """Append lines into postgresql.conf file.""" - assert not self.running - with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file: - conf_file.writelines(options) - - def start(self) -> None: - assert not self.running - self.running = True - self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, 'start']) - - def stop(self) -> None: - assert self.running - self.running = False - self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, 'stop']) - - def get_subdir_size(self, subdir) -> int: - """Return size of pgdatadir subdirectory in bytes.""" - return get_dir_size(os.path.join(self.pgdatadir, subdir)) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc, tb): - if self.running: - self.stop() - - -@pytest.fixture(scope='function') -def vanilla_pg(test_output_dir: str) -> Iterator[VanillaPostgres]: - pgdatadir = os.path.join(test_output_dir, "pgdata-vanilla") - pg_bin = PgBin(test_output_dir) - with VanillaPostgres(pgdatadir, pg_bin, 5432) as vanilla_pg: - yield vanilla_pg - - -class ZenithProxy(PgProtocol): - def __init__(self, port: int): - super().__init__(host="127.0.0.1", username="pytest", password="pytest", port=port) - self.http_port = 7001 - self._popen: Optional[subprocess.Popen[bytes]] = None - - def start_static(self, addr="127.0.0.1:5432") -> None: - assert self._popen is None - - # Start proxy - bin_proxy = os.path.join(str(zenith_binpath), 'proxy') - args = [bin_proxy] - args.extend(["--http", f"{self.host}:{self.http_port}"]) - args.extend(["--proxy", f"{self.host}:{self.port}"]) - args.extend(["--auth-method", "password"]) - args.extend(["--static-router", addr]) - self._popen = subprocess.Popen(args) - self._wait_until_ready() - - @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10) - def _wait_until_ready(self): - requests.get(f"http://{self.host}:{self.http_port}/v1/status") - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc, tb): - if self._popen is not None: - # NOTE the process will die when we're done with tests anyway, because - # it's a child process. This is mostly to clean up in between different tests. - self._popen.kill() - - -@pytest.fixture(scope='function') -def static_proxy(vanilla_pg) -> Iterator[ZenithProxy]: - """Zenith proxy that routes directly to vanilla postgres.""" - vanilla_pg.start() - vanilla_pg.safe_psql("create user pytest with password 'pytest';") - - with ZenithProxy(4432) as proxy: - proxy.start_static() - yield proxy - - -class Postgres(PgProtocol): - """ An object representing a running postgres daemon. """ - def __init__(self, env: ZenithEnv, tenant_id: uuid.UUID, port: int): - super().__init__(host='localhost', port=port, username='zenith_admin') - - self.env = env - self.running = False - self.node_name: Optional[str] = None # dubious, see asserts below - self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA - self.tenant_id = tenant_id - # path to conf is /pgdatadirs/tenants///postgresql.conf - - def create( - self, - node_name: str, - branch: Optional[str] = None, - config_lines: Optional[List[str]] = None, - ) -> 'Postgres': - """ - Create the pg data directory. - Returns self. - """ - - if not config_lines: - config_lines = [] - - if branch is None: - branch = node_name - - self.env.zenith_cli.pg_create(node_name, - tenant_id=self.tenant_id, - port=self.port, - timeline_spec=branch) - self.node_name = node_name - path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name - self.pgdata_dir = os.path.join(self.env.repo_dir, path) - - if config_lines is None: - config_lines = [] - self.config(config_lines) - - return self - - def start(self) -> 'Postgres': - """ - Start the Postgres instance. - Returns self. - """ - - assert self.node_name is not None - - log.info(f"Starting postgres node {self.node_name}") - - run_result = self.env.zenith_cli.pg_start(self.node_name, - tenant_id=self.tenant_id, - port=self.port) - self.running = True - - log.info(f"stdout: {run_result.stdout}") - - return self - - def pg_data_dir_path(self) -> str: - """ Path to data directory """ - assert self.node_name - path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name - return os.path.join(self.env.repo_dir, path) - - def pg_xact_dir_path(self) -> str: - """ Path to pg_xact dir """ - return os.path.join(self.pg_data_dir_path(), 'pg_xact') - - def pg_twophase_dir_path(self) -> str: - """ Path to pg_twophase dir """ - return os.path.join(self.pg_data_dir_path(), 'pg_twophase') - - def config_file_path(self) -> str: - """ Path to postgresql.conf """ - return os.path.join(self.pg_data_dir_path(), 'postgresql.conf') - - def adjust_for_wal_acceptors(self, wal_acceptors: str) -> 'Postgres': - """ - Adjust instance config for working with wal acceptors instead of - pageserver (pre-configured by CLI) directly. - """ - - # TODO: reuse config() - with open(self.config_file_path(), "r") as f: - cfg_lines = f.readlines() - with open(self.config_file_path(), "w") as f: - for cfg_line in cfg_lines: - # walproposer uses different application_name - if ("synchronous_standby_names" in cfg_line or - # don't ask pageserver to fetch WAL from compute - "callmemaybe_connstring" in cfg_line or - # don't repeat wal_acceptors multiple times - "wal_acceptors" in cfg_line): - continue - f.write(cfg_line) - f.write("synchronous_standby_names = 'walproposer'\n") - f.write("wal_acceptors = '{}'\n".format(wal_acceptors)) - return self - - def config(self, lines: List[str]) -> 'Postgres': - """ - Add lines to postgresql.conf. - Lines should be an array of valid postgresql.conf rows. - Returns self. - """ - - with open(self.config_file_path(), 'a') as conf: - for line in lines: - conf.write(line) - conf.write('\n') - - return self - - def stop(self) -> 'Postgres': - """ - Stop the Postgres instance if it's running. - Returns self. - """ - - if self.running: - assert self.node_name is not None - self.env.zenith_cli.pg_stop(self.node_name, tenant_id=self.tenant_id) - self.running = False - - return self - - def stop_and_destroy(self) -> 'Postgres': - """ - Stop the Postgres instance, then destroy it. - Returns self. - """ - - assert self.node_name is not None - self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, destroy=True) - self.node_name = None - - return self - - def create_start( - self, - node_name: str, - branch: Optional[str] = None, - config_lines: Optional[List[str]] = None, - ) -> 'Postgres': - """ - Create a Postgres instance, apply config - and then start it. - Returns self. - """ - - self.create( - node_name=node_name, - branch=branch, - config_lines=config_lines, - ).start() - - return self - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc, tb): - self.stop() - - -class PostgresFactory: - """ An object representing multiple running postgres daemons. """ - def __init__(self, env: ZenithEnv): - self.env = env - self.num_instances = 0 - self.instances: List[Postgres] = [] - - def create_start(self, - node_name: str = "main", - branch: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - config_lines: Optional[List[str]] = None) -> Postgres: - - pg = Postgres( - self.env, - tenant_id=tenant_id or self.env.initial_tenant, - port=self.env.port_distributor.get_port(), - ) - self.num_instances += 1 - self.instances.append(pg) - - return pg.create_start( - node_name=node_name, - branch=branch, - config_lines=config_lines, - ) - - def create(self, - node_name: str = "main", - branch: Optional[str] = None, - tenant_id: Optional[uuid.UUID] = None, - config_lines: Optional[List[str]] = None) -> Postgres: - - pg = Postgres( - self.env, - tenant_id=tenant_id or self.env.initial_tenant, - port=self.env.port_distributor.get_port(), - ) - - self.num_instances += 1 - self.instances.append(pg) - - return pg.create( - node_name=node_name, - branch=branch, - config_lines=config_lines, - ) - - def stop_all(self) -> 'PostgresFactory': - for pg in self.instances: - pg.stop() - - return self - - -def read_pid(path: Path) -> int: - """ Read content of file into number """ - return int(path.read_text()) - - -@dataclass -class SafekeeperPort: - pg: int - http: int - - -@dataclass -class Safekeeper: - """ An object representing a running safekeeper daemon. """ - env: ZenithEnv - port: SafekeeperPort - name: str # identifier for logging - auth_token: Optional[str] = None - - def start(self) -> 'Safekeeper': - self.env.zenith_cli.safekeeper_start(self.name) - - # wait for wal acceptor start by checking its status - started_at = time.time() - while True: - try: - http_cli = self.http_client() - http_cli.check_status() - except Exception as e: - elapsed = time.time() - started_at - if elapsed > 3: - raise RuntimeError( - f"timed out waiting {elapsed:.0f}s for wal acceptor start: {e}") - time.sleep(0.5) - else: - break # success - return self - - def stop(self, immediate=False) -> 'Safekeeper': - log.info('Stopping safekeeper {}'.format(self.name)) - self.env.zenith_cli.safekeeper_stop(self.name, immediate) - return self - - def append_logical_message(self, - tenant_id: uuid.UUID, - timeline_id: uuid.UUID, - request: Dict[str, Any]) -> Dict[str, Any]: - """ - Send JSON_CTRL query to append LogicalMessage to WAL and modify - safekeeper state. It will construct LogicalMessage from provided - prefix and message, and then will write it to WAL. - """ - - # "replication=0" hacks psycopg not to send additional queries - # on startup, see https://github.com/psycopg/psycopg2/pull/482 - connstr = f"host=localhost port={self.port.pg} replication=0 options='-c ztimelineid={timeline_id.hex} ztenantid={tenant_id.hex}'" - - with closing(psycopg2.connect(connstr)) as conn: - # server doesn't support transactions - conn.autocommit = True - with conn.cursor() as cur: - request_json = json.dumps(request) - log.info(f"JSON_CTRL request on port {self.port.pg}: {request_json}") - cur.execute("JSON_CTRL " + request_json) - all = cur.fetchall() - log.info(f"JSON_CTRL response: {all[0][0]}") - res = json.loads(all[0][0]) - assert isinstance(res, dict) - return res - - def http_client(self) -> SafekeeperHttpClient: - return SafekeeperHttpClient(port=self.port.http) - - -@dataclass -class SafekeeperTimelineStatus: - acceptor_epoch: int - flush_lsn: str - - -@dataclass -class SafekeeperMetrics: - # These are metrics from Prometheus which uses float64 internally. - # As a consequence, values may differ from real original int64s. - flush_lsn_inexact: Dict[Tuple[str, str], int] = field(default_factory=dict) - commit_lsn_inexact: Dict[Tuple[str, str], int] = field(default_factory=dict) - - -class SafekeeperHttpClient(requests.Session): - def __init__(self, port: int) -> None: - super().__init__() - self.port = port - - def check_status(self): - self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() - - def timeline_status(self, tenant_id: str, timeline_id: str) -> SafekeeperTimelineStatus: - res = self.get(f"http://localhost:{self.port}/v1/timeline/{tenant_id}/{timeline_id}") - res.raise_for_status() - resj = res.json() - return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'], - flush_lsn=resj['flush_lsn']) - - def get_metrics(self) -> SafekeeperMetrics: - request_result = self.get(f"http://localhost:{self.port}/metrics") - request_result.raise_for_status() - all_metrics_text = request_result.text - - metrics = SafekeeperMetrics() - for match in re.finditer( - r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE): - metrics.flush_lsn_inexact[(match.group(1), match.group(2))] = int(match.group(3)) - for match in re.finditer( - r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE): - metrics.commit_lsn_inexact[(match.group(1), match.group(2))] = int(match.group(3)) - return metrics - - -def get_test_output_dir(request: Any) -> str: - """ Compute the working directory for an individual test. """ - test_name = request.node.name - test_dir = os.path.join(str(top_output_dir), test_name) - log.info(f'get_test_output_dir is {test_dir}') - return test_dir - - -# This is autouse, so the test output directory always gets created, even -# if a test doesn't put anything there. It also solves a problem with the -# zenith_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it -# creates the repo in the test output directory. But it cannot depend on -# 'test_output_dir' fixture, because when TEST_SHARED_FIXTURES is not set, -# it has 'session' scope and cannot access fixtures with 'function' -# scope. So it uses the get_test_output_dir() function to get the path, and -# this fixture ensures that the directory exists. That works because -# 'autouse' fixtures are run before other fixtures. -@pytest.fixture(scope='function', autouse=True) -def test_output_dir(request: Any) -> str: - """ Create the working directory for an individual test. """ - - # one directory per test - test_dir = get_test_output_dir(request) - log.info(f'test_output_dir is {test_dir}') - shutil.rmtree(test_dir, ignore_errors=True) - mkdir_if_needed(test_dir) - return test_dir - - -SKIP_DIRS = frozenset(('pg_wal', 'pg_stat', 'pg_stat_tmp', 'pg_subtrans', 'pg_logical')) - -SKIP_FILES = frozenset(('pg_internal.init', - 'pg.log', - 'zenith.signal', - 'postgresql.conf', - 'postmaster.opts', - 'postmaster.pid', - 'pg_control')) - - -def should_skip_dir(dirname: str) -> bool: - return dirname in SKIP_DIRS - - -def should_skip_file(filename: str) -> bool: - if filename in SKIP_FILES: - return True - # check for temp table files according to https://www.postgresql.org/docs/current/storage-file-layout.html - # i e "tBBB_FFF" - if not filename.startswith('t'): - return False - - tmp_name = filename[1:].split('_') - if len(tmp_name) != 2: - return False - - try: - list(map(int, tmp_name)) - except: - return False - return True - - -# -# Test helpers -# -def list_files_to_compare(pgdata_dir: str): - pgdata_files = [] - for root, _file, filenames in os.walk(pgdata_dir): - for filename in filenames: - rel_dir = os.path.relpath(root, pgdata_dir) - # Skip some dirs and files we don't want to compare - if should_skip_dir(rel_dir) or should_skip_file(filename): - continue - rel_file = os.path.join(rel_dir, filename) - pgdata_files.append(rel_file) - - pgdata_files.sort() - log.info(pgdata_files) - return pgdata_files - - -# pg is the existing and running compute node, that we want to compare with a basebackup -def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Postgres): - - # Get the timeline ID of our branch. We need it for the 'basebackup' command - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute("SHOW zenith.zenith_timeline") - timeline = cur.fetchone()[0] - - # stop postgres to ensure that files won't change - pg.stop() - - # Take a basebackup from pageserver - restored_dir_path = os.path.join(env.repo_dir, f"{pg.node_name}_restored_datadir") - mkdir_if_needed(restored_dir_path) - - pg_bin = PgBin(test_output_dir) - psql_path = os.path.join(pg_bin.pg_bin_path, 'psql') - - cmd = rf""" - {psql_path} \ - --no-psqlrc \ - postgres://localhost:{env.pageserver.service_port.pg} \ - -c 'basebackup {pg.tenant_id.hex} {timeline}' \ - | tar -x -C {restored_dir_path} - """ - - # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. - # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')} - result = subprocess.run(cmd, env=psql_env, capture_output=True, text=True, shell=True) - - # Print captured stdout/stderr if basebackup cmd failed. - if result.returncode != 0: - log.error('Basebackup shell command failed with:') - log.error(result.stdout) - log.error(result.stderr) - assert result.returncode == 0 - - # list files we're going to compare - assert pg.pgdata_dir - pgdata_files = list_files_to_compare(pg.pgdata_dir) - restored_files = list_files_to_compare(restored_dir_path) - - # check that file sets are equal - assert pgdata_files == restored_files - - # compare content of the files - # filecmp returns (match, mismatch, error) lists - # We've already filtered all mismatching files in list_files_to_compare(), - # so here expect that the content is identical - (match, mismatch, error) = filecmp.cmpfiles(pg.pgdata_dir, - restored_dir_path, - pgdata_files, - shallow=False) - log.info(f'filecmp result mismatch and error lists:\n\t mismatch={mismatch}\n\t error={error}') - - for f in mismatch: - - f1 = os.path.join(pg.pgdata_dir, f) - f2 = os.path.join(restored_dir_path, f) - stdout_filename = "{}.filediff".format(f2) - - with open(stdout_filename, 'w') as stdout_f: - subprocess.run("xxd -b {} > {}.hex ".format(f1, f1), shell=True) - subprocess.run("xxd -b {} > {}.hex ".format(f2, f2), shell=True) - - cmd = 'diff {}.hex {}.hex'.format(f1, f2) - subprocess.run([cmd], stdout=stdout_f, shell=True) - - assert (mismatch, error) == ([], []) diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md new file mode 100644 index 0000000000..725612853a --- /dev/null +++ b/test_runner/performance/README.md @@ -0,0 +1,42 @@ +# Running locally + +First make a release build. The profiling flag is optional, used only for tests that +generate flame graphs. The `-s` flag just silences a lot of output, and makes it +easier to see if you have compile errors without scrolling up. +`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing,profiling" make -s -j8` + +NOTE: the `profiling` flag only works on linux because we use linux-specific +libc APIs like `libc::timer_t`. + +Then run the tests +`NEON_BIN=./target/release poetry run pytest test_runner/performance"` + +Some handy pytest flags for local development: +- `-x` tells pytest to stop on first error +- `-s` shows test output +- `-k` selects a test to run +- `--timeout=0` disables our default timeout of 300s (see `setup.cfg`) + +# What performance tests do we have and how we run them + +Performance tests are built using the same infrastructure as our usual python integration tests. There are some extra fixtures that help to collect performance metrics, and to run tests against both vanilla PostgreSQL and Neon for comparison. + +## Tests that are run against local installation + +Most of the performance tests run against a local installation. This is not very representative of a production environment. Firstly, Postgres, safekeeper(s) and the pageserver have to share CPU and I/O resources, which can add noise to the results. Secondly, network overhead is eliminated. + +In the CI, the performance tests are run in the same environment as the other integration tests. We don't have control over the host that the CI runs on, so the environment may vary widely from one run to another, which makes the results across different runs noisy to compare. + +## Remote tests + +There are a few tests that marked with `pytest.mark.remote_cluster`. These tests do not set up a local environment, and instead require a libpq connection string to connect to. So they can be run on any Postgres compatible database. Currently, the CI runs these tests on our staging and captest environments daily. Those are not an isolated environments, so there can be noise in the results due to activity of other clusters. + +## Noise + +All tests run only once. Usually to obtain more consistent performance numbers, a test should be repeated multiple times and the results be aggregated, for example by taking min, max, avg, or median. + +## Results collection + +Local test results for main branch, and results of daily performance tests, are stored in a neon project deployed in production environment. There is a Grafana dashboard that visualizes the results. Here is the [dashboard](https://observer.zenith.tech/d/DGKBm9Jnz/perf-test-results?orgId=1). The main problem with it is the unavailability to point at particular commit, though the data for that is available in the database. Needs some tweaking from someone who knows Grafana tricks. + +There is also an inconsistency in test naming. Test name should be the same across platforms, and results can be differentiated by the platform field. But currently, platform is sometimes included in test name because of the way how parametrization works in pytest. I.e. there is a platform switch in the dashboard with neon-local-ci and neon-staging variants. I.e. some tests under neon-local-ci value for a platform switch are displayed as `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]` and `Test test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]` which is highly confusing. diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py new file mode 100644 index 0000000000..4b109c150f --- /dev/null +++ b/test_runner/performance/test_branch_creation.py @@ -0,0 +1,152 @@ +import random +import statistics +import threading +import time +import timeit +from contextlib import closing +from typing import List + +import pytest +from fixtures.benchmark_fixture import MetricReport +from fixtures.compare_fixtures import NeonCompare +from fixtures.log_helper import log +from fixtures.neon_fixtures import wait_for_last_record_lsn +from fixtures.types import Lsn + + +def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]): + neon_compare.zenbenchmark.record( + "branch_creation_duration_max", max(durs), "s", MetricReport.LOWER_IS_BETTER + ) + neon_compare.zenbenchmark.record( + "branch_creation_duration_avg", statistics.mean(durs), "s", MetricReport.LOWER_IS_BETTER + ) + neon_compare.zenbenchmark.record( + "branch_creation_duration_stdev", statistics.stdev(durs), "s", MetricReport.LOWER_IS_BETTER + ) + + +@pytest.mark.parametrize("n_branches", [20]) +# Test measures the latency of branch creation during a heavy [1] workload. +# +# [1]: to simulate a heavy workload, the test tweaks the GC and compaction settings +# to increase the task's frequency. The test runs `pgbench` in each new branch. +# Each branch is created from a randomly picked source branch. +def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int): + env = neon_compare.env + pg_bin = neon_compare.pg_bin + + # Use aggressive GC and checkpoint settings, so GC and compaction happen more often during the test + tenant, _ = env.neon_cli.create_tenant( + conf={ + "gc_period": "5 s", + "gc_horizon": f"{4 * 1024 ** 2}", + "checkpoint_distance": f"{2 * 1024 ** 2}", + "compaction_target_size": f"{1024 ** 2}", + "compaction_threshold": "2", + # set PITR interval to be small, so we can do GC + "pitr_interval": "5 s", + } + ) + + def run_pgbench(branch: str): + log.info(f"Start a pgbench workload on branch {branch}") + + pg = env.postgres.create_start(branch, tenant_id=tenant) + connstr = pg.connstr() + + pg_bin.run_capture(["pgbench", "-i", connstr]) + pg_bin.run_capture(["pgbench", "-c10", "-T10", connstr]) + + pg.stop() + + env.neon_cli.create_branch("b0", tenant_id=tenant) + + threads: List[threading.Thread] = [] + threads.append(threading.Thread(target=run_pgbench, args=("b0",), daemon=True)) + threads[-1].start() + + branch_creation_durations = [] + for i in range(n_branches): + time.sleep(1.0) + + # random a source branch + p = random.randint(0, i) + + timer = timeit.default_timer() + env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p), tenant_id=tenant) + dur = timeit.default_timer() - timer + + log.info(f"Creating branch b{i+1} took {dur}s") + branch_creation_durations.append(dur) + + threads.append(threading.Thread(target=run_pgbench, args=(f"b{i+1}",), daemon=True)) + threads[-1].start() + + for thread in threads: + thread.join() + + _record_branch_creation_durations(neon_compare, branch_creation_durations) + + +@pytest.mark.parametrize("n_branches", [1024]) +# Test measures the latency of branch creation when creating a lot of branches. +def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): + env = neon_compare.env + + env.neon_cli.create_branch("b0") + + pg = env.postgres.create_start("b0") + neon_compare.pg_bin.run_capture(["pgbench", "-i", "-s10", pg.connstr()]) + + branch_creation_durations = [] + + for i in range(n_branches): + # random a source branch + p = random.randint(0, i) + timer = timeit.default_timer() + env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p)) + dur = timeit.default_timer() - timer + branch_creation_durations.append(dur) + + _record_branch_creation_durations(neon_compare, branch_creation_durations) + + +# Test measures the branch creation time when branching from a timeline with a lot of relations. +# +# This test measures the latency of branch creation under two scenarios +# 1. The ancestor branch is not under any workloads +# 2. The ancestor branch is under a workload (busy) +# +# To simulate the workload, the test runs a concurrent insertion on the ancestor branch right before branching. +def test_branch_creation_many_relations(neon_compare: NeonCompare): + env = neon_compare.env + + timeline_id = env.neon_cli.create_branch("root") + + pg = env.postgres.create_start("root") + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + for i in range(10000): + cur.execute(f"CREATE TABLE t{i} as SELECT g FROM generate_series(1, 1000) g") + + # Wait for the pageserver to finish processing all the pending WALs, + # as we don't want the LSN wait time to be included during the branch creation + flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + wait_for_last_record_lsn( + env.pageserver.http_client(), env.initial_tenant, timeline_id, flush_lsn + ) + + with neon_compare.record_duration("create_branch_time_not_busy_root"): + env.neon_cli.create_branch("child_not_busy", "root") + + # run a concurrent insertion to make the ancestor "busy" during the branch creation + thread = threading.Thread( + target=pg.safe_psql, args=("INSERT INTO t0 VALUES (generate_series(1, 100000))",) + ) + thread.start() + + with neon_compare.record_duration("create_branch_time_busy_root"): + env.neon_cli.create_branch("child_busy", "root") + + thread.join() diff --git a/test_runner/performance/test_branching.py b/test_runner/performance/test_branching.py new file mode 100644 index 0000000000..0fe7306f87 --- /dev/null +++ b/test_runner/performance/test_branching.py @@ -0,0 +1,96 @@ +import timeit +from pathlib import Path +from typing import List + +from fixtures.benchmark_fixture import PgBenchRunResult +from fixtures.compare_fixtures import NeonCompare +from fixtures.neon_fixtures import fork_at_current_lsn +from performance.test_perf_pgbench import utc_now_timestamp + +# ----------------------------------------------------------------------- +# Start of `test_compare_child_and_root_*` tests +# ----------------------------------------------------------------------- + +# `test_compare_child_and_root_*` tests compare the performance of a branch and its child branch(s). +# A common pattern in those tests is initializing a root branch then creating a child branch(s) from the root. +# Each test then runs a similar workload for both child branch and root branch. Each measures and reports +# some latencies/metrics during the workload for performance comparison between a branch and its ancestor. + + +def test_compare_child_and_root_pgbench_perf(neon_compare: NeonCompare): + env = neon_compare.env + pg_bin = neon_compare.pg_bin + + def run_pgbench_on_branch(branch: str, cmd: List[str]): + run_start_timestamp = utc_now_timestamp() + t0 = timeit.default_timer() + out = pg_bin.run_capture( + cmd, + ) + run_duration = timeit.default_timer() - t0 + run_end_timestamp = utc_now_timestamp() + + stdout = Path(f"{out}.stdout").read_text() + + res = PgBenchRunResult.parse_from_stdout( + stdout=stdout, + run_duration=run_duration, + run_start_timestamp=run_start_timestamp, + run_end_timestamp=run_end_timestamp, + ) + neon_compare.zenbenchmark.record_pg_bench_result(branch, res) + + env.neon_cli.create_branch("root") + pg_root = env.postgres.create_start("root") + pg_bin.run_capture(["pgbench", "-i", pg_root.connstr(), "-s10"]) + + fork_at_current_lsn(env, pg_root, "child", "root") + + pg_child = env.postgres.create_start("child") + + run_pgbench_on_branch("root", ["pgbench", "-c10", "-T10", pg_root.connstr()]) + run_pgbench_on_branch("child", ["pgbench", "-c10", "-T10", pg_child.connstr()]) + + +def test_compare_child_and_root_write_perf(neon_compare: NeonCompare): + env = neon_compare.env + env.neon_cli.create_branch("root") + pg_root = env.postgres.create_start("root") + + pg_root.safe_psql( + "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')", + ) + + env.neon_cli.create_branch("child", "root") + pg_child = env.postgres.create_start("child") + + with neon_compare.record_duration("root_run_duration"): + pg_root.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)") + with neon_compare.record_duration("child_run_duration"): + pg_child.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)") + + +def test_compare_child_and_root_read_perf(neon_compare: NeonCompare): + env = neon_compare.env + env.neon_cli.create_branch("root") + pg_root = env.postgres.create_start("root") + + pg_root.safe_psql_many( + [ + "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')", + "INSERT INTO foo SELECT FROM generate_series(1,1000000)", + ] + ) + + env.neon_cli.create_branch("child", "root") + pg_child = env.postgres.create_start("child") + + with neon_compare.record_duration("root_run_duration"): + pg_root.safe_psql("SELECT count(*) from foo") + with neon_compare.record_duration("child_run_duration"): + pg_child.safe_psql("SELECT count(*) from foo") + + +# ----------------------------------------------------------------------- +# End of `test_compare_child_and_root_*` tests +# ----------------------------------------------------------------------- diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 4e73bedcc0..d6e67aa361 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -1,8 +1,6 @@ from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare + +from fixtures.compare_fixtures import PgCompare # @@ -15,17 +13,16 @@ from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare # 3. Disk space used # 4. Peak memory usage # -def test_bulk_insert(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +def test_bulk_insert(neon_with_baseline: PgCompare): + env = neon_with_baseline - # Get the timeline ID of our branch. We need it for the 'do_gc' command with closing(env.pg.connect()) as conn: with conn.cursor() as cur: cur.execute("create table huge (i int, j int);") # Run INSERT, recording the time and I/O it takes - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('insert'): + with env.record_pageserver_writes("pageserver_writes"): + with env.record_duration("insert"): cur.execute("insert into huge values (generate_series(1, 5000000), 0);") env.flush() diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index 6fd77f3020..cef7ce0c6b 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -1,8 +1,8 @@ import timeit -from fixtures.benchmark_fixture import MetricReport -import pytest -from fixtures.zenith_fixtures import ZenithEnvBuilder +import pytest +from fixtures.benchmark_fixture import MetricReport +from fixtures.neon_fixtures import NeonEnvBuilder # Run bulk tenant creation test. # @@ -12,38 +12,31 @@ from fixtures.zenith_fixtures import ZenithEnvBuilder # 2. Average creation time per tenant -@pytest.mark.parametrize('tenants_count', [1, 5, 10]) -@pytest.mark.parametrize('use_wal_acceptors', ['with_wa', 'without_wa']) +@pytest.mark.parametrize("tenants_count", [1, 5, 10]) def test_bulk_tenant_create( - zenith_env_builder: ZenithEnvBuilder, - use_wal_acceptors: str, + neon_env_builder: NeonEnvBuilder, tenants_count: int, zenbenchmark, ): - """Measure tenant creation time (with and without wal acceptors)""" - if use_wal_acceptors == 'with_wa': - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() time_slices = [] for i in range(tenants_count): start = timeit.default_timer() - tenant = env.create_tenant() - env.zenith_cli.create_branch( - f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", - "main", - tenant_id=tenant) + tenant, _ = env.neon_cli.create_tenant() + env.neon_cli.create_timeline( + f"test_bulk_tenant_create_{tenants_count}_{i}", tenant_id=tenant + ) # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now? - #if use_wal_acceptors == 'with_wa': + # if use_safekeepers == 'with_sa': # wa_factory.start_n_new(3) pg_tenant = env.postgres.create_start( - f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", - None, # branch name, None means same as node name - tenant, + f"test_bulk_tenant_create_{tenants_count}_{i}", tenant_id=tenant ) end = timeit.default_timer() @@ -51,7 +44,9 @@ def test_bulk_tenant_create( pg_tenant.stop() - zenbenchmark.record('tenant_creation_time', - sum(time_slices) / len(time_slices), - 's', - report=MetricReport.LOWER_IS_BETTER) + zenbenchmark.record( + "tenant_creation_time", + sum(time_slices) / len(time_slices), + "s", + report=MetricReport.LOWER_IS_BETTER, + ) diff --git a/test_runner/performance/test_compare_pg_stats.py b/test_runner/performance/test_compare_pg_stats.py new file mode 100644 index 0000000000..d39ea55fbb --- /dev/null +++ b/test_runner/performance/test_compare_pg_stats.py @@ -0,0 +1,131 @@ +import os +import threading +import time +from typing import List + +import pytest +from fixtures.compare_fixtures import PgCompare +from fixtures.pg_stats import PgStatTable +from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix + + +def get_seeds_matrix(default: int = 100): + seeds = os.getenv("TEST_PG_BENCH_SEEDS_MATRIX", default=str(default)) + return list(map(int, seeds.split(","))) + + +@pytest.mark.parametrize("seed", get_seeds_matrix()) +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix(5)) +def test_compare_pg_stats_rw_with_pgbench_default( + neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_rw: List[PgStatTable], +): + env = neon_with_baseline + # initialize pgbench + env.pg_bin.run_capture(["pgbench", f"-s{scale}", "-i", env.pg.connstr()]) + env.flush() + + with env.record_pg_stats(pg_stats_rw): + env.pg_bin.run_capture( + ["pgbench", f"-T{duration}", f"--random-seed={seed}", env.pg.connstr()] + ) + env.flush() + + +@pytest.mark.parametrize("seed", get_seeds_matrix()) +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix(5)) +def test_compare_pg_stats_wo_with_pgbench_simple_update( + neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_wo: List[PgStatTable], +): + env = neon_with_baseline + # initialize pgbench + env.pg_bin.run_capture(["pgbench", f"-s{scale}", "-i", env.pg.connstr()]) + env.flush() + + with env.record_pg_stats(pg_stats_wo): + env.pg_bin.run_capture( + ["pgbench", "-N", f"-T{duration}", f"--random-seed={seed}", env.pg.connstr()] + ) + env.flush() + + +@pytest.mark.parametrize("seed", get_seeds_matrix()) +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix(5)) +def test_compare_pg_stats_ro_with_pgbench_select_only( + neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_ro: List[PgStatTable], +): + env = neon_with_baseline + # initialize pgbench + env.pg_bin.run_capture(["pgbench", f"-s{scale}", "-i", env.pg.connstr()]) + env.flush() + + with env.record_pg_stats(pg_stats_ro): + env.pg_bin.run_capture( + ["pgbench", "-S", f"-T{duration}", f"--random-seed={seed}", env.pg.connstr()] + ) + env.flush() + + +@pytest.mark.parametrize("seed", get_seeds_matrix()) +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix(5)) +def test_compare_pg_stats_wal_with_pgbench_default( + neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_wal: List[PgStatTable], +): + env = neon_with_baseline + # initialize pgbench + env.pg_bin.run_capture(["pgbench", f"-s{scale}", "-i", env.pg.connstr()]) + env.flush() + + with env.record_pg_stats(pg_stats_wal): + env.pg_bin.run_capture( + ["pgbench", f"-T{duration}", f"--random-seed={seed}", env.pg.connstr()] + ) + env.flush() + + +@pytest.mark.parametrize("n_tables", [1, 10]) +@pytest.mark.parametrize("duration", get_durations_matrix(10)) +def test_compare_pg_stats_wo_with_heavy_write( + neon_with_baseline: PgCompare, n_tables: int, duration: int, pg_stats_wo: List[PgStatTable] +): + env = neon_with_baseline + with env.pg.connect().cursor() as cur: + for i in range(n_tables): + cur.execute( + f"CREATE TABLE t{i}(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')" + ) + + def start_single_table_workload(table_id: int): + start = time.time() + with env.pg.connect().cursor() as cur: + while time.time() - start < duration: + cur.execute(f"INSERT INTO t{table_id} SELECT FROM generate_series(1,1000)") + + with env.record_pg_stats(pg_stats_wo): + threads = [ + threading.Thread(target=start_single_table_workload, args=(i,)) for i in range(n_tables) + ] + + for thread in threads: + thread.start() + for thread in threads: + thread.join() diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py index e04a0361cb..01b2097112 100644 --- a/test_runner/performance/test_copy.py +++ b/test_runner/performance/test_copy.py @@ -1,10 +1,7 @@ from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare from io import BufferedReader, RawIOBase -from itertools import repeat + +from fixtures.compare_fixtures import PgCompare class CopyTestData(RawIOBase): @@ -27,9 +24,9 @@ class CopyTestData(RawIOBase): self.rownum += 1 # Number of bytes to read in this call - l = min(len(self.linebuf) - self.ptr, len(b)) + l = min(len(self.linebuf) - self.ptr, len(b)) # noqa: E741 - b[:l] = self.linebuf[self.ptr:(self.ptr + l)] + b[:l] = self.linebuf[self.ptr : (self.ptr + l)] self.ptr += l return l @@ -41,8 +38,8 @@ def copy_test_data(rows: int): # # COPY performance tests. # -def test_copy(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +def test_copy(neon_with_baseline: PgCompare): + env = neon_with_baseline # Get the timeline ID of our branch. We need it for the pageserver 'checkpoint' command with closing(env.pg.connect()) as conn: @@ -52,19 +49,19 @@ def test_copy(zenith_with_baseline: PgCompare): # Load data with COPY, recording the time and I/O it takes. # # Since there's no data in the table previously, this extends it. - with env.record_pageserver_writes('copy_extend_pageserver_writes'): - with env.record_duration('copy_extend'): - cur.copy_from(copy_test_data(1000000), 'copytest') + with env.record_pageserver_writes("copy_extend_pageserver_writes"): + with env.record_duration("copy_extend"): + cur.copy_from(copy_test_data(1000000), "copytest") env.flush() # Delete most rows, and VACUUM to make the space available for reuse. - with env.record_pageserver_writes('delete_pageserver_writes'): - with env.record_duration('delete'): + with env.record_pageserver_writes("delete_pageserver_writes"): + with env.record_duration("delete"): cur.execute("delete from copytest where i % 100 <> 0;") env.flush() - with env.record_pageserver_writes('vacuum_pageserver_writes'): - with env.record_duration('vacuum'): + with env.record_pageserver_writes("vacuum_pageserver_writes"): + with env.record_duration("vacuum"): cur.execute("vacuum copytest") env.flush() @@ -72,9 +69,9 @@ def test_copy(zenith_with_baseline: PgCompare): # by the VACUUM. # # This will also clear all the VM bits. - with env.record_pageserver_writes('copy_reuse_pageserver_writes'): - with env.record_duration('copy_reuse'): - cur.copy_from(copy_test_data(1000000), 'copytest') + with env.record_pageserver_writes("copy_reuse_pageserver_writes"): + with env.record_duration("copy_reuse"): + cur.copy_from(copy_test_data(1000000), "copytest") env.flush() env.report_peak_memory_use() diff --git a/test_runner/performance/test_dup_key.py b/test_runner/performance/test_dup_key.py new file mode 100644 index 0000000000..81752ae740 --- /dev/null +++ b/test_runner/performance/test_dup_key.py @@ -0,0 +1,54 @@ +from contextlib import closing + +import pytest +from fixtures.compare_fixtures import PgCompare +from pytest_lazyfixture import lazy_fixture # type: ignore + + +@pytest.mark.parametrize( + "env", + [ + # The test is too slow to run in CI, but fast enough to run with remote tests + pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), + pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), + pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), + ], +) +def test_dup_key(env: PgCompare): + # Update the same page many times, then measure read performance + + with closing(env.pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("drop table if exists t, f;") + + cur.execute("SET synchronous_commit=off") + cur.execute("SET statement_timeout=0") + + # Write many updates to the same row + with env.record_duration("write"): + cur.execute("create table t (i integer, filler text);") + cur.execute("insert into t values (0);") + cur.execute( + """ +do $$ +begin + for ivar in 1..5000000 loop + update t set i = ivar, filler = repeat('a', 50); + update t set i = ivar, filler = repeat('b', 50); + update t set i = ivar, filler = repeat('c', 50); + update t set i = ivar, filler = repeat('d', 50); + rollback; + end loop; +end; +$$; +""" + ) + + # Write 3-4 MB to evict t from compute cache + cur.execute("create table f (i integer);") + cur.execute("insert into f values (generate_series(1,100000));") + + # Read + with env.record_duration("read"): + cur.execute("select * from t;") + cur.fetchall() diff --git a/test_runner/performance/test_gist_build.py b/test_runner/performance/test_gist_build.py index 92396f6cb7..311030b99d 100644 --- a/test_runner/performance/test_gist_build.py +++ b/test_runner/performance/test_gist_build.py @@ -1,9 +1,6 @@ -import os from contextlib import closing -from fixtures.benchmark_fixture import MetricReport -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare -from fixtures.log_helper import log + +from fixtures.compare_fixtures import PgCompare # @@ -11,8 +8,8 @@ from fixtures.log_helper import log # As of this writing, we're duplicate those giant WAL records for each page, # which makes the delta layer about 32x larger than it needs to be. # -def test_gist_buffering_build(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +def test_gist_buffering_build(neon_with_baseline: PgCompare): + env = neon_with_baseline with closing(env.pg.connect()) as conn: with conn.cursor() as cur: @@ -24,8 +21,8 @@ def test_gist_buffering_build(zenith_with_baseline: PgCompare): ) # Build the index. - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('build'): + with env.record_pageserver_writes("pageserver_writes"): + with env.record_duration("build"): cur.execute( "create index gist_pointidx2 on gist_point_tbl using gist(p) with (buffering = on)" ) diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py new file mode 100644 index 0000000000..aad6ee667a --- /dev/null +++ b/test_runner/performance/test_hot_page.py @@ -0,0 +1,39 @@ +from contextlib import closing + +import pytest +from fixtures.compare_fixtures import PgCompare +from pytest_lazyfixture import lazy_fixture # type: ignore + + +@pytest.mark.parametrize( + "env", + [ + # The test is too slow to run in CI, but fast enough to run with remote tests + pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), + pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), + pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), + ], +) +def test_hot_page(env: PgCompare): + # Update the same page many times, then measure read performance + num_writes = 1000000 + + with closing(env.pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("drop table if exists t, f;") + + # Write many updates to the same row + with env.record_duration("write"): + cur.execute("create table t (i integer);") + cur.execute("insert into t values (0);") + for i in range(num_writes): + cur.execute(f"update t set i = {i};") + + # Write 3-4 MB to evict t from compute cache + cur.execute("create table f (i integer);") + cur.execute("insert into f values (generate_series(1,100000));") + + # Read + with env.record_duration("read"): + cur.execute("select * from t;") + cur.fetchall() diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py new file mode 100644 index 0000000000..2f519e152c --- /dev/null +++ b/test_runner/performance/test_hot_table.py @@ -0,0 +1,38 @@ +from contextlib import closing + +import pytest +from fixtures.compare_fixtures import PgCompare +from pytest_lazyfixture import lazy_fixture # type: ignore + + +@pytest.mark.parametrize( + "env", + [ + # The test is too slow to run in CI, but fast enough to run with remote tests + pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), + pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), + pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), + ], +) +def test_hot_table(env: PgCompare): + # Update a small table many times, then measure read performance + num_rows = 100000 # Slightly larger than shared buffers size TODO validate + num_writes = 1000000 + num_reads = 10 + + with closing(env.pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("drop table if exists t;") + + # Write many updates to a small table + with env.record_duration("write"): + cur.execute("create table t (i integer primary key);") + cur.execute(f"insert into t values (generate_series(1,{num_rows}));") + for i in range(num_writes): + cur.execute(f"update t set i = {i + num_rows} WHERE i = {i};") + + # Read the table + with env.record_duration("read"): + for i in range(num_reads): + cur.execute("select * from t;") + cur.fetchall() diff --git a/test_runner/performance/test_latency.py b/test_runner/performance/test_latency.py new file mode 100644 index 0000000000..9aa618650d --- /dev/null +++ b/test_runner/performance/test_latency.py @@ -0,0 +1,29 @@ +import threading + +import pytest +from fixtures.compare_fixtures import PgCompare +from fixtures.neon_fixtures import Postgres +from performance.test_perf_pgbench import get_scales_matrix +from performance.test_wal_backpressure import record_read_latency + + +def start_write_workload(pg: Postgres, scale: int = 10): + with pg.connect().cursor() as cur: + cur.execute(f"create table big as select generate_series(1,{scale*100_000})") + + +# Measure latency of reads on one table, while lots of writes are happening on another table. +# The fine-grained tracking of last-written LSNs helps to keep the latency low. Without it, the reads would +# often need to wait for the WAL records of the unrelated writes to be processed by the pageserver. +@pytest.mark.parametrize("scale", get_scales_matrix(1)) +def test_measure_read_latency_heavy_write_workload(neon_with_baseline: PgCompare, scale: int): + env = neon_with_baseline + pg = env.pg + + with pg.connect().cursor() as cur: + cur.execute(f"create table small as select generate_series(1,{scale*100_000})") + + write_thread = threading.Thread(target=start_write_workload, args=(pg, scale * 100)) + write_thread.start() + + record_read_latency(env, lambda: write_thread.is_alive(), "SELECT count(*) from small") diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py new file mode 100644 index 0000000000..d71fb6d12c --- /dev/null +++ b/test_runner/performance/test_layer_map.py @@ -0,0 +1,39 @@ +import time + +from fixtures.neon_fixtures import NeonEnvBuilder + + +# +# Benchmark searching the layer map, when there are a lot of small layer files. +# +def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): + + env = neon_env_builder.init_start() + n_iters = 10 + n_records = 100000 + + # We want to have a lot of lot of layer files to exercise the layer map. Make + # gc_horizon and checkpoint_distance very small, so that we get a lot of small layer files. + tenant, _ = env.neon_cli.create_tenant( + conf={ + "gc_period": "100 m", + "gc_horizon": "1048576", + "checkpoint_distance": "8192", + "compaction_period": "1 s", + "compaction_threshold": "1", + "compaction_target_size": "8192", + } + ) + + env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant) + pg = env.postgres.create_start("test_layer_map", tenant_id=tenant) + cur = pg.connect().cursor() + cur.execute("create table t(x integer)") + for i in range(n_iters): + cur.execute(f"insert into t values (generate_series(1,{n_records}))") + time.sleep(1) + + cur.execute("vacuum t") + with zenbenchmark.record_duration("test_query"): + cur.execute("SELECT count(*) from t") + assert cur.fetchone() == (n_iters * n_records,) diff --git a/test_runner/performance/test_parallel_copy_to.py b/test_runner/performance/test_parallel_copy_to.py index e4388ce8e2..b4a25e0edc 100644 --- a/test_runner/performance/test_parallel_copy_to.py +++ b/test_runner/performance/test_parallel_copy_to.py @@ -1,10 +1,8 @@ -from io import BytesIO import asyncio -import asyncpg -from fixtures.zenith_fixtures import ZenithEnv, Postgres, PgProtocol -from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +from io import BytesIO + +from fixtures.compare_fixtures import PgCompare +from fixtures.neon_fixtures import PgProtocol async def repeat_bytes(buf, repetitions: int): @@ -16,7 +14,8 @@ async def copy_test_data_to_table(pg: PgProtocol, worker_id: int, table_name: st buf = BytesIO() for i in range(1000): buf.write( - f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode()) + f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode() + ) buf.seek(0) copy_input = repeat_bytes(buf.read(), 5000) @@ -28,7 +27,7 @@ async def copy_test_data_to_table(pg: PgProtocol, worker_id: int, table_name: st async def parallel_load_different_tables(pg: PgProtocol, n_parallel: int): workers = [] for worker_id in range(n_parallel): - worker = copy_test_data_to_table(pg, worker_id, f'copytest_{worker_id}') + worker = copy_test_data_to_table(pg, worker_id, f"copytest_{worker_id}") workers.append(asyncio.create_task(worker)) # await all workers @@ -36,17 +35,17 @@ async def parallel_load_different_tables(pg: PgProtocol, n_parallel: int): # Load 5 different tables in parallel with COPY TO -def test_parallel_copy_different_tables(zenith_with_baseline: PgCompare, n_parallel=5): +def test_parallel_copy_different_tables(neon_with_baseline: PgCompare, n_parallel=5): - env = zenith_with_baseline + env = neon_with_baseline conn = env.pg.connect() cur = conn.cursor() for worker_id in range(n_parallel): - cur.execute(f'CREATE TABLE copytest_{worker_id} (i int, t text)') + cur.execute(f"CREATE TABLE copytest_{worker_id} (i int, t text)") - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('load'): + with env.record_pageserver_writes("pageserver_writes"): + with env.record_duration("load"): asyncio.run(parallel_load_different_tables(env.pg, n_parallel)) env.flush() @@ -57,7 +56,7 @@ def test_parallel_copy_different_tables(zenith_with_baseline: PgCompare, n_paral async def parallel_load_same_table(pg: PgProtocol, n_parallel: int): workers = [] for worker_id in range(n_parallel): - worker = copy_test_data_to_table(pg, worker_id, f'copytest') + worker = copy_test_data_to_table(pg, worker_id, "copytest") workers.append(asyncio.create_task(worker)) # await all workers @@ -65,15 +64,15 @@ async def parallel_load_same_table(pg: PgProtocol, n_parallel: int): # Load data into one table with COPY TO from 5 parallel connections -def test_parallel_copy_same_table(zenith_with_baseline: PgCompare, n_parallel=5): - env = zenith_with_baseline +def test_parallel_copy_same_table(neon_with_baseline: PgCompare, n_parallel=5): + env = neon_with_baseline conn = env.pg.connect() cur = conn.cursor() - cur.execute(f'CREATE TABLE copytest (i int, t text)') + cur.execute("CREATE TABLE copytest (i int, t text)") - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('load'): + with env.record_pageserver_writes("pageserver_writes"): + with env.record_duration("load"): asyncio.run(parallel_load_same_table(env.pg, n_parallel)) env.flush() diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 5ffce3c0be..0ed3e45971 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -1,30 +1,223 @@ -from contextlib import closing -from fixtures.zenith_fixtures import PgBin, VanillaPostgres, ZenithEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +import calendar +import enum +import os +import timeit +from datetime import datetime +from pathlib import Path +from typing import Dict, List -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker -from fixtures.log_helper import log +import pytest +from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult +from fixtures.compare_fixtures import NeonCompare, PgCompare +from fixtures.utils import get_scale_for_db + + +@enum.unique +class PgBenchLoadType(enum.Enum): + INIT = "init" + SIMPLE_UPDATE = "simple_update" + SELECT_ONLY = "select-only" + + +def utc_now_timestamp() -> int: + return calendar.timegm(datetime.utcnow().utctimetuple()) + + +def init_pgbench(env: PgCompare, cmdline, password: None): + environ: Dict[str, str] = {} + if password is not None: + environ["PGPASSWORD"] = password + + # calculate timestamps and durations separately + # timestamp is intended to be used for linking to grafana and logs + # duration is actually a metric and uses float instead of int for timestamp + start_timestamp = utc_now_timestamp() + t0 = timeit.default_timer() + with env.record_pageserver_writes("init.pageserver_writes"): + out = env.pg_bin.run_capture(cmdline, env=environ) + env.flush() + + duration = timeit.default_timer() - t0 + end_timestamp = utc_now_timestamp() + + stderr = Path(f"{out}.stderr").read_text() + + res = PgBenchInitResult.parse_from_stderr( + stderr=stderr, + duration=duration, + start_timestamp=start_timestamp, + end_timestamp=end_timestamp, + ) + env.zenbenchmark.record_pg_bench_init_result("init", res) + + +def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None): + environ: Dict[str, str] = {} + if password is not None: + environ["PGPASSWORD"] = password + + with env.record_pageserver_writes(f"{prefix}.pageserver_writes"): + run_start_timestamp = utc_now_timestamp() + t0 = timeit.default_timer() + out = env.pg_bin.run_capture(cmdline, env=environ) + run_duration = timeit.default_timer() - t0 + run_end_timestamp = utc_now_timestamp() + env.flush() + + stdout = Path(f"{out}.stdout").read_text() + + res = PgBenchRunResult.parse_from_stdout( + stdout=stdout, + run_duration=run_duration, + run_start_timestamp=run_start_timestamp, + run_end_timestamp=run_end_timestamp, + ) + env.zenbenchmark.record_pg_bench_result(prefix, res) # -# Run a very short pgbench test. +# Initialize a pgbench database, and run pgbench against it. # -# Collects three metrics: +# This makes runs two different pgbench workloads against the same +# initialized database, and 'duration' is the time of each run. So +# the total runtime is 2 * duration, plus time needed to initialize +# the test database. # -# 1. Time to initialize the pgbench database (pgbench -s5 -i) -# 2. Time to run 5000 pgbench transactions -# 3. Disk space used -# -def test_pgbench(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +# Currently, the # of connections is hardcoded at 4 +def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: PgBenchLoadType): + env.zenbenchmark.record("scale", scale, "", MetricReport.TEST_PARAM) - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('init'): - env.pg_bin.run_capture(['pgbench', '-s5', '-i', env.pg.connstr()]) - env.flush() + password = env.pg.default_options.get("password", None) + options = "-cstatement_timeout=1h " + env.pg.default_options.get("options", "") + # drop password from the connection string by passing password=None and set password separately + connstr = env.pg.connstr(password=None, options=options) - with env.record_duration('5000_xacts'): - env.pg_bin.run_capture(['pgbench', '-c1', '-t5000', env.pg.connstr()]) - env.flush() + if workload_type == PgBenchLoadType.INIT: + # Run initialize + init_pgbench(env, ["pgbench", f"-s{scale}", "-i", connstr], password=password) + + if workload_type == PgBenchLoadType.SIMPLE_UPDATE: + # Run simple-update workload + run_pgbench( + env, + "simple-update", + [ + "pgbench", + "-N", + "-c4", + f"-T{duration}", + "-P2", + "--progress-timestamp", + connstr, + ], + password=password, + ) + + if workload_type == PgBenchLoadType.SELECT_ONLY: + # Run SELECT workload + run_pgbench( + env, + "select-only", + [ + "pgbench", + "-S", + "-c4", + f"-T{duration}", + "-P2", + "--progress-timestamp", + connstr, + ], + password=password, + ) env.report_size() + + +def get_durations_matrix(default: int = 45) -> List[int]: + durations = os.getenv("TEST_PG_BENCH_DURATIONS_MATRIX", default=str(default)) + rv = [] + for d in durations.split(","): + d = d.strip().lower() + if d.endswith("h"): + duration = int(d.removesuffix("h")) * 60 * 60 + elif d.endswith("m"): + duration = int(d.removesuffix("m")) * 60 + else: + duration = int(d.removesuffix("s")) + rv.append(duration) + + return rv + + +def get_scales_matrix(default: int = 10) -> List[int]: + scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX", default=str(default)) + rv = [] + for s in scales.split(","): + s = s.strip().lower() + if s.endswith("mb"): + scale = get_scale_for_db(int(s.removesuffix("mb"))) + elif s.endswith("gb"): + scale = get_scale_for_db(int(s.removesuffix("gb")) * 1024) + else: + scale = int(s) + rv.append(scale) + + return rv + + +# Run the pgbench tests against vanilla Postgres and neon +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix()) +def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int): + run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.INIT) + run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.SIMPLE_UPDATE) + run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.SELECT_ONLY) + + +# Run the pgbench tests, and generate a flamegraph from it +# This requires that the pageserver was built with the 'profiling' feature. +# +# TODO: If the profiling is cheap enough, there's no need to run the same test +# twice, with and without profiling. But for now, run it separately, so that we +# can see how much overhead the profiling adds. +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix()) +def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int, duration: int): + neon_env_builder.pageserver_config_override = """ +profiling="page_requests" +""" + env = neon_env_builder.init_start() + env.pageserver.is_profiling_enabled_or_skip() + env.neon_cli.create_branch("empty", "main") + + neon_compare = NeonCompare(zenbenchmark, env, pg_bin, "pgbench") + run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.INIT) + run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SIMPLE_UPDATE) + run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SELECT_ONLY) + + +# The following 3 tests run on an existing database as it was set up by previous tests, +# and leaves the database in a state that would be used in the next tests. +# Modifying the definition order of these functions or adding other remote tests in between will alter results. +# See usage of --sparse-ordering flag in the pytest invocation in the CI workflow +# +# Run the pgbench tests against an existing Postgres cluster +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_pgbench_remote_init(remote_compare: PgCompare, scale: int, duration: int): + run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.INIT) + + +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_pgbench_remote_simple_update(remote_compare: PgCompare, scale: int, duration: int): + run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.SIMPLE_UPDATE) + + +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_pgbench_remote_select_only(remote_compare: PgCompare, scale: int, duration: int): + run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.SELECT_ONLY) diff --git a/test_runner/performance/test_perf_pgbench_remote.py b/test_runner/performance/test_perf_pgbench_remote.py deleted file mode 100644 index 28472a16c8..0000000000 --- a/test_runner/performance/test_perf_pgbench_remote.py +++ /dev/null @@ -1,124 +0,0 @@ -import dataclasses -import os -import subprocess -from typing import List -from fixtures.benchmark_fixture import PgBenchRunResult, ZenithBenchmarker -import pytest -from datetime import datetime -import calendar -import timeit -import os - - -def utc_now_timestamp() -> int: - return calendar.timegm(datetime.utcnow().utctimetuple()) - - -@dataclasses.dataclass -class PgBenchRunner: - connstr: str - scale: int - transactions: int - pgbench_bin_path: str = "pgbench" - - def invoke(self, args: List[str]) -> 'subprocess.CompletedProcess[str]': - res = subprocess.run([self.pgbench_bin_path, *args], text=True, capture_output=True) - - if res.returncode != 0: - raise RuntimeError(f"pgbench failed. stdout: {res.stdout} stderr: {res.stderr}") - return res - - def init(self, vacuum: bool = True) -> 'subprocess.CompletedProcess[str]': - args = [] - if not vacuum: - args.append("--no-vacuum") - args.extend([f"--scale={self.scale}", "--initialize", self.connstr]) - return self.invoke(args) - - def run(self, jobs: int = 1, clients: int = 1): - return self.invoke([ - f"--transactions={self.transactions}", - f"--jobs={jobs}", - f"--client={clients}", - "--progress=2", # print progress every two seconds - self.connstr, - ]) - - -@pytest.fixture -def connstr(): - res = os.getenv("BENCHMARK_CONNSTR") - if res is None: - raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable") - return res - - -def get_transactions_matrix(): - transactions = os.getenv("TEST_PG_BENCH_TRANSACTIONS_MATRIX") - if transactions is None: - return [10**4, 10**5] - return list(map(int, transactions.split(","))) - - -def get_scales_matrix(): - scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX") - if scales is None: - return [10, 20] - return list(map(int, scales.split(","))) - - -@pytest.mark.parametrize("scale", get_scales_matrix()) -@pytest.mark.parametrize("transactions", get_transactions_matrix()) -@pytest.mark.remote_cluster -def test_pg_bench_remote_cluster(zenbenchmark: ZenithBenchmarker, - connstr: str, - scale: int, - transactions: int): - """ - The best way is to run same pack of tests both, for local zenith - and against staging, but currently local tests heavily depend on - things available only locally e.g. zenith binaries, pageserver api, etc. - Also separate test allows to run pgbench workload against vanilla postgres - or other systems that support postgres protocol. - - Also now this is more of a liveness test because it stresses pageserver internals, - so we clearly see what goes wrong in more "real" environment. - """ - pg_bin = os.getenv("PG_BIN") - if pg_bin is not None: - pgbench_bin_path = os.path.join(pg_bin, "pgbench") - else: - pgbench_bin_path = "pgbench" - - runner = PgBenchRunner( - connstr=connstr, - scale=scale, - transactions=transactions, - pgbench_bin_path=pgbench_bin_path, - ) - # calculate timestamps and durations separately - # timestamp is intended to be used for linking to grafana and logs - # duration is actually a metric and uses float instead of int for timestamp - init_start_timestamp = utc_now_timestamp() - t0 = timeit.default_timer() - runner.init() - init_duration = timeit.default_timer() - t0 - init_end_timestamp = utc_now_timestamp() - - run_start_timestamp = utc_now_timestamp() - t0 = timeit.default_timer() - out = runner.run() # TODO handle failures - run_duration = timeit.default_timer() - t0 - run_end_timestamp = utc_now_timestamp() - - res = PgBenchRunResult.parse_from_output( - out=out, - init_duration=init_duration, - init_start_timestamp=init_start_timestamp, - init_end_timestamp=init_end_timestamp, - run_duration=run_duration, - run_start_timestamp=run_start_timestamp, - run_end_timestamp=run_end_timestamp, - ) - - zenbenchmark.record_pg_bench_result(res) diff --git a/test_runner/performance/test_random_writes.py b/test_runner/performance/test_random_writes.py index b41f2f72a8..df766d52da 100644 --- a/test_runner/performance/test_random_writes.py +++ b/test_runner/performance/test_random_writes.py @@ -1,14 +1,9 @@ -import os -from contextlib import closing -from fixtures.benchmark_fixture import MetricReport -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare -from fixtures.log_helper import log - -import psycopg2.extras import random -import time -from fixtures.utils import print_gc_result +from contextlib import closing + +from fixtures.benchmark_fixture import MetricReport +from fixtures.compare_fixtures import PgCompare +from fixtures.utils import query_scalar # This is a clear-box test that demonstrates the worst case scenario for the @@ -17,14 +12,14 @@ from fixtures.utils import print_gc_result # A naive pageserver implementation would create a full image layer for each # dirty segment, leading to write_amplification = segment_size / page_size, # when compared to vanilla postgres. With segment_size = 10MB, that's 1250. -def test_random_writes(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +def test_random_writes(neon_with_baseline: PgCompare): + env = neon_with_baseline # Number of rows in the test database. 1M rows runs quickly, but implies # a small effective_checkpoint_distance, which makes the test less realistic. # Using a 300 TB database would imply a 250 MB effective_checkpoint_distance, # but it will take a very long time to run. From what I've seen so far, - # increasing n_rows doesn't have impact on the (zenith_runtime / vanilla_runtime) + # increasing n_rows doesn't have impact on the (neon_runtime / vanilla_runtime) # performance ratio. n_rows = 1 * 1000 * 1000 # around 36 MB table @@ -42,36 +37,46 @@ def test_random_writes(zenith_with_baseline: PgCompare): with closing(env.pg.connect()) as conn: with conn.cursor() as cur: # Create the test table - with env.record_duration('init'): - cur.execute(""" + with env.record_duration("init"): + cur.execute( + """ CREATE TABLE Big( pk integer primary key, count integer default 0 ); - """) - cur.execute(f"INSERT INTO Big (pk) values (generate_series(1,{n_rows}))") + """ + ) + + # Insert n_rows in batches to avoid query timeouts + rows_inserted = 0 + while rows_inserted < n_rows: + rows_to_insert = min(1000 * 1000, n_rows - rows_inserted) + low = rows_inserted + 1 + high = rows_inserted + rows_to_insert + cur.execute(f"INSERT INTO Big (pk) values (generate_series({low},{high}))") + rows_inserted += rows_to_insert # Get table size (can't be predicted because padding and alignment) - cur.execute("SELECT pg_relation_size('Big');") - row = cur.fetchone() - table_size = row[0] - env.zenbenchmark.record("table_size", table_size, 'bytes', MetricReport.TEST_PARAM) + table_size = query_scalar(cur, "SELECT pg_relation_size('Big')") + env.zenbenchmark.record("table_size", table_size, "bytes", MetricReport.TEST_PARAM) # Decide how much to write, based on knowledge of pageserver implementation. - # Avoiding segment collisions maximizes (zenith_runtime / vanilla_runtime). + # Avoiding segment collisions maximizes (neon_runtime / vanilla_runtime). segment_size = 10 * 1024 * 1024 n_segments = table_size // segment_size n_writes = load_factor * n_segments // 3 # The closer this is to 250 MB, the more realistic the test is. effective_checkpoint_distance = table_size * n_writes // n_rows - env.zenbenchmark.record("effective_checkpoint_distance", - effective_checkpoint_distance, - 'bytes', - MetricReport.TEST_PARAM) + env.zenbenchmark.record( + "effective_checkpoint_distance", + effective_checkpoint_distance, + "bytes", + MetricReport.TEST_PARAM, + ) # Update random keys - with env.record_duration('run'): + with env.record_duration("run"): for it in range(n_iterations): for i in range(n_writes): key = random.randint(1, n_rows) diff --git a/test_runner/performance/test_read_trace.py b/test_runner/performance/test_read_trace.py new file mode 100644 index 0000000000..a5bd0b8de6 --- /dev/null +++ b/test_runner/performance/test_read_trace.py @@ -0,0 +1,31 @@ +from contextlib import closing + +from fixtures.neon_fixtures import NeonEnvBuilder + + +# This test demonstrates how to collect a read trace. It's useful until +# it gets replaced by a test that actually does stuff with the trace. +def test_read_request_tracing(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + tenant, _ = env.neon_cli.create_tenant( + conf={ + "trace_read_requests": "true", + } + ) + + timeline = env.neon_cli.create_timeline("test_trace_replay", tenant_id=tenant) + pg = env.postgres.create_start("test_trace_replay", "main", tenant) + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("create table t (i integer);") + cur.execute(f"insert into t values (generate_series(1,{10000}));") + cur.execute("select count(*) from t;") + + # Stop pg so we drop the connection and flush the traces + pg.stop() + + trace_path = env.repo_dir / "traces" / str(tenant) / str(timeline) + assert trace_path.exists() diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py new file mode 100644 index 0000000000..c681c50ff5 --- /dev/null +++ b/test_runner/performance/test_seqscans.py @@ -0,0 +1,50 @@ +# Test sequential scan speed +# +from contextlib import closing + +import pytest +from fixtures.benchmark_fixture import MetricReport +from fixtures.compare_fixtures import PgCompare +from fixtures.log_helper import log + + +@pytest.mark.parametrize( + "rows,iters,workers", + [ + # The test table is large enough (3-4 MB) that it doesn't fit in the compute node + # cache, so the seqscans go to the page server. But small enough that it fits + # into memory in the page server. + pytest.param(100000, 100, 0), + # Also test with a larger table, with and without parallelism + pytest.param(10000000, 1, 0), + pytest.param(10000000, 1, 4), + ], +) +def test_seqscans(neon_with_baseline: PgCompare, rows: int, iters: int, workers: int): + env = neon_with_baseline + + with closing(env.pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("create table t (i integer);") + cur.execute(f"insert into t values (generate_series(1,{rows}));") + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('t') as tbl_ize + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + shared_buffers = row[0] + table_size = row[1] + log.info(f"shared_buffers is {shared_buffers}, table size {table_size}") + assert int(shared_buffers) < int(table_size) + env.zenbenchmark.record("table_size", table_size, "bytes", MetricReport.TEST_PARAM) + + cur.execute(f"set max_parallel_workers_per_gather = {workers}") + + with env.record_duration("run"): + for i in range(iters): + cur.execute("select count(*) from t;") diff --git a/test_runner/performance/test_small_seqscans.py b/test_runner/performance/test_small_seqscans.py deleted file mode 100644 index b98018ad97..0000000000 --- a/test_runner/performance/test_small_seqscans.py +++ /dev/null @@ -1,41 +0,0 @@ -# Test sequential scan speed -# -# The test table is large enough (3-4 MB) that it doesn't fit in the compute node -# cache, so the seqscans go to the page server. But small enough that it fits -# into memory in the page server. -from contextlib import closing -from dataclasses import dataclass -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker -from fixtures.compare_fixtures import PgCompare -import pytest - - -@pytest.mark.parametrize('rows', [ - pytest.param(100000), - pytest.param(1000000, marks=pytest.mark.slow), -]) -def test_small_seqscans(zenith_with_baseline: PgCompare, rows: int): - env = zenith_with_baseline - - with closing(env.pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute('create table t (i integer);') - cur.execute(f'insert into t values (generate_series(1,{rows}));') - - # Verify that the table is larger than shared_buffers - cur.execute(''' - select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('t') as tbl_ize - from pg_settings where name = 'shared_buffers' - ''') - row = cur.fetchone() - shared_buffers = row[0] - table_size = row[1] - log.info(f"shared_buffers is {shared_buffers}, table size {table_size}") - assert int(shared_buffers) < int(table_size) - env.zenbenchmark.record("table_size", table_size, 'bytes', MetricReport.TEST_PARAM) - - with env.record_duration('run'): - for i in range(1000): - cur.execute('select count(*) from t;') diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py new file mode 100644 index 0000000000..e91b180154 --- /dev/null +++ b/test_runner/performance/test_startup.py @@ -0,0 +1,51 @@ +from contextlib import closing + +import pytest +from fixtures.benchmark_fixture import NeonBenchmarker +from fixtures.neon_fixtures import NeonEnvBuilder + + +# This test sometimes runs for longer than the global 5 minute timeout. +@pytest.mark.timeout(600) +def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + # Start + env.neon_cli.create_branch("test_startup") + with zenbenchmark.record_duration("startup_time"): + pg = env.postgres.create_start("test_startup") + pg.safe_psql("select 1;") + + # Restart + pg.stop_and_destroy() + with zenbenchmark.record_duration("restart_time"): + pg.create_start("test_startup") + pg.safe_psql("select 1;") + + # Fill up + num_rows = 1000000 # 30 MB + num_tables = 100 + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + for i in range(num_tables): + cur.execute(f"create table t_{i} (i integer);") + cur.execute(f"insert into t_{i} values (generate_series(1,{num_rows}));") + + # Read + with zenbenchmark.record_duration("read_time"): + pg.safe_psql("select * from t_0;") + + # Read again + with zenbenchmark.record_duration("second_read_time"): + pg.safe_psql("select * from t_0;") + + # Restart + pg.stop_and_destroy() + with zenbenchmark.record_duration("restart_with_data"): + pg.create_start("test_startup") + pg.safe_psql("select 1;") + + # Read + with zenbenchmark.record_duration("read_after_restart"): + pg.safe_psql("select * from t_0;") diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py new file mode 100644 index 0000000000..dd840acd25 --- /dev/null +++ b/test_runner/performance/test_wal_backpressure.py @@ -0,0 +1,285 @@ +import statistics +import threading +import time +import timeit +from typing import Any, Callable, List + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare +from fixtures.log_helper import log +from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin +from fixtures.types import Lsn +from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix + + +@pytest.fixture(params=["vanilla", "neon_off", "neon_on"]) +# This fixture constructs multiple `PgCompare` interfaces using a builder pattern. +# The builder parameters are encoded in the fixture's param. +# For example, to build a `NeonCompare` interface, the corresponding fixture's param should have +# a format of `neon_{safekeepers_enable_fsync}`. +# Note that, here "_" is used to separate builder parameters. +def pg_compare(request) -> PgCompare: + x = request.param.split("_") + + if x[0] == "vanilla": + # `VanillaCompare` interface + fixture = request.getfixturevalue("vanilla_compare") + assert isinstance(fixture, VanillaCompare) + + return fixture + else: + assert ( + len(x) == 2 + ), f"request param ({request.param}) should have a format of \ + `neon_{{safekeepers_enable_fsync}}`" + + # `NeonCompare` interface + neon_env_builder = request.getfixturevalue("neon_env_builder") + assert isinstance(neon_env_builder, NeonEnvBuilder) + + zenbenchmark = request.getfixturevalue("zenbenchmark") + assert isinstance(zenbenchmark, NeonBenchmarker) + + pg_bin = request.getfixturevalue("pg_bin") + assert isinstance(pg_bin, PgBin) + + neon_env_builder.safekeepers_enable_fsync = x[1] == "on" + + env = neon_env_builder.init_start() + env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) + + branch_name = request.node.name + return NeonCompare(zenbenchmark, env, pg_bin, branch_name) + + +def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_iters: int): + """Start an intensive write workload across multiple tables. + + ## Single table workload: + At each step, insert new `new_rows_each_update` rows. + The variable `new_rows_each_update` is equal to `scale * 100_000`. + The number of steps is determined by `num_iters` variable.""" + new_rows_each_update = scale * 100_000 + + def start_single_table_workload(table_id: int): + for _ in range(num_iters): + with env.pg.connect().cursor() as cur: + cur.execute( + f"INSERT INTO t{table_id} SELECT FROM generate_series(1,{new_rows_each_update})" + ) + + with env.record_duration("run_duration"): + threads = [ + threading.Thread(target=start_single_table_workload, args=(i,)) for i in range(n_tables) + ] + + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + +@pytest.mark.timeout(1000) +@pytest.mark.parametrize("n_tables", [5]) +@pytest.mark.parametrize("scale", get_scales_matrix(5)) +@pytest.mark.parametrize("num_iters", [10]) +def test_heavy_write_workload(pg_compare: PgCompare, n_tables: int, scale: int, num_iters: int): + env = pg_compare + + # Initializes test tables + with env.pg.connect().cursor() as cur: + for i in range(n_tables): + cur.execute( + f"CREATE TABLE t{i}(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')" + ) + cur.execute(f"INSERT INTO t{i} (key) VALUES (0)") + + workload_thread = threading.Thread( + target=start_heavy_write_workload, args=(env, n_tables, scale, num_iters) + ) + workload_thread.start() + + record_thread = threading.Thread( + target=record_lsn_write_lag, args=(env, lambda: workload_thread.is_alive()) + ) + record_thread.start() + + record_read_latency(env, lambda: workload_thread.is_alive(), "SELECT * from t0 where key = 0") + workload_thread.join() + record_thread.join() + + +def start_pgbench_simple_update_workload(env: PgCompare, duration: int): + with env.record_duration("run_duration"): + env.pg_bin.run_capture( + [ + "pgbench", + "-j10", + "-c10", + "-N", + f"-T{duration}", + env.pg.connstr(options="-csynchronous_commit=off"), + ] + ) + env.flush() + + +@pytest.mark.timeout(1000) +@pytest.mark.parametrize("scale", get_scales_matrix(100)) +@pytest.mark.parametrize("duration", get_durations_matrix()) +def test_pgbench_simple_update_workload(pg_compare: PgCompare, scale: int, duration: int): + env = pg_compare + + # initialize pgbench tables + env.pg_bin.run_capture(["pgbench", f"-s{scale}", "-i", env.pg.connstr()]) + env.flush() + + workload_thread = threading.Thread( + target=start_pgbench_simple_update_workload, args=(env, duration) + ) + workload_thread.start() + + record_thread = threading.Thread( + target=record_lsn_write_lag, args=(env, lambda: workload_thread.is_alive()) + ) + record_thread.start() + + record_read_latency( + env, lambda: workload_thread.is_alive(), "SELECT * from pgbench_accounts where aid = 1" + ) + workload_thread.join() + record_thread.join() + + +def start_pgbench_intensive_initialization(env: PgCompare, scale: int, done_event: threading.Event): + with env.record_duration("run_duration"): + # Needs to increase the statement timeout (default: 120s) because the + # initialization step can be slow with a large scale. + env.pg_bin.run_capture( + [ + "pgbench", + f"-s{scale}", + "-i", + "-Idtg", + env.pg.connstr(options="-cstatement_timeout=600s"), + ] + ) + + done_event.set() + + +@pytest.mark.timeout(1000) +@pytest.mark.parametrize("scale", get_scales_matrix(1000)) +def test_pgbench_intensive_init_workload(pg_compare: PgCompare, scale: int): + env = pg_compare + with env.pg.connect().cursor() as cur: + cur.execute("CREATE TABLE foo as select generate_series(1,100000)") + + workload_done_event = threading.Event() + + workload_thread = threading.Thread( + target=start_pgbench_intensive_initialization, args=(env, scale, workload_done_event) + ) + workload_thread.start() + + record_thread = threading.Thread( + target=record_lsn_write_lag, args=(env, lambda: not workload_done_event.is_set()) + ) + record_thread.start() + + record_read_latency(env, lambda: not workload_done_event.is_set(), "SELECT count(*) from foo") + workload_thread.join() + record_thread.join() + + +def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_interval: float = 1.0): + if not isinstance(env, NeonCompare): + return + + lsn_write_lags: List[Any] = [] + last_received_lsn = Lsn(0) + last_pg_flush_lsn = Lsn(0) + + with env.pg.connect().cursor() as cur: + cur.execute("CREATE EXTENSION neon") + + while run_cond(): + cur.execute( + """ + select pg_wal_lsn_diff(pg_current_wal_flush_lsn(),received_lsn), + pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_flush_lsn(),received_lsn)), + pg_current_wal_flush_lsn(), + received_lsn + from backpressure_lsns(); + """ + ) + + res = cur.fetchone() + assert isinstance(res, tuple) + lsn_write_lags.append(res[0]) + + curr_received_lsn = Lsn(res[3]) + lsn_process_speed = (curr_received_lsn - last_received_lsn) / (1024**2) + last_received_lsn = curr_received_lsn + + curr_pg_flush_lsn = Lsn(res[2]) + lsn_produce_speed = (curr_pg_flush_lsn - last_pg_flush_lsn) / (1024**2) + last_pg_flush_lsn = curr_pg_flush_lsn + + log.info( + f"received_lsn_lag={res[1]}, pg_flush_lsn={res[2]}, received_lsn={res[3]}, lsn_process_speed={lsn_process_speed:.2f}MB/s, lsn_produce_speed={lsn_produce_speed:.2f}MB/s" + ) + + time.sleep(pool_interval) + + env.zenbenchmark.record( + "lsn_write_lag_max", + float(max(lsn_write_lags) / (1024**2)), + "MB", + MetricReport.LOWER_IS_BETTER, + ) + env.zenbenchmark.record( + "lsn_write_lag_avg", + float(statistics.mean(lsn_write_lags) / (1024**2)), + "MB", + MetricReport.LOWER_IS_BETTER, + ) + env.zenbenchmark.record( + "lsn_write_lag_stdev", + float(statistics.stdev(lsn_write_lags) / (1024**2)), + "MB", + MetricReport.LOWER_IS_BETTER, + ) + + +def record_read_latency( + env: PgCompare, run_cond: Callable[[], bool], read_query: str, read_interval: float = 1.0 +): + read_latencies = [] + + with env.pg.connect().cursor() as cur: + while run_cond(): + try: + t1 = timeit.default_timer() + cur.execute(read_query) + t2 = timeit.default_timer() + + log.info( + f"Executed read query {read_query}, got {cur.fetchall()}, read time {t2-t1:.2f}s" + ) + read_latencies.append(t2 - t1) + except Exception as err: + log.error(f"Got error when executing the read query: {err}") + + time.sleep(read_interval) + + env.zenbenchmark.record( + "read_latency_max", max(read_latencies), "s", MetricReport.LOWER_IS_BETTER + ) + env.zenbenchmark.record( + "read_latency_avg", statistics.mean(read_latencies), "s", MetricReport.LOWER_IS_BETTER + ) + env.zenbenchmark.record( + "read_latency_stdev", statistics.stdev(read_latencies), "s", MetricReport.LOWER_IS_BETTER + ) diff --git a/test_runner/performance/test_write_amplification.py b/test_runner/performance/test_write_amplification.py index 49232bf6d3..30c217e392 100644 --- a/test_runner/performance/test_write_amplification.py +++ b/test_runner/performance/test_write_amplification.py @@ -10,31 +10,30 @@ # in LSN order, writing the oldest layer first. That creates a new 10 MB image # layer to be created for each of those small updates. This is the Write # Amplification problem at its finest. -import os from contextlib import closing -from fixtures.benchmark_fixture import MetricReport -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare -from fixtures.log_helper import log + +from fixtures.compare_fixtures import PgCompare -def test_write_amplification(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +def test_write_amplification(neon_with_baseline: PgCompare): + env = neon_with_baseline with closing(env.pg.connect()) as conn: with conn.cursor() as cur: - with env.record_pageserver_writes('pageserver_writes'): - with env.record_duration('run'): + with env.record_pageserver_writes("pageserver_writes"): + with env.record_duration("run"): # NOTE: Because each iteration updates every table already created, # the runtime and write amplification is O(n^2), where n is the # number of iterations. for i in range(25): - cur.execute(f''' + cur.execute( + f""" CREATE TABLE tbl{i} AS SELECT g as i, 'long string to consume some space' || g as t FROM generate_series(1, 100000) g - ''') + """ + ) cur.execute(f"create index on tbl{i} (i);") for j in range(1, i): cur.execute(f"delete from tbl{j} where i = {i}") diff --git a/test_runner/pg_clients/csharp/npgsql/.dockerignore b/test_runner/pg_clients/csharp/npgsql/.dockerignore new file mode 100644 index 0000000000..cd42ee34e8 --- /dev/null +++ b/test_runner/pg_clients/csharp/npgsql/.dockerignore @@ -0,0 +1,2 @@ +bin/ +obj/ diff --git a/test_runner/pg_clients/csharp/npgsql/.gitignore b/test_runner/pg_clients/csharp/npgsql/.gitignore new file mode 100644 index 0000000000..cd42ee34e8 --- /dev/null +++ b/test_runner/pg_clients/csharp/npgsql/.gitignore @@ -0,0 +1,2 @@ +bin/ +obj/ diff --git a/test_runner/pg_clients/csharp/npgsql/Dockerfile b/test_runner/pg_clients/csharp/npgsql/Dockerfile new file mode 100644 index 0000000000..a78bc2f3bc --- /dev/null +++ b/test_runner/pg_clients/csharp/npgsql/Dockerfile @@ -0,0 +1,14 @@ +FROM mcr.microsoft.com/dotnet/sdk:6.0 AS build +WORKDIR /source + +COPY *.csproj . +RUN dotnet restore + +COPY . . +RUN dotnet publish -c release -o /app --no-restore + +FROM mcr.microsoft.com/dotnet/runtime:6.0 +WORKDIR /app +COPY --from=build /app . + +ENTRYPOINT ["dotnet", "csharp-npgsql.dll"] diff --git a/test_runner/pg_clients/csharp/npgsql/Program.cs b/test_runner/pg_clients/csharp/npgsql/Program.cs new file mode 100644 index 0000000000..17c2d5b81d --- /dev/null +++ b/test_runner/pg_clients/csharp/npgsql/Program.cs @@ -0,0 +1,19 @@ +using Npgsql; + +var host = Environment.GetEnvironmentVariable("NEON_HOST"); +var database = Environment.GetEnvironmentVariable("NEON_DATABASE"); +var user = Environment.GetEnvironmentVariable("NEON_USER"); +var password = Environment.GetEnvironmentVariable("NEON_PASSWORD"); + +var connString = $"Host={host};Username={user};Password={password};Database={database}"; + +await using var conn = new NpgsqlConnection(connString); +await conn.OpenAsync(); + +await using (var cmd = new NpgsqlCommand("SELECT 1", conn)) +await using (var reader = await cmd.ExecuteReaderAsync()) +{ + while (await reader.ReadAsync()) + Console.WriteLine(reader.GetInt32(0)); +} +await conn.CloseAsync(); diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj new file mode 100644 index 0000000000..7c1f90c1fc --- /dev/null +++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj @@ -0,0 +1,14 @@ + + + + Exe + net6.0 + enable + enable + + + + + + + diff --git a/test_runner/pg_clients/java/jdbc/.gitignore b/test_runner/pg_clients/java/jdbc/.gitignore new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/test_runner/pg_clients/java/jdbc/.gitignore @@ -0,0 +1 @@ + diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile new file mode 100644 index 0000000000..daad99c3a1 --- /dev/null +++ b/test_runner/pg_clients/java/jdbc/Dockerfile @@ -0,0 +1,10 @@ +FROM openjdk:17 +WORKDIR /source + +COPY . . + +WORKDIR /app +RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.4.0.jar && \ + javac -d /app /source/Example.java + +CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"] diff --git a/test_runner/pg_clients/java/jdbc/Example.java b/test_runner/pg_clients/java/jdbc/Example.java new file mode 100644 index 0000000000..410a971649 --- /dev/null +++ b/test_runner/pg_clients/java/jdbc/Example.java @@ -0,0 +1,31 @@ +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.ResultSet; +import java.sql.Statement; +import java.util.Properties; + +public class Example +{ + public static void main( String[] args ) throws Exception + { + String host = System.getenv("NEON_HOST"); + String database = System.getenv("NEON_DATABASE"); + String user = System.getenv("NEON_USER"); + String password = System.getenv("NEON_PASSWORD"); + + String url = "jdbc:postgresql://%s/%s".formatted(host, database); + Properties props = new Properties(); + props.setProperty("user", user); + props.setProperty("password", password); + + Connection conn = DriverManager.getConnection(url, props); + Statement st = conn.createStatement(); + ResultSet rs = st.executeQuery("SELECT 1"); + while (rs.next()) + { + System.out.println(rs.getString(1)); + } + rs.close(); + st.close(); + } +} diff --git a/test_runner/pg_clients/python/asyncpg/Dockerfile b/test_runner/pg_clients/python/asyncpg/Dockerfile new file mode 100644 index 0000000000..10662f92d5 --- /dev/null +++ b/test_runner/pg_clients/python/asyncpg/Dockerfile @@ -0,0 +1,8 @@ +FROM python:3.10 +WORKDIR /source + +COPY . . + +RUN python3 -m pip install --no-cache-dir -r requirements.txt + +CMD ["python3", "asyncpg_example.py"] diff --git a/test_runner/pg_clients/python/asyncpg/asyncpg_example.py b/test_runner/pg_clients/python/asyncpg/asyncpg_example.py new file mode 100755 index 0000000000..4d9dfb09c1 --- /dev/null +++ b/test_runner/pg_clients/python/asyncpg/asyncpg_example.py @@ -0,0 +1,29 @@ +#! /usr/bin/env python3 + +import asyncio +import os + +import asyncpg + + +async def run(**kwargs) -> asyncpg.Record: + conn = await asyncpg.connect( + **kwargs, + statement_cache_size=0, # Prepared statements doesn't work pgbouncer + ) + rv = await conn.fetchrow("SELECT 1") + await conn.close() + + return rv + + +if __name__ == "__main__": + kwargs = { + k.lstrip("NEON_").lower(): v + for k in ("NEON_HOST", "NEON_DATABASE", "NEON_USER", "NEON_PASSWORD") + if (v := os.environ.get(k, None)) is not None + } + + row = asyncio.run(run(**kwargs)) + + print(row[0]) diff --git a/test_runner/pg_clients/python/asyncpg/requirements.txt b/test_runner/pg_clients/python/asyncpg/requirements.txt new file mode 100644 index 0000000000..edc57ecc81 --- /dev/null +++ b/test_runner/pg_clients/python/asyncpg/requirements.txt @@ -0,0 +1 @@ +asyncpg==0.25.0 diff --git a/test_runner/pg_clients/python/pg8000/Dockerfile b/test_runner/pg_clients/python/pg8000/Dockerfile new file mode 100644 index 0000000000..eddf64df5b --- /dev/null +++ b/test_runner/pg_clients/python/pg8000/Dockerfile @@ -0,0 +1,8 @@ +FROM python:3.10 +WORKDIR /source + +COPY . . + +RUN python3 -m pip install --no-cache-dir -r requirements.txt + +CMD ["python3", "pg8000_example.py"] diff --git a/test_runner/pg_clients/python/pg8000/README.md b/test_runner/pg_clients/python/pg8000/README.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/pg_clients/python/pg8000/pg8000_example.py b/test_runner/pg_clients/python/pg8000/pg8000_example.py new file mode 100755 index 0000000000..b1d77af5bb --- /dev/null +++ b/test_runner/pg_clients/python/pg8000/pg8000_example.py @@ -0,0 +1,22 @@ +#! /usr/bin/env python3 + +import os + +import pg8000.dbapi + +if __name__ == "__main__": + kwargs = { + k.lstrip("NEON_").lower(): v + for k in ("NEON_HOST", "NEON_DATABASE", "NEON_USER", "NEON_PASSWORD") + if (v := os.environ.get(k, None)) is not None + } + conn = pg8000.dbapi.connect( + **kwargs, + ssl_context=True, + ) + + cursor = conn.cursor() + cursor.execute("SELECT 1") + row = cursor.fetchone() + print(row[0]) + conn.close() diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt new file mode 100644 index 0000000000..1577712150 --- /dev/null +++ b/test_runner/pg_clients/python/pg8000/requirements.txt @@ -0,0 +1 @@ +pg8000==1.29.1 diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/.dockerignore b/test_runner/pg_clients/swift/PostgresClientKitExample/.dockerignore new file mode 100644 index 0000000000..30bcfa4ed5 --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/.dockerignore @@ -0,0 +1 @@ +.build/ diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/.gitignore b/test_runner/pg_clients/swift/PostgresClientKitExample/.gitignore new file mode 100644 index 0000000000..30bcfa4ed5 --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/.gitignore @@ -0,0 +1 @@ +.build/ diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile new file mode 100644 index 0000000000..8f9477bd6a --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile @@ -0,0 +1,11 @@ +FROM swift:5.6 AS build +RUN apt-get -q update && apt-get -q install -y libssl-dev +WORKDIR /source + +COPY . . +RUN swift build --configuration release + +FROM swift:5.6 +WORKDIR /app +COPY --from=build /source/.build/release/release . +CMD ["/app/PostgresClientKitExample"] diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved new file mode 100644 index 0000000000..478e31000e --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved @@ -0,0 +1,41 @@ +{ + "pins" : [ + { + "identity" : "bluesocket", + "kind" : "remoteSourceControl", + "location" : "https://github.com/IBM-Swift/BlueSocket.git", + "state" : { + "revision" : "dd924c3bc2c1c144c42b8dda3896f1a03115ded4", + "version" : "2.0.2" + } + }, + { + "identity" : "bluesslservice", + "kind" : "remoteSourceControl", + "location" : "https://github.com/IBM-Swift/BlueSSLService", + "state" : { + "revision" : "c249988fb748749739144e7f554710552acdc0bd", + "version" : "2.0.1" + } + }, + { + "identity" : "postgresclientkit", + "kind" : "remoteSourceControl", + "location" : "https://github.com/codewinsdotcom/PostgresClientKit.git", + "state" : { + "branch" : "v1.4.3", + "revision" : "beafedaea6dc9f04712e9a8547b77f47c406a47e" + } + }, + { + "identity" : "swift-argument-parser", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-argument-parser", + "state" : { + "revision" : "6b2aa2748a7881eebb9f84fb10c01293e15b52ca", + "version" : "0.5.0" + } + } + ], + "version" : 2 +} diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift new file mode 100644 index 0000000000..0d40b28572 --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift @@ -0,0 +1,17 @@ +// swift-tools-version:5.6 +import PackageDescription + +let package = Package( + name: "PostgresClientKitExample", + dependencies: [ + .package( + url: "https://github.com/codewinsdotcom/PostgresClientKit.git", + revision: "v1.4.3" + ) + ], + targets: [ + .target( + name: "PostgresClientKitExample", + dependencies: [ "PostgresClientKit" ]) + ] +) diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift new file mode 100644 index 0000000000..c7518dd88c --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift @@ -0,0 +1,38 @@ +import Foundation + +import PostgresClientKit + +do { + var configuration = PostgresClientKit.ConnectionConfiguration() + + let env = ProcessInfo.processInfo.environment + if let host = env["NEON_HOST"] { + configuration.host = host + } + if let database = env["NEON_DATABASE"] { + configuration.database = database + } + if let user = env["NEON_USER"] { + configuration.user = user + } + if let password = env["NEON_PASSWORD"] { + configuration.credential = .scramSHA256(password: password) + } + + let connection = try PostgresClientKit.Connection(configuration: configuration) + defer { connection.close() } + + let text = "SELECT 1;" + let statement = try connection.prepareStatement(text: text) + defer { statement.close() } + + let cursor = try statement.execute(parameterValues: [ ]) + defer { cursor.close() } + + for row in cursor { + let columns = try row.get().columns + print(columns[0]) + } +} catch { + print(error) +} diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py new file mode 100644 index 0000000000..6ffe3bf918 --- /dev/null +++ b/test_runner/pg_clients/test_pg_clients.py @@ -0,0 +1,54 @@ +import shutil +from pathlib import Path +from tempfile import NamedTemporaryFile + +import pytest +from fixtures.neon_fixtures import RemotePostgres +from fixtures.utils import subprocess_capture + + +@pytest.mark.remote_cluster +@pytest.mark.parametrize( + "client", + [ + "csharp/npgsql", + "java/jdbc", + "python/asyncpg", + pytest.param( + "python/pg8000", # See https://github.com/neondatabase/neon/pull/2008#discussion_r912264281 + marks=pytest.mark.xfail(reason="Handles SSL in incompatible with Neon way"), + ), + pytest.param( + "swift/PostgresClientKit", # See https://github.com/neondatabase/neon/pull/2008#discussion_r911896592 + marks=pytest.mark.xfail(reason="Neither SNI nor parameters is supported"), + ), + "typescript/postgresql-client", + ], +) +def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: str): + conn_options = remote_pg.conn_options() + + env_file = None + with NamedTemporaryFile(mode="w", delete=False) as f: + env_file = f.name + f.write( + f""" + NEON_HOST={conn_options["host"]} + NEON_DATABASE={conn_options["dbname"]} + NEON_USER={conn_options["user"]} + NEON_PASSWORD={conn_options["password"]} + """ + ) + + image_tag = client.lower() + docker_bin = shutil.which("docker") + if docker_bin is None: + raise RuntimeError("docker is required for running this test") + + build_cmd = [docker_bin, "build", "--tag", image_tag, f"{Path(__file__).parent / client}"] + subprocess_capture(test_output_dir, build_cmd, check=True) + + run_cmd = [docker_bin, "run", "--rm", "--env-file", env_file, image_tag] + basepath = subprocess_capture(test_output_dir, run_cmd, check=True) + + assert Path(f"{basepath}.stdout").read_text().strip() == "1" diff --git a/test_runner/pg_clients/typescript/postgresql-client/.dockerignore b/test_runner/pg_clients/typescript/postgresql-client/.dockerignore new file mode 100644 index 0000000000..c2658d7d1b --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/.dockerignore @@ -0,0 +1 @@ +node_modules/ diff --git a/test_runner/pg_clients/typescript/postgresql-client/.gitignore b/test_runner/pg_clients/typescript/postgresql-client/.gitignore new file mode 100644 index 0000000000..c2658d7d1b --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/.gitignore @@ -0,0 +1 @@ +node_modules/ diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile new file mode 100644 index 0000000000..b57147503f --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile @@ -0,0 +1,7 @@ +FROM node:16 +WORKDIR /source + +COPY . . +RUN npm clean-install + +CMD ["/source/index.js"] \ No newline at end of file diff --git a/test_runner/pg_clients/typescript/postgresql-client/index.js b/test_runner/pg_clients/typescript/postgresql-client/index.js new file mode 100755 index 0000000000..af4899baab --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/index.js @@ -0,0 +1,25 @@ +#! /usr/bin/env node + +import {Connection} from 'postgresql-client'; + +const params = { + "host": process.env.NEON_HOST, + "database": process.env.NEON_DATABASE, + "user": process.env.NEON_USER, + "password": process.env.NEON_PASSWORD, + "ssl": true, +} +for (const key in params) { + if (params[key] === undefined) { + delete params[key]; + } +} + +const connection = new Connection(params); +await connection.connect(); +const result = await connection.query( + 'select 1' +); +const rows = result.rows; +await connection.close(); +console.log(rows[0][0]); diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json new file mode 100644 index 0000000000..bb5b4a1378 --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json @@ -0,0 +1,262 @@ +{ + "name": "typescript", + "lockfileVersion": 2, + "requires": true, + "packages": { + "": { + "dependencies": { + "postgresql-client": "^2.1.3" + } + }, + "node_modules/debug": { + "version": "4.3.4", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", + "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==", + "dependencies": { + "ms": "2.1.2" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/doublylinked": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.1.tgz", + "integrity": "sha512-Lpqb+qyHpR5Bew8xfKsxVYdjXEYAQ7HLp1IX47kHKmVCZeXErInytonjkL+kE+L4yaKSYEmDNR9MJYr5zwuAKA==", + "engines": { + "node": ">= 10.0" + } + }, + "node_modules/lightning-pool": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-3.1.3.tgz", + "integrity": "sha512-OgWuoh0BBrikWx/mc/XwIKwC9HHTe/GU3XODLMBPibv7jv8u0o2gQFS7KVEg5U8Oufg6N7mkm8Y1RoiLER0zeQ==", + "dependencies": { + "doublylinked": "^2.4.3", + "putil-promisify": "^1.8.2" + }, + "engines": { + "node": ">= 10.0" + } + }, + "node_modules/ms": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", + "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==" + }, + "node_modules/obuf": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz", + "integrity": "sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg==" + }, + "node_modules/postgres-bytea": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-3.0.0.tgz", + "integrity": "sha512-CNd4jim9RFPkObHSjVHlVrxoVQXz7quwNFpz7RY1okNNme49+sVyiTvTRobiLV548Hx/hb1BG+iE7h9493WzFw==", + "dependencies": { + "obuf": "~1.1.2" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/postgresql-client": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.1.3.tgz", + "integrity": "sha512-36Ga6JzhydsRzcCRcA/Y2hrX9C9sI0wS6sgRNBlOGkOwACXQVybmhDM7mAUbi9cT00N39Ee7btR0eMCyD//5Xg==", + "dependencies": { + "debug": "^4.3.4", + "doublylinked": "^2.5.1", + "lightning-pool": "^3.1.3", + "postgres-bytea": "^3.0.0", + "power-tasks": "^0.8.0", + "putil-merge": "^3.8.0", + "putil-promisify": "^1.8.5", + "putil-varhelpers": "^1.6.4" + }, + "engines": { + "node": ">=14.0", + "npm": ">=7.0.0" + } + }, + "node_modules/power-tasks": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-0.8.0.tgz", + "integrity": "sha512-HhMcx+y5UkzlEmKslruz8uAU2Yq8CODJsFEMFsYMrGp5EzKpkNHGu0RNvBqyewKJDZHPNKtBSULsEAxMqQIBVQ==", + "dependencies": { + "debug": "^4.3.4", + "doublylinked": "^2.5.1", + "strict-typed-events": "^2.2.0" + }, + "engines": { + "node": ">=14.0", + "npm": ">=7.0.0" + } + }, + "node_modules/putil-merge": { + "version": "3.8.0", + "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.8.0.tgz", + "integrity": "sha512-5tXPafJawWFoYZWLhkYXZ7IC/qkI45HgJsgv36lJBeq3qjFZfUITZE01CmWUBIlIn9f1yDiikqgYERARhVmgrg==", + "engines": { + "node": ">= 10.0" + } + }, + "node_modules/putil-promisify": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.8.5.tgz", + "integrity": "sha512-DItclasWWZokvpq3Aiaq0iV7WC8isP/0o/8mhC0yV6CQ781N/7NQHA1VyOm6hfpeFEwIQoo1C4Yjc5eH0q6Jbw==", + "engines": { + "node": ">= 6.0" + } + }, + "node_modules/putil-varhelpers": { + "version": "1.6.4", + "resolved": "https://registry.npmjs.org/putil-varhelpers/-/putil-varhelpers-1.6.4.tgz", + "integrity": "sha512-nM2nO1HS2yJUyPgz0grd2XZAM0Spr6/tt6F4xXeNDjByV00BV2mq6lZ+sDff8WIfQBI9Hn1Czh93H1xBvKESxw==", + "engines": { + "node": ">= 6.0" + } + }, + "node_modules/strict-typed-events": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.2.0.tgz", + "integrity": "sha512-yvHRtEfRRV7TJWi9cLhMt4Mb12JtAwXXONltUlLCA3fRB0LRy94B4E4e2gIlXzT5nZHTZVpOjJNOshri3LZ5bw==", + "dependencies": { + "putil-promisify": "^1.8.5", + "ts-gems": "^2.0.0" + }, + "engines": { + "node": ">=14.0" + } + }, + "node_modules/ts-gems": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.1.0.tgz", + "integrity": "sha512-5IqiG4nq1tsOhYPc4CwxA6bsE+TfU6uAABzf6bH4sdElgXpt/mlStvIYedvvtU7BM1+RRJxCaTLaaVFcCqNaiA==", + "peerDependencies": { + "typescript": ">=4.0.0" + } + }, + "node_modules/typescript": { + "version": "4.7.4", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.7.4.tgz", + "integrity": "sha512-C0WQT0gezHuw6AdY1M2jxUO83Rjf0HP7Sk1DtXj6j1EwkQNZrHAg2XPWlq62oqEhYvONq5pkC2Y9oPljWToLmQ==", + "peer": true, + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=4.2.0" + } + } + }, + "dependencies": { + "debug": { + "version": "4.3.4", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", + "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==", + "requires": { + "ms": "2.1.2" + } + }, + "doublylinked": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.1.tgz", + "integrity": "sha512-Lpqb+qyHpR5Bew8xfKsxVYdjXEYAQ7HLp1IX47kHKmVCZeXErInytonjkL+kE+L4yaKSYEmDNR9MJYr5zwuAKA==" + }, + "lightning-pool": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-3.1.3.tgz", + "integrity": "sha512-OgWuoh0BBrikWx/mc/XwIKwC9HHTe/GU3XODLMBPibv7jv8u0o2gQFS7KVEg5U8Oufg6N7mkm8Y1RoiLER0zeQ==", + "requires": { + "doublylinked": "^2.4.3", + "putil-promisify": "^1.8.2" + } + }, + "ms": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", + "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==" + }, + "obuf": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz", + "integrity": "sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg==" + }, + "postgres-bytea": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-3.0.0.tgz", + "integrity": "sha512-CNd4jim9RFPkObHSjVHlVrxoVQXz7quwNFpz7RY1okNNme49+sVyiTvTRobiLV548Hx/hb1BG+iE7h9493WzFw==", + "requires": { + "obuf": "~1.1.2" + } + }, + "postgresql-client": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.1.3.tgz", + "integrity": "sha512-36Ga6JzhydsRzcCRcA/Y2hrX9C9sI0wS6sgRNBlOGkOwACXQVybmhDM7mAUbi9cT00N39Ee7btR0eMCyD//5Xg==", + "requires": { + "debug": "^4.3.4", + "doublylinked": "^2.5.1", + "lightning-pool": "^3.1.3", + "postgres-bytea": "^3.0.0", + "power-tasks": "^0.8.0", + "putil-merge": "^3.8.0", + "putil-promisify": "^1.8.5", + "putil-varhelpers": "^1.6.4" + } + }, + "power-tasks": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-0.8.0.tgz", + "integrity": "sha512-HhMcx+y5UkzlEmKslruz8uAU2Yq8CODJsFEMFsYMrGp5EzKpkNHGu0RNvBqyewKJDZHPNKtBSULsEAxMqQIBVQ==", + "requires": { + "debug": "^4.3.4", + "doublylinked": "^2.5.1", + "strict-typed-events": "^2.2.0" + } + }, + "putil-merge": { + "version": "3.8.0", + "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.8.0.tgz", + "integrity": "sha512-5tXPafJawWFoYZWLhkYXZ7IC/qkI45HgJsgv36lJBeq3qjFZfUITZE01CmWUBIlIn9f1yDiikqgYERARhVmgrg==" + }, + "putil-promisify": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.8.5.tgz", + "integrity": "sha512-DItclasWWZokvpq3Aiaq0iV7WC8isP/0o/8mhC0yV6CQ781N/7NQHA1VyOm6hfpeFEwIQoo1C4Yjc5eH0q6Jbw==" + }, + "putil-varhelpers": { + "version": "1.6.4", + "resolved": "https://registry.npmjs.org/putil-varhelpers/-/putil-varhelpers-1.6.4.tgz", + "integrity": "sha512-nM2nO1HS2yJUyPgz0grd2XZAM0Spr6/tt6F4xXeNDjByV00BV2mq6lZ+sDff8WIfQBI9Hn1Czh93H1xBvKESxw==" + }, + "strict-typed-events": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.2.0.tgz", + "integrity": "sha512-yvHRtEfRRV7TJWi9cLhMt4Mb12JtAwXXONltUlLCA3fRB0LRy94B4E4e2gIlXzT5nZHTZVpOjJNOshri3LZ5bw==", + "requires": { + "putil-promisify": "^1.8.5", + "ts-gems": "^2.0.0" + } + }, + "ts-gems": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.1.0.tgz", + "integrity": "sha512-5IqiG4nq1tsOhYPc4CwxA6bsE+TfU6uAABzf6bH4sdElgXpt/mlStvIYedvvtU7BM1+RRJxCaTLaaVFcCqNaiA==", + "requires": {} + }, + "typescript": { + "version": "4.7.4", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.7.4.tgz", + "integrity": "sha512-C0WQT0gezHuw6AdY1M2jxUO83Rjf0HP7Sk1DtXj6j1EwkQNZrHAg2XPWlq62oqEhYvONq5pkC2Y9oPljWToLmQ==", + "peer": true + } + } +} diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json new file mode 100644 index 0000000000..5d8ca23a7f --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/package.json @@ -0,0 +1,6 @@ +{ + "type": "module", + "dependencies": { + "postgresql-client": "^2.1.3" + } +} diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py new file mode 100644 index 0000000000..d7aebfb938 --- /dev/null +++ b/test_runner/regress/test_ancestor_branch.py @@ -0,0 +1,104 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.types import TimelineId +from fixtures.utils import query_scalar + + +# +# Create ancestor branches off the main branch. +# +def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + # Override defaults, 1M gc_horizon and 4M checkpoint_distance. + # Extend compaction_period and gc_period to disable background compaction and gc. + tenant, _ = env.neon_cli.create_tenant( + conf={ + "gc_period": "10 m", + "gc_horizon": "1048576", + "checkpoint_distance": "4194304", + "compaction_period": "10 m", + "compaction_threshold": "2", + "compaction_target_size": "4194304", + } + ) + + pageserver_http.configure_failpoints(("flush-frozen-before-sync", "sleep(10000)")) + + pg_branch0 = env.postgres.create_start("main", tenant_id=tenant) + branch0_cur = pg_branch0.connect().cursor() + branch0_timeline = TimelineId(query_scalar(branch0_cur, "SHOW neon.timeline_id")) + log.info(f"b0 timeline {branch0_timeline}") + + # Create table, and insert 100k rows. + branch0_lsn = query_scalar(branch0_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"b0 at lsn {branch0_lsn}") + + branch0_cur.execute("CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)") + branch0_cur.execute( + """ + INSERT INTO foo + SELECT '00112233445566778899AABBCCDDEEFF' || ':branch0:' || g + FROM generate_series(1, 100000) g + """ + ) + lsn_100 = query_scalar(branch0_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN after 100k rows: {lsn_100}") + + # Create branch1. + env.neon_cli.create_branch("branch1", "main", tenant_id=tenant, ancestor_start_lsn=lsn_100) + pg_branch1 = env.postgres.create_start("branch1", tenant_id=tenant) + log.info("postgres is running on 'branch1' branch") + + branch1_cur = pg_branch1.connect().cursor() + branch1_timeline = TimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id")) + log.info(f"b1 timeline {branch1_timeline}") + + branch1_lsn = query_scalar(branch1_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"b1 at lsn {branch1_lsn}") + + # Insert 100k rows. + branch1_cur.execute( + """ + INSERT INTO foo + SELECT '00112233445566778899AABBCCDDEEFF' || ':branch1:' || g + FROM generate_series(1, 100000) g + """ + ) + lsn_200 = query_scalar(branch1_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN after 200k rows: {lsn_200}") + + # Create branch2. + env.neon_cli.create_branch("branch2", "branch1", tenant_id=tenant, ancestor_start_lsn=lsn_200) + pg_branch2 = env.postgres.create_start("branch2", tenant_id=tenant) + log.info("postgres is running on 'branch2' branch") + branch2_cur = pg_branch2.connect().cursor() + + branch2_timeline = TimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id")) + log.info(f"b2 timeline {branch2_timeline}") + + branch2_lsn = query_scalar(branch2_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"b2 at lsn {branch2_lsn}") + + # Insert 100k rows. + branch2_cur.execute( + """ + INSERT INTO foo + SELECT '00112233445566778899AABBCCDDEEFF' || ':branch2:' || g + FROM generate_series(1, 100000) g + """ + ) + lsn_300 = query_scalar(branch2_cur, "SELECT pg_current_wal_insert_lsn()") + log.info(f"LSN after 300k rows: {lsn_300}") + + # Run compaction on branch1. + compact = f"compact {tenant} {branch1_timeline}" + log.info(compact) + pageserver_http.timeline_compact(tenant, branch1_timeline) + + assert query_scalar(branch0_cur, "SELECT count(*) FROM foo") == 100000 + + assert query_scalar(branch1_cur, "SELECT count(*) FROM foo") == 200000 + + assert query_scalar(branch2_cur, "SELECT count(*) FROM foo") == 300000 diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py new file mode 100644 index 0000000000..8443aa029f --- /dev/null +++ b/test_runner/regress/test_auth.py @@ -0,0 +1,75 @@ +from contextlib import closing + +import pytest +from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException +from fixtures.types import TenantId + + +def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() + + ps = env.pageserver + + tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant) + tenant_http_client = env.pageserver.http_client(tenant_token) + invalid_tenant_token = env.auth_keys.generate_tenant_token(TenantId.generate()) + invalid_tenant_http_client = env.pageserver.http_client(invalid_tenant_token) + + management_token = env.auth_keys.generate_management_token() + management_http_client = env.pageserver.http_client(management_token) + + # this does not invoke auth check and only decodes jwt and checks it for validity + # check both tokens + ps.safe_psql("set FOO", password=tenant_token) + ps.safe_psql("set FOO", password=management_token) + + new_timeline_id = env.neon_cli.create_branch( + "test_pageserver_auth", tenant_id=env.initial_tenant + ) + + # tenant can create branches + tenant_http_client.timeline_create( + tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id + ) + # console can create branches for tenant + management_http_client.timeline_create( + tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id + ) + + # fail to create branch using token with different tenant_id + with pytest.raises( + PageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied" + ): + invalid_tenant_http_client.timeline_create( + tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id + ) + + # create tenant using management token + management_http_client.tenant_create() + + # fail to create tenant using tenant token + with pytest.raises( + PageserverApiException, + match="Forbidden: Attempt to access management api with tenant scope. Permission denied", + ): + tenant_http_client.tenant_create() + + +def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder): + neon_env_builder.auth_enabled = True + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + branch = "test_compute_auth_to_pageserver" + env.neon_cli.create_branch(branch) + pg = env.postgres.create_start(branch) + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + # we rely upon autocommit after each statement + # as waiting for acceptors happens there + cur.execute("CREATE TABLE t(key int primary key, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (5000050000,) diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/regress/test_backpressure.py similarity index 71% rename from test_runner/batch_others/test_backpressure.py rename to test_runner/regress/test_backpressure.py index 23af5b90ed..a81fa380a9 100644 --- a/test_runner/batch_others/test_backpressure.py +++ b/test_runner/regress/test_backpressure.py @@ -1,14 +1,13 @@ -from contextlib import closing, contextmanager -import psycopg2.extras -from fixtures.zenith_fixtures import ZenithEnvBuilder -from fixtures.log_helper import log -import os -import time -import asyncpg -from fixtures.zenith_fixtures import Postgres import threading +import time +from contextlib import closing, contextmanager -pytest_plugins = ("fixtures.zenith_fixtures") +import psycopg2.extras +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, Postgres + +pytest_plugins = "fixtures.neon_fixtures" @contextmanager @@ -25,7 +24,7 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv log.info("checks started") with pg_cur(pg) as cur: - cur.execute("CREATE EXTENSION zenith") # TODO move it to zenith_fixtures? + cur.execute("CREATE EXTENSION neon") # TODO move it to neon_fixtures? cur.execute("select pg_size_bytes(current_setting('max_replication_write_lag'))") res = cur.fetchone() @@ -45,7 +44,8 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv with pg_cur(pg) as cur: while not stop_event.is_set(): try: - cur.execute(''' + cur.execute( + """ select pg_wal_lsn_diff(pg_current_wal_flush_lsn(),received_lsn) as received_lsn_lag, pg_wal_lsn_diff(pg_current_wal_flush_lsn(),disk_consistent_lsn) as disk_consistent_lsn_lag, pg_wal_lsn_diff(pg_current_wal_flush_lsn(),remote_consistent_lsn) as remote_consistent_lsn_lag, @@ -53,16 +53,19 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_flush_lsn(),disk_consistent_lsn)), pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_flush_lsn(),remote_consistent_lsn)) from backpressure_lsns(); - ''') + """ + ) res = cur.fetchone() received_lsn_lag = res[0] disk_consistent_lsn_lag = res[1] remote_consistent_lsn_lag = res[2] - log.info(f"received_lsn_lag = {received_lsn_lag} ({res[3]}), " - f"disk_consistent_lsn_lag = {disk_consistent_lsn_lag} ({res[4]}), " - f"remote_consistent_lsn_lag = {remote_consistent_lsn_lag} ({res[5]})") + log.info( + f"received_lsn_lag = {received_lsn_lag} ({res[3]}), " + f"disk_consistent_lsn_lag = {disk_consistent_lsn_lag} ({res[4]}), " + f"remote_consistent_lsn_lag = {remote_consistent_lsn_lag} ({res[5]})" + ) # Since feedback from pageserver is not immediate, we should allow some lag overflow lag_overflow = 5 * 1024 * 1024 # 5MB @@ -72,7 +75,9 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv if max_replication_flush_lag_bytes > 0: assert disk_consistent_lsn_lag < max_replication_flush_lag_bytes + lag_overflow if max_replication_apply_lag_bytes > 0: - assert remote_consistent_lsn_lag < max_replication_apply_lag_bytes + lag_overflow + assert ( + remote_consistent_lsn_lag < max_replication_apply_lag_bytes + lag_overflow + ) time.sleep(polling_interval) @@ -80,7 +85,7 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv log.info(f"backpressure check query failed: {e}") stop_event.set() - log.info('check thread stopped') + log.info("check thread stopped") # This test illustrates how to tune backpressure to control the lag @@ -91,14 +96,15 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv # If backpressure is enabled and tuned properly, insertion will be throttled, but the query will not timeout. -def test_backpressure_received_lsn_lag(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() +@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/1587") +def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() # Create a branch for us - env.zenith_cli.create_branch("test_backpressure", "main") + env.neon_cli.create_branch("test_backpressure") - pg = env.postgres.create_start('test_backpressure', - config_lines=['max_replication_write_lag=30MB']) + pg = env.postgres.create_start( + "test_backpressure", config_lines=["max_replication_write_lag=30MB"] + ) log.info("postgres is running on 'test_backpressure' branch") # setup check thread @@ -132,23 +138,29 @@ def test_backpressure_received_lsn_lag(zenith_env_builder: ZenithEnvBuilder): rows_inserted += 100000 except Exception as e: if check_thread.is_alive(): - log.info('stopping check thread') + log.info("stopping check thread") check_stop_event.set() check_thread.join() - assert False, f"Exception {e} while inserting rows, but WAL lag is within configured threshold. That means backpressure is not tuned properly" + assert ( + False + ), f"Exception {e} while inserting rows, but WAL lag is within configured threshold. That means backpressure is not tuned properly" else: - assert False, f"Exception {e} while inserting rows and WAL lag overflowed configured threshold. That means backpressure doesn't work." + assert ( + False + ), f"Exception {e} while inserting rows and WAL lag overflowed configured threshold. That means backpressure doesn't work." log.info(f"inserted {rows_inserted} rows") if check_thread.is_alive(): - log.info('stopping check thread') + log.info("stopping check thread") check_stop_event.set() check_thread.join() - log.info('check thread stopped') + log.info("check thread stopped") else: - assert False, "WAL lag overflowed configured threshold. That means backpressure doesn't work." + assert ( + False + ), "WAL lag overflowed configured threshold. That means backpressure doesn't work." -#TODO test_backpressure_disk_consistent_lsn_lag. Play with pageserver's checkpoint settings -#TODO test_backpressure_remote_consistent_lsn_lag +# TODO test_backpressure_disk_consistent_lsn_lag. Play with pageserver's checkpoint settings +# TODO test_backpressure_remote_consistent_lsn_lag diff --git a/test_runner/regress/test_basebackup_error.py b/test_runner/regress/test_basebackup_error.py new file mode 100644 index 0000000000..94d3999d17 --- /dev/null +++ b/test_runner/regress/test_basebackup_error.py @@ -0,0 +1,18 @@ +import pytest +from fixtures.neon_fixtures import NeonEnv + + +# +# Test error handling, if the 'basebackup' command fails in the middle +# of building the tar archive. +# +def test_basebackup_error(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_basebackup_error", "empty") + pageserver_http = env.pageserver.http_client() + + # Introduce failpoint + pageserver_http.configure_failpoints(("basebackup-before-control-file", "return")) + + with pytest.raises(Exception, match="basebackup-before-control-file"): + env.postgres.create_start("test_basebackup_error") diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py new file mode 100644 index 0000000000..12debe50eb --- /dev/null +++ b/test_runner/regress/test_branch_and_gc.py @@ -0,0 +1,170 @@ +import threading +import time + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.types import Lsn +from fixtures.utils import query_scalar + + +# Test the GC implementation when running with branching. +# This test reproduces the issue https://github.com/neondatabase/neon/issues/707. +# +# Consider two LSNs `lsn1` and `lsn2` with some delta files as follows: +# ... +# p -> has an image layer xx_p with p < lsn1 +# ... +# lsn1 +# ... +# q -> has an image layer yy_q with lsn1 < q < lsn2 +# ... +# lsn2 +# +# Consider running a GC iteration such that the GC horizon is between p and lsn1 +# ... +# p -> has an image layer xx_p with p < lsn1 +# D_start -> is a delta layer D's start (e.g D = '...-...-D_start-D_end') +# ... +# GC_h -> is a gc horizon such that p < GC_h < lsn1 +# ... +# lsn1 +# ... +# D_end -> is a delta layer D's end +# ... +# q -> has an image layer yy_q with lsn1 < q < lsn2 +# ... +# lsn2 +# +# As described in the issue #707, the image layer xx_p will be deleted as +# its range is below the GC horizon and there exists a newer image layer yy_q (q > p). +# However, removing xx_p will corrupt any delta layers that depend on xx_p that +# are not deleted by GC. For example, the delta layer D is corrupted in the +# above example because D depends on the image layer xx_p for value reconstruction. +# +# Because the delta layer D covering lsn1 is corrupted, creating a branch +# starting from lsn1 should return an error as follows: +# could not find data for key ... at LSN ..., for request at LSN ... +def test_branch_and_gc(neon_simple_env: NeonEnv): + env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() + + tenant, _ = env.neon_cli.create_tenant( + conf={ + # disable background GC + "gc_period": "10 m", + "gc_horizon": f"{10 * 1024 ** 3}", + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{1024 ** 2}", + # set the target size to be large to allow the image layer to cover the whole key space + "compaction_target_size": f"{1024 ** 3}", + # tweak the default settings to allow quickly create image layers and L1 layers + "compaction_period": "1 s", + "compaction_threshold": "2", + "image_creation_threshold": "1", + # set PITR interval to be small, so we can do GC + "pitr_interval": "1 s", + } + ) + + timeline_main = env.neon_cli.create_timeline("test_main", tenant_id=tenant) + pg_main = env.postgres.create_start("test_main", tenant_id=tenant) + + main_cur = pg_main.connect().cursor() + + main_cur.execute( + "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')" + ) + main_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)") + lsn1 = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) + log.info(f"LSN1: {lsn1}") + + main_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)") + lsn2 = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) + log.info(f"LSN2: {lsn2}") + + # Set the GC horizon so that lsn1 is inside the horizon, which means + # we can create a new branch starting from lsn1. + pageserver_http_client.timeline_gc(tenant, timeline_main, lsn2 - lsn1 + 1024) + + env.neon_cli.create_branch( + "test_branch", "test_main", tenant_id=tenant, ancestor_start_lsn=lsn1 + ) + pg_branch = env.postgres.create_start("test_branch", tenant_id=tenant) + + branch_cur = pg_branch.connect().cursor() + branch_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)") + + assert query_scalar(branch_cur, "SELECT count(*) FROM foo") == 200000 + + +# This test simulates a race condition happening when branch creation and GC are performed concurrently. +# +# Suppose we want to create a new timeline 't' from a source timeline 's' starting +# from a lsn 'lsn'. Upon creating 't', if we don't hold the GC lock and compare 'lsn' with +# the latest GC information carefully, it's possible for GC to accidentally remove data +# needed by the new timeline. +# +# In this test, GC is requested before the branch creation but is delayed to happen after branch creation. +# As a result, when doing GC for the source timeline, we don't have any information about +# the upcoming new branches, so it's possible to remove data that may be needed by the new branches. +# It's the branch creation task's job to make sure the starting 'lsn' is not out of scope +# and prevent creating branches with invalid starting LSNs. +# +# For more details, see discussion in https://github.com/neondatabase/neon/pull/2101#issuecomment-1185273447. +def test_branch_creation_before_gc(neon_simple_env: NeonEnv): + env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() + + # Disable background GC but set the `pitr_interval` to be small, so GC can delete something + tenant, _ = env.neon_cli.create_tenant( + conf={ + # disable background GC + "gc_period": "10 m", + "gc_horizon": f"{10 * 1024 ** 3}", + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{1024 ** 2}", + # set the target size to be large to allow the image layer to cover the whole key space + "compaction_target_size": f"{1024 ** 3}", + # tweak the default settings to allow quickly create image layers and L1 layers + "compaction_period": "1 s", + "compaction_threshold": "2", + "image_creation_threshold": "1", + # set PITR interval to be small, so we can do GC + "pitr_interval": "0 s", + } + ) + + b0 = env.neon_cli.create_branch("b0", tenant_id=tenant) + pg0 = env.postgres.create_start("b0", tenant_id=tenant) + res = pg0.safe_psql_many( + queries=[ + "CREATE TABLE t(key serial primary key)", + "INSERT INTO t SELECT FROM generate_series(1, 100000)", + "SELECT pg_current_wal_insert_lsn()", + "INSERT INTO t SELECT FROM generate_series(1, 100000)", + ] + ) + lsn = Lsn(res[2][0][0]) + + # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the + # branch creation task but the individual timeline GC iteration happens *after* + # the branch creation task. + pageserver_http_client.configure_failpoints(("before-timeline-gc", "sleep(2000)")) + + def do_gc(): + pageserver_http_client.timeline_gc(tenant, b0, 0) + + thread = threading.Thread(target=do_gc, daemon=True) + thread.start() + + # because of network latency and other factors, GC iteration might be processed + # after the `create_branch` request. Add a sleep here to make sure that GC is + # always processed before. + time.sleep(1.0) + + # The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC. + with pytest.raises(Exception, match="invalid branch start lsn: .*"): + env.neon_cli.create_branch("b1", "b0", tenant_id=tenant, ancestor_start_lsn=lsn) + + thread.join() diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py new file mode 100644 index 0000000000..0e2a8b346b --- /dev/null +++ b/test_runner/regress/test_branch_behind.py @@ -0,0 +1,123 @@ +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.types import Lsn, TimelineId +from fixtures.utils import print_gc_result, query_scalar + + +# +# Create a couple of branches off the main branch, at a historical point in time. +# +def test_branch_behind(neon_env_builder: NeonEnvBuilder): + # Disable pitr, because here we want to test branch creation after GC + neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + env = neon_env_builder.init_start() + + # Branch at the point where only 100 rows were inserted + env.neon_cli.create_branch("test_branch_behind") + pgmain = env.postgres.create_start("test_branch_behind") + log.info("postgres is running on 'test_branch_behind' branch") + + main_cur = pgmain.connect().cursor() + + timeline = TimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) + + # Create table, and insert the first 100 rows + main_cur.execute("CREATE TABLE foo (t text)") + + # keep some early lsn to test branch creation on out of date lsn + gced_lsn = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) + + main_cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100) g + """ + ) + lsn_a = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) + log.info(f"LSN after 100 rows: {lsn_a}") + + # Insert some more rows. (This generates enough WAL to fill a few segments.) + main_cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 200000) g + """ + ) + lsn_b = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) + log.info(f"LSN after 200100 rows: {lsn_b}") + + # Branch at the point where only 100 rows were inserted + env.neon_cli.create_branch( + "test_branch_behind_hundred", "test_branch_behind", ancestor_start_lsn=lsn_a + ) + + # Insert many more rows. This generates enough WAL to fill a few segments. + main_cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 200000) g + """ + ) + lsn_c = Lsn(query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()")) + + log.info(f"LSN after 400100 rows: {lsn_c}") + + # Branch at the point where only 200100 rows were inserted + env.neon_cli.create_branch( + "test_branch_behind_more", "test_branch_behind", ancestor_start_lsn=lsn_b + ) + + pg_hundred = env.postgres.create_start("test_branch_behind_hundred") + pg_more = env.postgres.create_start("test_branch_behind_more") + + # On the 'hundred' branch, we should see only 100 rows + hundred_cur = pg_hundred.connect().cursor() + assert query_scalar(hundred_cur, "SELECT count(*) FROM foo") == 100 + + # On the 'more' branch, we should see 100200 rows + more_cur = pg_more.connect().cursor() + assert query_scalar(more_cur, "SELECT count(*) FROM foo") == 200100 + + # All the rows are visible on the main branch + assert query_scalar(main_cur, "SELECT count(*) FROM foo") == 400100 + + # Check bad lsn's for branching + + # branch at segment boundary + env.neon_cli.create_branch( + "test_branch_segment_boundary", "test_branch_behind", ancestor_start_lsn=Lsn("0/3000000") + ) + pg = env.postgres.create_start("test_branch_segment_boundary") + assert pg.safe_psql("SELECT 1")[0][0] == 1 + + # branch at pre-initdb lsn + with pytest.raises(Exception, match="invalid branch start lsn: .*"): + env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn=Lsn("0/42")) + + # branch at pre-ancestor lsn + with pytest.raises(Exception, match="less than timeline ancestor lsn"): + env.neon_cli.create_branch( + "test_branch_preinitdb", "test_branch_behind", ancestor_start_lsn=Lsn("0/42") + ) + + # check that we cannot create branch based on garbage collected data + with env.pageserver.http_client() as pageserver_http: + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) + + with pytest.raises(Exception, match="invalid branch start lsn: .*"): + # this gced_lsn is pretty random, so if gc is disabled this woudln't fail + env.neon_cli.create_branch( + "test_branch_create_fail", "test_branch_behind", ancestor_start_lsn=gced_lsn + ) + + # check that after gc everything is still there + assert query_scalar(hundred_cur, "SELECT count(*) FROM foo") == 100 + + assert query_scalar(more_cur, "SELECT count(*) FROM foo") == 200100 + + assert query_scalar(main_cur, "SELECT count(*) FROM foo") == 400100 diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py new file mode 100644 index 0000000000..3b78700e9f --- /dev/null +++ b/test_runner/regress/test_branching.py @@ -0,0 +1,128 @@ +import random +import threading +import time +from typing import List + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres +from fixtures.types import Lsn +from fixtures.utils import query_scalar +from performance.test_perf_pgbench import get_scales_matrix + + +# Test branch creation +# +# This test spawns pgbench in a thread in the background, and creates a branch while +# pgbench is running. Then it launches pgbench on the new branch, and creates another branch. +# Repeat `n_branches` times. +# +# If 'ty' == 'cascade', each branch is created from the previous branch, so that you end +# up with a branch of a branch of a branch ... of a branch. With 'ty' == 'flat', +# each branch is created from the root. +@pytest.mark.parametrize("n_branches", [10]) +@pytest.mark.parametrize("scale", get_scales_matrix(1)) +@pytest.mark.parametrize("ty", ["cascade", "flat"]) +def test_branching_with_pgbench( + neon_simple_env: NeonEnv, pg_bin: PgBin, n_branches: int, scale: int, ty: str +): + env = neon_simple_env + + # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test + tenant, _ = env.neon_cli.create_tenant( + conf={ + "gc_period": "5 s", + "gc_horizon": f"{1024 ** 2}", + "checkpoint_distance": f"{1024 ** 2}", + "compaction_target_size": f"{1024 ** 2}", + # set PITR interval to be small, so we can do GC + "pitr_interval": "5 s", + } + ) + + def run_pgbench(pg: Postgres): + connstr = pg.connstr() + + log.info(f"Start a pgbench workload on pg {connstr}") + + pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", "-T15", connstr]) + + env.neon_cli.create_branch("b0", tenant_id=tenant) + pgs: List[Postgres] = [] + pgs.append(env.postgres.create_start("b0", tenant_id=tenant)) + + threads: List[threading.Thread] = [] + threads.append(threading.Thread(target=run_pgbench, args=(pgs[0],), daemon=True)) + threads[-1].start() + + thread_limit = 4 + + for i in range(n_branches): + # random a delay between [0, 5] + delay = random.random() * 5 + time.sleep(delay) + log.info(f"Sleep {delay}s") + + # If the number of concurrent threads exceeds a threshold, wait for + # all the threads to finish before spawning a new one. Because the + # regression tests in this directory are run concurrently in CI, we + # want to avoid the situation that one test exhausts resources for + # other tests. + if len(threads) >= thread_limit: + for thread in threads: + thread.join() + threads = [] + + if ty == "cascade": + env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(i), tenant_id=tenant) + else: + env.neon_cli.create_branch("b{}".format(i + 1), "b0", tenant_id=tenant) + + pgs.append(env.postgres.create_start("b{}".format(i + 1), tenant_id=tenant)) + + threads.append(threading.Thread(target=run_pgbench, args=(pgs[-1],), daemon=True)) + threads[-1].start() + + for thread in threads: + thread.join() + + for pg in pgs: + res = pg.safe_psql("SELECT count(*) from pgbench_accounts") + assert res[0] == (100000 * scale,) + + +# Test branching from an "unnormalized" LSN. +# +# Context: +# When doing basebackup for a newly created branch, pageserver generates +# 'pg_control' file to bootstrap WAL segment by specifying the redo position +# a "normalized" LSN based on the timeline's starting LSN: +# +# checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0; +# +# This test checks if the pageserver is able to handle a "unnormalized" starting LSN. +# +# Related: see discussion in https://github.com/neondatabase/neon/pull/2143#issuecomment-1209092186 +def test_branching_unnormalized_start_lsn(neon_simple_env: NeonEnv, pg_bin: PgBin): + XLOG_BLCKSZ = 8192 + + env = neon_simple_env + + env.neon_cli.create_branch("b0") + pg0 = env.postgres.create_start("b0") + + pg_bin.run_capture(["pgbench", "-i", pg0.connstr()]) + + with pg0.cursor() as cur: + curr_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # Specify the `start_lsn` as a number that is divided by `XLOG_BLCKSZ` + # and is smaller than `curr_lsn`. + start_lsn = Lsn((int(curr_lsn) - XLOG_BLCKSZ) // XLOG_BLCKSZ * XLOG_BLCKSZ) + + log.info(f"Branching b1 from b0 starting at lsn {start_lsn}...") + env.neon_cli.create_branch("b1", "b0", ancestor_start_lsn=start_lsn) + pg1 = env.postgres.create_start("b1") + + pg_bin.run_capture(["pgbench", "-i", pg1.connstr()]) diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py new file mode 100644 index 0000000000..b747af4d09 --- /dev/null +++ b/test_runner/regress/test_broken_timeline.py @@ -0,0 +1,166 @@ +import concurrent.futures +import os +from typing import List, Tuple + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres +from fixtures.types import TenantId, TimelineId + + +# Test restarting page server, while safekeeper and compute node keep +# running. +def test_broken_timeline(neon_env_builder: NeonEnvBuilder): + # One safekeeper is enough for this test. + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + tenant_timelines: List[Tuple[TenantId, TimelineId, Postgres]] = [] + + for n in range(4): + tenant_id, timeline_id = env.neon_cli.create_tenant() + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + with pg.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") + pg.stop() + tenant_timelines.append((tenant_id, timeline_id, pg)) + + # Stop the pageserver + env.pageserver.stop() + + # Leave the first timeline alone, but corrupt the others in different ways + (tenant0, timeline0, pg0) = tenant_timelines[0] + log.info(f"Timeline {tenant0}/{timeline0} is left intact") + + (tenant1, timeline1, pg1) = tenant_timelines[1] + metadata_path = f"{env.repo_dir}/tenants/{tenant1}/timelines/{timeline1}/metadata" + f = open(metadata_path, "w") + f.write("overwritten with garbage!") + f.close() + log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled") + + (tenant2, timeline2, pg2) = tenant_timelines[2] + timeline_path = f"{env.repo_dir}/tenants/{tenant2}/timelines/{timeline2}/" + for filename in os.listdir(timeline_path): + if filename.startswith("00000"): + # Looks like a layer file. Remove it + os.remove(f"{timeline_path}/{filename}") + log.info( + f"Timeline {tenant2}/{timeline2} got its layer files removed (no remote storage enabled)" + ) + + (tenant3, timeline3, pg3) = tenant_timelines[3] + timeline_path = f"{env.repo_dir}/tenants/{tenant3}/timelines/{timeline3}/" + for filename in os.listdir(timeline_path): + if filename.startswith("00000"): + # Looks like a layer file. Corrupt it + f = open(f"{timeline_path}/{filename}", "w") + f.write("overwritten with garbage!") + f.close() + log.info(f"Timeline {tenant3}/{timeline3} got its layer files spoiled") + + env.pageserver.start() + + # Tenant 0 should still work + pg0.start() + assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100 + + # But all others are broken + + # First timeline would not get loaded into pageserver due to corrupt metadata file + with pytest.raises(Exception, match=f"Timeline {tenant1}/{timeline1} was not found") as err: + pg1.start() + log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") + + # Second timeline has no ancestors, only the metadata file and no layer files + # We don't have the remote storage enabled, which means timeline is in an incorrect state, + # it's not loaded at all + with pytest.raises(Exception, match=f"Timeline {tenant2}/{timeline2} was not found") as err: + pg2.start() + log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}") + + # Yet other timelines will fail when their layers will be queried during basebackup: we don't check layer file contents on startup, when loading the timeline + for n in range(3, 4): + (bad_tenant, bad_timeline, pg) = tenant_timelines[n] + with pytest.raises(Exception, match="extracting base backup failed") as err: + pg.start() + log.info( + f"compute startup failed lazily for timeline {bad_tenant}/{bad_timeline} with corrupt layers, during basebackup preparation: {err}" + ) + + +def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): + env = neon_simple_env + + tenant_id, _ = env.neon_cli.create_tenant() + + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + futures = [ + executor.submit( + env.neon_cli.create_timeline, f"test-create-multiple-timelines-{i}", tenant_id + ) + for i in range(4) + ] + for future in futures: + future.result() + + +def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv): + env = neon_simple_env + pageserver_http = env.pageserver.http_client() + + tenant_id, _ = env.neon_cli.create_tenant() + + timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" + old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + initial_timeline_dirs = [d for d in timelines_dir.iterdir()] + + # Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed. + pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return")) + with pytest.raises(Exception, match="before-checkpoint-new-timeline"): + _ = env.neon_cli.create_timeline("test_timeline_init_break_before_checkpoint", tenant_id) + + # Restart the page server + env.neon_cli.pageserver_stop(immediate=True) + env.neon_cli.pageserver_start() + + # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. + new_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + assert ( + new_tenant_timelines == old_tenant_timelines + ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" + + timeline_dirs = [d for d in timelines_dir.iterdir()] + assert ( + timeline_dirs == initial_timeline_dirs + ), "pageserver should clean its temp timeline files on timeline creation failure" + + +def test_timeline_create_break_after_uninit_mark(neon_simple_env: NeonEnv): + env = neon_simple_env + pageserver_http = env.pageserver.http_client() + + tenant_id, _ = env.neon_cli.create_tenant() + + timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" + old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + initial_timeline_dirs = [d for d in timelines_dir.iterdir()] + + # Introduce failpoint when creating a new timeline uninit mark, before any other files were created + pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return")) + with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"): + _ = env.neon_cli.create_timeline("test_timeline_create_break_after_uninit_mark", tenant_id) + + # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. + # "New" timeline is not present in the list, allowing pageserver to retry the same request + new_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + assert ( + new_tenant_timelines == old_tenant_timelines + ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" + + timeline_dirs = [d for d in timelines_dir.iterdir()] + assert ( + timeline_dirs == initial_timeline_dirs + ), "pageserver should clean its temp timeline files on timeline creation failure" diff --git a/test_runner/regress/test_build_info_metric.py b/test_runner/regress/test_build_info_metric.py new file mode 100644 index 0000000000..b75b5bd775 --- /dev/null +++ b/test_runner/regress/test_build_info_metric.py @@ -0,0 +1,19 @@ +from fixtures.metrics import parse_metrics +from fixtures.neon_fixtures import NeonEnvBuilder, NeonProxy + + +def test_build_info_metric(neon_env_builder: NeonEnvBuilder, link_proxy: NeonProxy): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + parsed_metrics = {} + + parsed_metrics["pageserver"] = parse_metrics(env.pageserver.http_client().get_metrics()) + parsed_metrics["safekeeper"] = parse_metrics(env.safekeepers[0].http_client().get_metrics_str()) + parsed_metrics["proxy"] = parse_metrics(link_proxy.get_metrics()) + + for component, metrics in parsed_metrics.items(): + sample = metrics.query_one("libmetrics_build_info") + + assert "revision" in sample.labels + assert len(sample.labels["revision"]) > 0 diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py new file mode 100644 index 0000000000..f47e4a99bf --- /dev/null +++ b/test_runner/regress/test_clog_truncate.py @@ -0,0 +1,70 @@ +import os +import time + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import query_scalar + + +# +# Test compute node start after clog truncation +# +def test_clog_truncate(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_clog_truncate", "empty") + + # set aggressive autovacuum to make sure that truncation will happen + config = [ + "autovacuum_max_workers=10", + "autovacuum_vacuum_threshold=0", + "autovacuum_vacuum_insert_threshold=0", + "autovacuum_vacuum_cost_delay=0", + "autovacuum_vacuum_cost_limit=10000", + "autovacuum_naptime =1s", + "autovacuum_freeze_max_age=100000", + ] + + pg = env.postgres.create_start("test_clog_truncate", config_lines=config) + log.info("postgres is running on test_clog_truncate branch") + + # Install extension containing function needed for test + pg.safe_psql("CREATE EXTENSION neon_test_utils") + + # Consume many xids to advance clog + with pg.cursor() as cur: + cur.execute("select test_consume_xids(1000*1000*10);") + log.info("xids consumed") + + # call a checkpoint to trigger TruncateSubtrans + cur.execute("CHECKPOINT;") + + # ensure WAL flush + cur.execute("select txid_current()") + log.info(cur.fetchone()) + + # wait for autovacuum to truncate the pg_xact + # XXX Is it worth to add a timeout here? + pg_xact_0000_path = os.path.join(pg.pg_xact_dir_path(), "0000") + log.info(f"pg_xact_0000_path = {pg_xact_0000_path}") + + while os.path.isfile(pg_xact_0000_path): + log.info(f"file exists. wait for truncation: {pg_xact_0000_path=}") + time.sleep(5) + + # checkpoint to advance latest lsn + with pg.cursor() as cur: + cur.execute("CHECKPOINT;") + lsn_after_truncation = query_scalar(cur, "select pg_current_wal_insert_lsn()") + + # create new branch after clog truncation and start a compute node on it + log.info(f"create branch at lsn_after_truncation {lsn_after_truncation}") + env.neon_cli.create_branch( + "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation + ) + pg2 = env.postgres.create_start("test_clog_truncate_new") + log.info("postgres is running on test_clog_truncate_new branch") + + # check that new node doesn't contain truncated segment + pg_xact_0000_path_new = os.path.join(pg2.pg_xact_dir_path(), "0000") + log.info(f"pg_xact_0000_path_new = {pg_xact_0000_path_new}") + assert os.path.isfile(pg_xact_0000_path_new) is False diff --git a/test_runner/regress/test_close_fds.py b/test_runner/regress/test_close_fds.py new file mode 100644 index 0000000000..22f245f79b --- /dev/null +++ b/test_runner/regress/test_close_fds.py @@ -0,0 +1,53 @@ +import os.path +import shutil +import subprocess +import threading +import time +from contextlib import closing + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv + + +def lsof_path() -> str: + path_output = shutil.which("lsof") + if path_output is None: + raise RuntimeError("lsof not found in PATH") + else: + return path_output + + +# Makes sure that `pageserver.pid` is only held by `pageserve` command, not other commands. +# This is to test the changes in https://github.com/neondatabase/neon/pull/1834. +def test_lsof_pageserver_pid(neon_simple_env: NeonEnv): + env = neon_simple_env + + def start_workload(): + env.neon_cli.create_branch("test_lsof_pageserver_pid") + pg = env.postgres.create_start("test_lsof_pageserver_pid") + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE foo as SELECT x FROM generate_series(1,100000) x") + cur.execute("update foo set x=x+1") + + workload_thread = threading.Thread(target=start_workload, args=(), daemon=True) + workload_thread.start() + + path = os.path.join(env.repo_dir, "pageserver.pid") + lsof = lsof_path() + while workload_thread.is_alive(): + res = subprocess.run( + [lsof, path], + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + # parse the `lsof` command's output to get only the list of commands + commands = [line.split(" ")[0] for line in res.stdout.strip().split("\n")[1:]] + if len(commands) > 0: + log.info(f"lsof commands: {commands}") + assert commands == ["pageserve"] + + time.sleep(1.0) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py new file mode 100644 index 0000000000..306aa84040 --- /dev/null +++ b/test_runner/regress/test_compatibility.py @@ -0,0 +1,356 @@ +import os +import shutil +import subprocess +from pathlib import Path +from typing import Any + +import pytest +import toml # TODO: replace with tomllib for Python >= 3.11 +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonCli, + NeonEnvBuilder, + PageserverHttpClient, + PgBin, + PortDistributor, + wait_for_last_record_lsn, + wait_for_upload, +) +from fixtures.types import Lsn +from pytest import FixtureRequest + +# +# A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases. +# - `test_create_snapshot` a script wrapped in a test that creates a data snapshot. +# - `test_backward_compatibility` checks that the current version of Neon can start/read/interract with a data snapshot created by the previous version. +# The path to the snapshot is configured by COMPATIBILITY_SNAPSHOT_DIR environment variable. +# If the breakage is intentional, the test can be xfaild with setting ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE=true. +# - `test_forward_compatibility` checks that a snapshot created by the current version can be started/read/interracted by the previous version of Neon. +# Paths to Neon and Postgres are configured by COMPATIBILITY_NEON_BIN and COMPATIBILITY_POSTGRES_DISTRIB_DIR environment variables. +# If the breakage is intentional, the test can be xfaild with setting ALLOW_FORWARD_COMPATIBILITY_BREAKAGE=true. +# +# The file contains a couple of helper functions: +# - prepare_snapshot copies the snapshot, cleans it up and makes it ready for the current version of Neon (replaces paths and ports in config files). +# - check_neon_works performs the test itself, feel free to add more checks there. +# + + +# Note: if renaming this test, don't forget to update a reference to it in a workflow file: +# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml +@pytest.mark.xdist_group("compatibility") +@pytest.mark.order(before="test_forward_compatibility") +def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path): + # The test doesn't really test anything + # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`. + # + # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it. + neon_env_builder.pg_version = "14" + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_local_fs_remote_storage() + + env = neon_env_builder.init_start() + pg = env.postgres.create_start("main") + pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()]) + pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()]) + pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"]) + + snapshot_config = toml.load(test_output_dir / "repo" / "config") + tenant_id = snapshot_config["default_tenant_id"] + timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id] + + pageserver_http = env.pageserver.http_client() + lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn) + + env.postgres.stop_all() + for sk in env.safekeepers: + sk.stop() + env.pageserver.stop() + + shutil.copytree(test_output_dir, test_output_dir / "compatibility_snapshot_pg14") + # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it + + +@pytest.mark.xdist_group("compatibility") +@pytest.mark.order(after="test_create_snapshot") +def test_backward_compatibility( + pg_bin: PgBin, + port_distributor: PortDistributor, + test_output_dir: Path, + neon_binpath: Path, + pg_distrib_dir: Path, + pg_version: str, + request: FixtureRequest, +): + compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR") + assert ( + compatibility_snapshot_dir_env is not None + ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)" + compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve() + + # Copy the snapshot to current directory, and prepare for the test + prepare_snapshot( + from_dir=compatibility_snapshot_dir, + to_dir=test_output_dir / "compatibility_snapshot", + port_distributor=port_distributor, + ) + + breaking_changes_allowed = ( + os.environ.get("ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" + ) + try: + check_neon_works( + test_output_dir / "compatibility_snapshot" / "repo", + neon_binpath, + pg_distrib_dir, + pg_version, + port_distributor, + test_output_dir, + pg_bin, + request, + ) + except Exception: + if breaking_changes_allowed: + pytest.xfail( + "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE env var" + ) + else: + raise + + assert ( + not breaking_changes_allowed + ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" + + +@pytest.mark.xdist_group("compatibility") +@pytest.mark.order(after="test_create_snapshot") +def test_forward_compatibility( + test_output_dir: Path, + port_distributor: PortDistributor, + pg_version: str, + request: FixtureRequest, +): + compatibility_neon_bin_env = os.environ.get("COMPATIBILITY_NEON_BIN") + assert compatibility_neon_bin_env is not None, ( + "COMPATIBILITY_NEON_BIN is not set. It should be set to a path with Neon binaries " + "(ideally generated by the previous version of Neon)" + ) + compatibility_neon_bin = Path(compatibility_neon_bin_env).resolve() + + compatibility_postgres_distrib_dir_env = os.environ.get("COMPATIBILITY_POSTGRES_DISTRIB_DIR") + assert ( + compatibility_postgres_distrib_dir_env is not None + ), "COMPATIBILITY_POSTGRES_DISTRIB_DIR is not set. It should be set to a pg_install directrory (ideally generated by the previous version of Neon)" + compatibility_postgres_distrib_dir = Path(compatibility_postgres_distrib_dir_env).resolve() + + compatibility_snapshot_dir = ( + test_output_dir.parent / "test_create_snapshot" / "compatibility_snapshot_pg14" + ) + # Copy the snapshot to current directory, and prepare for the test + prepare_snapshot( + from_dir=compatibility_snapshot_dir, + to_dir=test_output_dir / "compatibility_snapshot", + port_distributor=port_distributor, + ) + + breaking_changes_allowed = ( + os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" + ) + try: + check_neon_works( + test_output_dir / "compatibility_snapshot" / "repo", + compatibility_neon_bin, + compatibility_postgres_distrib_dir, + pg_version, + port_distributor, + test_output_dir, + PgBin(test_output_dir, compatibility_postgres_distrib_dir, pg_version), + request, + ) + except Exception: + if breaking_changes_allowed: + pytest.xfail( + "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE env var" + ) + else: + raise + + assert ( + not breaking_changes_allowed + ), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" + + +def prepare_snapshot(from_dir: Path, to_dir: Path, port_distributor: PortDistributor): + assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist" + assert (from_dir / "repo").exists(), f"Snapshot '{from_dir}' doesn't contain a repo directory" + assert (from_dir / "dump.sql").exists(), f"Snapshot '{from_dir}' doesn't contain a dump.sql" + + log.info(f"Copying snapshot from {from_dir} to {to_dir}") + shutil.copytree(from_dir, to_dir) + + repo_dir = to_dir / "repo" + + # Remove old logs to avoid confusion in test artifacts + for logfile in repo_dir.glob("**/*.log"): + logfile.unlink() + + # Remove tenants data for compute + for tenant in (repo_dir / "pgdatadirs" / "tenants").glob("*"): + shutil.rmtree(tenant) + + # Remove wal-redo temp directory + for tenant in (repo_dir / "tenants").glob("*"): + shutil.rmtree(tenant / "wal-redo-datadir.___temp") + + # Update paths and ports in config files + pageserver_toml = repo_dir / "pageserver.toml" + pageserver_config = toml.load(pageserver_toml) + pageserver_config["remote_storage"]["local_path"] = repo_dir / "local_fs_remote_storage" + pageserver_config["listen_http_addr"] = port_distributor.replace_with_new_port( + pageserver_config["listen_http_addr"] + ) + pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port( + pageserver_config["listen_pg_addr"] + ) + pageserver_config["broker_endpoints"] = [ + port_distributor.replace_with_new_port(ep) for ep in pageserver_config["broker_endpoints"] + ] + + with pageserver_toml.open("w") as f: + toml.dump(pageserver_config, f) + + snapshot_config_toml = repo_dir / "config" + snapshot_config = toml.load(snapshot_config_toml) + snapshot_config["etcd_broker"]["broker_endpoints"] = [ + port_distributor.replace_with_new_port(ep) + for ep in snapshot_config["etcd_broker"]["broker_endpoints"] + ] + snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port( + snapshot_config["pageserver"]["listen_http_addr"] + ) + snapshot_config["pageserver"]["listen_pg_addr"] = port_distributor.replace_with_new_port( + snapshot_config["pageserver"]["listen_pg_addr"] + ) + for sk in snapshot_config["safekeepers"]: + sk["http_port"] = port_distributor.replace_with_new_port(sk["http_port"]) + sk["pg_port"] = port_distributor.replace_with_new_port(sk["pg_port"]) + + with (snapshot_config_toml).open("w") as f: + toml.dump(snapshot_config, f) + + # Ensure that snapshot doesn't contain references to the original path + rv = subprocess.run( + [ + "grep", + "--recursive", + "--binary-file=without-match", + "--files-with-matches", + "test_create_snapshot/repo", + str(repo_dir), + ], + capture_output=True, + text=True, + ) + assert ( + rv.returncode != 0 + ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}" + + +def check_neon_works( + repo_dir: Path, + neon_binpath: Path, + pg_distrib_dir: Path, + pg_version: str, + port_distributor: PortDistributor, + test_output_dir: Path, + pg_bin: PgBin, + request: FixtureRequest, +): + snapshot_config_toml = repo_dir / "config" + snapshot_config = toml.load(snapshot_config_toml) + snapshot_config["neon_distrib_dir"] = str(neon_binpath) + snapshot_config["postgres_distrib_dir"] = str(pg_distrib_dir) + with (snapshot_config_toml).open("w") as f: + toml.dump(snapshot_config, f) + + # TODO: replace with NeonEnvBuilder / NeonEnv + config: Any = type("NeonEnvStub", (object,), {}) + config.rust_log_override = None + config.repo_dir = repo_dir + config.pg_version = pg_version + config.initial_tenant = snapshot_config["default_tenant_id"] + config.neon_binpath = neon_binpath + config.pg_distrib_dir = pg_distrib_dir + + cli = NeonCli(config) + cli.raw_cli(["start"]) + request.addfinalizer(lambda: cli.raw_cli(["stop"])) + + pg_port = port_distributor.get_port() + cli.pg_start("main", port=pg_port) + request.addfinalizer(lambda: cli.pg_stop("main")) + + connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres" + pg_bin.run(["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"]) + initial_dump_differs = dump_differs( + repo_dir.parent / "dump.sql", + test_output_dir / "dump.sql", + test_output_dir / "dump.filediff", + ) + + # Check that project can be recovered from WAL + # loosely based on https://github.com/neondatabase/cloud/wiki/Recovery-from-WAL + tenant_id = snapshot_config["default_tenant_id"] + timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id] + pageserver_port = snapshot_config["pageserver"]["listen_http_addr"].split(":")[-1] + auth_token = snapshot_config["pageserver"]["auth_token"] + pageserver_http = PageserverHttpClient( + port=pageserver_port, + is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled + auth_token=auth_token, + ) + + shutil.rmtree(repo_dir / "local_fs_remote_storage") + pageserver_http.timeline_delete(tenant_id, timeline_id) + pageserver_http.timeline_create(tenant_id, timeline_id) + pg_bin.run( + ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"] + ) + # The assert itself deferred to the end of the test + # to allow us to perform checks that change data before failing + dump_from_wal_differs = dump_differs( + test_output_dir / "dump.sql", + test_output_dir / "dump-from-wal.sql", + test_output_dir / "dump-from-wal.filediff", + ) + + # Check that we can interract with the data + pg_bin.run(["pgbench", "--time=10", "--progress=2", connstr]) + + assert not dump_from_wal_differs, "dump from WAL differs" + assert not initial_dump_differs, "initial dump differs" + + +def dump_differs(first: Path, second: Path, output: Path) -> bool: + """ + Runs diff(1) command on two SQL dumps and write the output to the given output file. + Returns True if the dumps differ, False otherwise. + """ + + with output.open("w") as stdout: + rv = subprocess.run( + [ + "diff", + "--unified", # Make diff output more readable + "--ignore-matching-lines=^--", # Ignore changes in comments + "--ignore-blank-lines", + str(first), + str(second), + ], + stdout=stdout, + ) + + return rv.returncode != 0 diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py new file mode 100644 index 0000000000..01b64b8b17 --- /dev/null +++ b/test_runner/regress/test_compute_ctl.py @@ -0,0 +1,203 @@ +import os +from subprocess import TimeoutExpired + +from fixtures.log_helper import log +from fixtures.neon_fixtures import ComputeCtl, NeonEnvBuilder, PgBin + + +# Test that compute_ctl works and prints "--sync-safekeepers" logs. +def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + ctl = ComputeCtl(env) + + env.neon_cli.create_branch("test_compute_ctl", "main") + pg = env.postgres.create_start("test_compute_ctl") + pg.safe_psql("CREATE TABLE t(key int primary key, value text)") + + with open(pg.config_file_path(), "r") as f: + cfg_lines = f.readlines() + cfg_map = {} + for line in cfg_lines: + if "=" in line: + k, v = line.split("=") + cfg_map[k] = v.strip("\n '\"") + log.info(f"postgres config: {cfg_map}") + pgdata = pg.pg_data_dir_path() + pg_bin_path = os.path.join(pg_bin.pg_bin_path, "postgres") + + pg.stop_and_destroy() + + spec = ( + """ +{ + "format_version": 1.0, + + "timestamp": "2021-05-23T18:25:43.511Z", + "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b", + + "cluster": { + "cluster_id": "test-cluster-42", + "name": "Neon Test", + "state": "restarted", + "roles": [ + ], + "databases": [ + ], + "settings": [ + { + "name": "fsync", + "value": "off", + "vartype": "bool" + }, + { + "name": "wal_level", + "value": "replica", + "vartype": "enum" + }, + { + "name": "hot_standby", + "value": "on", + "vartype": "bool" + }, + { + "name": "neon.safekeepers", + "value": """ + + f'"{cfg_map["neon.safekeepers"]}"' + + """, + "vartype": "string" + }, + { + "name": "wal_log_hints", + "value": "on", + "vartype": "bool" + }, + { + "name": "log_connections", + "value": "on", + "vartype": "bool" + }, + { + "name": "shared_buffers", + "value": "32768", + "vartype": "integer" + }, + { + "name": "port", + "value": """ + + f'"{cfg_map["port"]}"' + + """, + "vartype": "integer" + }, + { + "name": "max_connections", + "value": "100", + "vartype": "integer" + }, + { + "name": "max_wal_senders", + "value": "10", + "vartype": "integer" + }, + { + "name": "listen_addresses", + "value": "0.0.0.0", + "vartype": "string" + }, + { + "name": "wal_sender_timeout", + "value": "0", + "vartype": "integer" + }, + { + "name": "password_encryption", + "value": "md5", + "vartype": "enum" + }, + { + "name": "maintenance_work_mem", + "value": "65536", + "vartype": "integer" + }, + { + "name": "max_parallel_workers", + "value": "8", + "vartype": "integer" + }, + { + "name": "max_worker_processes", + "value": "8", + "vartype": "integer" + }, + { + "name": "neon.tenant_id", + "value": """ + + f'"{cfg_map["neon.tenant_id"]}"' + + """, + "vartype": "string" + }, + { + "name": "max_replication_slots", + "value": "10", + "vartype": "integer" + }, + { + "name": "neon.timeline_id", + "value": """ + + f'"{cfg_map["neon.timeline_id"]}"' + + """, + "vartype": "string" + }, + { + "name": "shared_preload_libraries", + "value": "neon", + "vartype": "string" + }, + { + "name": "synchronous_standby_names", + "value": "walproposer", + "vartype": "string" + }, + { + "name": "neon.pageserver_connstring", + "value": """ + + f'"{cfg_map["neon.pageserver_connstring"]}"' + + """, + "vartype": "string" + } + ] + }, + "delta_operations": [ + ] +} +""" + ) + + ps_connstr = cfg_map["neon.pageserver_connstring"] + log.info(f"ps_connstr: {ps_connstr}, pgdata: {pgdata}") + + # run compute_ctl and wait for 10s + try: + ctl.raw_cli( + ["--connstr", ps_connstr, "--pgdata", pgdata, "--spec", spec, "--pgbin", pg_bin_path], + timeout=10, + ) + except TimeoutExpired as exc: + ctl_logs = exc.stderr.decode("utf-8") + log.info("compute_ctl output:\n" + ctl_logs) + + start = "starting safekeepers syncing" + end = "safekeepers synced at LSN" + start_pos = ctl_logs.index(start) + assert start_pos != -1 + end_pos = ctl_logs.index(end, start_pos) + assert end_pos != -1 + sync_safekeepers_logs = ctl_logs[start_pos : end_pos + len(end)] + log.info("sync_safekeepers_logs:\n" + sync_safekeepers_logs) + + # assert that --sync-safekeepers logs are present in the output + assert "connecting with node" in sync_safekeepers_logs + assert "connected with node" in sync_safekeepers_logs + assert "proposer connected to quorum (2)" in sync_safekeepers_logs + assert "got votes from majority (2)" in sync_safekeepers_logs + assert "sending elected msg to node" in sync_safekeepers_logs diff --git a/test_runner/batch_others/test_config.py b/test_runner/regress/test_config.py similarity index 52% rename from test_runner/batch_others/test_config.py rename to test_runner/regress/test_config.py index fd2b3b4e99..3477d96b89 100644 --- a/test_runner/batch_others/test_config.py +++ b/test_runner/regress/test_config.py @@ -1,30 +1,32 @@ from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv # # Test starting Postgres with custom options # -def test_config(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_config", "empty") +def test_config(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_config", "empty") # change config - pg = env.postgres.create_start('test_config', config_lines=['log_min_messages=debug1']) - log.info('postgres is running on test_config branch') + pg = env.postgres.create_start("test_config", config_lines=["log_min_messages=debug1"]) + log.info("postgres is running on test_config branch") with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute(''' + cur.execute( + """ SELECT setting FROM pg_settings WHERE source != 'default' AND source != 'override' AND name = 'log_min_messages' - ''') + """ + ) # check that config change was applied - assert cur.fetchone() == ('debug1', ) + assert cur.fetchone() == ("debug1",) diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py new file mode 100644 index 0000000000..e94c9a2bd0 --- /dev/null +++ b/test_runner/regress/test_crafted_wal_end.py @@ -0,0 +1,70 @@ +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft + +# Restart nodes with WAL end having specially crafted shape, like last record +# crossing segment boundary, to test decoding issues. + + +@pytest.mark.parametrize( + "wal_type", + [ + "simple", + "last_wal_record_xlog_switch", + "last_wal_record_xlog_switch_ends_on_page_boundary", + "last_wal_record_crossing_segment", + "wal_record_crossing_segment_followed_by_small_one", + ], +) +def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_crafted_wal_end") + + pg = env.postgres.create("test_crafted_wal_end") + wal_craft = WalCraft(env) + pg.config(wal_craft.postgres_config()) + pg.start() + res = pg.safe_psql_many( + queries=[ + "CREATE TABLE keys(key int primary key)", + "INSERT INTO keys SELECT generate_series(1, 100)", + "SELECT SUM(key) FROM keys", + ] + ) + assert res[-1][0] == (5050,) + + wal_craft.in_existing(wal_type, pg.connstr()) + + log.info("Restarting all safekeepers and pageservers") + env.pageserver.stop() + env.safekeepers[0].stop() + env.safekeepers[0].start() + env.pageserver.start() + + log.info("Trying more queries") + res = pg.safe_psql_many( + queries=[ + "SELECT SUM(key) FROM keys", + "INSERT INTO keys SELECT generate_series(101, 200)", + "SELECT SUM(key) FROM keys", + ] + ) + assert res[0][0] == (5050,) + assert res[-1][0] == (20100,) + + log.info("Restarting all safekeepers and pageservers (again)") + env.pageserver.stop() + env.safekeepers[0].stop() + env.safekeepers[0].start() + env.pageserver.start() + + log.info("Trying more queries (again)") + res = pg.safe_psql_many( + queries=[ + "SELECT SUM(key) FROM keys", + "INSERT INTO keys SELECT generate_series(201, 300)", + "SELECT SUM(key) FROM keys", + ] + ) + assert res[0][0] == (20100,) + assert res[-1][0] == (45150,) diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py new file mode 100644 index 0000000000..036e50e6e8 --- /dev/null +++ b/test_runner/regress/test_createdropdb.py @@ -0,0 +1,104 @@ +import os +import pathlib + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content +from fixtures.utils import query_scalar + + +# +# Test CREATE DATABASE when there have been relmapper changes +# +def test_createdb(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_createdb", "empty") + + pg = env.postgres.create_start("test_createdb") + log.info("postgres is running on 'test_createdb' branch") + + with pg.cursor() as cur: + # Cause a 'relmapper' change in the original branch + cur.execute("VACUUM FULL pg_class") + + cur.execute("CREATE DATABASE foodb") + + lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") + + # Create a branch + env.neon_cli.create_branch("test_createdb2", "test_createdb", ancestor_start_lsn=lsn) + pg2 = env.postgres.create_start("test_createdb2") + + # Test that you can connect to the new database on both branches + for db in (pg, pg2): + with db.cursor(dbname="foodb") as cur: + # Check database size in both branches + cur.execute( + """ + select pg_size_pretty(pg_database_size('foodb')), + pg_size_pretty( + sum(pg_relation_size(oid, 'main')) + +sum(pg_relation_size(oid, 'vm')) + +sum(pg_relation_size(oid, 'fsm')) + ) FROM pg_class where relisshared is false + """ + ) + res = cur.fetchone() + assert res is not None + # check that dbsize equals sum of all relation sizes, excluding shared ones + # This is how we define dbsize in neon for now + assert res[0] == res[1] + + +# +# Test DROP DATABASE +# +def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): + env = neon_simple_env + env.neon_cli.create_branch("test_dropdb", "empty") + pg = env.postgres.create_start("test_dropdb") + log.info("postgres is running on 'test_dropdb' branch") + + with pg.cursor() as cur: + cur.execute("CREATE DATABASE foodb") + + lsn_before_drop = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") + + dboid = query_scalar(cur, "SELECT oid FROM pg_database WHERE datname='foodb';") + + with pg.cursor() as cur: + cur.execute("DROP DATABASE foodb") + + cur.execute("CHECKPOINT") + + lsn_after_drop = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") + + # Create two branches before and after database drop. + env.neon_cli.create_branch( + "test_before_dropdb", "test_dropdb", ancestor_start_lsn=lsn_before_drop + ) + pg_before = env.postgres.create_start("test_before_dropdb") + + env.neon_cli.create_branch( + "test_after_dropdb", "test_dropdb", ancestor_start_lsn=lsn_after_drop + ) + pg_after = env.postgres.create_start("test_after_dropdb") + + # Test that database exists on the branch before drop + pg_before.connect(dbname="foodb").close() + + # Test that database subdir exists on the branch before drop + assert pg_before.pgdata_dir + dbpath = pathlib.Path(pg_before.pgdata_dir) / "base" / str(dboid) + log.info(dbpath) + + assert os.path.isdir(dbpath) is True + + # Test that database subdir doesn't exist on the branch after drop + assert pg_after.pgdata_dir + dbpath = pathlib.Path(pg_after.pgdata_dir) / "base" / str(dboid) + log.info(dbpath) + + assert os.path.isdir(dbpath) is False + + # Check that we restore the content of the datadir correctly + check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py new file mode 100644 index 0000000000..c5f8246f5b --- /dev/null +++ b/test_runner/regress/test_createuser.py @@ -0,0 +1,28 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import query_scalar + + +# +# Test CREATE USER to check shared catalog restore +# +def test_createuser(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_createuser", "empty") + pg = env.postgres.create_start("test_createuser") + log.info("postgres is running on 'test_createuser' branch") + + with pg.cursor() as cur: + # Cause a 'relmapper' change in the original branch + cur.execute("CREATE USER testuser with password %s", ("testpwd",)) + + cur.execute("CHECKPOINT") + + lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") + + # Create a branch + env.neon_cli.create_branch("test_createuser2", "test_createuser", ancestor_start_lsn=lsn) + pg2 = env.postgres.create_start("test_createuser2") + + # Test that you can connect to new branch as a new user + assert pg2.safe_psql("select current_user", user="testuser") == [("testuser",)] diff --git a/test_runner/regress/test_fsm_truncate.py b/test_runner/regress/test_fsm_truncate.py new file mode 100644 index 0000000000..4551ff97e0 --- /dev/null +++ b/test_runner/regress/test_fsm_truncate.py @@ -0,0 +1,10 @@ +from fixtures.neon_fixtures import NeonEnvBuilder + + +def test_fsm_truncate(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_fsm_truncate") + pg = env.postgres.create_start("test_fsm_truncate") + pg.safe_psql( + "CREATE TABLE t1(key int); CREATE TABLE t2(key int); TRUNCATE TABLE t1; TRUNCATE TABLE t2;" + ) diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py new file mode 100644 index 0000000000..fc515e5878 --- /dev/null +++ b/test_runner/regress/test_fullbackup.py @@ -0,0 +1,70 @@ +import os +from pathlib import Path + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres +from fixtures.types import Lsn, TimelineId +from fixtures.utils import query_scalar, subprocess_capture + +num_rows = 1000 + + +# Ensure that regular postgres can start from fullbackup +def test_fullbackup( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + port_distributor: PortDistributor, + pg_distrib_dir: Path, +): + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_fullbackup") + pgmain = env.postgres.create_start("test_fullbackup") + log.info("postgres is running on 'test_fullbackup' branch") + + with pgmain.cursor() as cur: + timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) + + # data loading may take a while, so increase statement timeout + cur.execute("SET statement_timeout='300s'") + cur.execute( + f"""CREATE TABLE tbl AS SELECT 'long string to consume some space' || g + from generate_series(1,{num_rows}) g""" + ) + cur.execute("CHECKPOINT") + + lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) + log.info(f"start_backup_lsn = {lsn}") + + # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. + # PgBin sets it automatically, but here we need to pipe psql output to the tar command. + psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} + + # Get and unpack fullbackup from pageserver + restored_dir_path = env.repo_dir / "restored_datadir" + os.mkdir(restored_dir_path, 0o750) + query = f"fullbackup {env.initial_tenant} {timeline} {lsn}" + cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] + result_basepath = pg_bin.run_capture(cmd, env=psql_env) + tar_output_file = result_basepath + ".stdout" + subprocess_capture(env.repo_dir, ["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)]) + + # HACK + # fullbackup returns neon specific pg_control and first WAL segment + # use resetwal to overwrite it + pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") + cmd = [pg_resetwal_path, "-D", str(restored_dir_path)] + pg_bin.run_capture(cmd, env=psql_env) + + # Restore from the backup and find the data we inserted + port = port_distributor.get_port() + with VanillaPostgres(restored_dir_path, pg_bin, port, init=False) as vanilla_pg: + # TODO make port an optional argument + vanilla_pg.configure( + [ + f"port={port}", + ] + ) + vanilla_pg.start() + num_rows_found = vanilla_pg.safe_psql("select count(*) from tbl;", user="cloud_admin")[0][0] + assert num_rows == num_rows_found diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py new file mode 100644 index 0000000000..332bef225f --- /dev/null +++ b/test_runner/regress/test_gc_aggressive.py @@ -0,0 +1,89 @@ +import asyncio +import concurrent.futures +import random + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres +from fixtures.types import TimelineId +from fixtures.utils import query_scalar + +# Test configuration +# +# Create a table with {num_rows} rows, and perform {updates_to_perform} random +# UPDATEs on it, using {num_connections} separate connections. +num_connections = 10 +num_rows = 100000 +updates_to_perform = 10000 + +updates_performed = 0 + + +# Run random UPDATEs on test table +async def update_table(pg: Postgres): + global updates_performed + pg_conn = await pg.connect_async() + + while updates_performed < updates_to_perform: + updates_performed += 1 + id = random.randrange(1, num_rows) + await pg_conn.fetchrow(f"UPDATE foo SET counter = counter + 1 WHERE id = {id}") + + +# Perform aggressive GC with 0 horizon +async def gc(env: NeonEnv, timeline: TimelineId): + pageserver_http = env.pageserver.http_client() + + loop = asyncio.get_running_loop() + + with concurrent.futures.ThreadPoolExecutor() as pool: + while updates_performed < updates_to_perform: + await loop.run_in_executor( + pool, lambda: pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + ) + + +# At the same time, run UPDATEs and GC +async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: TimelineId): + workers = [] + for worker_id in range(num_connections): + workers.append(asyncio.create_task(update_table(pg))) + workers.append(asyncio.create_task(gc(env, timeline))) + + # await all workers + await asyncio.gather(*workers) + + +# +# Aggressively force GC, while running queries. +# +# (repro for https://github.com/neondatabase/neon/issues/1047) +# +def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): + + # Disable pitr, because here we want to test branch creation after GC + neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_gc_aggressive", "main") + pg = env.postgres.create_start("test_gc_aggressive") + log.info("postgres is running on test_gc_aggressive branch") + + with pg.cursor() as cur: + timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) + + # Create table, and insert the first 100 rows + cur.execute("CREATE TABLE foo (id int, counter int, t text)") + cur.execute( + f""" + INSERT INTO foo + SELECT g, 0, 'long string to consume some space' || g + FROM generate_series(1, {num_rows}) g + """ + ) + cur.execute("CREATE INDEX ON foo(id)") + + asyncio.run(update_and_gc(env, pg, timeline)) + + cur.execute("SELECT COUNT(*), SUM(counter) FROM foo") + r = cur.fetchone() + assert r is not None + assert r == (num_rows, updates_to_perform) diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py new file mode 100644 index 0000000000..22b77d2cf1 --- /dev/null +++ b/test_runner/regress/test_gc_cutoff.py @@ -0,0 +1,39 @@ +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin + + +# Test gc_cutoff +# +# This test sets fail point at the end of GC, and checks that pageserver +# normally restarts after it. Also, there should be GC ERRORs in the log, +# but the fixture checks the log for any unexpected ERRORs after every +# test anyway, so it doesn't need any special attention here. +def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test + tenant_id, _ = env.neon_cli.create_tenant( + conf={ + "gc_period": "10 s", + "gc_horizon": f"{1024 ** 2}", + "checkpoint_distance": f"{1024 ** 2}", + "compaction_period": "5 s", + # set PITR interval to be small, so we can do GC + "pitr_interval": "1 s", + "compaction_threshold": "3", + "image_creation_threshold": "2", + } + ) + pg = env.postgres.create_start("main", tenant_id=tenant_id) + connstr = pg.connstr(options="-csynchronous_commit=off") + pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) + + pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit")) + + for i in range(5): + try: + pg_bin.run_capture(["pgbench", "-N", "-c5", "-T100", "-Mprepared", connstr]) + except Exception: + env.pageserver.stop() + env.pageserver.start() + pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit")) diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py new file mode 100644 index 0000000000..ced5e18406 --- /dev/null +++ b/test_runner/regress/test_import.py @@ -0,0 +1,280 @@ +import json +import os +import re +import shutil +import tarfile +from contextlib import closing +from pathlib import Path + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + Postgres, + wait_for_last_record_lsn, + wait_for_upload, +) +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import subprocess_capture + + +@pytest.mark.timeout(600) +def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder): + # Put data in vanilla pg + vanilla_pg.start() + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + vanilla_pg.safe_psql( + """create table t as select 'long string to consume some space' || g + from generate_series(1,300000) g""" + ) + assert vanilla_pg.safe_psql("select count(*) from t") == [(300000,)] + + # Take basebackup + basebackup_dir = os.path.join(test_output_dir, "basebackup") + base_tar = os.path.join(basebackup_dir, "base.tar") + wal_tar = os.path.join(basebackup_dir, "pg_wal.tar") + os.mkdir(basebackup_dir) + vanilla_pg.safe_psql("CHECKPOINT") + pg_bin.run( + [ + "pg_basebackup", + "-F", + "tar", + "-d", + vanilla_pg.connstr(), + "-D", + basebackup_dir, + ] + ) + + # Make corrupt base tar with missing pg_control + unpacked_base = os.path.join(basebackup_dir, "unpacked-base") + corrupt_base_tar = os.path.join(unpacked_base, "corrupt-base.tar") + os.mkdir(unpacked_base, 0o750) + subprocess_capture(str(test_output_dir), ["tar", "-xf", base_tar, "-C", unpacked_base]) + os.remove(os.path.join(unpacked_base, "global/pg_control")) + subprocess_capture( + str(test_output_dir), + ["tar", "-cf", "corrupt-base.tar"] + os.listdir(unpacked_base), + cwd=unpacked_base, + ) + + # Get start_lsn and end_lsn + with open(os.path.join(basebackup_dir, "backup_manifest")) as f: + manifest = json.load(f) + start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"] + end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] + + node_name = "import_from_vanilla" + tenant = TenantId.generate() + timeline = TimelineId.generate() + + # Set up pageserver for import + neon_env_builder.enable_local_fs_remote_storage() + env = neon_env_builder.init_start() + env.pageserver.http_client().tenant_create(tenant) + + def import_tar(base, wal): + env.neon_cli.raw_cli( + [ + "timeline", + "import", + "--tenant-id", + str(tenant), + "--timeline-id", + str(timeline), + "--node-name", + node_name, + "--base-lsn", + start_lsn, + "--base-tarfile", + base, + "--end-lsn", + end_lsn, + "--wal-tarfile", + wal, + "--pg-version", + env.pg_version, + ] + ) + + # Importing corrupt backup fails + with pytest.raises(Exception): + import_tar(corrupt_base_tar, wal_tar) + + # Importing correct backup works + import_tar(base_tar, wal_tar) + + # Wait for data to land in s3 + client = env.pageserver.http_client() + wait_for_last_record_lsn(client, tenant, timeline, Lsn(end_lsn)) + wait_for_upload(client, tenant, timeline, Lsn(end_lsn)) + + # Check it worked + pg = env.postgres.create_start(node_name, tenant_id=tenant) + assert pg.safe_psql("select count(*) from t") == [(300000,)] + + +@pytest.mark.timeout(600) +def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): + neon_env_builder.enable_local_fs_remote_storage() + env = neon_env_builder.init_start() + + timeline = env.neon_cli.create_branch("test_import_from_pageserver_small") + pg = env.postgres.create_start("test_import_from_pageserver_small") + + num_rows = 3000 + lsn = _generate_data(num_rows, pg) + _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir) + + +@pytest.mark.timeout(1800) +# TODO: temporarily disable `test_import_from_pageserver_multisegment` test, enable +# the test back after finding the failure cause. +# @pytest.mark.skipif(os.environ.get('BUILD_TYPE') == "debug", reason="only run with release build") +@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2255") +def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): + neon_env_builder.enable_local_fs_remote_storage() + env = neon_env_builder.init_start() + + timeline = env.neon_cli.create_branch("test_import_from_pageserver_multisegment") + pg = env.postgres.create_start("test_import_from_pageserver_multisegment") + + # For `test_import_from_pageserver_multisegment`, we want to make sure that the data + # is large enough to create multi-segment files. Typically, a segment file's size is + # at most 1GB. A large number of inserted rows (`30000000`) is used to increase the + # DB size to above 1GB. Related: https://github.com/neondatabase/neon/issues/2097. + num_rows = 30000000 + lsn = _generate_data(num_rows, pg) + + logical_size = env.pageserver.http_client().timeline_detail(env.initial_tenant, timeline)[ + "current_logical_size" + ] + log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB") + assert logical_size > 1024**3 # = 1GB + + tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir) + + # Check if the backup data contains multiple segment files + cnt_seg_files = 0 + segfile_re = re.compile("[0-9]+\\.[0-9]+") + with tarfile.open(tar_output_file, "r") as tar_f: + for f in tar_f.getnames(): + if segfile_re.search(f) is not None: + cnt_seg_files += 1 + log.info(f"Found a segment file: {f} in the backup archive file") + assert cnt_seg_files > 0 + + +def _generate_data(num_rows: int, pg: Postgres) -> Lsn: + """Generate a table with `num_rows` rows. + + Returns: + the latest insert WAL's LSN""" + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + # data loading may take a while, so increase statement timeout + cur.execute("SET statement_timeout='300s'") + cur.execute( + f"""CREATE TABLE tbl AS SELECT 'long string to consume some space' || g + from generate_series(1,{num_rows}) g""" + ) + cur.execute("CHECKPOINT") + + cur.execute("SELECT pg_current_wal_insert_lsn()") + res = cur.fetchone() + assert res is not None and isinstance(res[0], str) + return Lsn(res[0]) + + +def _import( + expected_num_rows: int, + lsn: Lsn, + env: NeonEnv, + pg_bin: PgBin, + timeline: TimelineId, + pg_distrib_dir: Path, +) -> str: + """Test importing backup data to the pageserver. + + Args: + expected_num_rows: the expected number of rows of the test table in the backup data + lsn: the backup's base LSN + + Returns: + path to the backup archive file""" + log.info(f"start_backup_lsn = {lsn}") + + # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. + # PgBin sets it automatically, but here we need to pipe psql output to the tar command. + psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} + + # Get a fullbackup from pageserver + query = f"fullbackup { env.initial_tenant} {timeline} {lsn}" + cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] + result_basepath = pg_bin.run_capture(cmd, env=psql_env) + tar_output_file = result_basepath + ".stdout" + + # Stop the first pageserver instance, erase all its data + env.postgres.stop_all() + env.pageserver.stop() + + dir_to_clear = Path(env.repo_dir) / "tenants" + shutil.rmtree(dir_to_clear) + os.mkdir(dir_to_clear) + + # start the pageserver again + env.pageserver.start() + + # Import using another tenant_id, because we use the same pageserver. + # TODO Create another pageserver to make test more realistic. + tenant = TenantId.generate() + + # Import to pageserver + node_name = "import_from_pageserver" + client = env.pageserver.http_client() + client.tenant_create(tenant) + env.neon_cli.raw_cli( + [ + "timeline", + "import", + "--tenant-id", + str(tenant), + "--timeline-id", + str(timeline), + "--node-name", + node_name, + "--base-lsn", + str(lsn), + "--base-tarfile", + os.path.join(tar_output_file), + "--pg-version", + env.pg_version, + ] + ) + + # Wait for data to land in s3 + wait_for_last_record_lsn(client, tenant, timeline, lsn) + wait_for_upload(client, tenant, timeline, lsn) + + # Check it worked + pg = env.postgres.create_start(node_name, tenant_id=tenant) + assert pg.safe_psql("select count(*) from tbl") == [(expected_num_rows,)] + + # Take another fullbackup + query = f"fullbackup { tenant} {timeline} {lsn}" + cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] + result_basepath = pg_bin.run_capture(cmd, env=psql_env) + new_tar_output_file = result_basepath + ".stdout" + + # Check it's the same as the first fullbackup + # TODO pageserver should be checking checksum + assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file) + + # Check that gc works + pageserver_http = env.pageserver.http_client() + pageserver_http.timeline_gc(tenant, timeline, 0) + + return tar_output_file diff --git a/test_runner/regress/test_large_schema.py b/test_runner/regress/test_large_schema.py new file mode 100644 index 0000000000..f14265f6fd --- /dev/null +++ b/test_runner/regress/test_large_schema.py @@ -0,0 +1,83 @@ +import os +import time + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +# This test creates large number of tables which cause large catalog. +# Right now Neon serialize directory as single key-value storage entry and so +# it leads to layer filled mostly by one key. +# Originally Neon implementation of checkpoint and compaction is not able to split key which leads +# to large (several gigabytes) layer files (both ephemeral and delta layers). +# It may cause problems with uploading to S3 and also degrade performance because ephemeral file swapping. +# +def test_large_schema(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + pg = env.postgres.create_start("main") + + conn = pg.connect() + cur = conn.cursor() + + tables = 2 # 10 is too much for debug build + partitions = 1000 + for i in range(1, tables + 1): + print(f"iteration {i} / {tables}") + + # Restart compute. Restart is actually not strictly needed. + # It is done mostly because this test originally tries to model the problem reported by Ketteq. + pg.stop() + # Kill and restart the pageserver. + # env.pageserver.stop(immediate=True) + # env.pageserver.start() + pg.start() + + retry_sleep = 0.5 + max_retries = 200 + retries = 0 + while True: + try: + conn = pg.connect() + cur = conn.cursor() + cur.execute(f"CREATE TABLE if not exists t_{i}(pk integer) partition by range (pk)") + for j in range(1, partitions + 1): + cur.execute( + f"create table if not exists p_{i}_{j} partition of t_{i} for values from ({j}) to ({j + 1})" + ) + cur.execute(f"insert into t_{i} values (generate_series(1,{partitions}))") + cur.execute("vacuum full") + conn.close() + + except Exception as error: + # It's normal that it takes some time for the pageserver to + # restart, and for the connection to fail until it does. It + # should eventually recover, so retry until it succeeds. + print(f"failed: {error}") + if retries < max_retries: + retries += 1 + print(f"retry {retries} / {max_retries}") + time.sleep(retry_sleep) + continue + else: + raise + break + + conn = pg.connect() + cur = conn.cursor() + + for i in range(1, tables + 1): + cur.execute(f"SELECT count(*) FROM t_{i}") + assert cur.fetchone() == (partitions,) + + cur.execute("set enable_sort=off") + cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid") + + # Check layer file sizes + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant_id, timeline_id) + for filename in os.listdir(timeline_path): + if filename.startswith("00000"): + log.info(f"layer {filename} size is {os.path.getsize(timeline_path + filename)}") + assert os.path.getsize(timeline_path + filename) < 512_000_000 diff --git a/test_runner/regress/test_layer_writers_fail.py b/test_runner/regress/test_layer_writers_fail.py new file mode 100644 index 0000000000..e8ba0e7d91 --- /dev/null +++ b/test_runner/regress/test_layer_writers_fail.py @@ -0,0 +1,92 @@ +import pytest +from fixtures.neon_fixtures import NeonEnv, NeonPageserver + + +@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2703") +def test_image_layer_writer_fail_before_finish(neon_simple_env: NeonEnv): + env = neon_simple_env + pageserver_http = env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{1024 ** 2}", + # set the target size to be large to allow the image layer to cover the whole key space + "compaction_target_size": f"{1024 ** 3}", + # tweak the default settings to allow quickly create image layers and L1 layers + "compaction_period": "1 s", + "compaction_threshold": "2", + "image_creation_threshold": "1", + } + ) + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ] + ) + + pageserver_http.configure_failpoints(("image-layer-writer-fail-before-finish", "return")) + with pytest.raises(Exception, match="image-layer-writer-fail-before-finish"): + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + new_temp_layer_files = list( + filter( + lambda file: str(file).endswith(NeonPageserver.TEMP_FILE_SUFFIX), + [path for path in env.timeline_dir(tenant_id, timeline_id).iterdir()], + ) + ) + + assert ( + len(new_temp_layer_files) == 0 + ), "pageserver should clean its temporary new image layer files on failure" + + +@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2703") +def test_delta_layer_writer_fail_before_finish(neon_simple_env: NeonEnv): + env = neon_simple_env + pageserver_http = env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{1024 ** 2}", + # set the target size to be large to allow the image layer to cover the whole key space + "compaction_target_size": f"{1024 ** 3}", + # tweak the default settings to allow quickly create image layers and L1 layers + "compaction_period": "1 s", + "compaction_threshold": "2", + "image_creation_threshold": "1", + } + ) + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ] + ) + + pageserver_http.configure_failpoints(("delta-layer-writer-fail-before-finish", "return")) + # Note: we cannot test whether the exception is exactly 'delta-layer-writer-fail-before-finish' + # since our code does it in loop, we cannot get this exact error for our request. + with pytest.raises(Exception): + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + new_temp_layer_files = list( + filter( + lambda file: str(file).endswith(NeonPageserver.TEMP_FILE_SUFFIX), + [path for path in env.timeline_dir(tenant_id, timeline_id).iterdir()], + ) + ) + + assert ( + len(new_temp_layer_files) == 0 + ), "pageserver should clean its temporary new delta layer files on failure" diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py new file mode 100644 index 0000000000..c5a49a6704 --- /dev/null +++ b/test_runner/regress/test_lsn_mapping.py @@ -0,0 +1,69 @@ +from datetime import timedelta + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn +from fixtures.utils import query_scalar + + +# +# Test pageserver get_lsn_by_timestamp API +# +def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping") + pgmain = env.postgres.create_start("test_lsn_mapping") + log.info("postgres is running on 'test_lsn_mapping' branch") + + cur = pgmain.connect().cursor() + # Create table, and insert rows, each in a separate transaction + # Disable synchronous_commit to make this initialization go faster. + # + # Each row contains current insert LSN and the current timestamp, when + # the row was inserted. + cur.execute("SET synchronous_commit=off") + cur.execute("CREATE TABLE foo (x integer)") + tbl = [] + for i in range(1000): + cur.execute(f"INSERT INTO foo VALUES({i})") + # Get the timestamp at UTC + after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=None) + tbl.append([i, after_timestamp]) + + # Execute one more transaction with synchronous_commit enabled, to flush + # all the previous transactions + cur.execute("INSERT INTO foo VALUES (-1)") + + # Wait until WAL is received by pageserver + wait_for_last_flush_lsn(env, pgmain, env.initial_tenant, new_timeline_id) + + with env.pageserver.http_client() as client: + # Check edge cases: timestamp in the future + probe_timestamp = tbl[-1][1] + timedelta(hours=1) + result = client.timeline_get_lsn_by_timestamp( + env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z" + ) + assert result == "future" + + # timestamp too the far history + probe_timestamp = tbl[0][1] - timedelta(hours=10) + result = client.timeline_get_lsn_by_timestamp( + env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z" + ) + assert result == "past" + + # Probe a bunch of timestamps in the valid range + for i in range(1, len(tbl), 100): + probe_timestamp = tbl[i][1] + lsn = client.timeline_get_lsn_by_timestamp( + env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z" + ) + # Call get_lsn_by_timestamp to get the LSN + # Launch a new read-only node at that LSN, and check that only the rows + # that were supposed to be committed at that point in time are visible. + pg_here = env.postgres.create_start( + branch_name="test_lsn_mapping", node_name="test_lsn_mapping_read", lsn=lsn + ) + assert pg_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i + + pg_here.stop_and_destroy() diff --git a/test_runner/batch_others/test_multixact.py b/test_runner/regress/test_multixact.py similarity index 63% rename from test_runner/batch_others/test_multixact.py rename to test_runner/regress/test_multixact.py index 6a2afd2ede..635beb16b7 100644 --- a/test_runner/batch_others/test_multixact.py +++ b/test_runner/regress/test_multixact.py @@ -1,5 +1,6 @@ -from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content +from fixtures.utils import query_scalar # @@ -8,22 +9,23 @@ from fixtures.log_helper import log # it only checks next_multixact_id field in restored pg_control, # since we don't have functions to check multixact internals. # -def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir): - env = zenith_simple_env - env.zenith_cli.create_branch("test_multixact", "empty") - pg = env.postgres.create_start('test_multixact') +def test_multixact(neon_simple_env: NeonEnv, test_output_dir): + env = neon_simple_env + env.neon_cli.create_branch("test_multixact", "empty") + pg = env.postgres.create_start("test_multixact") log.info("postgres is running on 'test_multixact' branch") - pg_conn = pg.connect() - cur = pg_conn.cursor() - - cur.execute(''' + cur = pg.connect().cursor() + cur.execute( + """ CREATE TABLE t1(i int primary key); INSERT INTO t1 select * from generate_series(1, 100); - ''') + """ + ) - cur.execute('SELECT next_multixact_id FROM pg_control_checkpoint()') - next_multixact_id_old = cur.fetchone()[0] + next_multixact_id_old = query_scalar( + cur, "SELECT next_multixact_id FROM pg_control_checkpoint()" + ) # Lock entries using parallel connections in a round-robin fashion. nclients = 20 @@ -41,18 +43,20 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir): for i in range(5000): conn = connections[i % nclients] conn.commit() - conn.cursor().execute('select * from t1 for key share') + conn.cursor().execute("select * from t1 for key share") # We have multixacts now. We can close the connections. for c in connections: c.close() # force wal flush - cur.execute('checkpoint') + cur.execute("checkpoint") cur.execute( - 'SELECT next_multixact_id, pg_current_wal_insert_lsn() FROM pg_control_checkpoint()') + "SELECT next_multixact_id, pg_current_wal_insert_lsn() FROM pg_control_checkpoint()" + ) res = cur.fetchone() + assert res is not None next_multixact_id = res[0] lsn = res[1] @@ -60,15 +64,13 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir): assert int(next_multixact_id) > int(next_multixact_id_old) # Branch at this point - env.zenith_cli.create_branch("test_multixact_new", "test_multixact@" + lsn) - pg_new = env.postgres.create_start('test_multixact_new') + env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn) + pg_new = env.postgres.create_start("test_multixact_new") log.info("postgres is running on 'test_multixact_new' branch") - pg_new_conn = pg_new.connect() - cur_new = pg_new_conn.cursor() - - cur_new.execute('SELECT next_multixact_id FROM pg_control_checkpoint()') - next_multixact_id_new = cur_new.fetchone()[0] + next_multixact_id_new = pg_new.safe_psql( + "SELECT next_multixact_id FROM pg_control_checkpoint()" + )[0][0] # Check that we restored pg_controlfile correctly assert next_multixact_id_new == next_multixact_id diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py new file mode 100644 index 0000000000..d146f78c3a --- /dev/null +++ b/test_runner/regress/test_neon_cli.py @@ -0,0 +1,133 @@ +from typing import cast + +import requests +from fixtures.neon_fixtures import ( + DEFAULT_BRANCH_NAME, + NeonEnv, + NeonEnvBuilder, + PageserverHttpClient, +) +from fixtures.types import TenantId, TimelineId + + +def helper_compare_timeline_list( + pageserver_http_client: PageserverHttpClient, env: NeonEnv, initial_tenant: TenantId +): + """ + Compare timelines list returned by CLI and directly via API. + Filters out timelines created by other tests. + """ + + timelines_api = sorted( + map( + lambda t: TimelineId(t["timeline_id"]), + pageserver_http_client.timeline_list(initial_tenant), + ) + ) + + timelines_cli = env.neon_cli.list_timelines() + assert timelines_cli == env.neon_cli.list_timelines(initial_tenant) + + cli_timeline_ids = sorted([timeline_id for (_, timeline_id) in timelines_cli]) + assert timelines_api == cli_timeline_ids + + +def test_cli_timeline_list(neon_simple_env: NeonEnv): + env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() + + # Initial sanity check + helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) + + # Create a branch for us + main_timeline_id = env.neon_cli.create_branch("test_cli_branch_list_main") + helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) + + # Create a nested branch + nested_timeline_id = env.neon_cli.create_branch( + "test_cli_branch_list_nested", "test_cli_branch_list_main" + ) + helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) + + # Check that all new branches are visible via CLI + timelines_cli = [timeline_id for (_, timeline_id) in env.neon_cli.list_timelines()] + + assert main_timeline_id in timelines_cli + assert nested_timeline_id in timelines_cli + + +def helper_compare_tenant_list(pageserver_http_client: PageserverHttpClient, env: NeonEnv): + tenants = pageserver_http_client.tenant_list() + tenants_api = sorted(map(lambda t: cast(str, t["id"]), tenants)) + + res = env.neon_cli.list_tenants() + tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) + + assert tenants_api == tenants_cli + + +def test_cli_tenant_list(neon_simple_env: NeonEnv): + env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() + # Initial sanity check + helper_compare_tenant_list(pageserver_http_client, env) + + # Create new tenant + tenant1, _ = env.neon_cli.create_tenant() + + # check tenant1 appeared + helper_compare_tenant_list(pageserver_http_client, env) + + # Create new tenant + tenant2, _ = env.neon_cli.create_tenant() + + # check tenant2 appeared + helper_compare_tenant_list(pageserver_http_client, env) + + res = env.neon_cli.list_tenants() + tenants = sorted(map(lambda t: TenantId(t.split()[0]), res.stdout.splitlines())) + + assert env.initial_tenant in tenants + assert tenant1 in tenants + assert tenant2 in tenants + + +def test_cli_tenant_create(neon_simple_env: NeonEnv): + env = neon_simple_env + tenant_id, _ = env.neon_cli.create_tenant() + timelines = env.neon_cli.list_timelines(tenant_id) + + # an initial timeline should be created upon tenant creation + assert len(timelines) == 1 + assert timelines[0][0] == DEFAULT_BRANCH_NAME + + +def test_cli_ipv4_listeners(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + # Connect to sk port on v4 loopback + res = requests.get(f"http://127.0.0.1:{env.safekeepers[0].port.http}/v1/status") + assert res.ok + + # FIXME Test setup is using localhost:xx in ps config. + # Perhaps consider switching test suite to v4 loopback. + + # Connect to ps port on v4 loopback + # res = requests.get(f'http://127.0.0.1:{env.pageserver.service_port.http}/v1/status') + # assert res.ok + + +def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + # Stop default ps/sk + env.neon_cli.pageserver_stop() + env.neon_cli.safekeeper_stop() + + # Default start + res = env.neon_cli.raw_cli(["start"]) + res.check_returncode() + + # Default stop + res = env.neon_cli.raw_cli(["stop"]) + res.check_returncode() diff --git a/test_runner/batch_others/test_next_xid.py b/test_runner/regress/test_next_xid.py similarity index 67% rename from test_runner/batch_others/test_next_xid.py rename to test_runner/regress/test_next_xid.py index 625abc39d3..698ea0e1d3 100644 --- a/test_runner/batch_others/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -1,27 +1,22 @@ -import pytest -import random import time -from fixtures.zenith_fixtures import ZenithEnvBuilder -from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder # Test restarting page server, while safekeeper and compute node keep # running. -def test_next_xid(zenith_env_builder: ZenithEnvBuilder): - # One safekeeper is enough for this test. - zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() +def test_next_xid(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() - pg = env.postgres.create_start('main') + pg = env.postgres.create_start("main") conn = pg.connect() cur = conn.cursor() - cur.execute('CREATE TABLE t(x integer)') + cur.execute("CREATE TABLE t(x integer)") iterations = 32 for i in range(1, iterations + 1): - print(f'iteration {i} / {iterations}') + print(f"iteration {i} / {iterations}") # Kill and restart the pageserver. pg.stop() @@ -43,10 +38,10 @@ def test_next_xid(zenith_env_builder: ZenithEnvBuilder): # It's normal that it takes some time for the pageserver to # restart, and for the connection to fail until it does. It # should eventually recover, so retry until it succeeds. - print(f'failed: {error}') + print(f"failed: {error}") if retries < max_retries: retries += 1 - print(f'retry {retries} / {max_retries}') + print(f"retry {retries} / {max_retries}") time.sleep(retry_sleep) continue else: @@ -56,4 +51,4 @@ def test_next_xid(zenith_env_builder: ZenithEnvBuilder): conn = pg.connect() cur = conn.cursor() cur.execute("SELECT count(*) FROM t") - assert cur.fetchone() == (iterations, ) + assert cur.fetchone() == (iterations,) diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py new file mode 100644 index 0000000000..73933021a4 --- /dev/null +++ b/test_runner/regress/test_normal_work.py @@ -0,0 +1,52 @@ +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PageserverHttpClient + + +def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient): + tenant_id, timeline_id = env.neon_cli.create_tenant() + pg = env.postgres.create_start("main", tenant_id=tenant_id) + # we rely upon autocommit after each statement + res_1 = pg.safe_psql_many( + queries=[ + "CREATE TABLE t(key int primary key, value text)", + "INSERT INTO t SELECT generate_series(1,100000), 'payload'", + "SELECT sum(key) FROM t", + ] + ) + + assert res_1[-1][0] == (5000050000,) + # TODO check detach on live instance + log.info("stopping compute") + pg.stop() + log.info("compute stopped") + + pg.start() + res_2 = pg.safe_psql("SELECT sum(key) FROM t") + assert res_2[0] == (5000050000,) + + pg.stop() + pageserver_http.tenant_detach(tenant_id) + + +@pytest.mark.parametrize("num_timelines,num_safekeepers", [(3, 1)]) +def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_safekeepers: int): + """ + Basic test: + * create new tenant with a timeline + * write some data + * ensure that it was successfully written + * restart compute + * check that the data is there + * stop compute + * detach tenant + + Repeat check for several tenants/timelines. + """ + + neon_env_builder.num_safekeepers = num_safekeepers + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + for _ in range(num_timelines): + check_tenant(env, pageserver_http) diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py similarity index 61% rename from test_runner/batch_others/test_old_request_lsn.py rename to test_runner/regress/test_old_request_lsn.py index d09fb24913..3e387bb6cc 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,7 +1,7 @@ -from contextlib import closing - -from fixtures.zenith_fixtures import ZenithEnv from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.types import TimelineId +from fixtures.utils import print_gc_result, query_scalar # @@ -14,49 +14,56 @@ from fixtures.log_helper import log # just a hint that the page hasn't been modified since that LSN, and the page # server should return the latest page version regardless of the LSN. # -def test_old_request_lsn(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_old_request_lsn", "empty") - pg = env.postgres.create_start('test_old_request_lsn') - log.info('postgres is running on test_old_request_lsn branch') +def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): + # Disable pitr, because here we want to test branch creation after GC + neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_old_request_lsn", "main") + pg = env.postgres.create_start("test_old_request_lsn") + log.info("postgres is running on test_old_request_lsn branch") pg_conn = pg.connect() cur = pg_conn.cursor() # Get the timeline ID of our branch. We need it for the 'do_gc' command - cur.execute("SHOW zenith.zenith_timeline") - timeline = cur.fetchone()[0] + timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) - psconn = env.pageserver.connect() - pscur = psconn.cursor() + pageserver_http = env.pageserver.http_client() # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers. - cur.execute('CREATE TABLE foo (id int4 PRIMARY KEY, val int, t text)') - cur.execute(''' + cur.execute("CREATE TABLE foo (id int4 PRIMARY KEY, val int, t text)") + cur.execute( + """ INSERT INTO foo SELECT g, 1, 'long string to consume some space' || g FROM generate_series(1, 100000) g - ''') + """ + ) # Verify that the table is larger than shared_buffers, so that the SELECT below # will cause GetPage requests. - cur.execute(''' + cur.execute( + """ select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize from pg_settings where name = 'shared_buffers' - ''') + """ + ) row = cur.fetchone() - log.info(f'shared_buffers is {row[0]}, table size {row[1]}') + assert row is not None + log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) - cur.execute('VACUUM foo') + cur.execute("VACUUM foo") # Make a lot of updates on a single row, generating a lot of WAL. Trigger # garbage collections so that the page server will remove old page versions. for i in range(10): - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) + for j in range(100): - cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;') + cur.execute("UPDATE foo SET val = val + 1 WHERE id = 1;") # All (or at least most of) the updates should've been on the same page, so # that we haven't had to evict any dirty pages for a long time. Now run diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py new file mode 100644 index 0000000000..ab321eeb02 --- /dev/null +++ b/test_runner/regress/test_pageserver_api.py @@ -0,0 +1,187 @@ +import subprocess +from pathlib import Path +from typing import Optional + +from fixtures.neon_fixtures import ( + DEFAULT_BRANCH_NAME, + NeonEnv, + NeonEnvBuilder, + PageserverHttpClient, +) +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import wait_until + + +# test that we cannot override node id after init +def test_pageserver_init_node_id( + neon_simple_env: NeonEnv, neon_binpath: Path, pg_distrib_dir: Path +): + repo_dir = neon_simple_env.repo_dir + pageserver_config = repo_dir / "pageserver.toml" + pageserver_bin = neon_binpath / "pageserver" + + def run_pageserver(args): + return subprocess.run( + [str(pageserver_bin), "-D", str(repo_dir), *args], + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + # remove initial config and stop existing pageserver + pageserver_config.unlink() + neon_simple_env.pageserver.stop() + + bad_init = run_pageserver(["--init", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']) + assert ( + bad_init.returncode == 1 + ), "pageserver should not be able to init new config without the node id" + assert "missing id" in bad_init.stderr + assert not pageserver_config.exists(), "config file should not be created after init error" + + completed_init = run_pageserver( + ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] + ) + assert ( + completed_init.returncode == 0 + ), "pageserver should be able to create a new config with the node id given" + assert pageserver_config.exists(), "config file should be created successfully" + + bad_reinit = run_pageserver( + ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] + ) + assert ( + bad_reinit.returncode == 1 + ), "pageserver should not be able to init new config without the node id" + assert "already exists, cannot init it" in bad_reinit.stderr + + bad_update = run_pageserver(["--update-config", "-c", "id = 3"]) + assert bad_update.returncode == 1, "pageserver should not allow updating node id" + assert "has node id already, it cannot be overridden" in bad_update.stderr + + +def check_client(client: PageserverHttpClient, initial_tenant: TenantId): + client.check_status() + + # check initial tenant is there + assert initial_tenant in {TenantId(t["id"]) for t in client.tenant_list()} + + # create new tenant and check it is also there + tenant_id = TenantId.generate() + client.tenant_create(tenant_id) + assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()} + + timelines = client.timeline_list(tenant_id) + assert len(timelines) == 0, "initial tenant should not have any timelines" + + # create timeline + timeline_id = TimelineId.generate() + client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id) + + timelines = client.timeline_list(tenant_id) + assert len(timelines) > 0 + + # check it is there + assert timeline_id in {TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)} + for timeline in timelines: + timeline_id = TimelineId(timeline["timeline_id"]) + timeline_details = client.timeline_detail( + tenant_id=tenant_id, + timeline_id=timeline_id, + include_non_incremental_logical_size=True, + ) + + assert TenantId(timeline_details["tenant_id"]) == tenant_id + assert TimelineId(timeline_details["timeline_id"]) == timeline_id + + +def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): + env = neon_simple_env + with env.pageserver.http_client() as client: + tenant_id, timeline_id = env.neon_cli.create_tenant() + + timeline_details = client.timeline_detail( + tenant_id=tenant_id, timeline_id=timeline_id, include_non_incremental_logical_size=True + ) + + assert ( + timeline_details.get("wal_source_connstr") is None + ), "Should not be able to connect to WAL streaming without PG compute node running" + assert ( + timeline_details.get("last_received_msg_lsn") is None + ), "Should not be able to connect to WAL streaming without PG compute node running" + assert ( + timeline_details.get("last_received_msg_ts") is None + ), "Should not be able to connect to WAL streaming without PG compute node running" + + +def expect_updated_msg_lsn( + client: PageserverHttpClient, + tenant_id: TenantId, + timeline_id: TimelineId, + prev_msg_lsn: Optional[Lsn], +) -> Lsn: + timeline_details = client.timeline_detail(tenant_id, timeline_id=timeline_id) + + # a successful `timeline_details` response must contain the below fields + assert "wal_source_connstr" in timeline_details.keys() + assert "last_received_msg_lsn" in timeline_details.keys() + assert "last_received_msg_ts" in timeline_details.keys() + + assert ( + timeline_details["last_received_msg_lsn"] is not None + ), "the last received message's LSN is empty" + + last_msg_lsn = Lsn(timeline_details["last_received_msg_lsn"]) + assert ( + prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn + ), f"the last received message's LSN {last_msg_lsn} hasn't been updated \ + compared to the previous message's LSN {prev_msg_lsn}" + + return last_msg_lsn + + +# Test the WAL-receiver related fields in the response to `timeline_details` API call +# +# These fields used to be returned by a separate API call, but they're part of +# `timeline_details` now. +def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): + env = neon_simple_env + with env.pageserver.http_client() as client: + tenant_id, timeline_id = env.neon_cli.create_tenant() + pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id) + + # Wait to make sure that we get a latest WAL receiver data. + # We need to wait here because it's possible that we don't have access to + # the latest WAL yet, when the `timeline_detail` API is first called. + # See: https://github.com/neondatabase/neon/issues/1768. + lsn = wait_until( + number_of_iterations=5, + interval=1, + func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None), + ) + + # Make a DB modification then expect getting a new WAL receiver's data. + pg.safe_psql("CREATE TABLE t(key int primary key, value text)") + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn), + ) + + +def test_pageserver_http_api_client(neon_simple_env: NeonEnv): + env = neon_simple_env + with env.pageserver.http_client() as client: + check_client(client, env.initial_tenant) + + +def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder): + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() + + management_token = env.auth_keys.generate_management_token() + + with env.pageserver.http_client(auth_token=management_token) as client: + check_client(client, env.initial_tenant) diff --git a/test_runner/batch_others/test_pageserver_catchup.py b/test_runner/regress/test_pageserver_catchup.py similarity index 65% rename from test_runner/batch_others/test_pageserver_catchup.py rename to test_runner/regress/test_pageserver_catchup.py index 97dc0f3260..cba3203591 100644 --- a/test_runner/batch_others/test_pageserver_catchup.py +++ b/test_runner/regress/test_pageserver_catchup.py @@ -1,37 +1,35 @@ -import pytest -import random -import time - -from contextlib import closing -from multiprocessing import Process, Value -from fixtures.zenith_fixtures import ZenithEnvBuilder -from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder # Test safekeeper sync and pageserver catch up # while initial compute node is down and pageserver is lagging behind safekeepers. # Ensure that basebackup after restart of all components is correct # and new compute node contains all data. -def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() +def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() - env.zenith_cli.create_branch("test_pageserver_catchup_while_compute_down", "main") - pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down') + env.neon_cli.create_branch("test_pageserver_catchup_while_compute_down") + # Make shared_buffers large to ensure we won't query pageserver while it is down. + pg = env.postgres.create_start( + "test_pageserver_catchup_while_compute_down", config_lines=["shared_buffers=512MB"] + ) pg_conn = pg.connect() cur = pg_conn.cursor() # Create table, and insert some rows. - cur.execute('CREATE TABLE foo (t text)') - cur.execute(''' + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 10000) g - ''') + """ + ) cur.execute("SELECT count(*) FROM foo") - assert cur.fetchone() == (10000, ) + assert cur.fetchone() == (10000,) # Stop and restart pageserver. This is a more or less graceful shutdown, although # the page server doesn't currently have a shutdown routine so there's no difference @@ -40,11 +38,13 @@ def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuil # insert some more rows # since pageserver is shut down, these will be only on safekeepers - cur.execute(''' + cur.execute( + """ INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 10000) g - ''') + """ + ) # stop safekeepers gracefully env.safekeepers[0].stop() @@ -59,11 +59,11 @@ def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuil env.safekeepers[2].start() # restart compute node - pg.stop_and_destroy().create_start('test_pageserver_catchup_while_compute_down') + pg.stop_and_destroy().create_start("test_pageserver_catchup_while_compute_down") # Ensure that basebackup went correct and pageserver returned all data pg_conn = pg.connect() cur = pg_conn.cursor() cur.execute("SELECT count(*) FROM foo") - assert cur.fetchone() == (20000, ) + assert cur.fetchone() == (20000,) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py new file mode 100644 index 0000000000..eac5e6e61d --- /dev/null +++ b/test_runner/regress/test_pageserver_restart.py @@ -0,0 +1,126 @@ +from contextlib import closing + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +# Test restarting page server, while safekeeper and compute node keep +# running. +def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_pageserver_restart") + pg = env.postgres.create_start("test_pageserver_restart") + + pg_conn = pg.connect() + cur = pg_conn.cursor() + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers, otherwise the SELECT after restart will just return answer + # from shared_buffers without hitting the page server, which defeats the point + # of this test. + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + log.info(f"shared_buffers is {row[0]}, table size {row[1]}") + assert int(row[0]) < int(row[1]) + + # Stop the pageserver gracefully and restart it. + env.pageserver.stop() + env.pageserver.start() + + # Stopping the pageserver breaks the connection from the postgres backend to + # the page server, and causes the next query on the connection to fail. Start a new + # postgres connection too, to avoid that error. (Ideally, the compute node would + # handle that and retry internally, without propagating the error to the user, but + # currently it doesn't...) + pg_conn = pg.connect() + cur = pg_conn.cursor() + + cur.execute("SELECT count(*) FROM foo") + assert cur.fetchone() == (100000,) + + # Stop the page server by force, and restart it + env.pageserver.stop() + env.pageserver.start() + + +# Test that repeatedly kills and restarts the page server, while the +# safekeeper and compute node keep running. +@pytest.mark.timeout(540) +def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + # Use a tiny checkpoint distance, to create a lot of layers quickly. + # That allows us to stress the compaction and layer flushing logic more. + tenant, _ = env.neon_cli.create_tenant( + conf={ + "checkpoint_distance": "5000000", + } + ) + env.neon_cli.create_timeline("test_pageserver_chaos", tenant_id=tenant) + pg = env.postgres.create_start("test_pageserver_chaos", tenant_id=tenant) + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers, otherwise the SELECT after restart will just return answer + # from shared_buffers without hitting the page server, which defeats the point + # of this test. + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE foo (id int, t text, updates int)") + cur.execute("CREATE INDEX ON foo (id)") + cur.execute( + """ + INSERT INTO foo + SELECT g, 'long string to consume some space' || g, 0 + FROM generate_series(1, 100000) g + """ + ) + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + log.info(f"shared_buffers is {row[0]}, table size {row[1]}") + assert int(row[0]) < int(row[1]) + + # Update the whole table, then immediately kill and restart the pageserver + for i in range(1, 15): + pg.safe_psql("UPDATE foo set updates = updates + 1") + + # This kills the pageserver immediately, to simulate a crash + env.pageserver.stop(immediate=True) + env.pageserver.start() + + # Stopping the pageserver breaks the connection from the postgres backend to + # the page server, and causes the next query on the connection to fail. Start a new + # postgres connection too, to avoid that error. (Ideally, the compute node would + # handle that and retry internally, without propagating the error to the user, but + # currently it doesn't...) + pg_conn = pg.connect() + cur = pg_conn.cursor() + + # Check that all the updates are visible + num_updates = pg.safe_psql("SELECT sum(updates) FROM foo")[0][0] + assert num_updates == i * 100000 diff --git a/test_runner/batch_others/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py similarity index 64% rename from test_runner/batch_others/test_parallel_copy.py rename to test_runner/regress/test_parallel_copy.py index 6f87bc4a36..59f19026cc 100644 --- a/test_runner/batch_others/test_parallel_copy.py +++ b/test_runner/regress/test_parallel_copy.py @@ -1,9 +1,8 @@ -from io import BytesIO import asyncio -import asyncpg -import subprocess -from fixtures.zenith_fixtures import ZenithEnv, Postgres +from io import BytesIO + from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, Postgres async def repeat_bytes(buf, repetitions: int): @@ -15,19 +14,25 @@ async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str) buf = BytesIO() for i in range(1000): buf.write( - f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode()) + f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode() + ) buf.seek(0) copy_input = repeat_bytes(buf.read(), 5000) pg_conn = await pg.connect_async() + + # PgProtocol.connect_async sets statement_timeout to 2 minutes. + # That's not enough for this test, on a slow system in debug mode. + await pg_conn.execute("SET statement_timeout='300s'") + await pg_conn.copy_to_table(table_name, source=copy_input) async def parallel_load_same_table(pg: Postgres, n_parallel: int): workers = [] for worker_id in range(n_parallel): - worker = copy_test_data_to_table(pg, worker_id, f'copytest') + worker = copy_test_data_to_table(pg, worker_id, "copytest") workers.append(asyncio.create_task(worker)) # await all workers @@ -35,16 +40,16 @@ async def parallel_load_same_table(pg: Postgres, n_parallel: int): # Load data into one table with COPY TO from 5 parallel connections -def test_parallel_copy(zenith_simple_env: ZenithEnv, n_parallel=5): - env = zenith_simple_env - env.zenith_cli.create_branch("test_parallel_copy", "empty") - pg = env.postgres.create_start('test_parallel_copy') +def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5): + env = neon_simple_env + env.neon_cli.create_branch("test_parallel_copy", "empty") + pg = env.postgres.create_start("test_parallel_copy") log.info("postgres is running on 'test_parallel_copy' branch") # Create test table conn = pg.connect() cur = conn.cursor() - cur.execute(f'CREATE TABLE copytest (i int, t text)') + cur.execute("CREATE TABLE copytest (i int, t text)") # Run COPY TO to load the table with parallel connections. asyncio.run(parallel_load_same_table(pg, n_parallel)) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py new file mode 100644 index 0000000000..5eb1ebb3de --- /dev/null +++ b/test_runner/regress/test_pg_regress.py @@ -0,0 +1,179 @@ +# +# This file runs pg_regress-based tests. +# +from pathlib import Path + +import pytest +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content + + +# Run the main PostgreSQL regression tests, in src/test/regress. +# +# This runs for a long time, especially in debug mode, so use a larger-than-default +# timeout. +@pytest.mark.timeout(1800) +def test_pg_regress( + neon_simple_env: NeonEnv, + test_output_dir: Path, + pg_bin, + capsys, + base_dir: Path, + pg_distrib_dir: Path, +): + env = neon_simple_env + + env.neon_cli.create_branch("test_pg_regress", "empty") + # Connect to postgres and create a database called "regression". + pg = env.postgres.create_start("test_pg_regress") + pg.safe_psql("CREATE DATABASE regression") + + # Create some local directories for pg_regress to run in. + runpath = test_output_dir / "regress" + (runpath / "testtablespace").mkdir(parents=True) + + # Compute all the file locations that pg_regress will need. + build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/regress" + src_path = base_dir / f"vendor/postgres-v{env.pg_version}/src/test/regress" + bindir = pg_distrib_dir / f"v{env.pg_version}/bin" + schedule = src_path / "parallel_schedule" + pg_regress = build_path / "pg_regress" + + pg_regress_command = [ + str(pg_regress), + '--bindir=""', + "--use-existing", + f"--bindir={bindir}", + f"--dlpath={build_path}", + f"--schedule={schedule}", + f"--inputdir={src_path}", + ] + + env_vars = { + "PGPORT": str(pg.default_options["port"]), + "PGUSER": pg.default_options["user"], + "PGHOST": pg.default_options["host"], + } + + # Run the command. + # We don't capture the output. It's not too chatty, and it always + # logs the exact same data to `regression.out` anyway. + with capsys.disabled(): + pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) + + # checkpoint one more time to ensure that the lsn we get is the latest one + pg.safe_psql("CHECKPOINT") + + # Check that we restore the content of the datadir correctly + check_restored_datadir_content(test_output_dir, env, pg) + + +# Run the PostgreSQL "isolation" tests, in src/test/isolation. +# +# This runs for a long time, especially in debug mode, so use a larger-than-default +# timeout. +@pytest.mark.timeout(1800) +def test_isolation( + neon_simple_env: NeonEnv, + test_output_dir: Path, + pg_bin, + capsys, + base_dir: Path, + pg_distrib_dir: Path, +): + env = neon_simple_env + + env.neon_cli.create_branch("test_isolation", "empty") + # Connect to postgres and create a database called "regression". + # isolation tests use prepared transactions, so enable them + pg = env.postgres.create_start("test_isolation", config_lines=["max_prepared_transactions=100"]) + pg.safe_psql("CREATE DATABASE isolation_regression") + + # Create some local directories for pg_isolation_regress to run in. + runpath = test_output_dir / "regress" + (runpath / "testtablespace").mkdir(parents=True) + + # Compute all the file locations that pg_isolation_regress will need. + build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/isolation" + src_path = base_dir / f"vendor/postgres-v{env.pg_version}/src/test/isolation" + bindir = pg_distrib_dir / f"v{env.pg_version}/bin" + schedule = src_path / "isolation_schedule" + pg_isolation_regress = build_path / "pg_isolation_regress" + + pg_isolation_regress_command = [ + str(pg_isolation_regress), + "--use-existing", + f"--bindir={bindir}", + f"--dlpath={build_path}", + f"--inputdir={src_path}", + f"--schedule={schedule}", + ] + + env_vars = { + "PGPORT": str(pg.default_options["port"]), + "PGUSER": pg.default_options["user"], + "PGHOST": pg.default_options["host"], + } + + # Run the command. + # We don't capture the output. It's not too chatty, and it always + # logs the exact same data to `regression.out` anyway. + with capsys.disabled(): + pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath) + + +# Run extra Neon-specific pg_regress-based tests. The tests and their +# schedule file are in the sql_regress/ directory. +def test_sql_regress( + neon_simple_env: NeonEnv, + test_output_dir: Path, + pg_bin, + capsys, + base_dir: Path, + pg_distrib_dir: Path, +): + env = neon_simple_env + + env.neon_cli.create_branch("test_sql_regress", "empty") + # Connect to postgres and create a database called "regression". + pg = env.postgres.create_start("test_sql_regress") + pg.safe_psql("CREATE DATABASE regression") + + # Create some local directories for pg_regress to run in. + runpath = test_output_dir / "regress" + (runpath / "testtablespace").mkdir(parents=True) + + # Compute all the file locations that pg_regress will need. + # This test runs neon specific tests + build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/regress" + src_path = base_dir / "test_runner/sql_regress" + bindir = pg_distrib_dir / f"v{env.pg_version}/bin" + schedule = src_path / "parallel_schedule" + pg_regress = build_path / "pg_regress" + + pg_regress_command = [ + str(pg_regress), + "--use-existing", + f"--bindir={bindir}", + f"--dlpath={build_path}", + f"--schedule={schedule}", + f"--inputdir={src_path}", + ] + + env_vars = { + "PGPORT": str(pg.default_options["port"]), + "PGUSER": pg.default_options["user"], + "PGHOST": pg.default_options["host"], + } + + # Run the command. + # We don't capture the output. It's not too chatty, and it always + # logs the exact same data to `regression.out` anyway. + with capsys.disabled(): + pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) + + # checkpoint one more time to ensure that the lsn we get is the latest one + pg.safe_psql("CHECKPOINT") + pg.safe_psql("select pg_current_wal_insert_lsn()")[0][0] + + # Check that we restore the content of the datadir correctly + check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py new file mode 100644 index 0000000000..d8b7256577 --- /dev/null +++ b/test_runner/regress/test_pitr_gc.py @@ -0,0 +1,74 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.types import TimelineId +from fixtures.utils import print_gc_result, query_scalar + + +# +# Check pitr_interval GC behavior. +# Insert some data, run GC and create a branch in the past. +# +def test_pitr_gc(neon_env_builder: NeonEnvBuilder): + # Set pitr interval such that we need to keep the data + neon_env_builder.pageserver_config_override = ( + "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" + ) + + env = neon_env_builder.init_start() + pgmain = env.postgres.create_start("main") + log.info("postgres is running on 'main' branch") + + main_pg_conn = pgmain.connect() + main_cur = main_pg_conn.cursor() + timeline = TimelineId(query_scalar(main_cur, "SHOW neon.timeline_id")) + + # Create table + main_cur.execute("CREATE TABLE foo (t text)") + + for i in range(10000): + main_cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space'; + """ + ) + + if i == 99: + # keep some early lsn to test branch creation after GC + main_cur.execute("SELECT pg_current_wal_insert_lsn(), txid_current()") + res = main_cur.fetchone() + assert res is not None + lsn_a = res[0] + xid_a = res[1] + log.info(f"LSN after 100 rows: {lsn_a} xid {xid_a}") + + main_cur.execute("SELECT pg_current_wal_insert_lsn(), txid_current()") + res = main_cur.fetchone() + assert res is not None + + debug_lsn = res[0] + debug_xid = res[1] + log.info(f"LSN after 10000 rows: {debug_lsn} xid {debug_xid}") + + # run GC + with env.pageserver.http_client() as pageserver_http: + pageserver_http.timeline_compact(env.initial_tenant, timeline) + # perform aggressive GC. Data still should be kept because of the PITR setting. + gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + print_gc_result(gc_result) + + # Branch at the point where only 100 rows were inserted + # It must have been preserved by PITR setting + env.neon_cli.create_branch("test_pitr_gc_hundred", "main", ancestor_start_lsn=lsn_a) + + pg_hundred = env.postgres.create_start("test_pitr_gc_hundred") + + # On the 'hundred' branch, we should see only 100 rows + hundred_pg_conn = pg_hundred.connect() + hundred_cur = hundred_pg_conn.cursor() + hundred_cur.execute("SELECT count(*) FROM foo") + assert hundred_cur.fetchone() == (100,) + + # All the rows are visible on the main branch + main_cur.execute("SELECT count(*) FROM foo") + assert main_cur.fetchone() == (10000,) diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py new file mode 100644 index 0000000000..b4647ebbe9 --- /dev/null +++ b/test_runner/regress/test_proxy.py @@ -0,0 +1,143 @@ +import json +import subprocess +from urllib.parse import urlparse + +import psycopg2 +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres + + +def test_proxy_select_1(static_proxy): + static_proxy.safe_psql("select 1", options="project=generic-project-name") + + +def test_password_hack(static_proxy): + user = "borat" + password = "password" + static_proxy.safe_psql( + f"create role {user} with login password '{password}'", options="project=irrelevant" + ) + + # Note the format of `magic`! + magic = f"project=irrelevant;{password}" + static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic) + + # Must also check that invalid magic won't be accepted. + with pytest.raises(psycopg2.errors.OperationalError): + magic = "broken" + static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic) + + +def get_session_id_from_uri_line(uri_prefix, uri_line): + assert uri_prefix in uri_line + + url_parts = urlparse(uri_line) + psql_session_id = url_parts.path[1:] + assert psql_session_id.isalnum(), "session_id should only contain alphanumeric chars." + link_auth_uri_prefix = uri_line[: -len(url_parts.path)] + # invariant: the prefix must match the uri_prefix. + assert ( + link_auth_uri_prefix == uri_prefix + ), f"Line='{uri_line}' should contain a http auth link of form '{uri_prefix}/'." + # invariant: the entire link_auth_uri should be on its own line, module spaces. + assert " ".join(uri_line.split(" ")) == f"{uri_prefix}/{psql_session_id}" + + return psql_session_id + + +def create_and_send_db_info(local_vanilla_pg, psql_session_id, mgmt_port): + pg_user = "proxy" + pg_password = "password" + + local_vanilla_pg.start() + query = f"create user {pg_user} with login superuser password '{pg_password}'" + local_vanilla_pg.safe_psql(query) + + port = local_vanilla_pg.default_options["port"] + host = local_vanilla_pg.default_options["host"] + dbname = local_vanilla_pg.default_options["dbname"] + + db_info_dict = { + "session_id": psql_session_id, + "result": { + "Success": { + "host": host, + "port": port, + "dbname": dbname, + "user": pg_user, + "password": pg_password, + } + }, + } + db_info_str = json.dumps(db_info_dict) + cmd_args = [ + "psql", + "-h", + "127.0.0.1", # localhost + "-p", + f"{mgmt_port}", + "-c", + db_info_str, + ] + + log.info(f"Sending to proxy the user and db info: {' '.join(cmd_args)}") + p = subprocess.Popen(cmd_args, stdout=subprocess.PIPE) + out, err = p.communicate() + assert "ok" in str(out) + + +async def get_uri_line_from_process_welcome_notice(link_auth_uri_prefix, proc): + """ + Returns the line from the welcome notice from proc containing link_auth_uri_prefix. + :param link_auth_uri_prefix: the uri prefix used to indicate the line of interest + :param proc: the process to read the welcome message from. + :return: a line containing the full link authentication uri. + """ + max_num_lines_of_welcome_message = 15 + for attempt in range(max_num_lines_of_welcome_message): + raw_line = await proc.stderr.readline() + line = raw_line.decode("utf-8").strip() + if link_auth_uri_prefix in line: + return line + assert False, f"did not find line containing '{link_auth_uri_prefix}'" + + +@pytest.mark.asyncio +async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProxy): + """ + Test copied and modified from: test_project_psql_link_auth test from cloud/tests_e2e/tests/test_project.py + Step 1. establish connection to the proxy + Step 2. retrieve session_id: + Step 2.1: read welcome message + Step 2.2: parse session_id + Step 3. create a vanilla_pg and send user and db info via command line (using Popen) a psql query via mgmt port to proxy. + Step 4. assert that select 1 has been executed correctly. + """ + + psql = PSQL( + host=link_proxy.host, + port=link_proxy.proxy_port, + ) + proc = await psql.run("select 42") + + uri_prefix = link_proxy.link_auth_uri_prefix + line_str = await get_uri_line_from_process_welcome_notice(uri_prefix, proc) + + psql_session_id = get_session_id_from_uri_line(uri_prefix, line_str) + log.info(f"Parsed psql_session_id='{psql_session_id}' from Neon welcome message.") + + create_and_send_db_info(vanilla_pg, psql_session_id, link_proxy.mgmt_port) + + assert proc.stdout is not None + out = (await proc.stdout.read()).decode("utf-8").strip() + assert out == "42" + + +# Pass extra options to the server. +def test_proxy_options(static_proxy): + with static_proxy.connect(options="project=irrelevant -cproxytest.option=value") as conn: + with conn.cursor() as cur: + cur.execute("SHOW proxytest.option") + value = cur.fetchall()[0][0] + assert value == "value" diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py new file mode 100644 index 0000000000..beaae0351b --- /dev/null +++ b/test_runner/regress/test_read_validation.py @@ -0,0 +1,197 @@ +from contextlib import closing + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import query_scalar +from psycopg2.errors import IoError, UndefinedTable + +pytest_plugins = "fixtures.neon_fixtures" + +extensions = ["pageinspect", "neon_test_utils", "pg_buffercache"] + + +# +# Validation of reading different page versions +# +def test_read_validation(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_read_validation", "empty") + + pg = env.postgres.create_start("test_read_validation") + log.info("postgres is running on 'test_read_validation' branch") + + with closing(pg.connect()) as con: + with con.cursor() as c: + + for e in extensions: + c.execute("create extension if not exists {};".format(e)) + + c.execute("create table foo (c int) with (autovacuum_enabled = false)") + c.execute("insert into foo values (1)") + + c.execute("select lsn, lower, upper from page_header(get_raw_page('foo', 'main', 0));") + first = c.fetchone() + assert first is not None + + relfilenode = query_scalar(c, "select relfilenode from pg_class where relname = 'foo'") + + c.execute("insert into foo values (2);") + c.execute("select lsn, lower, upper from page_header(get_raw_page('foo', 'main', 0));") + second = c.fetchone() + + assert first != second, "Failed to update page" + + log.info("Test table is populated, validating buffer cache") + + cache_entries = query_scalar( + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + ) + assert cache_entries > 0, "No buffers cached for the test relation" + + c.execute( + "select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {}".format( + relfilenode + ) + ) + reln = c.fetchone() + assert reln is not None + + log.info("Clear buffer cache to ensure no stale pages are brought into the cache") + + c.execute("select clear_buffer_cache()") + + cache_entries = query_scalar( + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + ) + assert cache_entries == 0, "Failed to clear buffer cache" + + log.info("Cache is clear, reading stale page version") + + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{}'))".format( + first[0] + ) + ) + direct_first = c.fetchone() + assert first == direct_first, "Failed fetch page at historic lsn" + + cache_entries = query_scalar( + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + ) + assert cache_entries == 0, "relation buffers detected after invalidation" + + log.info("Cache is clear, reading latest page version without cache") + + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL))" + ) + direct_latest = c.fetchone() + assert second == direct_latest, "Failed fetch page at latest lsn" + + cache_entries = query_scalar( + c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + ) + assert cache_entries == 0, "relation buffers detected after invalidation" + + log.info( + "Cache is clear, reading stale page version without cache using relation identifiers" + ) + + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( + reln[0], reln[1], reln[2], first[0] + ) + ) + direct_first = c.fetchone() + assert first == direct_first, "Failed fetch page at historic lsn using oid" + + log.info( + "Cache is clear, reading latest page version without cache using relation identifiers" + ) + + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, NULL ))".format( + reln[0], reln[1], reln[2] + ) + ) + direct_latest = c.fetchone() + assert second == direct_latest, "Failed fetch page at latest lsn" + + c.execute("drop table foo;") + + log.info( + "Relation dropped, attempting reading stale page version without cache using relation identifiers" + ) + + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( + reln[0], reln[1], reln[2], first[0] + ) + ) + direct_first = c.fetchone() + assert first == direct_first, "Failed fetch page at historic lsn using oid" + + log.info("Validation page inspect won't allow reading pages of dropped relations") + try: + c.execute("select * from page_header(get_raw_page('foo', 'main', 0));") + assert False, "query should have failed" + except UndefinedTable as e: + log.info("Caught an expected failure: {}".format(e)) + + +def test_read_validation_neg(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_read_validation_neg", "empty") + + pg = env.postgres.create_start("test_read_validation_neg") + log.info("postgres is running on 'test_read_validation_neg' branch") + + with closing(pg.connect()) as con: + with con.cursor() as c: + + for e in extensions: + c.execute("create extension if not exists {};".format(e)) + + log.info("read a page of a missing relation") + try: + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0'))" + ) + assert False, "query should have failed" + except UndefinedTable as e: + log.info("Caught an expected failure: {}".format(e)) + + c.execute("create table foo (c int) with (autovacuum_enabled = false)") + c.execute("insert into foo values (1)") + + log.info("read a page at lsn 0") + try: + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0'))" + ) + assert False, "query should have failed" + except IoError as e: + log.info("Caught an expected failure: {}".format(e)) + + log.info("Pass NULL as an input") + expected = (None, None, None) + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0'))" + ) + assert c.fetchone() == expected, "Expected null output" + + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0'))" + ) + assert c.fetchone() == expected, "Expected null output" + + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0'))" + ) + assert c.fetchone() == expected, "Expected null output" + + # This check is currently failing, reading beyond EOF is returning a 0-page + log.info("Read beyond EOF") + c.execute( + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL))" + ) diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py new file mode 100644 index 0000000000..dfa57aec25 --- /dev/null +++ b/test_runner/regress/test_readonly_node.py @@ -0,0 +1,153 @@ +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, wait_for_last_record_lsn +from fixtures.types import Lsn +from fixtures.utils import query_scalar + + +# +# Create read-only compute nodes, anchored at historical points in time. +# +# This is very similar to the 'test_branch_behind' test, but instead of +# creating branches, creates read-only nodes. +# +def test_readonly_node(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_readonly_node", "empty") + pgmain = env.postgres.create_start("test_readonly_node") + log.info("postgres is running on 'test_readonly_node' branch") + + main_pg_conn = pgmain.connect() + main_cur = main_pg_conn.cursor() + + # Create table, and insert the first 100 rows + main_cur.execute("CREATE TABLE foo (t text)") + + main_cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100) g + """ + ) + main_cur.execute("SELECT pg_current_wal_insert_lsn()") + lsn_a = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info("LSN after 100 rows: " + lsn_a) + + # Insert some more rows. (This generates enough WAL to fill a few segments.) + main_cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 200000) g + """ + ) + lsn_b = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info("LSN after 200100 rows: " + lsn_b) + + # Insert many more rows. This generates enough WAL to fill a few segments. + main_cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 200000) g + """ + ) + + lsn_c = query_scalar(main_cur, "SELECT pg_current_wal_insert_lsn()") + log.info("LSN after 400100 rows: " + lsn_c) + + # Create first read-only node at the point where only 100 rows were inserted + pg_hundred = env.postgres.create_start( + branch_name="test_readonly_node", node_name="test_readonly_node_hundred", lsn=lsn_a + ) + + # And another at the point where 200100 rows were inserted + pg_more = env.postgres.create_start( + branch_name="test_readonly_node", node_name="test_readonly_node_more", lsn=lsn_b + ) + + # On the 'hundred' node, we should see only 100 rows + hundred_pg_conn = pg_hundred.connect() + hundred_cur = hundred_pg_conn.cursor() + hundred_cur.execute("SELECT count(*) FROM foo") + assert hundred_cur.fetchone() == (100,) + + # On the 'more' node, we should see 100200 rows + more_pg_conn = pg_more.connect() + more_cur = more_pg_conn.cursor() + more_cur.execute("SELECT count(*) FROM foo") + assert more_cur.fetchone() == (200100,) + + # All the rows are visible on the main branch + main_cur.execute("SELECT count(*) FROM foo") + assert main_cur.fetchone() == (400100,) + + # Check creating a node at segment boundary + pg = env.postgres.create_start( + branch_name="test_readonly_node", + node_name="test_branch_segment_boundary", + lsn=Lsn("0/3000000"), + ) + cur = pg.connect().cursor() + cur.execute("SELECT 1") + assert cur.fetchone() == (1,) + + # Create node at pre-initdb lsn + with pytest.raises(Exception, match="invalid basebackup lsn"): + # compute node startup with invalid LSN should fail + env.postgres.create_start( + branch_name="test_readonly_node", + node_name="test_readonly_node_preinitdb", + lsn=Lsn("0/42"), + ) + + +# Similar test, but with more data, and we force checkpoints +def test_timetravel(neon_simple_env: NeonEnv): + env = neon_simple_env + pageserver_http_client = env.pageserver.http_client() + env.neon_cli.create_branch("test_timetravel", "empty") + pg = env.postgres.create_start("test_timetravel") + + client = env.pageserver.http_client() + + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + lsns = [] + + with pg.cursor() as cur: + cur.execute( + """ + CREATE TABLE testtab(id serial primary key, iteration int, data text); + INSERT INTO testtab (iteration, data) SELECT 0, 'data' FROM generate_series(1, 100000); + """ + ) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + lsns.append((0, current_lsn)) + + for i in range(1, 5): + with pg.cursor() as cur: + cur.execute(f"UPDATE testtab SET iteration = {i}") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + lsns.append((i, current_lsn)) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to force a new layer file + pageserver_http_client.timeline_checkpoint(tenant_id, timeline_id) + + ##### Restart pageserver + env.postgres.stop_all() + env.pageserver.stop() + env.pageserver.start() + + for (i, lsn) in lsns: + pg_old = env.postgres.create_start( + branch_name="test_timetravel", node_name=f"test_old_lsn_{i}", lsn=lsn + ) + with pg_old.cursor() as cur: + assert query_scalar(cur, f"select count(*) from testtab where iteration={i}") == 100000 + assert query_scalar(cur, f"select count(*) from testtab where iteration<>{i}") == 0 diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py new file mode 100644 index 0000000000..e70b1351ba --- /dev/null +++ b/test_runner/regress/test_recovery.py @@ -0,0 +1,58 @@ +import time +from contextlib import closing + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +# +# Test pageserver recovery after crash +# +def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): + # Override default checkpointer settings to run it more often + neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" + + env = neon_env_builder.init() + env.pageserver.is_testing_enabled_or_skip() + + neon_env_builder.start() + + # Create a branch for us + env.neon_cli.create_branch("test_pageserver_recovery", "main") + + pg = env.postgres.create_start("test_pageserver_recovery") + log.info("postgres is running on 'test_pageserver_recovery' branch") + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + with env.pageserver.http_client() as pageserver_http: + # Create and initialize test table + cur.execute("CREATE TABLE foo(x bigint)") + cur.execute("INSERT INTO foo VALUES (generate_series(1,100000))") + + # Sleep for some time to let checkpoint create image layers + time.sleep(2) + + # Configure failpoints + pageserver_http.configure_failpoints( + [ + ("flush-frozen-before-sync", "sleep(2000)"), + ("checkpoint-after-sync", "exit"), + ] + ) + + # Do some updates until pageserver is crashed + try: + while True: + cur.execute("update foo set x=x+1") + except Exception as err: + log.info(f"Expected server crash {err}") + + log.info("Wait before server restart") + env.pageserver.stop() + env.pageserver.start() + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("select count(*) from foo") + assert cur.fetchone() == (100000,) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py new file mode 100644 index 0000000000..4fb5a5406d --- /dev/null +++ b/test_runner/regress/test_remote_storage.py @@ -0,0 +1,143 @@ +# It's possible to run any regular test with the local fs remote storage via +# env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... + +import os +import shutil +import time +from pathlib import Path + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + RemoteStorageKind, + assert_no_in_progress_downloads_for_tenant, + available_remote_storages, + wait_for_last_record_lsn, + wait_for_upload, +) +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import query_scalar, wait_until + + +# +# Tests that a piece of data is backed up and restored correctly: +# +# 1. Initial pageserver +# * starts a pageserver with remote storage, stores specific data in its tables +# * triggers a checkpoint (which produces a local data scheduled for backup), gets the corresponding timeline id +# * polls the timeline status to ensure it's copied remotely +# * inserts more data in the pageserver and repeats the process, to check multiple checkpoints case +# * stops the pageserver, clears all local directories +# +# 2. Second pageserver +# * starts another pageserver, connected to the same remote storage +# * timeline_attach is called for the same timeline id +# * timeline status is polled until it's downloaded +# * queries the specific data, ensuring that it matches the one stored before +# +# The tests are done for all types of remote storage pageserver supports. +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_remote_storage_backup_and_restore( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + # Use this test to check more realistic SK ids: some etcd key parsing bugs were related, + # and this test needs SK to write data to pageserver, so it will be visible + neon_env_builder.safekeepers_id_start = 12 + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_remote_storage_backup_and_restore", + ) + + data_id = 1 + data_secret = "very secret secret" + + ##### First start, insert secret data and upload it to the remote storage + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + checkpoint_numbers = range(1, 3) + + for checkpoint_number in checkpoint_numbers: + with pg.cursor() as cur: + cur.execute( + f""" + CREATE TABLE t{checkpoint_number}(id int primary key, secret text); + INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); + """ + ) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + log.info(f"waiting for checkpoint {checkpoint_number} upload") + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, tenant_id, timeline_id, current_lsn) + log.info(f"upload of checkpoint {checkpoint_number} is done") + + ##### Stop the first pageserver instance, erase all its data + env.postgres.stop_all() + env.pageserver.stop() + + dir_to_clear = Path(env.repo_dir) / "tenants" + shutil.rmtree(dir_to_clear) + os.mkdir(dir_to_clear) + + ##### Second start, restore the data and ensure it's the same + env.pageserver.start() + + # Introduce failpoint in download + pageserver_http.configure_failpoints(("remote-storage-download-pre-rename", "return")) + + client.tenant_attach(tenant_id) + + # is there a better way to assert that failpoint triggered? + time.sleep(10) + + # assert cannot attach timeline that is scheduled for download + with pytest.raises(Exception, match="Conflict: Tenant download is already in progress"): + client.tenant_attach(tenant_id) + + tenant_status = client.tenant_status(tenant_id) + log.info("Tenant status with active failpoint: %s", tenant_status) + assert tenant_status["has_in_progress_downloads"] is True + + # trigger temporary download files removal + env.pageserver.stop() + env.pageserver.start() + + client.tenant_attach(tenant_id) + + log.info("waiting for timeline redownload") + wait_until( + number_of_iterations=20, + interval=1, + func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), + ) + + detail = client.timeline_detail(tenant_id, timeline_id) + log.info("Timeline detail after attach completed: %s", detail) + assert ( + Lsn(detail["last_record_lsn"]) >= current_lsn + ), "current db Lsn should should not be less than the one stored on remote storage" + assert not detail["awaits_download"] + + pg = env.postgres.create_start("main") + with pg.cursor() as cur: + for checkpoint_number in checkpoint_numbers: + assert ( + query_scalar(cur, f"SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};") + == f"{data_secret}|{checkpoint_number}" + ) diff --git a/test_runner/regress/test_setup.py b/test_runner/regress/test_setup.py new file mode 100644 index 0000000000..3d1471621b --- /dev/null +++ b/test_runner/regress/test_setup.py @@ -0,0 +1,17 @@ +"""Tests for the code in test fixtures""" + +from fixtures.neon_fixtures import NeonEnvBuilder + + +# Test that pageserver and safekeeper can restart quickly. +# This is a regression test, see https://github.com/neondatabase/neon/issues/2247 +def test_fixture_restart(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + for i in range(3): + env.pageserver.stop() + env.pageserver.start() + + for i in range(3): + env.safekeepers[0].stop() + env.safekeepers[0].start() diff --git a/test_runner/batch_others/test_subxacts.py b/test_runner/regress/test_subxacts.py similarity index 51% rename from test_runner/batch_others/test_subxacts.py rename to test_runner/regress/test_subxacts.py index bed1c4be63..42234bf535 100644 --- a/test_runner/batch_others/test_subxacts.py +++ b/test_runner/regress/test_subxacts.py @@ -1,38 +1,40 @@ -from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content # Test subtransactions # # The pg_subxact SLRU is not preserved on restarts, and doesn't need to be # maintained in the pageserver, so subtransactions are not very exciting for -# Zenith. They are included in the commit record though and updated in the +# Neon. They are included in the commit record though and updated in the # CLOG. -def test_subxacts(zenith_simple_env: ZenithEnv, test_output_dir): - env = zenith_simple_env - env.zenith_cli.create_branch("test_subxacts", "empty") - pg = env.postgres.create_start('test_subxacts') +def test_subxacts(neon_simple_env: NeonEnv, test_output_dir): + env = neon_simple_env + env.neon_cli.create_branch("test_subxacts", "empty") + pg = env.postgres.create_start("test_subxacts") log.info("postgres is running on 'test_subxacts' branch") pg_conn = pg.connect() cur = pg_conn.cursor() - cur.execute(''' + cur.execute( + """ CREATE TABLE t1(i int, j int); - ''') + """ + ) - cur.execute('select pg_switch_wal();') + cur.execute("select pg_switch_wal();") # Issue 100 transactions, with 1000 subtransactions in each. for i in range(100): - cur.execute('begin') + cur.execute("begin") for j in range(1000): - cur.execute(f'savepoint sp{j}') - cur.execute(f'insert into t1 values ({i}, {j})') - cur.execute('commit') + cur.execute(f"savepoint sp{j}") + cur.execute(f"insert into t1 values ({i}, {j})") + cur.execute("commit") # force wal flush - cur.execute('checkpoint') + cur.execute("checkpoint") # Check that we can restore the content of the datadir correctly check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py new file mode 100644 index 0000000000..46a945a58b --- /dev/null +++ b/test_runner/regress/test_tenant_conf.py @@ -0,0 +1,135 @@ +from contextlib import closing + +import psycopg2.extras +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder + + +def test_tenant_config(neon_env_builder: NeonEnvBuilder): + """Test per tenant configuration""" + # set some non-default global config + neon_env_builder.pageserver_config_override = """ +page_cache_size=444; +wait_lsn_timeout='111 s'; +tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" + + env = neon_env_builder.init_start() + + # Check that we raise on misspelled configs + invalid_conf_key = "some_invalid_setting_name_blah_blah_123" + try: + env.neon_cli.create_tenant( + conf={ + invalid_conf_key: "20000", + } + ) + except Exception as e: + assert invalid_conf_key in str(e) + else: + raise AssertionError("Expected validation error") + + tenant, _ = env.neon_cli.create_tenant( + conf={ + "checkpoint_distance": "20000", + "gc_period": "30sec", + } + ) + + env.neon_cli.create_timeline("test_tenant_conf", tenant_id=tenant) + env.postgres.create_start( + "test_tenant_conf", + "main", + tenant, + ) + + # check the configuration of the default tenant + # it should match global configuration + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: + log.info(f"show {env.initial_tenant}") + pscur.execute(f"show {env.initial_tenant}") + res = pscur.fetchone() + assert all( + i in res.items() + for i in { + "checkpoint_distance": 10000, + "compaction_target_size": 1048576, + "compaction_period": 20, + "compaction_threshold": 10, + "gc_horizon": 67108864, + "gc_period": 100, + "image_creation_threshold": 3, + "pitr_interval": 2592000, + }.items() + ) + + # check the configuration of the new tenant + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: + pscur.execute(f"show {tenant}") + res = pscur.fetchone() + log.info(f"res: {res}") + assert all( + i in res.items() + for i in { + "checkpoint_distance": 20000, + "compaction_target_size": 1048576, + "compaction_period": 20, + "compaction_threshold": 10, + "gc_horizon": 67108864, + "gc_period": 30, + "image_creation_threshold": 3, + "pitr_interval": 2592000, + }.items() + ) + + # update the config and ensure that it has changed + env.neon_cli.config_tenant( + tenant_id=tenant, + conf={ + "checkpoint_distance": "15000", + "gc_period": "80sec", + }, + ) + + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: + pscur.execute(f"show {tenant}") + res = pscur.fetchone() + log.info(f"after config res: {res}") + assert all( + i in res.items() + for i in { + "checkpoint_distance": 15000, + "compaction_target_size": 1048576, + "compaction_period": 20, + "compaction_threshold": 10, + "gc_horizon": 67108864, + "gc_period": 80, + "image_creation_threshold": 3, + "pitr_interval": 2592000, + }.items() + ) + + # restart the pageserver and ensure that the config is still correct + env.pageserver.stop() + env.pageserver.start() + + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: + pscur.execute(f"show {tenant}") + res = pscur.fetchone() + log.info(f"after restart res: {res}") + assert all( + i in res.items() + for i in { + "checkpoint_distance": 15000, + "compaction_target_size": 1048576, + "compaction_period": 20, + "compaction_threshold": 10, + "gc_horizon": 67108864, + "gc_period": 80, + "image_creation_threshold": 3, + "pitr_interval": 2592000, + }.items() + ) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py new file mode 100644 index 0000000000..dc4cd2e37e --- /dev/null +++ b/test_runner/regress/test_tenant_detach.py @@ -0,0 +1,79 @@ +from threading import Thread + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException, PageserverHttpClient +from fixtures.types import TenantId, TimelineId + + +def do_gc_target( + pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId +): + """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" + try: + pageserver_http.timeline_gc(tenant_id, timeline_id, 0) + except Exception as e: + log.error("do_gc failed: %s", e) + + +def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + # first check for non existing tenant + tenant_id = TenantId.generate() + with pytest.raises( + expected_exception=PageserverApiException, + match=f"Tenant not found for id {tenant_id}", + ): + pageserver_http.tenant_detach(tenant_id) + + # create new nenant + tenant_id, timeline_id = env.neon_cli.create_tenant() + + # assert tenant exists on disk + assert (env.repo_dir / "tenants" / str(tenant_id)).exists() + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + # we rely upon autocommit after each statement + pg.safe_psql_many( + queries=[ + "CREATE TABLE t(key int primary key, value text)", + "INSERT INTO t SELECT generate_series(1,100000), 'payload'", + ] + ) + + # gc should not try to even start + with pytest.raises( + expected_exception=PageserverApiException, match="gc target timeline does not exist" + ): + bogus_timeline_id = TimelineId.generate() + pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0) + + # try to concurrently run gc and detach + gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id)) + gc_thread.start() + + last_error = None + for i in range(3): + try: + pageserver_http.tenant_detach(tenant_id) + except Exception as e: + last_error = e + log.error(f"try {i} error detaching tenant: {e}") + continue + else: + break + # else is called if the loop finished without reaching "break" + else: + pytest.fail(f"could not detach tenant: {last_error}") + + gc_thread.join(timeout=10) + + # check that nothing is left on disk for deleted tenant + assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() + + with pytest.raises( + expected_exception=PageserverApiException, match=f"Tenant {tenant_id} not found" + ): + pageserver_http.timeline_gc(tenant_id, timeline_id, 0) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py new file mode 100644 index 0000000000..aec45307f7 --- /dev/null +++ b/test_runner/regress/test_tenant_relocation.py @@ -0,0 +1,493 @@ +import os +import threading +from contextlib import closing, contextmanager +from pathlib import Path +from typing import Any, Dict, Optional, Tuple + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + Etcd, + NeonEnv, + NeonEnvBuilder, + PageserverHttpClient, + PortDistributor, + Postgres, + assert_no_in_progress_downloads_for_tenant, + wait_for_last_record_lsn, + wait_for_upload, +) +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import query_scalar, start_in_background, subprocess_capture, wait_until + + +def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): + assert abs(a - b) / a < margin_ratio, abs(a - b) / a + + +@contextmanager +def new_pageserver_service( + new_pageserver_dir: Path, + pageserver_bin: Path, + remote_storage_mock_path: Path, + pg_port: int, + http_port: int, + broker: Optional[Etcd], + pg_distrib_dir: Path, +): + """ + cannot use NeonPageserver yet because it depends on neon cli + which currently lacks support for multiple pageservers + """ + # actually run new pageserver + cmd = [ + str(pageserver_bin), + "--workdir", + str(new_pageserver_dir), + "--update-config", + f"-c listen_pg_addr='localhost:{pg_port}'", + f"-c listen_http_addr='localhost:{http_port}'", + f"-c pg_distrib_dir='{pg_distrib_dir}'", + "-c id=2", + f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}", + ] + if broker is not None: + cmd.append( + f"-c broker_endpoints=['{broker.client_url()}']", + ) + pageserver_client = PageserverHttpClient( + port=http_port, + auth_token=None, + is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled + ) + try: + pageserver_process = start_in_background( + cmd, new_pageserver_dir, "pageserver.log", pageserver_client.check_status + ) + except Exception as e: + log.error(e) + pageserver_process.kill() + raise Exception(f"Failed to start pageserver as {cmd}, reason: {e}") + + log.info("new pageserver started") + try: + yield pageserver_process + finally: + log.info("stopping new pageserver") + pageserver_process.kill() + + +@contextmanager +def pg_cur(pg): + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + yield cur + + +def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Event): + log.info("load started") + + inserted_ctr = 0 + failed = False + while not stop_event.is_set(): + try: + with pg_cur(pg) as cur: + cur.execute("INSERT INTO load VALUES ('some payload')") + inserted_ctr += 1 + except: # noqa: E722 + if not failed: + log.info("load failed") + failed = True + load_ok_event.clear() + else: + if failed: + with pg_cur(pg) as cur: + # if we recovered after failure verify that we have correct number of rows + log.info("recovering at %s", inserted_ctr) + cur.execute("SELECT count(*) FROM load") + # it seems that sometimes transaction gets committed before we can acknowledge + # the result, so sometimes selected value is larger by one than we expect + assert cur.fetchone()[0] - inserted_ctr <= 1 + log.info("successfully recovered %s", inserted_ctr) + failed = False + load_ok_event.set() + log.info("load thread stopped") + + +def populate_branch( + pg: Postgres, + tenant_id: TenantId, + ps_http: PageserverHttpClient, + create_table: bool, + expected_sum: Optional[int], +) -> Tuple[TimelineId, Lsn]: + # insert some data + with pg_cur(pg) as cur: + cur.execute("SHOW neon.timeline_id") + timeline_id = TimelineId(cur.fetchone()[0]) + log.info("timeline to relocate %s", timeline_id) + + log.info( + "pg_current_wal_flush_lsn(): %s", + Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")), + ) + log.info( + "timeline detail %s", + ps_http.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id), + ) + + # we rely upon autocommit after each statement + # as waiting for acceptors happens there + if create_table: + cur.execute("CREATE TABLE t(key int, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'some payload'") + if expected_sum is not None: + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (expected_sum,) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + return timeline_id, current_lsn + + +def ensure_checkpoint( + pageserver_http: PageserverHttpClient, + tenant_id: TenantId, + timeline_id: TimelineId, + current_lsn: Lsn, +): + # run checkpoint manually to be sure that data landed in remote storage + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + +def check_timeline_attached( + new_pageserver_http_client: PageserverHttpClient, + tenant_id: TenantId, + timeline_id: TimelineId, + old_timeline_detail: Dict[str, Any], + old_current_lsn: Lsn, +): + # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint + new_timeline_detail = new_pageserver_http_client.timeline_detail(tenant_id, timeline_id) + + # when load is active these checks can break because lsns are not static + # so let's check with some margin + assert_abs_margin_ratio( + int(Lsn(new_timeline_detail["disk_consistent_lsn"])), + int(Lsn(old_timeline_detail["disk_consistent_lsn"])), + 0.03, + ) + + assert_abs_margin_ratio( + int(Lsn(new_timeline_detail["disk_consistent_lsn"])), int(old_current_lsn), 0.03 + ) + + +def switch_pg_to_new_pageserver( + env: NeonEnv, + pg: Postgres, + new_pageserver_port: int, + tenant_id: TenantId, + timeline_id: TimelineId, +) -> Path: + pg.stop() + + pg_config_file_path = Path(pg.config_file_path()) + pg_config_file_path.open("a").write( + f"\nneon.pageserver_connstring = 'postgresql://no_user:@localhost:{new_pageserver_port}'" + ) + + pg.start() + + timeline_to_detach_local_path = ( + env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + ) + files_before_detach = os.listdir(timeline_to_detach_local_path) + assert ( + "metadata" in files_before_detach + ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file,\ + but got: {files_before_detach}" + assert ( + len(files_before_detach) >= 2 + ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\ + but got {files_before_detach}" + + return timeline_to_detach_local_path + + +def post_migration_check(pg: Postgres, sum_before_migration: int, old_local_path: Path): + with pg_cur(pg) as cur: + # check that data is still there + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (sum_before_migration,) + # check that we can write new data + cur.execute("INSERT INTO t SELECT generate_series(1001,2000), 'some payload'") + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (sum_before_migration + 1500500,) + + assert not os.path.exists( + old_local_path + ), f"After detach, local timeline dir {old_local_path} should be removed" + + +@pytest.mark.parametrize( + "method", + [ + # A minor migration involves no storage breaking changes. + # It is done by attaching the tenant to a new pageserver. + "minor", + # A major migration involves exporting a postgres datadir + # basebackup and importing it into the new pageserver. + # This kind of migration can tolerate breaking changes + # to storage format + "major", + ], +) +@pytest.mark.parametrize("with_load", ["with_load", "without_load"]) +def test_tenant_relocation( + neon_env_builder: NeonEnvBuilder, + port_distributor: PortDistributor, + test_output_dir: Path, + neon_binpath: Path, + base_dir: Path, + method: str, + with_load: str, +): + neon_env_builder.enable_local_fs_remote_storage() + + env = neon_env_builder.init_start() + + # create folder for remote storage mock + remote_storage_mock_path = env.repo_dir / "local_fs_remote_storage" + + # we use two branches to check that they are both relocated + # first branch is used for load, compute for second one is used to + # check that data is not lost + + pageserver_http = env.pageserver.http_client() + + tenant_id, initial_timeline_id = env.neon_cli.create_tenant( + TenantId("74ee8b079a0e437eb0afea7d26a07209") + ) + log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id) + + env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id) + pg_main = env.postgres.create_start( + branch_name="test_tenant_relocation_main", tenant_id=tenant_id + ) + + timeline_id_main, current_lsn_main = populate_branch( + pg_main, + tenant_id=tenant_id, + ps_http=pageserver_http, + create_table=True, + expected_sum=500500, + ) + + env.neon_cli.create_branch( + new_branch_name="test_tenant_relocation_second", + ancestor_branch_name="test_tenant_relocation_main", + ancestor_start_lsn=current_lsn_main, + tenant_id=tenant_id, + ) + pg_second = env.postgres.create_start( + branch_name="test_tenant_relocation_second", tenant_id=tenant_id + ) + + timeline_id_second, current_lsn_second = populate_branch( + pg_second, + tenant_id=tenant_id, + ps_http=pageserver_http, + create_table=False, + expected_sum=1001000, + ) + + # wait until pageserver receives that data + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_main, current_lsn_main) + timeline_detail_main = pageserver_http.timeline_detail(tenant_id, timeline_id_main) + + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_second, current_lsn_second) + timeline_detail_second = pageserver_http.timeline_detail(tenant_id, timeline_id_second) + + if with_load == "with_load": + # create load table + with pg_cur(pg_main) as cur: + cur.execute("CREATE TABLE load(value text)") + + load_stop_event = threading.Event() + load_ok_event = threading.Event() + load_thread = threading.Thread( + target=load, + args=(pg_main, load_stop_event, load_ok_event), + daemon=True, # To make sure the child dies when the parent errors + ) + load_thread.start() + + # this requirement introduces a problem + # if user creates a branch during migration + # it wont appear on the new pageserver + ensure_checkpoint( + pageserver_http=pageserver_http, + tenant_id=tenant_id, + timeline_id=timeline_id_main, + current_lsn=current_lsn_main, + ) + + ensure_checkpoint( + pageserver_http=pageserver_http, + tenant_id=tenant_id, + timeline_id=timeline_id_second, + current_lsn=current_lsn_second, + ) + + log.info("inititalizing new pageserver") + # bootstrap second pageserver + new_pageserver_dir = env.repo_dir / "new_pageserver" + new_pageserver_dir.mkdir() + + new_pageserver_pg_port = port_distributor.get_port() + new_pageserver_http_port = port_distributor.get_port() + log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port) + pageserver_bin = neon_binpath / "pageserver" + + new_pageserver_http = PageserverHttpClient( + port=new_pageserver_http_port, + auth_token=None, + is_testing_enabled_or_skip=env.pageserver.is_testing_enabled_or_skip, + ) + + with new_pageserver_service( + new_pageserver_dir, + pageserver_bin, + remote_storage_mock_path, + new_pageserver_pg_port, + new_pageserver_http_port, + neon_env_builder.broker, + neon_env_builder.pg_distrib_dir, + ): + + # Migrate either by attaching from s3 or import/export basebackup + if method == "major": + cmd = [ + "poetry", + "run", + "python", + str(base_dir / "scripts/export_import_between_pageservers.py"), + "--tenant-id", + str(tenant_id), + "--from-host", + "localhost", + "--from-http-port", + str(pageserver_http.port), + "--from-pg-port", + str(env.pageserver.service_port.pg), + "--to-host", + "localhost", + "--to-http-port", + str(new_pageserver_http_port), + "--to-pg-port", + str(new_pageserver_pg_port), + "--pg-distrib-dir", + str(neon_env_builder.pg_distrib_dir), + "--work-dir", + str(test_output_dir), + "--tmp-pg-port", + str(port_distributor.get_port()), + ] + subprocess_capture(test_output_dir, cmd, check=True) + elif method == "minor": + # call to attach timeline to new pageserver + new_pageserver_http.tenant_attach(tenant_id) + + # check that it shows that download is in progress + tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id) + assert tenant_status.get("has_in_progress_downloads"), tenant_status + + # wait until tenant is downloaded + wait_until( + number_of_iterations=10, + interval=1, + func=lambda: assert_no_in_progress_downloads_for_tenant( + new_pageserver_http, tenant_id + ), + ) + + check_timeline_attached( + new_pageserver_http, + tenant_id, + timeline_id_main, + timeline_detail_main, + current_lsn_main, + ) + + check_timeline_attached( + new_pageserver_http, + tenant_id, + timeline_id_second, + timeline_detail_second, + current_lsn_second, + ) + + # rewrite neon cli config to use new pageserver for basebackup to start new compute + cli_config_lines = (env.repo_dir / "config").read_text().splitlines() + cli_config_lines[-2] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'" + cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'" + (env.repo_dir / "config").write_text("\n".join(cli_config_lines)) + + old_local_path_main = switch_pg_to_new_pageserver( + env, + pg_main, + new_pageserver_pg_port, + tenant_id, + timeline_id_main, + ) + + old_local_path_second = switch_pg_to_new_pageserver( + env, + pg_second, + new_pageserver_pg_port, + tenant_id, + timeline_id_second, + ) + + # detach tenant from old pageserver before we check + # that all the data is there to be sure that old pageserver + # is no longer involved, and if it is, we will see the errors + pageserver_http.tenant_detach(tenant_id) + + post_migration_check(pg_main, 500500, old_local_path_main) + post_migration_check(pg_second, 1001000, old_local_path_second) + + # ensure that we can successfully read all relations on the new pageserver + with pg_cur(pg_second) as cur: + cur.execute( + """ + DO $$ + DECLARE + r RECORD; + BEGIN + FOR r IN + SELECT relname FROM pg_class WHERE relkind='r' + LOOP + RAISE NOTICE '%', r.relname; + EXECUTE 'SELECT count(*) FROM quote_ident($1)' USING r.relname; + END LOOP; + END$$; + """ + ) + + if with_load == "with_load": + assert load_ok_event.wait(3) + log.info("stopping load thread") + load_stop_event.set() + load_thread.join(timeout=10) + log.info("load thread stopped") + + # bring old pageserver back for clean shutdown via neon cli + # new pageserver will be shut down by the context manager + cli_config_lines = (env.repo_dir / "config").read_text().splitlines() + cli_config_lines[-2] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'" + cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'" + (env.repo_dir / "config").write_text("\n".join(cli_config_lines)) diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py new file mode 100644 index 0000000000..03e7129ff7 --- /dev/null +++ b/test_runner/regress/test_tenant_size.py @@ -0,0 +1,280 @@ +import time +from typing import List, Tuple + +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PageserverApiException, + wait_for_last_flush_lsn, +) +from fixtures.types import Lsn + + +def test_empty_tenant_size(neon_simple_env: NeonEnv): + env = neon_simple_env + (tenant_id, _) = env.neon_cli.create_tenant() + http_client = env.pageserver.http_client() + size = http_client.tenant_size(tenant_id) + + # we should never have zero, because there should be the initdb however + # this is questionable if we should have anything in this case, as the + # gc_cutoff is negative + assert ( + size == 0 + ), "initial implementation returns zero tenant_size before last_record_lsn is past gc_horizon" + + with env.postgres.create_start("main", tenant_id=tenant_id) as pg: + with pg.cursor() as cur: + cur.execute("SELECT 1") + row = cur.fetchone() + assert row is not None + assert row[0] == 1 + size = http_client.tenant_size(tenant_id) + assert size == 0, "starting idle compute should not change the tenant size" + + # the size should be the same, until we increase the size over the + # gc_horizon + size = http_client.tenant_size(tenant_id) + assert size == 0, "tenant_size should not be affected by shutdown of compute" + + +def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): + """ + Operate on single branch reading the tenants size after each transaction. + """ + + # gc and compaction is not wanted automatically + # the pitr_interval here is quite problematic, so we cannot really use it. + # it'd have to be calibrated per test executing env. + + # there was a bug which was hidden if the create table and first batch of + # inserts is larger than gc_horizon. for example 0x20000 here hid the fact + # that there next_gc_cutoff could be smaller than initdb_lsn, which will + # obviously lead to issues when calculating the size. + gc_horizon = 0x30000 + neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='1h', gc_period='1h', pitr_interval='0sec', gc_horizon={gc_horizon}}}" + + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + branch_name, timeline_id = env.neon_cli.list_timelines(tenant_id)[0] + + http_client = env.pageserver.http_client() + + collected_responses: List[Tuple[Lsn, int]] = [] + + with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg: + with pg.cursor() as cur: + cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL)") + + batch_size = 100 + + i = 0 + while True: + with pg.cursor() as cur: + cur.execute( + f"INSERT INTO t0(i) SELECT i FROM generate_series({batch_size} * %s, ({batch_size} * (%s + 1)) - 1) s(i)", + (i, i), + ) + + i += 1 + + current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + size = http_client.tenant_size(tenant_id) + + if len(collected_responses) > 0: + prev = collected_responses[-1][1] + if size == 0: + assert prev == 0 + else: + assert size > prev + + collected_responses.append((current_lsn, size)) + + if len(collected_responses) > 2: + break + + while True: + with pg.cursor() as cur: + cur.execute( + f"UPDATE t0 SET i = -i WHERE i IN (SELECT i FROM t0 WHERE i > 0 LIMIT {batch_size})" + ) + updated = cur.rowcount + + if updated == 0: + break + + current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + size = http_client.tenant_size(tenant_id) + prev = collected_responses[-1][1] + assert size > prev, "tenant_size should grow with updates" + collected_responses.append((current_lsn, size)) + + while True: + with pg.cursor() as cur: + cur.execute(f"DELETE FROM t0 WHERE i IN (SELECT i FROM t0 LIMIT {batch_size})") + deleted = cur.rowcount + + if deleted == 0: + break + + current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + size = http_client.tenant_size(tenant_id) + prev = collected_responses[-1][1] + assert ( + size > prev + ), "even though rows have been deleted, the tenant_size should increase" + collected_responses.append((current_lsn, size)) + + with pg.cursor() as cur: + cur.execute("DROP TABLE t0") + + current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + size = http_client.tenant_size(tenant_id) + prev = collected_responses[-1][1] + assert size > prev, "dropping table grows tenant_size" + collected_responses.append((current_lsn, size)) + + # this isn't too many lines to forget for a while. observed while + # developing these tests that locally the value is a bit more than what we + # get in the ci. + for lsn, size in collected_responses: + log.info(f"collected: {lsn}, {size}") + + env.pageserver.stop() + env.pageserver.start() + + size_after = http_client.tenant_size(tenant_id) + prev = collected_responses[-1][1] + + assert size_after == prev, "size after restarting pageserver should not have changed" + + +def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder): + """ + Reported size goes up while branches or rows are being added, goes down after removing branches. + """ + + gc_horizon = 128 * 1024 + + neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='1h', gc_period='1h', pitr_interval='0sec', gc_horizon={gc_horizon}}}" + + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + main_branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0] + + http_client = env.pageserver.http_client() + + main_pg = env.postgres.create_start(main_branch_name, tenant_id=tenant_id) + + batch_size = 10000 + + with main_pg.cursor() as cur: + cur.execute( + f"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, {batch_size}) s(i)" + ) + + wait_for_last_flush_lsn(env, main_pg, tenant_id, main_timeline_id) + size_at_branch = http_client.tenant_size(tenant_id) + assert size_at_branch > 0 + + first_branch_timeline_id = env.neon_cli.create_branch( + "first-branch", main_branch_name, tenant_id + ) + + # unsure why this happens, the size difference is more than a page alignment + size_after_first_branch = http_client.tenant_size(tenant_id) + assert size_after_first_branch > size_at_branch + assert size_after_first_branch - size_at_branch == gc_horizon + + first_branch_pg = env.postgres.create_start("first-branch", tenant_id=tenant_id) + + with first_branch_pg.cursor() as cur: + cur.execute( + f"CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, {batch_size}) s(i)" + ) + + wait_for_last_flush_lsn(env, first_branch_pg, tenant_id, first_branch_timeline_id) + size_after_growing_first_branch = http_client.tenant_size(tenant_id) + assert size_after_growing_first_branch > size_after_first_branch + + with main_pg.cursor() as cur: + cur.execute( + f"CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 2*{batch_size}) s(i)" + ) + + wait_for_last_flush_lsn(env, main_pg, tenant_id, main_timeline_id) + size_after_continuing_on_main = http_client.tenant_size(tenant_id) + assert size_after_continuing_on_main > size_after_growing_first_branch + + second_branch_timeline_id = env.neon_cli.create_branch( + "second-branch", main_branch_name, tenant_id + ) + size_after_second_branch = http_client.tenant_size(tenant_id) + assert size_after_second_branch > size_after_continuing_on_main + + second_branch_pg = env.postgres.create_start("second-branch", tenant_id=tenant_id) + + with second_branch_pg.cursor() as cur: + cur.execute( + f"CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 3*{batch_size}) s(i)" + ) + + wait_for_last_flush_lsn(env, second_branch_pg, tenant_id, second_branch_timeline_id) + size_after_growing_second_branch = http_client.tenant_size(tenant_id) + assert size_after_growing_second_branch > size_after_second_branch + + with second_branch_pg.cursor() as cur: + cur.execute("DROP TABLE t0") + cur.execute("DROP TABLE t1") + cur.execute("VACUUM FULL") + + wait_for_last_flush_lsn(env, second_branch_pg, tenant_id, second_branch_timeline_id) + size_after_thinning_branch = http_client.tenant_size(tenant_id) + assert ( + size_after_thinning_branch > size_after_growing_second_branch + ), "tenant_size should grow with dropped tables and full vacuum" + + first_branch_pg.stop_and_destroy() + second_branch_pg.stop_and_destroy() + main_pg.stop() + env.pageserver.stop() + env.pageserver.start() + + # chance of compaction and gc on startup might have an effect on the + # tenant_size but so far this has been reliable, even though at least gc + # and tenant_size race for the same locks + size_after = http_client.tenant_size(tenant_id) + assert size_after == size_after_thinning_branch + + # teardown, delete branches, and the size should be going down + deleted = False + for _ in range(10): + try: + http_client.timeline_delete(tenant_id, first_branch_timeline_id) + deleted = True + break + except PageserverApiException as e: + # compaction is ok but just retry if this fails; related to #2442 + if "cannot lock compaction critical section" in str(e): + time.sleep(1) + continue + raise + + assert deleted + + size_after_deleting_first = http_client.tenant_size(tenant_id) + assert size_after_deleting_first < size_after_thinning_branch + + http_client.timeline_delete(tenant_id, second_branch_timeline_id) + size_after_deleting_second = http_client.tenant_size(tenant_id) + assert size_after_deleting_second < size_after_deleting_first + + assert size_after_deleting_second < size_after_continuing_on_main + assert size_after_deleting_second > size_after_first_branch diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py new file mode 100644 index 0000000000..a6e935035c --- /dev/null +++ b/test_runner/regress/test_tenant_tasks.py @@ -0,0 +1,77 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.types import TenantId, TimelineId +from fixtures.utils import wait_until + + +def get_only_element(l): # noqa: E741 + assert len(l) == 1 + return l[0] + + +# Test that gc and compaction tenant tasks start and stop correctly +def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): + # The gc and compaction loops don't bother to watch for tenant state + # changes while sleeping, so we use small periods to make this test + # run faster. With default settings we'd have to wait longer for tasks + # to notice state changes and shut down. + # TODO fix this behavior in the pageserver + tenant_config = "{gc_period = '1 s', compaction_period = '1 s'}" + neon_env_builder.pageserver_config_override = f"tenant_config={tenant_config}" + name = "test_tenant_tasks" + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + + def get_state(tenant): + all_states = client.tenant_list() + matching = [t for t in all_states if TenantId(t["id"]) == tenant] + return get_only_element(matching)["state"] + + def get_metric_value(name): + metrics = client.get_metrics() + relevant = [line for line in metrics.splitlines() if line.startswith(name)] + if len(relevant) == 0: + return 0 + line = get_only_element(relevant) + value = line.lstrip(name).strip() + return int(value) + + def delete_all_timelines(tenant: TenantId): + timelines = [TimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)] + for t in timelines: + client.timeline_delete(tenant, t) + + def assert_active_without_jobs(tenant): + assert get_state(tenant) == {"Active": {"background_jobs_running": False}} + + # Create tenant, start compute + tenant, _ = env.neon_cli.create_tenant() + env.neon_cli.create_timeline(name, tenant_id=tenant) + pg = env.postgres.create_start(name, tenant_id=tenant) + assert get_state(tenant) == { + "Active": {"background_jobs_running": True} + }, "Pageserver should activate a tenant and start background jobs if timelines are loaded" + + # Stop compute + pg.stop() + + # Delete all timelines on all tenants + for tenant_info in client.tenant_list(): + tenant_id = TenantId(tenant_info["id"]) + delete_all_timelines(tenant_id) + wait_until(10, 0.2, lambda: assert_active_without_jobs(tenant_id)) + + # Assert that all tasks finish quickly after tenant is detached + assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0 + client.tenant_detach(tenant) + client.tenant_detach(env.initial_tenant) + + def assert_tasks_finish(): + tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}') + tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}') + tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}') + log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}") + assert tasks_started == tasks_ended + assert tasks_panicked == 0 + + wait_until(10, 0.2, assert_tasks_finish) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py new file mode 100644 index 0000000000..4ffea60950 --- /dev/null +++ b/test_runner/regress/test_tenants.py @@ -0,0 +1,262 @@ +import os +import shutil +from contextlib import closing +from datetime import datetime +from pathlib import Path +from typing import List + +import pytest +from fixtures.log_helper import log +from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + RemoteStorageKind, + available_remote_storages, +) +from fixtures.types import Lsn, TenantId, TimelineId +from prometheus_client.samples import Sample + + +def test_tenant_creation_fails(neon_simple_env: NeonEnv): + tenants_dir = Path(neon_simple_env.repo_dir) / "tenants" + initial_tenants = sorted( + map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) + ) + initial_tenant_dirs = [d for d in tenants_dir.iterdir()] + + pageserver_http = neon_simple_env.pageserver.http_client() + pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return")) + with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"): + _ = neon_simple_env.neon_cli.create_tenant() + + new_tenants = sorted( + map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) + ) + assert initial_tenants == new_tenants, "should not create new tenants" + + new_tenant_dirs = [d for d in tenants_dir.iterdir()] + assert ( + new_tenant_dirs == initial_tenant_dirs + ), "pageserver should clean its temp tenant dirs on tenant creation failure" + + +def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + + env = neon_env_builder.init_start() + """Tests tenants with and without wal acceptors""" + tenant_1, _ = env.neon_cli.create_tenant() + tenant_2, _ = env.neon_cli.create_tenant() + + env.neon_cli.create_timeline("test_tenants_normal_work", tenant_id=tenant_1) + env.neon_cli.create_timeline("test_tenants_normal_work", tenant_id=tenant_2) + + pg_tenant1 = env.postgres.create_start( + "test_tenants_normal_work", + tenant_id=tenant_1, + ) + pg_tenant2 = env.postgres.create_start( + "test_tenants_normal_work", + tenant_id=tenant_2, + ) + + for pg in [pg_tenant1, pg_tenant2]: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + # we rely upon autocommit after each statement + # as waiting for acceptors happens there + cur.execute("CREATE TABLE t(key int primary key, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (5000050000,) + + +def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + + env = neon_env_builder.init_start() + tenant_1, _ = env.neon_cli.create_tenant() + tenant_2, _ = env.neon_cli.create_tenant() + + timeline_1 = env.neon_cli.create_timeline("test_metrics_normal_work", tenant_id=tenant_1) + timeline_2 = env.neon_cli.create_timeline("test_metrics_normal_work", tenant_id=tenant_2) + + pg_tenant1 = env.postgres.create_start("test_metrics_normal_work", tenant_id=tenant_1) + pg_tenant2 = env.postgres.create_start("test_metrics_normal_work", tenant_id=tenant_2) + + for pg in [pg_tenant1, pg_tenant2]: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (5000050000,) + + collected_metrics = { + "pageserver": env.pageserver.http_client().get_metrics(), + } + for sk in env.safekeepers: + collected_metrics[f"safekeeper{sk.id}"] = sk.http_client().get_metrics_str() + + for name in collected_metrics: + basepath = os.path.join(neon_env_builder.repo_dir, f"{name}.metrics") + + with open(basepath, "w") as stdout_f: + print(collected_metrics[name], file=stdout_f, flush=True) + + all_metrics = [parse_metrics(m, name) for name, m in collected_metrics.items()] + ps_metrics = all_metrics[0] + sk_metrics = all_metrics[1:] + + ttids = [ + {"tenant_id": str(tenant_1), "timeline_id": str(timeline_1)}, + {"tenant_id": str(tenant_2), "timeline_id": str(timeline_2)}, + ] + + # Test metrics per timeline + for tt in ttids: + log.info(f"Checking metrics for {tt}") + + ps_lsn = Lsn(int(ps_metrics.query_one("pageserver_last_record_lsn", filter=tt).value)) + sk_lsns = [ + Lsn(int(sk.query_one("safekeeper_commit_lsn", filter=tt).value)) for sk in sk_metrics + ] + + log.info(f"ps_lsn: {ps_lsn}") + log.info(f"sk_lsns: {sk_lsns}") + + assert ps_lsn <= max(sk_lsns) + assert ps_lsn > Lsn(0) + + # Test common metrics + for metrics in all_metrics: + log.info(f"Checking common metrics for {metrics.name}") + + log.info( + f"process_cpu_seconds_total: {metrics.query_one('process_cpu_seconds_total').value}" + ) + log.info(f"process_threads: {int(metrics.query_one('process_threads').value)}") + log.info( + f"process_resident_memory_bytes (MB): {metrics.query_one('process_resident_memory_bytes').value / 1024 / 1024}" + ) + log.info( + f"process_virtual_memory_bytes (MB): {metrics.query_one('process_virtual_memory_bytes').value / 1024 / 1024}" + ) + log.info(f"process_open_fds: {int(metrics.query_one('process_open_fds').value)}") + log.info(f"process_max_fds: {int(metrics.query_one('process_max_fds').value)}") + log.info( + f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}" + ) + + +def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder): + """Tests that when a tenant is detached, the tenant specific metrics are not left behind""" + + neon_env_builder.num_safekeepers = 3 + + env = neon_env_builder.init_start() + tenant_1, _ = env.neon_cli.create_tenant() + tenant_2, _ = env.neon_cli.create_tenant() + + env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_1) + env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_2) + + pg_tenant1 = env.postgres.create_start("test_metrics_removed_after_detach", tenant_id=tenant_1) + pg_tenant2 = env.postgres.create_start("test_metrics_removed_after_detach", tenant_id=tenant_2) + + for pg in [pg_tenant1, pg_tenant2]: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (5000050000,) + + def get_ps_metric_samples_for_tenant(tenant_id: TenantId) -> List[Sample]: + ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver") + samples = [] + for metric_name in ps_metrics.metrics: + for sample in ps_metrics.query_all( + name=metric_name, filter={"tenant_id": str(tenant_id)} + ): + samples.append(sample) + return samples + + for tenant in [tenant_1, tenant_2]: + pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)]) + assert pre_detach_samples == set(PAGESERVER_PER_TENANT_METRICS) + + env.pageserver.http_client().tenant_detach(tenant) + + post_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)]) + assert post_detach_samples == set() + + +# Check that empty tenants work with or without the remote storage +@pytest.mark.parametrize( + "remote_storage_kind", available_remote_storages() + [RemoteStorageKind.NOOP] +) +def test_pageserver_with_empty_tenants( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_pageserver_with_empty_tenants", + ) + + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + + tenant_without_timelines_dir = env.initial_tenant + log.info( + f"Tenant {tenant_without_timelines_dir} becomes broken: it abnormally looses tenants/ directory and is expected to be completely ignored when pageserver restarts" + ) + shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_without_timelines_dir) / "timelines") + + tenant_with_empty_timelines_dir = client.tenant_create() + log.info( + f"Tenant {tenant_with_empty_timelines_dir} gets all of its timelines deleted: still should be functional" + ) + temp_timelines = client.timeline_list(tenant_with_empty_timelines_dir) + for temp_timeline in temp_timelines: + client.timeline_delete( + tenant_with_empty_timelines_dir, TimelineId(temp_timeline["timeline_id"]) + ) + files_in_timelines_dir = sum( + 1 + for _p in Path.iterdir( + Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines" + ) + ) + assert ( + files_in_timelines_dir == 0 + ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory" + + # Trigger timeline reinitialization after pageserver restart + env.postgres.stop_all() + env.pageserver.stop() + env.pageserver.start() + + client = env.pageserver.http_client() + tenants = client.tenant_list() + + assert ( + len(tenants) == 2 + ), "Pageserver should attach only tenants with empty or not existing timelines/ dir on restart" + + [broken_tenant] = [t for t in tenants if t["id"] == str(tenant_without_timelines_dir)] + assert ( + broken_tenant + ), f"A broken tenant {tenant_without_timelines_dir} should exists in the tenant list" + assert ( + broken_tenant["state"] == "Broken" + ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken" + + [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)] + assert ( + loaded_tenant + ), f"Tenant {tenant_with_empty_timelines_dir} should be loaded as the only one with tenants/ directory" + assert loaded_tenant["state"] == { + "Active": {"background_jobs_running": False} + }, "Empty tenant should be loaded and ready for timeline creation" diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py new file mode 100644 index 0000000000..9a4cbe135b --- /dev/null +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -0,0 +1,423 @@ +# +# Little stress test for the checkpointing and remote storage code. +# +# The test creates several tenants, and runs a simple workload on +# each tenant, in parallel. The test uses remote storage, and a tiny +# checkpoint_distance setting so that a lot of layer files are created. +# + +import asyncio +import json +import os +import shutil +from pathlib import Path +from typing import List, Tuple + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + LocalFsStorage, + NeonEnv, + NeonEnvBuilder, + Postgres, + RemoteStorageKind, + assert_no_in_progress_downloads_for_tenant, + available_remote_storages, + wait_for_last_record_lsn, + wait_for_upload, +) +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import query_scalar, wait_until + + +async def tenant_workload(env: NeonEnv, pg: Postgres): + await env.pageserver.connect_async() + + pg_conn = await pg.connect_async() + + await pg_conn.execute("CREATE TABLE t(key int primary key, value text)") + for i in range(1, 100): + await pg_conn.execute( + f"INSERT INTO t SELECT {i}*1000 + g, 'payload' from generate_series(1,1000) g" + ) + + # we rely upon autocommit after each statement + # as waiting for acceptors happens there + res = await pg_conn.fetchval("SELECT count(*) FROM t") + assert res == i * 1000 + + +async def all_tenants_workload(env: NeonEnv, tenants_pgs): + workers = [] + for _, pg in tenants_pgs: + worker = tenant_workload(env, pg) + workers.append(asyncio.create_task(worker)) + + # await all workers + await asyncio.gather(*workers) + + +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_tenants_many", + ) + + env = neon_env_builder.init_start() + + tenants_pgs: List[Tuple[TenantId, Postgres]] = [] + + for _ in range(1, 5): + # Use a tiny checkpoint distance, to create a lot of layers quickly + tenant, _ = env.neon_cli.create_tenant( + conf={ + "checkpoint_distance": "5000000", + } + ) + env.neon_cli.create_timeline("test_tenants_many", tenant_id=tenant) + + pg = env.postgres.create_start( + "test_tenants_many", + tenant_id=tenant, + ) + tenants_pgs.append((tenant, pg)) + + asyncio.run(all_tenants_workload(env, tenants_pgs)) + + # Wait for the remote storage uploads to finish + pageserver_http = env.pageserver.http_client() + for tenant, pg in tenants_pgs: + res = pg.safe_psql_many( + ["SHOW neon.tenant_id", "SHOW neon.timeline_id", "SELECT pg_current_wal_flush_lsn()"] + ) + tenant_id = TenantId(res[0][0][0]) + timeline_id = TimelineId(res[1][0][0]) + current_lsn = Lsn(res[2][0][0]) + + # wait until pageserver receives all the data + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + + # run final checkpoint manually to flush all the data to remote storage + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_tenants_attached_after_download( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="remote_storage_kind", + ) + + data_id = 1 + data_secret = "very secret secret" + + ##### First start, insert secret data and upload it to the remote storage + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + for checkpoint_number in range(1, 3): + with pg.cursor() as cur: + cur.execute( + f""" + CREATE TABLE t{checkpoint_number}(id int primary key, secret text); + INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); + """ + ) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + log.info(f"waiting for checkpoint {checkpoint_number} upload") + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, tenant_id, timeline_id, current_lsn) + log.info(f"upload of checkpoint {checkpoint_number} is done") + + ##### Stop the pageserver, erase its layer file to force it being downloaded from S3 + env.postgres.stop_all() + env.pageserver.stop() + + timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + local_layer_deleted = False + for path in Path.iterdir(timeline_dir): + if path.name.startswith("00000"): + # Looks like a layer file. Remove it + os.remove(path) + local_layer_deleted = True + break + assert local_layer_deleted, f"Found no local layer files to delete in directory {timeline_dir}" + + ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory + env.pageserver.start() + client = env.pageserver.http_client() + + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), + ) + + restored_timelines = client.timeline_list(tenant_id) + assert ( + len(restored_timelines) == 1 + ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage" + retored_timeline = restored_timelines[0] + assert retored_timeline["timeline_id"] == str( + timeline_id + ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_tenant_upgrades_index_json_from_v0( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + # the "image" for the v0 index_part.json. the fields themselves are + # replaced with values read from the later version because of #2592 (initdb + # lsn not reproducible). + v0_skeleton = json.loads( + """{ + "timeline_layers":[ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9" + ], + "missing_layers":[], + "disk_consistent_lsn":"0/16960E8", + "metadata_bytes":[] + }""" + ) + + # getting a too eager compaction happening for this test would not play + # well with the strict assertions. + neon_env_builder.pageserver_config_override = "tenant_config.compaction_period='1h'" + + neon_env_builder.enable_remote_storage( + remote_storage_kind, "test_tenant_upgrades_index_json_from_v0" + ) + + # launch pageserver, populate the default tenants timeline, wait for it to be uploaded, + # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + with pg.cursor() as cur: + cur.execute("CREATE TABLE t0 AS VALUES (123, 'second column as text');") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # flush, wait until in remote storage + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + env.postgres.stop_all() + env.pageserver.stop() + + # remove all local data for the tenant to force redownloading and subsequent upgrade + shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id)) + + # downgrade the remote file + timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id) + with open(timeline_path, "r+") as timeline_file: + # keep the deserialized for later inspection + orig_index_part = json.load(timeline_file) + + v0_index_part = {key: orig_index_part[key] for key in v0_skeleton} + + timeline_file.seek(0) + json.dump(v0_index_part, timeline_file) + + env.pageserver.start() + pageserver_http = env.pageserver.http_client() + pageserver_http.tenant_attach(tenant_id) + + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id), + ) + + pg = env.postgres.create_start("main") + + with pg.cursor() as cur: + cur.execute("INSERT INTO t0 VALUES (234, 'test data');") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + # not needed anymore + env.postgres.stop_all() + env.pageserver.stop() + + # make sure the file has been upgraded back to how it started + index_part = local_fs_index_part(env, tenant_id, timeline_id) + assert index_part["version"] == orig_index_part["version"] + assert index_part["missing_layers"] == orig_index_part["missing_layers"] + + # expect one more layer because of the forced checkpoint + assert len(index_part["timeline_layers"]) == len(orig_index_part["timeline_layers"]) + 1 + + # all of the same layer files are there, but they might be shuffled around + orig_layers = set(orig_index_part["timeline_layers"]) + later_layers = set(index_part["timeline_layers"]) + assert later_layers.issuperset(orig_layers) + + added_layers = later_layers - orig_layers + assert len(added_layers) == 1 + + # all of metadata has been regenerated (currently just layer file size) + all_metadata_keys = set() + for layer in orig_layers: + orig_metadata = orig_index_part["layer_metadata"][layer] + new_metadata = index_part["layer_metadata"][layer] + assert ( + orig_metadata == new_metadata + ), f"metadata for layer {layer} should not have changed {orig_metadata} vs. {new_metadata}" + all_metadata_keys |= set(orig_metadata.keys()) + + one_new_layer = next(iter(added_layers)) + assert one_new_layer in index_part["layer_metadata"], "new layer should have metadata" + + only_new_metadata = index_part["layer_metadata"][one_new_layer] + + assert ( + set(only_new_metadata.keys()).symmetric_difference(all_metadata_keys) == set() + ), "new layer metadata has same metadata as others" + + +# FIXME: test index_part.json getting downgraded from imaginary new version + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_tenant_redownloads_truncated_file_on_startup( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + # since we now store the layer file length metadata, we notice on startup that a layer file is of wrong size, and proceed to redownload it. + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_tenant_redownloads_truncated_file_on_startup", + ) + + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + with pg.cursor() as cur: + cur.execute("CREATE TABLE t1 AS VALUES (123, 'foobar');") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + env.postgres.stop_all() + env.pageserver.stop() + + timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + local_layer_truncated = None + for path in Path.iterdir(timeline_dir): + if path.name.startswith("00000"): + correct_size = os.stat(path).st_size + os.truncate(path, 0) + local_layer_truncated = (path, correct_size) + break + assert ( + local_layer_truncated is not None + ), f"Found no local layer files to delete in directory {timeline_dir}" + + (path, expected_size) = local_layer_truncated + + # ensure the same size is found from the index_part.json + index_part = local_fs_index_part(env, tenant_id, timeline_id) + assert index_part["layer_metadata"][path.name]["file_size"] == expected_size + + ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory + env.pageserver.start() + client = env.pageserver.http_client() + + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), + ) + + restored_timelines = client.timeline_list(tenant_id) + assert ( + len(restored_timelines) == 1 + ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage" + retored_timeline = restored_timelines[0] + assert retored_timeline["timeline_id"] == str( + timeline_id + ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" + + assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded" + + # the remote side of local_layer_truncated + remote_layer_path = local_fs_index_part_path(env, tenant_id, timeline_id).parent / path.name + + # if the upload ever was ongoing, this check would be racy, but at least one + # extra http request has been made in between so assume it's enough delay + assert ( + os.stat(remote_layer_path).st_size == expected_size + ), "truncated file should not had been uploaded around re-download" + + pg = env.postgres.create_start("main") + + with pg.cursor() as cur: + cur.execute("INSERT INTO t1 VALUES (234, 'test data');") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + # now that the upload is complete, make sure the file hasn't been + # re-uploaded truncated. this is a rather bogus check given the current + # implementation, but it's critical it doesn't happen so wasting a few + # lines of python to do this. + assert ( + os.stat(remote_layer_path).st_size == expected_size + ), "truncated file should not had been uploaded after next checkpoint" + + +def local_fs_index_part(env, tenant_id, timeline_id): + """ + Return json.load parsed index_part.json of tenant and timeline from LOCAL_FS + """ + timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id) + with open(timeline_path, "r") as timeline_file: + return json.load(timeline_file) + + +def local_fs_index_part_path(env, tenant_id, timeline_id): + """ + Return path to the LOCAL_FS index_part.json of the tenant and timeline. + """ + assert isinstance(env.remote_storage, LocalFsStorage) + return ( + env.remote_storage.root + / "tenants" + / str(tenant_id) + / "timelines" + / str(timeline_id) + / "index_part.json" + ) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py new file mode 100644 index 0000000000..450f7f2381 --- /dev/null +++ b/test_runner/regress/test_timeline_delete.py @@ -0,0 +1,78 @@ +import pytest +from fixtures.neon_fixtures import NeonEnv, PageserverApiException +from fixtures.types import TenantId, TimelineId +from fixtures.utils import wait_until + + +def test_timeline_delete(neon_simple_env: NeonEnv): + env = neon_simple_env + + ps_http = env.pageserver.http_client() + + # first try to delete non existing timeline + # for existing tenant: + invalid_timeline_id = TimelineId.generate() + with pytest.raises(PageserverApiException, match="timeline not found"): + ps_http.timeline_delete(tenant_id=env.initial_tenant, timeline_id=invalid_timeline_id) + + # for non existing tenant: + invalid_tenant_id = TenantId.generate() + with pytest.raises( + PageserverApiException, + match=f"Tenant {invalid_tenant_id} not found in the local state", + ): + ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id) + + # construct pair of branches to validate that pageserver prohibits + # deletion of ancestor timelines when they have child branches + parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty") + + leaf_timeline_id = env.neon_cli.create_branch( + "test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent" + ) + + ps_http = env.pageserver.http_client() + with pytest.raises( + PageserverApiException, match="Cannot delete timeline which has child timelines" + ): + + timeline_path = ( + env.repo_dir + / "tenants" + / str(env.initial_tenant) + / "timelines" + / str(parent_timeline_id) + ) + assert timeline_path.exists() + + ps_http.timeline_delete(env.initial_tenant, parent_timeline_id) + + assert not timeline_path.exists() + + timeline_path = ( + env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id) + ) + assert timeline_path.exists() + + # retry deletes when compaction or gc is running in pageserver + wait_until( + number_of_iterations=3, + interval=0.2, + func=lambda: ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id), + ) + + assert not timeline_path.exists() + + # check 404 + with pytest.raises( + PageserverApiException, + match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found", + ): + ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) + + # FIXME leaves tenant without timelines, should we prevent deletion of root timeline? + wait_until( + number_of_iterations=3, + interval=0.2, + func=lambda: ps_http.timeline_delete(env.initial_tenant, parent_timeline_id), + ) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py new file mode 100644 index 0000000000..ec2bed7fee --- /dev/null +++ b/test_runner/regress/test_timeline_size.py @@ -0,0 +1,482 @@ +import math +import random +import re +import time +from contextlib import closing +from pathlib import Path + +import psycopg2.errors +import psycopg2.extras +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PageserverHttpClient, + PgBin, + PortDistributor, + Postgres, + VanillaPostgres, + wait_for_last_flush_lsn, +) +from fixtures.types import TenantId, TimelineId +from fixtures.utils import get_timeline_dir_size + + +def test_timeline_size(neon_simple_env: NeonEnv): + env = neon_simple_env + new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") + + client = env.pageserver.http_client() + wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + + pgmain = env.postgres.create_start("test_timeline_size") + log.info("postgres is running on 'test_timeline_size' branch") + + with closing(pgmain.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 10) g + """ + ) + + res = client.timeline_detail( + env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True + ) + assert res["current_logical_size"] == res["current_logical_size_non_incremental"] + cur.execute("TRUNCATE foo") + + res = client.timeline_detail( + env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True + ) + assert res["current_logical_size"] == res["current_logical_size_non_incremental"] + + +def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): + env = neon_simple_env + new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty") + + client = env.pageserver.http_client() + wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + timeline_details = client.timeline_detail( + env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True + ) + + pgmain = env.postgres.create_start("test_timeline_size_createdropdb") + log.info("postgres is running on 'test_timeline_size_createdropdb' branch") + + with closing(pgmain.connect()) as conn: + with conn.cursor() as cur: + res = client.timeline_detail( + env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True + ) + assert res["current_logical_size"] == res["current_logical_size_non_incremental"] + assert ( + timeline_details["current_logical_size_non_incremental"] + == res["current_logical_size_non_incremental"] + ), "no writes should not change the incremental logical size" + + cur.execute("CREATE DATABASE foodb") + with closing(pgmain.connect(dbname="foodb")) as conn: + with conn.cursor() as cur2: + + cur2.execute("CREATE TABLE foo (t text)") + cur2.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 10) g + """ + ) + + res = client.timeline_detail( + env.initial_tenant, + new_timeline_id, + include_non_incremental_logical_size=True, + ) + assert ( + res["current_logical_size"] == res["current_logical_size_non_incremental"] + ) + + cur.execute("DROP DATABASE foodb") + + res = client.timeline_detail( + env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True + ) + assert res["current_logical_size"] == res["current_logical_size_non_incremental"] + + +# wait until received_lsn_lag is 0 +def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60): + started_at = time.time() + + received_lsn_lag = 1 + while received_lsn_lag > 0: + elapsed = time.time() - started_at + if elapsed > timeout: + raise RuntimeError( + "timed out waiting for pageserver to reach pg_current_wal_flush_lsn()" + ) + + res = pgmain.safe_psql( + """ + SELECT + pg_size_pretty(pg_cluster_size()), + pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag + FROM backpressure_lsns(); + """ + )[0] + log.info(f"pg_cluster_size = {res[0]}, received_lsn_lag = {res[1]}") + received_lsn_lag = res[1] + + time.sleep(polling_interval) + + +def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota") + + wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + + pgmain = env.postgres.create_start( + "test_timeline_size_quota", + # Set small limit for the test + config_lines=["neon.max_cluster_size=30MB"], + ) + log.info("postgres is running on 'test_timeline_size_quota' branch") + + with closing(pgmain.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE EXTENSION neon") # TODO move it to neon_fixtures? + + cur.execute("CREATE TABLE foo (t text)") + + wait_for_pageserver_catchup(pgmain) + + # Insert many rows. This query must fail because of space limit + try: + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + wait_for_pageserver_catchup(pgmain) + + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 500000) g + """ + ) + + # If we get here, the timeline size limit failed + log.error("Query unexpectedly succeeded") + assert False + + except psycopg2.errors.DiskFull as err: + log.info(f"Query expectedly failed with: {err}") + + # drop table to free space + cur.execute("DROP TABLE foo") + + wait_for_pageserver_catchup(pgmain) + + # create it again and insert some rows. This query must succeed + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 10000) g + """ + ) + + wait_for_pageserver_catchup(pgmain) + + cur.execute("SELECT * from pg_size_pretty(pg_cluster_size())") + pg_cluster_size = cur.fetchone() + log.info(f"pg_cluster_size = {pg_cluster_size}") + + new_res = client.timeline_detail( + env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True + ) + assert ( + new_res["current_logical_size"] == new_res["current_logical_size_non_incremental"] + ), "after the WAL is streamed, current_logical_size is expected to be calculated and to be equal its non-incremental value" + + +def test_timeline_physical_size_init(neon_simple_env: NeonEnv): + env = neon_simple_env + new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_init") + pg = env.postgres.create_start("test_timeline_physical_size_init") + + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 1000) g""", + ] + ) + + wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) + + # restart the pageserer to force calculating timeline's initial physical size + env.pageserver.stop() + env.pageserver.start() + + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): + env = neon_simple_env + pageserver_http = env.pageserver.http_client() + new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_checkpoint") + pg = env.postgres.create_start("test_timeline_physical_size_post_checkpoint") + + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 1000) g""", + ] + ) + + wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) + + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder): + # Disable background compaction as we don't want it to happen after `get_physical_size` request + # and before checking the expected size on disk, which makes the assertion failed + neon_env_builder.pageserver_config_override = ( + "tenant_config={checkpoint_distance=100000, compaction_period='10m'}" + ) + + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction") + pg = env.postgres.create_start("test_timeline_physical_size_post_compaction") + + # We don't want autovacuum to run on the table, while we are calculating the + # physical size, because that could cause a new layer to be created and a + # mismatch between the incremental and non-incremental size. (If that still + # happens, because of some other background activity or autovacuum on other + # tables, we could simply retry the size calculations. It's unlikely that + # that would happen more than once.) + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ] + ) + + wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) + pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id) + + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): + # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request + # and before checking the expected size on disk, which makes the assertion failed + neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='10m', gc_period='10m', pitr_interval='1s'}" + + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc") + pg = env.postgres.create_start("test_timeline_physical_size_post_gc") + + # Like in test_timeline_physical_size_post_compaction, disable autovacuum + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text) WITH (autovacuum_enabled = off)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ] + ) + + wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) + + pg.safe_psql( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) + + pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None) + + assert_physical_size(env, env.initial_tenant, new_timeline_id) + + +# The timeline logical and physical sizes are also exposed as prometheus metrics. +# Test the metrics. +def test_timeline_size_metrics( + neon_simple_env: NeonEnv, + test_output_dir: Path, + port_distributor: PortDistributor, + pg_distrib_dir: Path, + pg_version: str, +): + env = neon_simple_env + pageserver_http = env.pageserver.http_client() + + new_timeline_id = env.neon_cli.create_branch("test_timeline_size_metrics") + pg = env.postgres.create_start("test_timeline_size_metrics") + + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""", + ] + ) + + wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) + pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) + + # get the metrics and parse the metric for the current timeline's physical size + metrics = env.pageserver.http_client().get_metrics() + matches = re.search( + f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', + metrics, + re.MULTILINE, + ) + assert matches + tl_physical_size_metric = int(matches.group(1)) + + # assert that the physical size metric matches the actual physical size on disk + timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id) + assert tl_physical_size_metric == get_timeline_dir_size(timeline_path) + + # Check that the logical size metric is sane, and matches + matches = re.search( + f'^pageserver_current_logical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', + metrics, + re.MULTILINE, + ) + assert matches + tl_logical_size_metric = int(matches.group(1)) + + pgdatadir = test_output_dir / "pgdata-vanilla" + pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: + vanilla_pg.configure([f"port={port}"]) + vanilla_pg.start() + + # Create database based on template0 because we can't connect to template0 + vanilla_pg.safe_psql("CREATE TABLE foo (t text)") + vanilla_pg.safe_psql( + """INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g""" + ) + vanilla_size_sum = vanilla_pg.safe_psql( + "select sum(pg_database_size(oid)) from pg_database" + )[0][0] + + # Compare the size with Vanilla postgres. + # Allow some slack, because the logical size metric includes some things like + # the SLRUs that are not included in pg_database_size(). + assert math.isclose(tl_logical_size_metric, vanilla_size_sum, abs_tol=2 * 1024 * 1024) + + # The sum of the sizes of all databases, as seen by pg_database_size(), should also + # be close. Again allow some slack, the logical size metric includes some things like + # the SLRUs that are not included in pg_database_size(). + dbsize_sum = pg.safe_psql("select sum(pg_database_size(oid)) from pg_database")[0][0] + assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024) + + +def test_tenant_physical_size(neon_simple_env: NeonEnv): + random.seed(100) + + env = neon_simple_env + pageserver_http = env.pageserver.http_client() + client = env.pageserver.http_client() + + tenant, timeline = env.neon_cli.create_tenant() + + def get_timeline_physical_size(timeline: TimelineId): + res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True) + return res["current_physical_size_non_incremental"] + + timeline_total_size = get_timeline_physical_size(timeline) + for i in range(10): + n_rows = random.randint(100, 1000) + + timeline = env.neon_cli.create_branch(f"test_tenant_physical_size_{i}", tenant_id=tenant) + pg = env.postgres.create_start(f"test_tenant_physical_size_{i}", tenant_id=tenant) + + pg.safe_psql_many( + [ + "CREATE TABLE foo (t text)", + f"INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, {n_rows}) g", + ] + ) + + wait_for_last_flush_lsn(env, pg, tenant, timeline) + pageserver_http.timeline_checkpoint(tenant, timeline) + + timeline_total_size += get_timeline_physical_size(timeline) + + pg.stop() + + tenant_physical_size = int(client.tenant_status(tenant_id=tenant)["current_physical_size"]) + assert tenant_physical_size == timeline_total_size + + +def assert_physical_size(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): + """Check the current physical size returned from timeline API + matches the total physical size of the timeline on disk""" + client = env.pageserver.http_client() + res = client.timeline_detail(tenant_id, timeline_id, include_non_incremental_physical_size=True) + timeline_path = env.timeline_dir(tenant_id, timeline_id) + assert res["current_physical_size"] == res["current_physical_size_non_incremental"] + assert res["current_physical_size"] == get_timeline_dir_size(timeline_path) + + +# Timeline logical size initialization is an asynchronous background task that runs once, +# try a few times to ensure it's activated properly +def wait_for_timeline_size_init( + client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId +): + for i in range(10): + timeline_details = client.timeline_detail( + tenant, timeline, include_non_incremental_logical_size=True + ) + current_logical_size = timeline_details["current_logical_size"] + non_incremental = timeline_details["current_logical_size_non_incremental"] + if current_logical_size == non_incremental: + return + log.info( + f"waiting for current_logical_size of a timeline to be calculated, iteration {i}: {current_logical_size} vs {non_incremental}" + ) + time.sleep(1) + raise Exception( + f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}" + ) diff --git a/test_runner/batch_others/test_twophase.py b/test_runner/regress/test_twophase.py similarity index 70% rename from test_runner/batch_others/test_twophase.py rename to test_runner/regress/test_twophase.py index d6a1cd01e8..f3b0f9ca06 100644 --- a/test_runner/batch_others/test_twophase.py +++ b/test_runner/regress/test_twophase.py @@ -1,47 +1,46 @@ import os -from fixtures.zenith_fixtures import ZenithEnv from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn # # Test branching, when a transaction is in prepared state # -def test_twophase(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_twophase", "empty") - - pg = env.postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5']) +def test_twophase(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_twophase", "empty") + pg = env.postgres.create_start("test_twophase", config_lines=["max_prepared_transactions=5"]) log.info("postgres is running on 'test_twophase' branch") conn = pg.connect() cur = conn.cursor() - cur.execute('CREATE TABLE foo (t text)') + cur.execute("CREATE TABLE foo (t text)") # Prepare a transaction that will insert a row - cur.execute('BEGIN') + cur.execute("BEGIN") cur.execute("INSERT INTO foo VALUES ('one')") cur.execute("PREPARE TRANSACTION 'insert_one'") # Prepare another transaction that will insert a row - cur.execute('BEGIN') + cur.execute("BEGIN") cur.execute("INSERT INTO foo VALUES ('two')") cur.execute("PREPARE TRANSACTION 'insert_two'") # Prepare a transaction that will insert a row - cur.execute('BEGIN') + cur.execute("BEGIN") cur.execute("INSERT INTO foo VALUES ('three')") cur.execute("PREPARE TRANSACTION 'insert_three'") # Prepare another transaction that will insert a row - cur.execute('BEGIN') + cur.execute("BEGIN") cur.execute("INSERT INTO foo VALUES ('four')") cur.execute("PREPARE TRANSACTION 'insert_four'") # On checkpoint state data copied to files in # pg_twophase directory and fsynced - cur.execute('CHECKPOINT') + cur.execute("CHECKPOINT") twophase_files = os.listdir(pg.pg_twophase_dir_path()) log.info(twophase_files) @@ -49,19 +48,19 @@ def test_twophase(zenith_simple_env: ZenithEnv): cur.execute("COMMIT PREPARED 'insert_three'") cur.execute("ROLLBACK PREPARED 'insert_four'") - cur.execute('CHECKPOINT') + cur.execute("CHECKPOINT") twophase_files = os.listdir(pg.pg_twophase_dir_path()) log.info(twophase_files) assert len(twophase_files) == 2 # Create a branch with the transaction in prepared state - env.zenith_cli.create_branch("test_twophase_prepared", "test_twophase") + fork_at_current_lsn(env, pg, "test_twophase_prepared", "test_twophase") # Start compute on the new branch pg2 = env.postgres.create_start( - 'test_twophase_prepared', - config_lines=['max_prepared_transactions=5'], + "test_twophase_prepared", + config_lines=["max_prepared_transactions=5"], ) # Check that we restored only needed twophase files @@ -77,9 +76,9 @@ def test_twophase(zenith_simple_env: ZenithEnv): cur2.execute("COMMIT PREPARED 'insert_one'") cur2.execute("ROLLBACK PREPARED 'insert_two'") - cur2.execute('SELECT * FROM foo') - assert cur2.fetchall() == [('one', ), ('three', )] + cur2.execute("SELECT * FROM foo") + assert cur2.fetchall() == [("one",), ("three",)] # Only one committed insert is visible on the original branch - cur.execute('SELECT * FROM foo') - assert cur.fetchall() == [('three', )] + cur.execute("SELECT * FROM foo") + assert cur.fetchall() == [("three",)] diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py new file mode 100644 index 0000000000..16a870471b --- /dev/null +++ b/test_runner/regress/test_vm_bits.py @@ -0,0 +1,83 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn + + +# +# Test that the VM bit is cleared correctly at a HEAP_DELETE and +# HEAP_UPDATE record. +# +def test_vm_bit_clear(neon_simple_env: NeonEnv): + env = neon_simple_env + + env.neon_cli.create_branch("test_vm_bit_clear", "empty") + pg = env.postgres.create_start("test_vm_bit_clear") + + log.info("postgres is running on 'test_vm_bit_clear' branch") + pg_conn = pg.connect() + cur = pg_conn.cursor() + + # Install extension containing function needed for test + cur.execute("CREATE EXTENSION neon_test_utils") + + # Create a test table and freeze it to set the VM bit. + cur.execute("CREATE TABLE vmtest_delete (id integer PRIMARY KEY)") + cur.execute("INSERT INTO vmtest_delete VALUES (1)") + cur.execute("VACUUM FREEZE vmtest_delete") + + cur.execute("CREATE TABLE vmtest_update (id integer PRIMARY KEY)") + cur.execute("INSERT INTO vmtest_update SELECT g FROM generate_series(1, 1000) g") + cur.execute("VACUUM FREEZE vmtest_update") + + # DELETE and UPDATE the rows. + cur.execute("DELETE FROM vmtest_delete WHERE id = 1") + cur.execute("UPDATE vmtest_update SET id = 5000 WHERE id = 1") + + # Branch at this point, to test that later + fork_at_current_lsn(env, pg, "test_vm_bit_clear_new", "test_vm_bit_clear") + + # Clear the buffer cache, to force the VM page to be re-fetched from + # the page server + cur.execute("SELECT clear_buffer_cache()") + + # Check that an index-only scan doesn't see the deleted row. If the + # clearing of the VM bit was not replayed correctly, this would incorrectly + # return deleted row. + cur.execute( + """ + set enable_seqscan=off; + set enable_indexscan=on; + set enable_bitmapscan=off; + """ + ) + + cur.execute("SELECT * FROM vmtest_delete WHERE id = 1") + assert cur.fetchall() == [] + cur.execute("SELECT * FROM vmtest_update WHERE id = 1") + assert cur.fetchall() == [] + + cur.close() + + # Check the same thing on the branch that we created right after the DELETE + # + # As of this writing, the code in smgrwrite() creates a full-page image whenever + # a dirty VM page is evicted. If the VM bit was not correctly cleared by the + # earlier WAL record, the full-page image hides the problem. Starting a new + # server at the right point-in-time avoids that full-page image. + pg_new = env.postgres.create_start("test_vm_bit_clear_new") + + log.info("postgres is running on 'test_vm_bit_clear_new' branch") + pg_new_conn = pg_new.connect() + cur_new = pg_new_conn.cursor() + + cur_new.execute( + """ + set enable_seqscan=off; + set enable_indexscan=on; + set enable_bitmapscan=off; + """ + ) + + cur_new.execute("SELECT * FROM vmtest_delete WHERE id = 1") + assert cur_new.fetchall() == [] + cur_new.execute("SELECT * FROM vmtest_update WHERE id = 1") + assert cur_new.fetchall() == [] diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py new file mode 100644 index 0000000000..8ef7f27752 --- /dev/null +++ b/test_runner/regress/test_wal_acceptor.py @@ -0,0 +1,1189 @@ +import os +import pathlib +import random +import shutil +import signal +import subprocess +import sys +import threading +import time +from contextlib import closing +from dataclasses import dataclass, field +from functools import partial +from pathlib import Path +from typing import Any, List, Optional + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + Etcd, + NeonEnv, + NeonEnvBuilder, + NeonPageserver, + PgBin, + PgProtocol, + PortDistributor, + Postgres, + RemoteStorageKind, + RemoteStorageUsers, + Safekeeper, + SafekeeperHttpClient, + SafekeeperPort, + available_remote_storages, + wait_for_last_record_lsn, + wait_for_upload, +) +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import get_dir_size, query_scalar, start_in_background + + +def wait_lsn_force_checkpoint( + tenant_id: TenantId, + timeline_id: TimelineId, + pg: Postgres, + ps: NeonPageserver, + pageserver_conn_options={}, +): + lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver") + + auth_token = None + if "password" in pageserver_conn_options: + auth_token = pageserver_conn_options["password"] + + # wait for the pageserver to catch up + wait_for_last_record_lsn( + ps.http_client(auth_token=auth_token), + tenant_id, + timeline_id, + lsn, + ) + + # force checkpoint to advance remote_consistent_lsn + ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id) + + # ensure that remote_consistent_lsn is advanced + wait_for_upload( + ps.http_client(auth_token=auth_token), + tenant_id, + timeline_id, + lsn, + ) + + +@dataclass +class TimelineMetrics: + timeline_id: TimelineId + last_record_lsn: Lsn + # One entry per each Safekeeper, order is the same + flush_lsns: List[Lsn] = field(default_factory=list) + commit_lsns: List[Lsn] = field(default_factory=list) + + +# Run page server and multiple acceptors, and multiple compute nodes running +# against different timelines. +def test_many_timelines(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + n_timelines = 3 + + branch_names = [ + "test_safekeepers_many_timelines_{}".format(tlin) for tlin in range(n_timelines) + ] + # pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418') + # that's not really human readable, so the branch names are introduced in Neon CLI. + # Neon CLI stores its branch <-> timeline mapping in its internals, + # but we need this to collect metrics from other servers, related to the timeline. + branch_names_to_timeline_ids = {} + + # start postgres on each timeline + pgs = [] + for branch_name in branch_names: + new_timeline_id = env.neon_cli.create_branch(branch_name) + pgs.append(env.postgres.create_start(branch_name)) + branch_names_to_timeline_ids[branch_name] = new_timeline_id + + tenant_id = env.initial_tenant + + def collect_metrics(message: str) -> List[TimelineMetrics]: + with env.pageserver.http_client() as pageserver_http: + timeline_details = [ + pageserver_http.timeline_detail( + tenant_id=tenant_id, timeline_id=branch_names_to_timeline_ids[branch_name] + ) + for branch_name in branch_names + ] + # All changes visible to pageserver (last_record_lsn) should be + # confirmed by safekeepers first. As we cannot atomically get + # state of both pageserver and safekeepers, we should start with + # pageserver. Looking at outdated data from pageserver is ok. + # Asking safekeepers first is not ok because new commits may arrive + # to both safekeepers and pageserver after we've already obtained + # safekeepers' state, it will look contradictory. + sk_metrics = [sk.http_client().get_metrics() for sk in env.safekeepers] + + timeline_metrics = [] + for timeline_detail in timeline_details: + timeline_id = TimelineId(timeline_detail["timeline_id"]) + + m = TimelineMetrics( + timeline_id=timeline_id, + last_record_lsn=Lsn(timeline_detail["last_record_lsn"]), + ) + for sk_m in sk_metrics: + m.flush_lsns.append(Lsn(sk_m.flush_lsn_inexact[(tenant_id, timeline_id)])) + m.commit_lsns.append(Lsn(sk_m.commit_lsn_inexact[(tenant_id, timeline_id)])) + + for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns): + # Invariant. May be < when transaction is in progress. + assert ( + commit_lsn <= flush_lsn + ), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + # We only call collect_metrics() after a transaction is confirmed by + # the compute node, which only happens after a consensus of safekeepers + # has confirmed the transaction. We assume majority consensus here. + assert ( + 2 * sum(m.last_record_lsn <= lsn for lsn in m.flush_lsns) + > neon_env_builder.num_safekeepers + ), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + assert ( + 2 * sum(m.last_record_lsn <= lsn for lsn in m.commit_lsns) + > neon_env_builder.num_safekeepers + ), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + timeline_metrics.append(m) + log.info(f"{message}: {timeline_metrics}") + return timeline_metrics + + # TODO: https://github.com/neondatabase/neon/issues/809 + # collect_metrics("before CREATE TABLE") + + # Do everything in different loops to have actions on different timelines + # interleaved. + # create schema + for pg in pgs: + pg.safe_psql("CREATE TABLE t(key int primary key, value text)") + init_m = collect_metrics("after CREATE TABLE") + + # Populate data for 2/3 timelines + class MetricsChecker(threading.Thread): + def __init__(self) -> None: + super().__init__(daemon=True) + self.should_stop = threading.Event() + self.exception: Optional[BaseException] = None + + def run(self) -> None: + try: + while not self.should_stop.is_set(): + collect_metrics("during INSERT INTO") + time.sleep(1) + except: # noqa: E722 + log.error( + "MetricsChecker's thread failed, the test will be failed on .stop() call", + exc_info=True, + ) + # We want to preserve traceback as well as the exception + exc_type, exc_value, exc_tb = sys.exc_info() + assert exc_type + e = exc_type(exc_value) + e.__traceback__ = exc_tb + self.exception = e + + def stop(self) -> None: + self.should_stop.set() + self.join() + if self.exception: + raise self.exception + + metrics_checker = MetricsChecker() + metrics_checker.start() + + for pg in pgs[:-1]: + pg.safe_psql("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + + metrics_checker.stop() + + collect_metrics("after INSERT INTO") + + # Check data for 2/3 timelines + for pg in pgs[:-1]: + res = pg.safe_psql("SELECT sum(key) FROM t") + assert res[0] == (5000050000,) + + final_m = collect_metrics("after SELECT") + # Assume that LSNs (a) behave similarly in all timelines; and (b) INSERT INTO alters LSN significantly. + # Also assume that safekeepers will not be significantly out of sync in this test. + middle_lsn = Lsn((int(init_m[0].last_record_lsn) + int(final_m[0].last_record_lsn)) // 2) + assert max(init_m[0].flush_lsns) < middle_lsn < min(final_m[0].flush_lsns) + assert max(init_m[0].commit_lsns) < middle_lsn < min(final_m[0].commit_lsns) + assert max(init_m[1].flush_lsns) < middle_lsn < min(final_m[1].flush_lsns) + assert max(init_m[1].commit_lsns) < middle_lsn < min(final_m[1].commit_lsns) + assert max(init_m[2].flush_lsns) <= min(final_m[2].flush_lsns) < middle_lsn + assert max(init_m[2].commit_lsns) <= min(final_m[2].commit_lsns) < middle_lsn + + +# Check that dead minority doesn't prevent the commits: execute insert n_inserts +# times, with fault_probability chance of getting a wal acceptor down or up +# along the way. 2 of 3 are always alive, so the work keeps going. +def test_restarts(neon_env_builder: NeonEnvBuilder): + fault_probability = 0.01 + n_inserts = 1000 + n_acceptors = 3 + + neon_env_builder.num_safekeepers = n_acceptors + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_safekeepers_restarts") + pg = env.postgres.create_start("test_safekeepers_restarts") + + # we rely upon autocommit after each statement + # as waiting for acceptors happens there + pg_conn = pg.connect() + cur = pg_conn.cursor() + + failed_node = None + cur.execute("CREATE TABLE t(key int primary key, value text)") + for i in range(n_inserts): + cur.execute("INSERT INTO t values (%s, 'payload');", (i + 1,)) + + if random.random() <= fault_probability: + if failed_node is None: + failed_node = env.safekeepers[random.randrange(0, n_acceptors)] + failed_node.stop() + else: + failed_node.start() + failed_node = None + assert query_scalar(cur, "SELECT sum(key) FROM t") == 500500 + + +# Test that safekeepers push their info to the broker and learn peer status from it +def test_broker(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_local_fs_remote_storage() + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_broker", "main") + pg = env.postgres.create_start("test_broker") + pg.safe_psql("CREATE TABLE t(key int primary key, value text)") + + # learn neon timeline from compute + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + # wait until remote_consistent_lsn gets advanced on all safekeepers + clients = [sk.http_client() for sk in env.safekeepers] + stat_before = [cli.timeline_status(tenant_id, timeline_id) for cli in clients] + log.info(f"statuses is {stat_before}") + + pg.safe_psql("INSERT INTO t SELECT generate_series(1,100), 'payload'") + + # force checkpoint in pageserver to advance remote_consistent_lsn + wait_lsn_force_checkpoint(tenant_id, timeline_id, pg, env.pageserver) + + # and wait till remote_consistent_lsn propagates to all safekeepers + started_at = time.time() + while True: + stat_after = [cli.timeline_status(tenant_id, timeline_id) for cli in clients] + if all( + s_after.remote_consistent_lsn > s_before.remote_consistent_lsn + for s_after, s_before in zip(stat_after, stat_before) + ): + break + elapsed = time.time() - started_at + if elapsed > 20: + raise RuntimeError( + f"timed out waiting {elapsed:.0f}s for remote_consistent_lsn propagation: status before {stat_before}, status current {stat_after}" + ) + time.sleep(0.5) + + +# Test that old WAL consumed by peers and pageserver is removed from safekeepers. +@pytest.mark.parametrize("auth_enabled", [False, True]) +def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): + neon_env_builder.num_safekeepers = 2 + # to advance remote_consistent_lsn + neon_env_builder.enable_local_fs_remote_storage() + neon_env_builder.auth_enabled = auth_enabled + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_safekeepers_wal_removal") + pg = env.postgres.create_start("test_safekeepers_wal_removal") + + # Note: it is important to insert at least two segments, as currently + # control file is synced roughly once in segment range and WAL is not + # removed until all horizons are persisted. + pg.safe_psql_many( + [ + "CREATE TABLE t(key int primary key, value text)", + "INSERT INTO t SELECT generate_series(1,200000), 'payload'", + ] + ) + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + # force checkpoint to advance remote_consistent_lsn + pageserver_conn_options = {} + if auth_enabled: + pageserver_conn_options["password"] = env.auth_keys.generate_tenant_token(tenant_id) + wait_lsn_force_checkpoint(tenant_id, timeline_id, pg, env.pageserver, pageserver_conn_options) + + # We will wait for first segment removal. Make sure they exist for starter. + first_segments = [ + os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id), "000000010000000000000001") + for sk in env.safekeepers + ] + assert all(os.path.exists(p) for p in first_segments) + + if not auth_enabled: + http_cli = env.safekeepers[0].http_client() + else: + http_cli = env.safekeepers[0].http_client( + auth_token=env.auth_keys.generate_tenant_token(tenant_id) + ) + http_cli_other = env.safekeepers[0].http_client( + auth_token=env.auth_keys.generate_tenant_token(TenantId.generate()) + ) + http_cli_noauth = env.safekeepers[0].http_client() + + # Pretend WAL is offloaded to s3. + if auth_enabled: + old_backup_lsn = http_cli.timeline_status( + tenant_id=tenant_id, timeline_id=timeline_id + ).backup_lsn + assert "FFFFFFFF/FEFFFFFF" != old_backup_lsn + for cli in [http_cli_other, http_cli_noauth]: + with pytest.raises(cli.HTTPError, match="Forbidden|Unauthorized"): + cli.record_safekeeper_info( + tenant_id, timeline_id, {"backup_lsn": "FFFFFFFF/FEFFFFFF"} + ) + assert ( + old_backup_lsn + == http_cli.timeline_status(tenant_id=tenant_id, timeline_id=timeline_id).backup_lsn + ) + http_cli.record_safekeeper_info(tenant_id, timeline_id, {"backup_lsn": "FFFFFFFF/FEFFFFFF"}) + assert ( + Lsn("FFFFFFFF/FEFFFFFF") + == http_cli.timeline_status(tenant_id=tenant_id, timeline_id=timeline_id).backup_lsn + ) + + # wait till first segment is removed on all safekeepers + wait( + lambda first_segments=first_segments: all(not os.path.exists(p) for p in first_segments), + "first segment get removed", + ) + + +# Wait for something, defined as f() returning True, raising error if this +# doesn't happen without timeout seconds. +def wait(f, desc, timeout=30): + started_at = time.time() + while True: + if f(): + break + elapsed = time.time() - started_at + if elapsed > timeout: + raise RuntimeError(f"timed out waiting {elapsed:.0f}s for {desc}") + time.sleep(0.5) + + +def is_segment_offloaded( + sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, seg_end: Lsn +): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"sk status is {tli_status}") + return tli_status.backup_lsn >= seg_end + + +def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"sk status is {tli_status}") + return tli_status.flush_lsn >= lsn + + +def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb): + http_cli = sk.http_client() + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + sk_wal_size = get_dir_size(os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id))) + sk_wal_size_mb = sk_wal_size / 1024 / 1024 + log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}") + return sk_wal_size_mb <= target_size_mb + + +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): + neon_env_builder.num_safekeepers = 3 + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_safekeepers_wal_backup", + ) + + neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER + + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_safekeepers_wal_backup") + pg = env.postgres.create_start("test_safekeepers_wal_backup") + + # learn neon timeline from compute + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + pg_conn = pg.connect() + cur = pg_conn.cursor() + cur.execute("create table t(key int, value text)") + + # Shut down subsequently each of safekeepers and fill a segment while sk is + # down; ensure segment gets offloaded by others. + offloaded_seg_end = [Lsn("0/2000000"), Lsn("0/3000000"), Lsn("0/4000000")] + for victim, seg_end in zip(env.safekeepers, offloaded_seg_end): + victim.stop() + # roughly fills one segment + cur.execute("insert into t select generate_series(1,250000), 'payload'") + live_sk = [sk for sk in env.safekeepers if sk != victim][0] + + wait( + partial(is_segment_offloaded, live_sk, tenant_id, timeline_id, seg_end), + f"segment ending at {seg_end} get offloaded", + ) + + victim.start() + + # put one of safekeepers down again + env.safekeepers[0].stop() + # restart postgres + pg.stop_and_destroy().create_start("test_safekeepers_wal_backup") + # and ensure offloading still works + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("insert into t select generate_series(1,250000), 'payload'") + seg_end = Lsn("0/5000000") + wait( + partial(is_segment_offloaded, env.safekeepers[1], tenant_id, timeline_id, seg_end), + f"segment ending at {seg_end} get offloaded", + ) + + +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): + neon_env_builder.num_safekeepers = 3 + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_s3_wal_replay", + ) + + neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER + + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_s3_wal_replay") + + pg = env.postgres.create_start("test_s3_wal_replay") + + # learn neon timeline from compute + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + expected_sum = 0 + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("create table t(key int, value text)") + cur.execute("insert into t values (1, 'payload')") + expected_sum += 1 + + offloaded_seg_end = Lsn("0/3000000") + # roughly fills two segments + cur.execute("insert into t select generate_series(1,500000), 'payload'") + expected_sum += 500000 * 500001 // 2 + + assert query_scalar(cur, "select sum(key) from t") == expected_sum + + for sk in env.safekeepers: + wait( + partial(is_segment_offloaded, sk, tenant_id, timeline_id, offloaded_seg_end), + f"segment ending at {offloaded_seg_end} get offloaded", + ) + + # advance remote_consistent_lsn to trigger WAL trimming + # this LSN should be less than commit_lsn, so timeline will be active=true in safekeepers, to push etcd updates + env.safekeepers[0].http_client().record_safekeeper_info( + tenant_id, timeline_id, {"remote_consistent_lsn": str(offloaded_seg_end)} + ) + + last_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + for sk in env.safekeepers: + # require WAL to be trimmed, so no more than one segment is left on disk + target_size_mb = 16 * 1.5 + wait( + partial(is_wal_trimmed, sk, tenant_id, timeline_id, target_size_mb), + f"sk_id={sk.id} to trim WAL to {target_size_mb:.2f}MB", + ) + # wait till everyone puts data up to last_lsn on disk, we are + # going to recreate state on safekeepers claiming they have data till last_lsn. + wait( + partial(is_flush_lsn_caught_up, sk, tenant_id, timeline_id, last_lsn), + f"sk_id={sk.id} to flush {last_lsn}", + ) + + ps_cli = env.pageserver.http_client() + pageserver_lsn = Lsn(ps_cli.timeline_detail(tenant_id, timeline_id)["last_record_lsn"]) + lag = last_lsn - pageserver_lsn + log.info( + f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb" + ) + + pg.stop_and_destroy() + + # Also delete and manually create timeline on safekeepers -- this tests + # scenario of manual recovery on different set of safekeepers. + + # save the last (partial) file to put it back after recreation; others will be fetched from s3 + sk = env.safekeepers[0] + tli_dir = Path(sk.data_dir()) / str(tenant_id) / str(timeline_id) + f_partial = Path([f for f in os.listdir(tli_dir) if f.endswith(".partial")][0]) + f_partial_path = tli_dir / f_partial + f_partial_saved = Path(sk.data_dir()) / f_partial.name + f_partial_path.rename(f_partial_saved) + + pg_version = sk.http_client().timeline_status(tenant_id, timeline_id).pg_version + + for sk in env.safekeepers: + cli = sk.http_client() + cli.timeline_delete_force(tenant_id, timeline_id) + cli.timeline_create(tenant_id, timeline_id, pg_version, last_lsn) + f_partial_path = ( + Path(sk.data_dir()) / str(tenant_id) / str(timeline_id) / f_partial_saved.name + ) + shutil.copy(f_partial_saved, f_partial_path) + + # recreate timeline on pageserver from scratch + ps_cli.timeline_delete(tenant_id, timeline_id) + ps_cli.timeline_create(tenant_id, timeline_id) + + wait_lsn_timeout = 60 * 3 + started_at = time.time() + last_debug_print = 0.0 + + while True: + elapsed = time.time() - started_at + if elapsed > wait_lsn_timeout: + raise RuntimeError("Timed out waiting for WAL redo") + + pageserver_lsn = Lsn( + env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["last_record_lsn"] + ) + lag = last_lsn - pageserver_lsn + + if time.time() > last_debug_print + 10 or lag <= 0: + last_debug_print = time.time() + log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb") + + if lag <= 0: + break + + time.sleep(1) + + log.info(f"WAL redo took {elapsed} s") + + # verify data + pg.create_start("test_s3_wal_replay") + + assert pg.safe_psql("select sum(key) from t")[0][0] == expected_sum + + +class ProposerPostgres(PgProtocol): + """Object for running postgres without NeonEnv""" + + def __init__( + self, + pgdata_dir: str, + pg_bin, + tenant_id: TenantId, + timeline_id: TimelineId, + listen_addr: str, + port: int, + ): + super().__init__(host=listen_addr, port=port, user="cloud_admin", dbname="postgres") + + self.pgdata_dir: str = pgdata_dir + self.pg_bin: PgBin = pg_bin + self.tenant_id: TenantId = tenant_id + self.timeline_id: TimelineId = timeline_id + self.listen_addr: str = listen_addr + self.port: int = port + + def pg_data_dir_path(self) -> str: + """Path to data directory""" + return self.pgdata_dir + + def config_file_path(self) -> str: + """Path to postgresql.conf""" + return os.path.join(self.pgdata_dir, "postgresql.conf") + + def create_dir_config(self, safekeepers: str): + """Create dir and config for running --sync-safekeepers""" + + pathlib.Path(self.pg_data_dir_path()).mkdir(exist_ok=True) + with open(self.config_file_path(), "w") as f: + cfg = [ + "synchronous_standby_names = 'walproposer'\n", + "shared_preload_libraries = 'neon'\n", + f"neon.timeline_id = '{self.timeline_id}'\n", + f"neon.tenant_id = '{self.tenant_id}'\n", + "neon.pageserver_connstring = ''\n", + f"neon.safekeepers = '{safekeepers}'\n", + f"listen_addresses = '{self.listen_addr}'\n", + f"port = '{self.port}'\n", + ] + + f.writelines(cfg) + + def sync_safekeepers(self) -> Lsn: + """ + Run 'postgres --sync-safekeepers'. + Returns execution result, which is commit_lsn after sync. + """ + + command = ["postgres", "--sync-safekeepers"] + env = { + "PGDATA": self.pg_data_dir_path(), + } + + basepath = self.pg_bin.run_capture(command, env) + + log.info(f"postgres --sync-safekeepers output: {basepath}") + + stdout_filename = basepath + ".stdout" + + with open(stdout_filename, "r") as stdout_f: + stdout = stdout_f.read() + return Lsn(stdout.strip("\n ")) + + def initdb(self): + """Run initdb""" + + args = ["initdb", "-U", "cloud_admin", "-D", self.pg_data_dir_path()] + self.pg_bin.run(args) + + def start(self): + """Start postgres with pg_ctl""" + + log_path = os.path.join(self.pg_data_dir_path(), "pg.log") + args = ["pg_ctl", "-D", self.pg_data_dir_path(), "-l", log_path, "-w", "start"] + self.pg_bin.run(args) + + def stop(self): + """Stop postgres with pg_ctl""" + + args = ["pg_ctl", "-D", self.pg_data_dir_path(), "-m", "immediate", "-w", "stop"] + self.pg_bin.run(args) + + +# insert wal in all safekeepers and run sync on proposer +def test_sync_safekeepers( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + port_distributor: PortDistributor, +): + + # We don't really need the full environment for this test, just the + # safekeepers would be enough. + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + + # write config for proposer + pgdata_dir = os.path.join(env.repo_dir, "proposer_pgdata") + pg = ProposerPostgres( + pgdata_dir, pg_bin, tenant_id, timeline_id, "127.0.0.1", port_distributor.get_port() + ) + pg.create_dir_config(env.get_safekeeper_connstrs()) + + # valid lsn, which is not in the segment start, nor in zero segment + epoch_start_lsn = Lsn("0/16B9188") + begin_lsn = epoch_start_lsn + + # append and commit WAL + lsn_after_append = [] + for i in range(3): + res = env.safekeepers[i].append_logical_message( + tenant_id, + timeline_id, + { + "lm_prefix": "prefix", + "lm_message": "message", + "set_commit_lsn": True, + "send_proposer_elected": True, + "term": 2, + "begin_lsn": int(begin_lsn), + "epoch_start_lsn": int(epoch_start_lsn), + "truncate_lsn": int(epoch_start_lsn), + "pg_version": int(env.pg_version) * 10000, + }, + ) + lsn = Lsn(res["inserted_wal"]["end_lsn"]) + lsn_after_append.append(lsn) + log.info(f"safekeeper[{i}] lsn after append: {lsn}") + + # run sync safekeepers + lsn_after_sync = pg.sync_safekeepers() + log.info(f"lsn after sync = {lsn_after_sync}") + + assert all(lsn_after_sync == lsn for lsn in lsn_after_append) + + +@pytest.mark.parametrize("auth_enabled", [False, True]) +def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): + neon_env_builder.auth_enabled = auth_enabled + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_timeline_status") + pg = env.postgres.create_start("test_timeline_status") + + wa = env.safekeepers[0] + + # learn neon timeline from compute + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + if not auth_enabled: + wa_http_cli = wa.http_client() + wa_http_cli.check_status() + else: + wa_http_cli = wa.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + wa_http_cli.check_status() + wa_http_cli_bad = wa.http_client( + auth_token=env.auth_keys.generate_tenant_token(TenantId.generate()) + ) + wa_http_cli_bad.check_status() + wa_http_cli_noauth = wa.http_client() + wa_http_cli_noauth.check_status() + + # fetch something sensible from status + tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) + epoch = tli_status.acceptor_epoch + timeline_start_lsn = tli_status.timeline_start_lsn + + if auth_enabled: + for cli in [wa_http_cli_bad, wa_http_cli_noauth]: + with pytest.raises(cli.HTTPError, match="Forbidden|Unauthorized"): + cli.timeline_status(tenant_id, timeline_id) + + pg.safe_psql("create table t(i int)") + + # ensure epoch goes up after reboot + pg.stop().start() + pg.safe_psql("insert into t values(10)") + + tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) + epoch_after_reboot = tli_status.acceptor_epoch + assert epoch_after_reboot > epoch + + # and timeline_start_lsn stays the same + assert tli_status.timeline_start_lsn == timeline_start_lsn + + +class SafekeeperEnv: + def __init__( + self, + repo_dir: Path, + port_distributor: PortDistributor, + pg_bin: PgBin, + neon_binpath: Path, + num_safekeepers: int = 1, + ): + self.repo_dir = repo_dir + self.port_distributor = port_distributor + self.broker = Etcd( + datadir=os.path.join(self.repo_dir, "etcd"), + port=self.port_distributor.get_port(), + peer_port=self.port_distributor.get_port(), + ) + self.pg_bin = pg_bin + self.num_safekeepers = num_safekeepers + self.bin_safekeeper = str(neon_binpath / "safekeeper") + self.safekeepers: Optional[List[subprocess.CompletedProcess[Any]]] = None + self.postgres: Optional[ProposerPostgres] = None + self.tenant_id: Optional[TenantId] = None + self.timeline_id: Optional[TimelineId] = None + + def init(self) -> "SafekeeperEnv": + assert self.postgres is None, "postgres is already initialized" + assert self.safekeepers is None, "safekeepers are already initialized" + + self.tenant_id = TenantId.generate() + self.timeline_id = TimelineId.generate() + self.repo_dir.mkdir(exist_ok=True) + + # Create config and a Safekeeper object for each safekeeper + self.safekeepers = [] + for i in range(1, self.num_safekeepers + 1): + self.safekeepers.append(self.start_safekeeper(i)) + + # Create and start postgres + self.postgres = self.create_postgres() + self.postgres.start() + + return self + + def start_safekeeper(self, i): + port = SafekeeperPort( + pg=self.port_distributor.get_port(), + http=self.port_distributor.get_port(), + ) + + safekeeper_dir = self.repo_dir / f"sk{i}" + safekeeper_dir.mkdir(exist_ok=True) + + cmd = [ + self.bin_safekeeper, + "-l", + f"127.0.0.1:{port.pg}", + "--listen-http", + f"127.0.0.1:{port.http}", + "-D", + str(safekeeper_dir), + "--id", + str(i), + "--broker-endpoints", + self.broker.client_url(), + ] + log.info(f'Running command "{" ".join(cmd)}"') + + safekeeper_client = SafekeeperHttpClient( + port=port.http, + auth_token=None, + ) + try: + safekeeper_process = start_in_background( + cmd, safekeeper_dir, "safekeeper.log", safekeeper_client.check_status + ) + return safekeeper_process + except Exception as e: + log.error(e) + safekeeper_process.kill() + raise Exception(f"Failed to start safekepeer as {cmd}, reason: {e}") + + def get_safekeeper_connstrs(self): + return ",".join([sk_proc.args[2] for sk_proc in self.safekeepers]) + + def create_postgres(self): + pgdata_dir = os.path.join(self.repo_dir, "proposer_pgdata") + pg = ProposerPostgres( + pgdata_dir, + self.pg_bin, + self.tenant_id, + self.timeline_id, + "127.0.0.1", + self.port_distributor.get_port(), + ) + pg.initdb() + pg.create_dir_config(self.get_safekeeper_connstrs()) + return pg + + def kill_safekeeper(self, sk_dir): + """Read pid file and kill process""" + pid_file = os.path.join(sk_dir, "safekeeper.pid") + with open(pid_file, "r") as f: + pid = int(f.read()) + log.info(f"Killing safekeeper with pid {pid}") + os.kill(pid, signal.SIGKILL) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + log.info("Cleaning up all safekeeper and compute nodes") + + # Stop all the nodes + if self.postgres is not None: + self.postgres.stop() + if self.safekeepers is not None: + for sk_proc in self.safekeepers: + self.kill_safekeeper(sk_proc.args[6]) + + +def test_safekeeper_without_pageserver( + test_output_dir: str, + port_distributor: PortDistributor, + pg_bin: PgBin, + neon_binpath: Path, +): + # Create the environment in the test-specific output dir + repo_dir = Path(os.path.join(test_output_dir, "repo")) + + env = SafekeeperEnv( + repo_dir, + port_distributor, + pg_bin, + neon_binpath, + ) + + with env: + env.init() + assert env.postgres is not None + + env.postgres.safe_psql("create table t(i int)") + env.postgres.safe_psql("insert into t select generate_series(1, 100)") + res = env.postgres.safe_psql("select sum(i) from t")[0][0] + assert res == 5050 + + +def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): + def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str: + return ",".join([f"localhost:{sk.port.pg}" for sk in env.safekeepers if sk.id in sk_names]) + + def execute_payload(pg: Postgres): + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + # we rely upon autocommit after each statement + # as waiting for acceptors happens there + cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)") + cur.execute("INSERT INTO t VALUES (0, 'something')") + sum_before = query_scalar(cur, "SELECT SUM(key) FROM t") + + cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + sum_after = query_scalar(cur, "SELECT SUM(key) FROM t") + assert sum_after == sum_before + 5000050000 + + def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId): + for sk in safekeepers: + http_cli = sk.http_client() + try: + status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"Safekeeper {sk.id} status: {status}") + except Exception as e: + log.info(f"Safekeeper {sk.id} status error: {e}") + + neon_env_builder.num_safekeepers = 4 + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_replace_safekeeper") + + log.info("Use only first 3 safekeepers") + env.safekeepers[3].stop() + active_safekeepers = [1, 2, 3] + pg = env.postgres.create("test_replace_safekeeper") + pg.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) + pg.start() + + # learn neon timeline from compute + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + execute_payload(pg) + show_statuses(env.safekeepers, tenant_id, timeline_id) + + log.info("Restart all safekeepers to flush everything") + env.safekeepers[0].stop(immediate=True) + execute_payload(pg) + env.safekeepers[0].start() + env.safekeepers[1].stop(immediate=True) + execute_payload(pg) + env.safekeepers[1].start() + env.safekeepers[2].stop(immediate=True) + execute_payload(pg) + env.safekeepers[2].start() + + env.safekeepers[0].stop(immediate=True) + env.safekeepers[1].stop(immediate=True) + env.safekeepers[2].stop(immediate=True) + env.safekeepers[0].start() + env.safekeepers[1].start() + env.safekeepers[2].start() + + execute_payload(pg) + show_statuses(env.safekeepers, tenant_id, timeline_id) + + log.info("Stop sk1 (simulate failure) and use only quorum of sk2 and sk3") + env.safekeepers[0].stop(immediate=True) + execute_payload(pg) + show_statuses(env.safekeepers, tenant_id, timeline_id) + + log.info("Recreate postgres to replace failed sk1 with new sk4") + pg.stop_and_destroy().create("test_replace_safekeeper") + active_safekeepers = [2, 3, 4] + env.safekeepers[3].start() + pg.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) + pg.start() + + execute_payload(pg) + show_statuses(env.safekeepers, tenant_id, timeline_id) + + log.info("Stop sk2 to require quorum of sk3 and sk4 for normal work") + env.safekeepers[1].stop(immediate=True) + execute_payload(pg) + show_statuses(env.safekeepers, tenant_id, timeline_id) + + +# We have `wal_keep_size=0`, so postgres should trim WAL once it's broadcasted +# to all safekeepers. This test checks that compute WAL can fit into small number +# of WAL segments. +def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): + # used to calculate delta in collect_stats + last_lsn = Lsn(0) + + # returns pg_wal size in MB + def collect_stats(pg: Postgres, cur, enable_logs=True): + nonlocal last_lsn + assert pg.pgdata_dir is not None + + log.info("executing INSERT to generate WAL") + current_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + pg_wal_size_mb = get_dir_size(os.path.join(pg.pgdata_dir, "pg_wal")) / 1024 / 1024 + if enable_logs: + lsn_delta_mb = (current_lsn - last_lsn) / 1024 / 1024 + log.info(f"LSN delta: {lsn_delta_mb} MB, current WAL size: {pg_wal_size_mb} MB") + last_lsn = current_lsn + return pg_wal_size_mb + + # generates about ~20MB of WAL, to create at least one new segment + def generate_wal(cur): + cur.execute("INSERT INTO t SELECT generate_series(1,300000), 'payload'") + + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_wal_deleted_after_broadcast") + # Adjust checkpoint config to prevent keeping old WAL segments + pg = env.postgres.create_start( + "test_wal_deleted_after_broadcast", + config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"], + ) + + pg_conn = pg.connect() + cur = pg_conn.cursor() + cur.execute("CREATE TABLE t(key int, value text)") + + collect_stats(pg, cur) + + # generate WAL to simulate normal workload + for i in range(5): + generate_wal(cur) + collect_stats(pg, cur) + + log.info("executing checkpoint") + cur.execute("CHECKPOINT") + wal_size_after_checkpoint = collect_stats(pg, cur) + + # there shouldn't be more than 2 WAL segments (but dir may have archive_status files) + assert wal_size_after_checkpoint < 16 * 2.5 + + +@pytest.mark.parametrize("auth_enabled", [False, True]) +def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): + neon_env_builder.auth_enabled = auth_enabled + env = neon_env_builder.init_start() + + # Create two tenants: one will be deleted, other should be preserved. + tenant_id = env.initial_tenant + timeline_id_1 = env.neon_cli.create_branch("br1") # Active, delete explicitly + timeline_id_2 = env.neon_cli.create_branch("br2") # Inactive, delete explicitly + timeline_id_3 = env.neon_cli.create_branch("br3") # Active, delete with the tenant + timeline_id_4 = env.neon_cli.create_branch("br4") # Inactive, delete with the tenant + + tenant_id_other, timeline_id_other = env.neon_cli.create_tenant() + + # Populate branches + pg_1 = env.postgres.create_start("br1") + pg_2 = env.postgres.create_start("br2") + pg_3 = env.postgres.create_start("br3") + pg_4 = env.postgres.create_start("br4") + pg_other = env.postgres.create_start("main", tenant_id=tenant_id_other) + for pg in [pg_1, pg_2, pg_3, pg_4, pg_other]: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key)") + sk = env.safekeepers[0] + sk_data_dir = Path(sk.data_dir()) + if not auth_enabled: + sk_http = sk.http_client() + sk_http_other = sk_http + else: + sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + sk_http_other = sk.http_client( + auth_token=env.auth_keys.generate_tenant_token(tenant_id_other) + ) + sk_http_noauth = sk.http_client() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state. + pg_2.stop_and_destroy() + pg_4.stop_and_destroy() + sk.stop() + sk.start() + + # Ensure connections to Safekeeper are established + for pg in [pg_1, pg_3, pg_other]: + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("INSERT INTO t (key) VALUES (1)") + + # Remove initial tenant's br1 (active) + assert sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Ensure repeated deletion succeeds + assert not sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + if auth_enabled: + # Ensure we cannot delete the other tenant + for sk_h in [sk_http, sk_http_noauth]: + with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): + assert sk_h.timeline_delete_force(tenant_id_other, timeline_id_other) + with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): + assert sk_h.tenant_delete_force(tenant_id_other) + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove initial tenant's br2 (inactive) + assert sk_http.timeline_delete_force(tenant_id, timeline_id_2)["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove non-existing branch, should succeed + assert not sk_http.timeline_delete_force(tenant_id, TimelineId("00" * 16))["dir_existed"] + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() + assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists() + assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove initial tenant fully (two branches are active) + response = sk_http.tenant_delete_force(tenant_id) + assert response[str(timeline_id_3)]["dir_existed"] + assert not (sk_data_dir / str(tenant_id)).exists() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Remove initial tenant again. + response = sk_http.tenant_delete_force(tenant_id) + assert response == {} + assert not (sk_data_dir / str(tenant_id)).exists() + assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() + + # Ensure the other tenant still works + sk_http_other.timeline_status(tenant_id_other, timeline_id_other) + with closing(pg_other.connect()) as conn: + with conn.cursor() as cur: + cur.execute("INSERT INTO t (key) VALUES (123)") diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py new file mode 100644 index 0000000000..70ae6bae18 --- /dev/null +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -0,0 +1,598 @@ +import asyncio +import random +import time +from dataclasses import dataclass +from typing import List, Optional + +import asyncpg +from fixtures.log_helper import getLogger +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper +from fixtures.types import Lsn, TenantId, TimelineId + +log = getLogger("root.safekeeper_async") + + +class BankClient(object): + def __init__(self, conn: asyncpg.Connection, n_accounts, init_amount): + self.conn: asyncpg.Connection = conn + self.n_accounts = n_accounts + self.init_amount = init_amount + + async def initdb(self): + await self.conn.execute("DROP TABLE IF EXISTS bank_accs") + await self.conn.execute("CREATE TABLE bank_accs(uid int primary key, amount int)") + await self.conn.execute( + """ + INSERT INTO bank_accs + SELECT *, $1 FROM generate_series(0, $2) + """, + self.init_amount, + self.n_accounts - 1, + ) + await self.conn.execute("DROP TABLE IF EXISTS bank_log") + await self.conn.execute("CREATE TABLE bank_log(from_uid int, to_uid int, amount int)") + + async def check_invariant(self): + row = await self.conn.fetchrow("SELECT sum(amount) AS sum FROM bank_accs") + assert row["sum"] == self.n_accounts * self.init_amount + + +async def bank_transfer(conn: asyncpg.Connection, from_uid, to_uid, amount): + # avoid deadlocks by sorting uids + if from_uid > to_uid: + from_uid, to_uid, amount = to_uid, from_uid, -amount + + async with conn.transaction(): + await conn.execute( + "UPDATE bank_accs SET amount = amount + ($1) WHERE uid = $2", + amount, + to_uid, + ) + await conn.execute( + "UPDATE bank_accs SET amount = amount - ($1) WHERE uid = $2", + amount, + from_uid, + ) + await conn.execute( + "INSERT INTO bank_log VALUES ($1, $2, $3)", + from_uid, + to_uid, + amount, + ) + + +class WorkerStats(object): + def __init__(self, n_workers): + self.counters = [0] * n_workers + self.running = True + + def reset(self): + self.counters = [0] * len(self.counters) + + def inc_progress(self, worker_id): + self.counters[worker_id] += 1 + + def check_progress(self): + log.debug("Workers progress: {}".format(self.counters)) + + # every worker should finish at least one tx + assert all(cnt > 0 for cnt in self.counters) + + progress = sum(self.counters) + log.info("All workers made {} transactions".format(progress)) + + +async def run_random_worker(stats: WorkerStats, pg: Postgres, worker_id, n_accounts, max_transfer): + pg_conn = await pg.connect_async() + log.debug("Started worker {}".format(worker_id)) + + while stats.running: + from_uid = random.randint(0, n_accounts - 1) + to_uid = (from_uid + random.randint(1, n_accounts - 1)) % n_accounts + amount = random.randint(1, max_transfer) + + await bank_transfer(pg_conn, from_uid, to_uid, amount) + stats.inc_progress(worker_id) + + log.debug("Executed transfer({}) {} => {}".format(amount, from_uid, to_uid)) + + log.debug("Finished worker {}".format(worker_id)) + + await pg_conn.close() + + +async def wait_for_lsn( + safekeeper: Safekeeper, + tenant_id: TenantId, + timeline_id: TimelineId, + wait_lsn: Lsn, + polling_interval=1, + timeout=60, +): + """ + Poll flush_lsn from safekeeper until it's greater or equal than + provided wait_lsn. To do that, timeline_status is fetched from + safekeeper every polling_interval seconds. + """ + + started_at = time.time() + client = safekeeper.http_client() + + flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn + log.info( + f"Safekeeper at port {safekeeper.port.pg} has flush_lsn {flush_lsn}, waiting for lsn {wait_lsn}" + ) + + while wait_lsn > flush_lsn: + elapsed = time.time() - started_at + if elapsed > timeout: + raise RuntimeError( + f"timed out waiting for safekeeper at port {safekeeper.port.pg} to reach {wait_lsn}, current lsn is {flush_lsn}" + ) + + await asyncio.sleep(polling_interval) + flush_lsn = client.timeline_status(tenant_id, timeline_id).flush_lsn + log.debug(f"safekeeper port={safekeeper.port.pg} flush_lsn={flush_lsn} wait_lsn={wait_lsn}") + + +# This test will run several iterations and check progress in each of them. +# On each iteration 1 acceptor is stopped, and 2 others should allow +# background workers execute transactions. In the end, state should remain +# consistent. +async def run_restarts_under_load( + env: NeonEnv, + pg: Postgres, + acceptors: List[Safekeeper], + n_workers=10, + n_accounts=100, + init_amount=100000, + max_transfer=100, + period_time=4, + iterations=10, +): + # Set timeout for this test at 5 minutes. It should be enough for test to complete, + # taking into account that this timeout is checked only at the beginning of every iteration. + test_timeout_at = time.monotonic() + 5 * 60 + + pg_conn = await pg.connect_async() + tenant_id = TenantId(await pg_conn.fetchval("show neon.tenant_id")) + timeline_id = TimelineId(await pg_conn.fetchval("show neon.timeline_id")) + + bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount) + # create tables and initial balances + await bank.initdb() + + stats = WorkerStats(n_workers) + workers = [] + for worker_id in range(n_workers): + worker = run_random_worker(stats, pg, worker_id, bank.n_accounts, max_transfer) + workers.append(asyncio.create_task(worker)) + + for it in range(iterations): + assert time.monotonic() < test_timeout_at, "test timed out" + + victim_idx = it % len(acceptors) + victim = acceptors[victim_idx] + victim.stop() + + flush_lsn = Lsn(await pg_conn.fetchval("SELECT pg_current_wal_flush_lsn()")) + log.info(f"Postgres flush_lsn {flush_lsn}") + + pageserver_lsn = Lsn( + env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["last_record_lsn"] + ) + sk_ps_lag = flush_lsn - pageserver_lsn + log.info(f"Pageserver last_record_lsn={pageserver_lsn} lag={sk_ps_lag / 1024}kb") + + # Wait until alive safekeepers catch up with postgres + for idx, safekeeper in enumerate(acceptors): + if idx != victim_idx: + await wait_for_lsn(safekeeper, tenant_id, timeline_id, flush_lsn) + + stats.reset() + await asyncio.sleep(period_time) + # assert that at least one transaction has completed in every worker + stats.check_progress() + + victim.start() + + log.info("Iterations are finished, exiting coroutines...") + stats.running = False + # await all workers + await asyncio.gather(*workers) + # assert balances sum hasn't changed + await bank.check_invariant() + await pg_conn.close() + + +# Restart acceptors one by one, while executing and validating bank transactions +def test_restarts_under_load(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_safekeepers_restarts_under_load") + # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long + pg = env.postgres.create_start( + "test_safekeepers_restarts_under_load", config_lines=["max_replication_write_lag=1MB"] + ) + + asyncio.run(run_restarts_under_load(env, pg, env.safekeepers)) + + +# Restart acceptors one by one and test that everything is working as expected +# when checkpoins are triggered frequently by max_wal_size=32MB. Because we have +# wal_keep_size=0, there will be aggressive WAL segments recycling. +def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_restarts_frequent_checkpoints") + # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long + pg = env.postgres.create_start( + "test_restarts_frequent_checkpoints", + config_lines=[ + "max_replication_write_lag=1MB", + "min_wal_size=32MB", + "max_wal_size=32MB", + "log_checkpoints=on", + ], + ) + + # we try to simulate large (flush_lsn - truncate_lsn) lag, to test that WAL segments + # are not removed before broadcasted to all safekeepers, with the help of replication slot + asyncio.run(run_restarts_under_load(env, pg, env.safekeepers, period_time=15, iterations=5)) + + +def postgres_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): + pg = Postgres( + env, + tenant_id=env.initial_tenant, + port=env.port_distributor.get_port(), + # In these tests compute has high probability of terminating on its own + # before our stop() due to lost consensus leadership. + check_stop_result=False, + ) + + # embed current time in node name + node_name = pgdir_name or f"pg_node_{time.time()}" + return pg.create_start( + branch_name=branch, node_name=node_name, config_lines=["log_statement=all"] + ) + + +async def exec_compute_query( + env: NeonEnv, branch: str, query: str, pgdir_name: Optional[str] = None +): + with postgres_create_start(env, branch=branch, pgdir_name=pgdir_name) as pg: + before_conn = time.time() + conn = await pg.connect_async() + res = await conn.fetch(query) + await conn.close() + after_conn = time.time() + log.info(f"{query} took {after_conn - before_conn}s") + return res + + +async def run_compute_restarts( + env: NeonEnv, queries=16, batch_insert=10000, branch="test_compute_restarts" +): + cnt = 0 + sum = 0 + + await exec_compute_query(env, branch, "CREATE TABLE t (i int)") + + for i in range(queries): + if i % 4 == 0: + await exec_compute_query( + env, branch, f"INSERT INTO t SELECT 1 FROM generate_series(1, {batch_insert})" + ) + sum += batch_insert + cnt += batch_insert + elif (i % 4 == 1) or (i % 4 == 3): + # Note that select causes lots of FPI's and increases probability of safekeepers + # standing at different LSNs after compute termination. + actual_sum = (await exec_compute_query(env, branch, "SELECT SUM(i) FROM t"))[0][0] + assert actual_sum == sum, f"Expected sum={sum}, actual={actual_sum}" + elif i % 4 == 2: + await exec_compute_query(env, branch, "UPDATE t SET i = i + 1") + sum += cnt + + +# Add a test which creates compute for every query, and then destroys it right after. +def test_compute_restarts(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_compute_restarts") + asyncio.run(run_compute_restarts(env)) + + +class BackgroundCompute(object): + MAX_QUERY_GAP_SECONDS = 2 + + def __init__(self, index: int, env: NeonEnv, branch: str): + self.index = index + self.env = env + self.branch = branch + self.running = False + self.stopped = False + self.total_tries = 0 + self.successful_queries: List[int] = [] + + async def run(self): + if self.running: + raise Exception("BackgroundCompute is already running") + + self.running = True + i = 0 + while not self.stopped: + try: + verify_key = (self.index << 16) + i + i += 1 + self.total_tries += 1 + res = await exec_compute_query( + self.env, + self.branch, + f"INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key", + pgdir_name=f"bgcompute{self.index}_key{verify_key}", + ) + log.info(f"result: {res}") + if len(res) != 1: + raise Exception("No result returned") + if res[0][0] != verify_key: + raise Exception("Wrong result returned") + self.successful_queries.append(verify_key) + except Exception as e: + log.info(f"BackgroundCompute {self.index} query failed: {e}") + + # With less sleep, there is a very big chance of not committing + # anything or only 1 xact during test run. + await asyncio.sleep(random.uniform(0, self.MAX_QUERY_GAP_SECONDS)) + self.running = False + + +async def run_concurrent_computes( + env: NeonEnv, num_computes=10, run_seconds=20, branch="test_concurrent_computes" +): + await exec_compute_query( + env, branch, "CREATE TABLE query_log (t timestamp default now(), index int, verify_key int)" + ) + + computes = [BackgroundCompute(i, env, branch) for i in range(num_computes)] + background_tasks = [asyncio.create_task(compute.run()) for compute in computes] + + await asyncio.sleep(run_seconds) + log.info("stopping all tasks but one") + for compute in computes[1:]: + compute.stopped = True + await asyncio.gather(*background_tasks[1:]) + log.info("stopped all tasks but one") + + # work for some time with only one compute -- it should be able to make some xacts + TIMEOUT_SECONDS = computes[0].MAX_QUERY_GAP_SECONDS + 3 + initial_queries_by_0 = len(computes[0].successful_queries) + log.info( + f"Waiting for another query by computes[0], " + f"it already had {initial_queries_by_0}, timeout is {TIMEOUT_SECONDS}s" + ) + for _ in range(10 * TIMEOUT_SECONDS): + current_queries_by_0 = len(computes[0].successful_queries) - initial_queries_by_0 + if current_queries_by_0 >= 1: + log.info( + f"Found {current_queries_by_0} successful queries " + f"by computes[0], completing the test" + ) + break + await asyncio.sleep(0.1) + else: + assert False, "Timed out while waiting for another query by computes[0]" + computes[0].stopped = True + + await asyncio.gather(background_tasks[0]) + + result = await exec_compute_query(env, branch, "SELECT * FROM query_log") + # we should have inserted something while single compute was running + log.info( + f"Executed {len(result)} queries, {current_queries_by_0} of them " + f"by computes[0] after we started stopping the others" + ) + for row in result: + log.info(f"{row[0]} {row[1]} {row[2]}") + + # ensure everything reported as committed wasn't lost + for compute in computes: + for verify_key in compute.successful_queries: + assert verify_key in [row[2] for row in result] + + +# Run multiple computes concurrently, creating-destroying them after single +# query. Ensure we don't lose any xacts reported as committed and be able to +# progress once only one compute remains. +def test_concurrent_computes(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_concurrent_computes") + asyncio.run(run_concurrent_computes(env)) + + +# Stop safekeeper and check that query cannot be executed while safekeeper is down. +# Query will insert a single row into a table. +async def check_unavailability( + sk: Safekeeper, conn: asyncpg.Connection, key: int, start_delay_sec: int = 2 +): + # shutdown one of two acceptors, that is, majority + sk.stop() + + bg_query = asyncio.create_task(conn.execute(f"INSERT INTO t values ({key}, 'payload')")) + + await asyncio.sleep(start_delay_sec) + # ensure that the query has not been executed yet + assert not bg_query.done() + + # start safekeeper and await the query + sk.start() + await bg_query + assert bg_query.done() + + +async def run_unavailability(env: NeonEnv, pg: Postgres): + conn = await pg.connect_async() + + # check basic work with table + await conn.execute("CREATE TABLE t(key int primary key, value text)") + await conn.execute("INSERT INTO t values (1, 'payload')") + + # stop safekeeper and check that query cannot be executed while safekeeper is down + await check_unavailability(env.safekeepers[0], conn, 2) + + # for the world's balance, do the same with second safekeeper + await check_unavailability(env.safekeepers[1], conn, 3) + + # check that we can execute queries after restart + await conn.execute("INSERT INTO t values (4, 'payload')") + + result_sum = await conn.fetchval("SELECT sum(key) FROM t") + assert result_sum == 10 + + +# When majority of acceptors is offline, commits are expected to be frozen +def test_unavailability(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 2 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_safekeepers_unavailability") + pg = env.postgres.create_start("test_safekeepers_unavailability") + + asyncio.run(run_unavailability(env, pg)) + + +@dataclass +class RaceConditionTest: + iteration: int + is_stopped: bool + + +# shut down random subset of safekeeper, sleep, wake them up, rinse, repeat +async def xmas_garland(safekeepers: List[Safekeeper], data: RaceConditionTest): + while not data.is_stopped: + data.iteration += 1 + victims = [] + for sk in safekeepers: + if random.random() >= 0.5: + victims.append(sk) + log.info( + f"Iteration {data.iteration}: stopping {list(map(lambda sk: sk.id, victims))} safekeepers" + ) + for v in victims: + v.stop() + await asyncio.sleep(1) + for v in victims: + v.start() + log.info(f"Iteration {data.iteration} finished") + await asyncio.sleep(1) + + +async def run_race_conditions(env: NeonEnv, pg: Postgres): + conn = await pg.connect_async() + await conn.execute("CREATE TABLE t(key int primary key, value text)") + + data = RaceConditionTest(0, False) + bg_xmas = asyncio.create_task(xmas_garland(env.safekeepers, data)) + + n_iterations = 5 + expected_sum = 0 + i = 1 + + while data.iteration <= n_iterations: + await asyncio.sleep(0.005) + await conn.execute(f"INSERT INTO t values ({i}, 'payload')") + expected_sum += i + i += 1 + + log.info(f"Executed {i-1} queries") + + res = await conn.fetchval("SELECT sum(key) FROM t") + assert res == expected_sum + + data.is_stopped = True + await bg_xmas + + +# do inserts while concurrently getting up/down subsets of acceptors +def test_race_conditions(neon_env_builder: NeonEnvBuilder): + + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_safekeepers_race_conditions") + pg = env.postgres.create_start("test_safekeepers_race_conditions") + + asyncio.run(run_race_conditions(env, pg)) + + +# Check that pageserver can select safekeeper with largest commit_lsn +# and switch if LSN is not updated for some time (NoWalTimeout). +async def run_wal_lagging(env: NeonEnv, pg: Postgres): + def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str: + # use ports 10, 11 and 12 to simulate unavailable safekeepers + return ",".join( + [ + f"localhost:{sk.port.pg if active else 10 + i}" + for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk)) + ] + ) + + conn = await pg.connect_async() + await conn.execute("CREATE TABLE t(key int primary key, value text)") + await conn.close() + pg.stop() + + n_iterations = 20 + n_txes = 10000 + expected_sum = 0 + i = 1 + quorum = len(env.safekeepers) // 2 + 1 + + for it in range(n_iterations): + active_sk = list(map(lambda _: random.random() >= 0.5, env.safekeepers)) + active_count = sum(active_sk) + + if active_count < quorum: + it -= 1 + continue + + pg.adjust_for_safekeepers(safekeepers_guc(env, active_sk)) + log.info(f"Iteration {it}: {active_sk}") + + pg.start() + conn = await pg.connect_async() + + for _ in range(n_txes): + await conn.execute(f"INSERT INTO t values ({i}, 'payload')") + expected_sum += i + i += 1 + + await conn.close() + pg.stop() + + pg.adjust_for_safekeepers(safekeepers_guc(env, [True] * len(env.safekeepers))) + pg.start() + conn = await pg.connect_async() + + log.info(f"Executed {i-1} queries") + + res = await conn.fetchval("SELECT sum(key) FROM t") + assert res == expected_sum + + +# do inserts while restarting postgres and messing with safekeeper addresses +def test_wal_lagging(neon_env_builder: NeonEnvBuilder): + + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_wal_lagging") + pg = env.postgres.create_start("test_wal_lagging") + + asyncio.run(run_wal_lagging(env, pg)) diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py new file mode 100644 index 0000000000..e1b1e03515 --- /dev/null +++ b/test_runner/regress/test_wal_restore.py @@ -0,0 +1,36 @@ +from pathlib import Path + +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres +from fixtures.types import TenantId + + +def test_wal_restore( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + test_output_dir: Path, + port_distributor: PortDistributor, + base_dir: Path, + pg_distrib_dir: Path, +): + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_wal_restore") + pg = env.postgres.create_start("test_wal_restore") + pg.safe_psql("create table t as select generate_series(1,300000)") + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + env.neon_cli.pageserver_stop() + port = port_distributor.get_port() + data_dir = test_output_dir / "pgsql.restored" + with VanillaPostgres( + data_dir, PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version), port + ) as restored: + pg_bin.run_capture( + [ + str(base_dir / "libs/utils/scripts/restore_from_wal.sh"), + str(pg_distrib_dir / f"v{env.pg_version}/bin"), + str(test_output_dir / "repo" / "safekeepers" / "sk1" / str(tenant_id) / "*"), + str(data_dir), + str(port), + ] + ) + restored.start() + assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)] diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py new file mode 100644 index 0000000000..c79aea35da --- /dev/null +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -0,0 +1,104 @@ +import time + +import psutil +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException +from fixtures.types import TenantId + + +def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_present=False): + children = psutil.Process(pageserver_pid).children() + for child in children: + if not wal_redo_present: + assert "--wal-redo" not in child.cmdline() + if not defunct_present: + assert child.status() != psutil.STATUS_ZOMBIE + + +# Check that the pageserver doesn't leave behind WAL redo processes +# when a tenant is detached. We had an issue previously where we failed +# to wait and consume the exit code of the WAL redo process, leaving it behind +# as a zombie process. +def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + pagserver_pid = int((env.repo_dir / "pageserver.pid").read_text()) + + assert_child_processes(pagserver_pid, wal_redo_present=False, defunct_present=False) + + # first check for non existing tenant + tenant_id = TenantId.generate() + with pytest.raises( + expected_exception=PageserverApiException, + match=f"Tenant not found for id {tenant_id}", + ): + pageserver_http.tenant_detach(tenant_id) + + # create new nenant + tenant_id, _ = env.neon_cli.create_tenant() + + # assert tenant exists on disk + assert (env.repo_dir / "tenants" / str(tenant_id)).exists() + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + + pg_conn = pg.connect() + cur = pg_conn.cursor() + + # Create table, and insert some rows. Make it big enough that it doesn't fit in + # shared_buffers, otherwise the SELECT after restart will just return answer + # from shared_buffers without hitting the page server, which defeats the point + # of this test. + cur.execute("CREATE TABLE foo (t text)") + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Verify that the table is larger than shared_buffers + cur.execute( + """ + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + from pg_settings where name = 'shared_buffers' + """ + ) + row = cur.fetchone() + assert row is not None + log.info(f"shared_buffers is {row[0]}, table size {row[1]}") + assert int(row[0]) < int(row[1]) + + cur.execute("SELECT count(*) FROM foo") + assert cur.fetchone() == (100000,) + + # After filling the table and doing the SELECT, it is guaranteed that we did some WAL redo. + # So, assert that the WAL redo process is present. + # XXX this is quite brittle as the lifecycle of the WAL redo process is an implementation detail + assert_child_processes(pagserver_pid, wal_redo_present=True, defunct_present=False) + + last_error = None + for i in range(3): + try: + pageserver_http.tenant_detach(tenant_id) + except Exception as e: + last_error = e + log.error(f"try {i} error detaching tenant: {e}") + continue + else: + break + # else is called if the loop finished without reaching "break" + else: + pytest.fail(f"could not detach tenant: {last_error}") + + # check that nothing is left on disk for deleted tenant + assert not (env.repo_dir / "tenants" / str(tenant_id)).exists() + + # Pageserver schedules kill+wait of the WAL redo process to the background runtime, + # asynchronously to tenant detach. Cut it some slack to complete kill+wait before + # checking. + time.sleep(1.0) + assert_child_processes(pagserver_pid, wal_redo_present=False, defunct_present=False) diff --git a/test_runner/zenith_regress/.gitignore b/test_runner/sql_regress/.gitignore similarity index 100% rename from test_runner/zenith_regress/.gitignore rename to test_runner/sql_regress/.gitignore diff --git a/test_runner/sql_regress/README.md b/test_runner/sql_regress/README.md new file mode 100644 index 0000000000..1ae8aaf61a --- /dev/null +++ b/test_runner/sql_regress/README.md @@ -0,0 +1,13 @@ +Simple tests that only need a PostgreSQL connection to run. +These are run by the regress/test_pg_regress.py test, which uses +the PostgreSQL pg_regress utility. + +To add a new SQL test: + +- add sql script to run to neon_regress/sql/testname.sql +- add expected output to neon_regress/expected/testname.out +- add testname to parallel_schedule + +That's it. +For more complex tests see PostgreSQL regression tests in src/test/regress. +These work basically the same. diff --git a/test_runner/zenith_regress/expected/.gitignore b/test_runner/sql_regress/expected/.gitignore similarity index 100% rename from test_runner/zenith_regress/expected/.gitignore rename to test_runner/sql_regress/expected/.gitignore diff --git a/test_runner/zenith_regress/expected/zenith-cid.out b/test_runner/sql_regress/expected/neon-cid.out similarity index 100% rename from test_runner/zenith_regress/expected/zenith-cid.out rename to test_runner/sql_regress/expected/neon-cid.out diff --git a/test_runner/zenith_regress/expected/zenith-clog.out b/test_runner/sql_regress/expected/neon-clog.out similarity index 100% rename from test_runner/zenith_regress/expected/zenith-clog.out rename to test_runner/sql_regress/expected/neon-clog.out diff --git a/test_runner/zenith_regress/expected/zenith-rel-truncate.out b/test_runner/sql_regress/expected/neon-rel-truncate.out similarity index 100% rename from test_runner/zenith_regress/expected/zenith-rel-truncate.out rename to test_runner/sql_regress/expected/neon-rel-truncate.out diff --git a/test_runner/zenith_regress/expected/zenith-vacuum-full.out b/test_runner/sql_regress/expected/neon-vacuum-full.out similarity index 100% rename from test_runner/zenith_regress/expected/zenith-vacuum-full.out rename to test_runner/sql_regress/expected/neon-vacuum-full.out diff --git a/test_runner/zenith_regress/parallel_schedule b/test_runner/sql_regress/parallel_schedule similarity index 71% rename from test_runner/zenith_regress/parallel_schedule rename to test_runner/sql_regress/parallel_schedule index f64bf8a034..569c7b5066 100644 --- a/test_runner/zenith_regress/parallel_schedule +++ b/test_runner/sql_regress/parallel_schedule @@ -4,7 +4,7 @@ # number of connections needed to run the tests. # ---------- -test: zenith-cid -test: zenith-rel-truncate -test: zenith-clog -test: zenith-vacuum-full +test: neon-cid +test: neon-rel-truncate +test: neon-clog +test: neon-vacuum-full diff --git a/test_runner/zenith_regress/sql/.gitignore b/test_runner/sql_regress/sql/.gitignore similarity index 100% rename from test_runner/zenith_regress/sql/.gitignore rename to test_runner/sql_regress/sql/.gitignore diff --git a/test_runner/zenith_regress/sql/zenith-cid.sql b/test_runner/sql_regress/sql/neon-cid.sql similarity index 100% rename from test_runner/zenith_regress/sql/zenith-cid.sql rename to test_runner/sql_regress/sql/neon-cid.sql diff --git a/test_runner/zenith_regress/sql/zenith-clog.sql b/test_runner/sql_regress/sql/neon-clog.sql similarity index 100% rename from test_runner/zenith_regress/sql/zenith-clog.sql rename to test_runner/sql_regress/sql/neon-clog.sql diff --git a/test_runner/zenith_regress/sql/zenith-rel-truncate.sql b/test_runner/sql_regress/sql/neon-rel-truncate.sql similarity index 100% rename from test_runner/zenith_regress/sql/zenith-rel-truncate.sql rename to test_runner/sql_regress/sql/neon-rel-truncate.sql diff --git a/test_runner/zenith_regress/sql/zenith-vacuum-full.sql b/test_runner/sql_regress/sql/neon-vacuum-full.sql similarity index 100% rename from test_runner/zenith_regress/sql/zenith-vacuum-full.sql rename to test_runner/sql_regress/sql/neon-vacuum-full.sql diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py index 56c735e87c..0281f4f48b 100644 --- a/test_runner/test_broken.py +++ b/test_runner/test_broken.py @@ -1,8 +1,9 @@ -import pytest import os -from fixtures.zenith_fixtures import ZenithEnv +import pytest from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv + """ Use this test to see what happens when tests fail. @@ -13,17 +14,18 @@ Set the environment variable RUN_BROKEN to see this test run (and fail, and hopefully not leave any server processes behind). """ -run_broken = pytest.mark.skipif(os.environ.get('RUN_BROKEN') is None, - reason="only used for testing the fixtures") +run_broken = pytest.mark.skipif( + os.environ.get("RUN_BROKEN") is None, reason="only used for testing the fixtures" +) @run_broken -def test_broken(zenith_simple_env: ZenithEnv, pg_bin): - env = zenith_simple_env +def test_broken(neon_simple_env: NeonEnv, pg_bin): + env = neon_simple_env - env.zenith_cli.create_branch("test_broken", "empty") + env.neon_cli.create_branch("test_broken", "empty") env.postgres.create_start("test_broken") - log.info('postgres is running') + log.info("postgres is running") - log.info('THIS NEXT COMMAND WILL FAIL:') - pg_bin.run('pgbench -i_am_a_broken_test'.split()) + log.info("THIS NEXT COMMAND WILL FAIL:") + pg_bin.run("pgbench -i_am_a_broken_test".split()) diff --git a/test_runner/zenith_regress/README.md b/test_runner/zenith_regress/README.md deleted file mode 100644 index 61e3aad04e..0000000000 --- a/test_runner/zenith_regress/README.md +++ /dev/null @@ -1,8 +0,0 @@ -To add a new SQL test - -- add sql script to run to zenith_regress/sql/testname.sql -- add expected output to zenith_regress/expected/testname.out -- add testname to parallel_schedule - -That's it. -For more complex tests see PostgreSQL regression tests. These works basically the same. diff --git a/vendor/postgres b/vendor/postgres deleted file mode 160000 index 31dc24ab29..0000000000 --- a/vendor/postgres +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 31dc24ab29e6bdd5cfb85920a9c728f759c01b29 diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 new file mode 160000 index 0000000000..cd0693e2be --- /dev/null +++ b/vendor/postgres-v14 @@ -0,0 +1 @@ +Subproject commit cd0693e2be224bedfa0b61f9c5e2ff4cd88eec2c diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 new file mode 160000 index 0000000000..e9e0fd5947 --- /dev/null +++ b/vendor/postgres-v15 @@ -0,0 +1 @@ +Subproject commit e9e0fd59477587ff571189f731e0f39bdfae57e3 diff --git a/walkeeper/Cargo.toml b/walkeeper/Cargo.toml deleted file mode 100644 index 193fc4acf6..0000000000 --- a/walkeeper/Cargo.toml +++ /dev/null @@ -1,37 +0,0 @@ -[package] -name = "walkeeper" -version = "0.1.0" -edition = "2021" - -[dependencies] -regex = "1.4.5" -bytes = "1.0.1" -byteorder = "1.4.3" -hyper = "0.14" -fs2 = "0.4.3" -lazy_static = "1.4.0" -serde_json = "1" -tracing = "0.1.27" -clap = "3.0" -daemonize = "0.4.1" -rust-s3 = { version = "0.28", default-features = false, features = ["no-verify-ssl", "tokio-rustls-tls"] } -tokio = { version = "1.11", features = ["macros"] } -postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -anyhow = "1.0" -crc32c = "0.6.0" -humantime = "2.1.0" -walkdir = "2" -signal-hook = "0.3.10" -serde = { version = "1.0", features = ["derive"] } -hex = "0.4.3" -const_format = "0.2.21" -tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } - -postgres_ffi = { path = "../postgres_ffi" } -workspace_hack = { path = "../workspace_hack" } -zenith_metrics = { path = "../zenith_metrics" } -zenith_utils = { path = "../zenith_utils" } - -[dev-dependencies] -tempfile = "3.2" diff --git a/walkeeper/src/bin/safekeeper.rs b/walkeeper/src/bin/safekeeper.rs deleted file mode 100644 index ea5d0cba14..0000000000 --- a/walkeeper/src/bin/safekeeper.rs +++ /dev/null @@ -1,262 +0,0 @@ -// -// Main entry point for the safekeeper executable -// -use anyhow::{Context, Result}; -use clap::{App, Arg}; -use const_format::formatcp; -use daemonize::Daemonize; -use fs2::FileExt; -use std::fs::File; -use std::path::{Path, PathBuf}; -use std::thread; -use tracing::*; -use walkeeper::control_file::{self, CreateControlFile}; -use zenith_utils::http::endpoint; -use zenith_utils::{logging, tcp_listener, GIT_VERSION}; - -use tokio::sync::mpsc; -use walkeeper::callmemaybe; -use walkeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR}; -use walkeeper::http; -use walkeeper::s3_offload; -use walkeeper::wal_service; -use walkeeper::SafeKeeperConf; -use zenith_utils::shutdown::exit_now; -use zenith_utils::signals; - -const LOCK_FILE_NAME: &str = "safekeeper.lock"; - -fn main() -> Result<()> { - zenith_metrics::set_common_metrics_prefix("safekeeper"); - let arg_matches = App::new("Zenith safekeeper") - .about("Store WAL stream to local file system and push it to WAL receivers") - .version(GIT_VERSION) - .arg( - Arg::new("datadir") - .short('D') - .long("dir") - .takes_value(true) - .help("Path to the safekeeper data directory"), - ) - .arg( - Arg::new("listen-pg") - .short('l') - .long("listen-pg") - .alias("listen") // for compatibility - .takes_value(true) - .help(formatcp!("listen for incoming WAL data connections on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")), - ) - .arg( - Arg::new("listen-http") - .long("listen-http") - .takes_value(true) - .help(formatcp!("http endpoint address for metrics on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")), - ) - // FIXME this argument is no longer needed since pageserver address is forwarded from compute. - // However because this argument is in use by console's e2e tests lets keep it for now and remove separately. - // So currently it is a noop. - .arg( - Arg::new("pageserver") - .short('p') - .long("pageserver") - .takes_value(true), - ) - .arg( - Arg::new("ttl") - .long("ttl") - .takes_value(true) - .help("interval for keeping WAL at safekeeper node, after which them will be uploaded to S3 and removed locally"), - ) - .arg( - Arg::new("recall") - .long("recall") - .takes_value(true) - .help("Period for requestion pageserver to call for replication"), - ) - .arg( - Arg::new("daemonize") - .short('d') - .long("daemonize") - .takes_value(false) - .help("Run in the background"), - ) - .arg( - Arg::new("no-sync") - .short('n') - .long("no-sync") - .takes_value(false) - .help("Do not wait for changes to be written safely to disk"), - ) - .arg( - Arg::new("dump-control-file") - .long("dump-control-file") - .takes_value(true) - .help("Dump control file at path specifed by this argument and exit"), - ) - .get_matches(); - - if let Some(addr) = arg_matches.value_of("dump-control-file") { - let state = control_file::FileStorage::load_control_file( - Path::new(addr), - CreateControlFile::False, - )?; - let json = serde_json::to_string(&state)?; - print!("{}", json); - return Ok(()); - } - - let mut conf: SafeKeeperConf = Default::default(); - - if let Some(dir) = arg_matches.value_of("datadir") { - // change into the data directory. - std::env::set_current_dir(PathBuf::from(dir))?; - } - - if arg_matches.is_present("no-sync") { - conf.no_sync = true; - } - - if arg_matches.is_present("daemonize") { - conf.daemonize = true; - } - - if let Some(addr) = arg_matches.value_of("listen-pg") { - conf.listen_pg_addr = addr.to_owned(); - } - - if let Some(addr) = arg_matches.value_of("listen-http") { - conf.listen_http_addr = addr.to_owned(); - } - - if let Some(ttl) = arg_matches.value_of("ttl") { - conf.ttl = Some(humantime::parse_duration(ttl)?); - } - - if let Some(recall) = arg_matches.value_of("recall") { - conf.recall_period = humantime::parse_duration(recall)?; - } - - start_safekeeper(conf) -} - -fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { - let log_file = logging::init("safekeeper.log", conf.daemonize)?; - - info!("version: {}", GIT_VERSION); - - // Prevent running multiple safekeepers on the same directory - let lock_file_path = conf.workdir.join(LOCK_FILE_NAME); - let lock_file = File::create(&lock_file_path).context("failed to open lockfile")?; - lock_file.try_lock_exclusive().with_context(|| { - format!( - "control file {} is locked by some other process", - lock_file_path.display() - ) - })?; - - let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| { - error!("failed to bind to address {}: {}", conf.listen_http_addr, e); - e - })?; - - info!("Starting safekeeper on {}", conf.listen_pg_addr); - let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| { - error!("failed to bind to address {}: {}", conf.listen_pg_addr, e); - e - })?; - - // XXX: Don't spawn any threads before daemonizing! - if conf.daemonize { - info!("daemonizing..."); - - // There should'n be any logging to stdin/stdout. Redirect it to the main log so - // that we will see any accidental manual fprintf's or backtraces. - let stdout = log_file.try_clone().unwrap(); - let stderr = log_file; - - let daemonize = Daemonize::new() - .pid_file("safekeeper.pid") - .working_directory(Path::new(".")) - .stdout(stdout) - .stderr(stderr); - - // XXX: The parent process should exit abruptly right after - // it has spawned a child to prevent coverage machinery from - // dumping stats into a `profraw` file now owned by the child. - // Otherwise, the coverage data will be damaged. - match daemonize.exit_action(|| exit_now(0)).start() { - Ok(_) => info!("Success, daemonized"), - Err(e) => error!("Error, {}", e), - } - } - - let signals = signals::install_shutdown_handlers()?; - let mut threads = vec![]; - - let conf_ = conf.clone(); - threads.push( - thread::Builder::new() - .name("http_endpoint_thread".into()) - .spawn(|| { - // TODO authentication - let router = http::make_router(conf_); - endpoint::serve_thread_main( - router, - http_listener, - std::future::pending(), // never shut down - ) - .unwrap(); - })?, - ); - - if conf.ttl.is_some() { - let conf_ = conf.clone(); - threads.push( - thread::Builder::new() - .name("S3 offload thread".into()) - .spawn(|| { - s3_offload::thread_main(conf_); - })?, - ); - } - - let (tx, rx) = mpsc::unbounded_channel(); - let conf_cloned = conf.clone(); - let wal_acceptor_thread = thread::Builder::new() - .name("WAL acceptor thread".into()) - .spawn(|| { - // thread code - let thread_result = wal_service::thread_main(conf_cloned, pg_listener, tx); - if let Err(e) = thread_result { - info!("wal_service thread terminated: {}", e); - } - }) - .unwrap(); - - threads.push(wal_acceptor_thread); - - let callmemaybe_thread = thread::Builder::new() - .name("callmemaybe thread".into()) - .spawn(|| { - // thread code - let thread_result = callmemaybe::thread_main(conf, rx); - if let Err(e) = thread_result { - error!("callmemaybe thread terminated: {}", e); - } - }) - .unwrap(); - threads.push(callmemaybe_thread); - - // TODO: put more thoughts into handling of failed threads - // We probably should restart them. - - // NOTE: we still have to handle signals like SIGQUIT to prevent coredumps - signals.handle(|signal| { - // TODO: implement graceful shutdown with joining threads etc - info!( - "Got {}. Terminating in immediate shutdown mode", - signal.name() - ); - std::process::exit(111); - }) -} diff --git a/walkeeper/src/callmemaybe.rs b/walkeeper/src/callmemaybe.rs deleted file mode 100644 index 1e52ec927b..0000000000 --- a/walkeeper/src/callmemaybe.rs +++ /dev/null @@ -1,303 +0,0 @@ -//! -//! Callmemaybe module is responsible for periodically requesting -//! pageserver to initiate wal streaming. -//! -//! Other threads can use CallmeEvent messages to subscribe or unsubscribe -//! from the call list. -//! -use crate::SafeKeeperConf; -use anyhow::{Context, Result}; -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::sync::Mutex; -use std::time::{Duration, Instant}; -use tokio::runtime; -use tokio::sync::mpsc::UnboundedReceiver; -use tokio::task; -use tokio_postgres::NoTls; -use tracing::*; -use zenith_utils::connstring::connection_host_port; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; - -async fn request_callback( - pageserver_connstr: String, - listen_pg_addr_str: String, - timelineid: ZTimelineId, - tenantid: ZTenantId, -) -> Result<()> { - info!( - "callmemaybe request_callback Connecting to pageserver {}", - &pageserver_connstr - ); - let (client, connection) = tokio_postgres::connect(&pageserver_connstr, NoTls).await?; - - tokio::spawn(async move { - if let Err(e) = connection.await { - error!("connection error: {}", e); - } - }); - - // use Config parsing because SockAddr parsing doesnt allow to use host names instead of ip addresses - let me_connstr = format!("postgresql://no_user@{}/no_db", listen_pg_addr_str); - let me_conf: postgres::config::Config = me_connstr.parse().unwrap(); - let (host, port) = connection_host_port(&me_conf); - - // pageserver connstr is needed to be able to distinguish between different pageservers - // it is required to correctly manage callmemaybe subscriptions when more than one pageserver is involved - // TODO it is better to use some sort of a unique id instead of connection string, see https://github.com/zenithdb/zenith/issues/1105 - let callme = format!( - "callmemaybe {} {} host={} port={} options='-c ztimelineid={} ztenantid={} pageserver_connstr={}'", - tenantid, timelineid, host, port, timelineid, tenantid, pageserver_connstr, - ); - - let _ = client.simple_query(&callme).await?; - - Ok(()) -} - -pub fn thread_main(conf: SafeKeeperConf, rx: UnboundedReceiver) -> Result<()> { - let runtime = runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - - runtime.block_on(main_loop(conf, rx)) -} - -#[derive(Debug, PartialEq, Eq, Hash, Clone)] -pub struct SubscriptionStateKey { - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - pageserver_connstr: String, -} - -impl SubscriptionStateKey { - pub fn new(tenant_id: ZTenantId, timeline_id: ZTimelineId, pageserver_connstr: String) -> Self { - Self { - tenant_id, - timeline_id, - pageserver_connstr, - } - } -} - -/// Messages to the callmemaybe thread -#[derive(Debug)] -pub enum CallmeEvent { - // add new subscription to the list - Subscribe(SubscriptionStateKey), - // remove the subscription from the list - Unsubscribe(SubscriptionStateKey), - // don't serve this subscription, but keep it in the list - Pause(SubscriptionStateKey), - // resume this subscription, if it exists, - // but don't create a new one if it is gone - Resume(SubscriptionStateKey), - // TODO how do we delete from subscriptions? -} - -#[derive(Debug)] -struct SubscriptionState { - tenantid: ZTenantId, - timelineid: ZTimelineId, - pageserver_connstr: String, - handle: Option>, - last_call_time: Instant, - paused: bool, -} - -impl SubscriptionState { - fn new( - tenantid: ZTenantId, - timelineid: ZTimelineId, - pageserver_connstr: String, - ) -> SubscriptionState { - SubscriptionState { - tenantid, - timelineid, - pageserver_connstr, - handle: None, - last_call_time: Instant::now(), - paused: false, - } - } - - fn pause(&mut self) { - self.paused = true; - self.abort_handle(); - } - - fn resume(&mut self) { - self.paused = false; - } - - // Most likely, the task have already successfully completed - // and abort() won't have any effect. - fn abort_handle(&mut self) { - if let Some(handle) = self.handle.take() { - handle.abort(); - - let timelineid = self.timelineid; - let tenantid = self.tenantid; - let pageserver_connstr = self.pageserver_connstr.clone(); - tokio::spawn(async move { - if let Err(err) = handle.await { - if err.is_cancelled() { - warn!("callback task for timelineid={} tenantid={} was cancelled before spawning a new one", - timelineid, tenantid); - } else { - error!( - "callback task for timelineid={} tenantid={} pageserver_connstr={} failed: {}", - timelineid, tenantid, pageserver_connstr, err - ); - } - } - }); - } - } - - fn call(&mut self, recall_period: Duration, listen_pg_addr: String) { - // Ignore call request if this subscription is paused - if self.paused { - debug!( - "ignore call request for paused subscription \ - tenantid: {}, timelineid: {}", - self.tenantid, self.timelineid - ); - return; - } - - // Check if it too early to recall - if self.handle.is_some() && self.last_call_time.elapsed() < recall_period { - debug!( - "too early to recall. self.last_call_time.elapsed: {:?}, recall_period: {:?} \ - tenantid: {}, timelineid: {}", - self.last_call_time, recall_period, self.tenantid, self.timelineid - ); - return; - } - - // If previous task didn't complete in recall_period, it must be hanging, - // so don't wait for it forever, just abort it and try again. - self.abort_handle(); - - let timelineid = self.timelineid; - let tenantid = self.tenantid; - let pageserver_connstr = self.pageserver_connstr.clone(); - self.handle = Some(tokio::spawn(async move { - request_callback(pageserver_connstr, listen_pg_addr, timelineid, tenantid) - .await - .unwrap_or_else(|e| { - error!( - "callback task for timelineid={} tenantid={} failed: {}", - timelineid, tenantid, e - ) - }); - })); - - // Update last_call_time - self.last_call_time = Instant::now(); - info!( - "new call spawned. last call time {:?} tenantid: {}, timelineid: {}", - self.last_call_time, self.tenantid, self.timelineid - ); - } -} - -impl Drop for SubscriptionState { - fn drop(&mut self) { - self.abort_handle(); - } -} - -pub async fn main_loop(conf: SafeKeeperConf, mut rx: UnboundedReceiver) -> Result<()> { - let subscriptions: Mutex> = - Mutex::new(HashMap::new()); - - let mut ticker = tokio::time::interval(conf.recall_period); - loop { - tokio::select! { - request = rx.recv() => - { - match request.context("done")? - { - CallmeEvent::Subscribe(key) => - { - let _enter = info_span!("callmemaybe: subscribe", timelineid = %key.timeline_id, tenantid = %key.tenant_id, pageserver_connstr=%key.pageserver_connstr.clone()).entered(); - let mut subscriptions = subscriptions.lock().unwrap(); - // XXX this clone is ugly, is there a way to use the trick with Borrow trait with entry API? - // when we switch to node id instead of the connection string key will be Copy and there will be no need to clone - match subscriptions.entry(key.clone()) { - Entry::Occupied(_) => { - // Do nothing if subscription already exists - // If it is paused it means that there is already established replication connection. - // If it is not paused it will be polled with other subscriptions when timeout expires. - // This can occur when replication channel is established before subscription is added. - info!( - "subscription already exists", - ); - } - Entry::Vacant(entry) => { - let subscription = entry.insert(SubscriptionState::new( - key.tenant_id, - key.timeline_id, - key.pageserver_connstr, - )); - subscription.call(conf.recall_period, conf.listen_pg_addr.clone()); - } - } - }, - CallmeEvent::Unsubscribe(key) => { - let _enter = debug_span!("callmemaybe: unsubscribe", timelineid = %key.timeline_id, tenantid = %key.tenant_id, pageserver_connstr=%key.pageserver_connstr.clone()).entered(); - debug!("unsubscribe"); - let mut subscriptions = subscriptions.lock().unwrap(); - subscriptions.remove(&key); - - }, - CallmeEvent::Pause(key) => { - let _enter = debug_span!("callmemaybe: pause", timelineid = %key.timeline_id, tenantid = %key.tenant_id, pageserver_connstr=%key.pageserver_connstr.clone()).entered(); - let mut subscriptions = subscriptions.lock().unwrap(); - // If pause received when no corresponding subscription exists it means that someone started replication - // without using callmemaybe. So we create subscription and pause it. - // In tenant relocation scenario subscribe call will be executed after pause when compute is restarted. - // In that case there is no need to create new/unpause existing subscription. - match subscriptions.entry(key.clone()) { - Entry::Occupied(mut sub) => { - debug!("pause existing"); - sub.get_mut().pause(); - } - Entry::Vacant(entry) => { - debug!("create paused"); - let subscription = entry.insert(SubscriptionState::new( - key.tenant_id, - key.timeline_id, - key.pageserver_connstr, - )); - subscription.pause(); - } - } - }, - CallmeEvent::Resume(key) => { - debug!( - "callmemaybe. thread_main. resume callback request for timelineid={} tenantid={} pageserver_connstr={}", - key.timeline_id, key.tenant_id, key.pageserver_connstr, - ); - let mut subscriptions = subscriptions.lock().unwrap(); - if let Some(sub) = subscriptions.get_mut(&key) - { - sub.resume(); - }; - }, - } - }, - _ = ticker.tick() => { - let _enter = debug_span!("callmemaybe: tick").entered(); - let mut subscriptions = subscriptions.lock().unwrap(); - - for (_, state) in subscriptions.iter_mut() { - state.call(conf.recall_period, conf.listen_pg_addr.clone()); - } - }, - }; - } -} diff --git a/walkeeper/src/control_file_upgrade.rs b/walkeeper/src/control_file_upgrade.rs deleted file mode 100644 index 913bd02c1e..0000000000 --- a/walkeeper/src/control_file_upgrade.rs +++ /dev/null @@ -1,114 +0,0 @@ -//! Code to deal with safekeeper control file upgrades -use crate::safekeeper::{ - AcceptorState, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermSwitchEntry, -}; -use anyhow::{bail, Result}; -use serde::{Deserialize, Serialize}; -use tracing::*; -use zenith_utils::{ - bin_ser::LeSer, - lsn::Lsn, - pq_proto::SystemId, - zid::{ZTenantId, ZTimelineId}, -}; - -/// Persistent consensus state of the acceptor. -#[derive(Debug, Clone, Serialize, Deserialize)] -struct AcceptorStateV1 { - /// acceptor's last term it voted for (advanced in 1 phase) - term: Term, - /// acceptor's epoch (advanced, i.e. bumped to 'term' when VCL is reached). - epoch: Term, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -struct SafeKeeperStateV1 { - /// persistent acceptor state - acceptor_state: AcceptorStateV1, - /// information about server - server: ServerInfo, - /// Unique id of the last *elected* proposer we dealed with. Not needed - /// for correctness, exists for monitoring purposes. - proposer_uuid: PgUuid, - /// part of WAL acknowledged by quorum and available locally - commit_lsn: Lsn, - /// minimal LSN which may be needed for recovery of some safekeeper (end_lsn - /// of last record streamed to everyone) - truncate_lsn: Lsn, - // Safekeeper starts receiving WAL from this LSN, zeros before it ought to - // be skipped during decoding. - wal_start_lsn: Lsn, -} - -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct ServerInfoV2 { - /// Postgres server version - pub pg_version: u32, - pub system_id: SystemId, - pub tenant_id: ZTenantId, - /// Zenith timelineid - pub ztli: ZTimelineId, - pub wal_seg_size: u32, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SafeKeeperStateV2 { - /// persistent acceptor state - pub acceptor_state: AcceptorState, - /// information about server - pub server: ServerInfoV2, - /// Unique id of the last *elected* proposer we dealed with. Not needed - /// for correctness, exists for monitoring purposes. - pub proposer_uuid: PgUuid, - /// part of WAL acknowledged by quorum and available locally - pub commit_lsn: Lsn, - /// minimal LSN which may be needed for recovery of some safekeeper (end_lsn - /// of last record streamed to everyone) - pub truncate_lsn: Lsn, - // Safekeeper starts receiving WAL from this LSN, zeros before it ought to - // be skipped during decoding. - pub wal_start_lsn: Lsn, -} - -pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { - // migrate to storing full term history - if version == 1 { - info!("reading safekeeper control file version {}", version); - let oldstate = SafeKeeperStateV1::des(&buf[..buf.len()])?; - let ac = AcceptorState { - term: oldstate.acceptor_state.term, - term_history: TermHistory(vec![TermSwitchEntry { - term: oldstate.acceptor_state.epoch, - lsn: Lsn(0), - }]), - }; - return Ok(SafeKeeperState { - acceptor_state: ac, - server: oldstate.server.clone(), - proposer_uuid: oldstate.proposer_uuid, - commit_lsn: oldstate.commit_lsn, - truncate_lsn: oldstate.truncate_lsn, - wal_start_lsn: oldstate.wal_start_lsn, - }); - // migrate to hexing some zids - } else if version == 2 { - info!("reading safekeeper control file version {}", version); - let oldstate = SafeKeeperStateV2::des(&buf[..buf.len()])?; - let server = ServerInfo { - pg_version: oldstate.server.pg_version, - system_id: oldstate.server.system_id, - tenant_id: oldstate.server.tenant_id, - timeline_id: oldstate.server.ztli, - wal_seg_size: oldstate.server.wal_seg_size, - }; - return Ok(SafeKeeperState { - acceptor_state: oldstate.acceptor_state, - server, - proposer_uuid: oldstate.proposer_uuid, - commit_lsn: oldstate.commit_lsn, - truncate_lsn: oldstate.truncate_lsn, - wal_start_lsn: oldstate.wal_start_lsn, - }); - } - bail!("unsupported safekeeper control file version {}", version) -} diff --git a/walkeeper/src/handler.rs b/walkeeper/src/handler.rs deleted file mode 100644 index 5367954842..0000000000 --- a/walkeeper/src/handler.rs +++ /dev/null @@ -1,210 +0,0 @@ -//! Part of Safekeeper pretending to be Postgres, i.e. handling Postgres -//! protocol commands. - -use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; -use crate::receive_wal::ReceiveWalConn; -use crate::send_wal::ReplicationConn; -use crate::timeline::{Timeline, TimelineTools}; -use crate::SafeKeeperConf; -use anyhow::{bail, Context, Result}; - -use postgres_ffi::xlog_utils::PG_TLI; -use regex::Regex; -use std::str::FromStr; -use std::sync::Arc; -use zenith_utils::lsn::Lsn; -use zenith_utils::postgres_backend; -use zenith_utils::postgres_backend::PostgresBackend; -use zenith_utils::pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; -use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; - -use crate::callmemaybe::CallmeEvent; -use crate::control_file::CreateControlFile; -use tokio::sync::mpsc::UnboundedSender; - -/// Safekeeper handler of postgres commands -pub struct SafekeeperPostgresHandler { - pub conf: SafeKeeperConf, - /// assigned application name - pub appname: Option, - pub ztenantid: Option, - pub ztimelineid: Option, - pub timeline: Option>, - pageserver_connstr: Option, - //sender to communicate with callmemaybe thread - pub tx: UnboundedSender, -} - -/// Parsed Postgres command. -enum SafekeeperPostgresCommand { - StartWalPush { pageserver_connstr: Option }, - StartReplication { start_lsn: Lsn }, - IdentifySystem, - JSONCtrl { cmd: AppendLogicalMessage }, -} - -fn parse_cmd(cmd: &str) -> Result { - if cmd.starts_with("START_WAL_PUSH") { - let re = Regex::new(r"START_WAL_PUSH(?: (.+))?").unwrap(); - - let caps = re.captures(cmd).unwrap(); - let pageserver_connstr = caps.get(1).map(|m| m.as_str().to_owned()); - Ok(SafekeeperPostgresCommand::StartWalPush { pageserver_connstr }) - } else if cmd.starts_with("START_REPLICATION") { - let re = - Regex::new(r"START_REPLICATION(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)").unwrap(); - let mut caps = re.captures_iter(cmd); - let start_lsn = caps - .next() - .map(|cap| cap[1].parse::()) - .context("failed to parse start LSN from START_REPLICATION command")??; - Ok(SafekeeperPostgresCommand::StartReplication { start_lsn }) - } else if cmd.starts_with("IDENTIFY_SYSTEM") { - Ok(SafekeeperPostgresCommand::IdentifySystem) - } else if cmd.starts_with("JSON_CTRL") { - let cmd = cmd.strip_prefix("JSON_CTRL").context("invalid prefix")?; - Ok(SafekeeperPostgresCommand::JSONCtrl { - cmd: serde_json::from_str(cmd)?, - }) - } else { - bail!("unsupported command {}", cmd); - } -} - -impl postgres_backend::Handler for SafekeeperPostgresHandler { - // ztenant id and ztimeline id are passed in connection string params - fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> { - if let FeStartupPacket::StartupMessage { params, .. } = sm { - self.ztenantid = match params.get("ztenantid") { - Some(z) => Some(ZTenantId::from_str(z)?), // just curious, can I do that from .map? - _ => None, - }; - - self.ztimelineid = match params.get("ztimelineid") { - Some(z) => Some(ZTimelineId::from_str(z)?), - _ => None, - }; - - if let Some(app_name) = params.get("application_name") { - self.appname = Some(app_name.clone()); - } - - self.pageserver_connstr = params.get("pageserver_connstr").cloned(); - - Ok(()) - } else { - bail!("Walkeeper received unexpected initial message: {:?}", sm); - } - } - - fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> { - let cmd = parse_cmd(query_string)?; - - // Is this command is ztimeline scoped? - match cmd { - SafekeeperPostgresCommand::StartWalPush { .. } - | SafekeeperPostgresCommand::StartReplication { .. } - | SafekeeperPostgresCommand::IdentifySystem - | SafekeeperPostgresCommand::JSONCtrl { .. } => { - let tenantid = self.ztenantid.context("tenantid is required")?; - let timelineid = self.ztimelineid.context("timelineid is required")?; - if self.timeline.is_none() { - // START_WAL_PUSH is the only command that initializes the timeline in production. - // There is also JSON_CTRL command, which should initialize the timeline for testing. - let create_control_file = match cmd { - SafekeeperPostgresCommand::StartWalPush { .. } - | SafekeeperPostgresCommand::JSONCtrl { .. } => CreateControlFile::True, - _ => CreateControlFile::False, - }; - self.timeline.set( - &self.conf, - ZTenantTimelineId::new(tenantid, timelineid), - create_control_file, - )?; - } - } - } - - match cmd { - SafekeeperPostgresCommand::StartWalPush { pageserver_connstr } => { - ReceiveWalConn::new(pgb, pageserver_connstr) - .run(self) - .context("failed to run ReceiveWalConn")?; - } - SafekeeperPostgresCommand::StartReplication { start_lsn } => { - ReplicationConn::new(pgb) - .run(self, pgb, start_lsn, self.pageserver_connstr.clone()) - .context("failed to run ReplicationConn")?; - } - SafekeeperPostgresCommand::IdentifySystem => { - self.handle_identify_system(pgb)?; - } - SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { - handle_json_ctrl(self, pgb, cmd)?; - } - } - Ok(()) - } -} - -impl SafekeeperPostgresHandler { - pub fn new(conf: SafeKeeperConf, tx: UnboundedSender) -> Self { - SafekeeperPostgresHandler { - conf, - appname: None, - ztenantid: None, - ztimelineid: None, - timeline: None, - pageserver_connstr: None, - tx, - } - } - - /// - /// Handle IDENTIFY_SYSTEM replication command - /// - fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> { - let start_pos = self.timeline.get().get_end_of_wal(); - let lsn = start_pos.to_string(); - let sysid = self.timeline.get().get_info().server.system_id.to_string(); - let lsn_bytes = lsn.as_bytes(); - let tli = PG_TLI.to_string(); - let tli_bytes = tli.as_bytes(); - let sysid_bytes = sysid.as_bytes(); - - pgb.write_message_noflush(&BeMessage::RowDescription(&[ - RowDescriptor { - name: b"systemid", - typoid: TEXT_OID, - typlen: -1, - ..Default::default() - }, - RowDescriptor { - name: b"timeline", - typoid: INT4_OID, - typlen: 4, - ..Default::default() - }, - RowDescriptor { - name: b"xlogpos", - typoid: TEXT_OID, - typlen: -1, - ..Default::default() - }, - RowDescriptor { - name: b"dbname", - typoid: TEXT_OID, - typlen: -1, - ..Default::default() - }, - ]))? - .write_message_noflush(&BeMessage::DataRow(&[ - Some(sysid_bytes), - Some(tli_bytes), - Some(lsn_bytes), - None, - ]))? - .write_message(&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"))?; - Ok(()) - } -} diff --git a/walkeeper/src/http/routes.rs b/walkeeper/src/http/routes.rs deleted file mode 100644 index 11a29ac6d3..0000000000 --- a/walkeeper/src/http/routes.rs +++ /dev/null @@ -1,105 +0,0 @@ -use hyper::{Body, Request, Response, StatusCode}; -use serde::Serialize; -use serde::Serializer; -use std::fmt::Display; -use std::sync::Arc; -use zenith_utils::http::{RequestExt, RouterBuilder}; -use zenith_utils::lsn::Lsn; -use zenith_utils::zid::ZTenantTimelineId; - -use crate::control_file::CreateControlFile; -use crate::safekeeper::Term; -use crate::safekeeper::TermHistory; -use crate::timeline::GlobalTimelines; -use crate::SafeKeeperConf; -use zenith_utils::http::endpoint; -use zenith_utils::http::error::ApiError; -use zenith_utils::http::json::json_response; -use zenith_utils::http::request::parse_request_param; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; - -/// Healthcheck handler. -async fn status_handler(_: Request) -> Result, ApiError> { - Ok(json_response(StatusCode::OK, "")?) -} - -fn get_conf(request: &Request) -> &SafeKeeperConf { - request - .data::>() - .expect("unknown state type") - .as_ref() -} - -/// Serialize through Display trait. -fn display_serialize(z: &F, s: S) -> Result -where - S: Serializer, - F: Display, -{ - s.serialize_str(&format!("{}", z)) -} - -/// Augment AcceptorState with epoch for convenience -#[derive(Debug, Serialize)] -struct AcceptorStateStatus { - term: Term, - epoch: Term, - term_history: TermHistory, -} - -/// Info about timeline on safekeeper ready for reporting. -#[derive(Debug, Serialize)] -struct TimelineStatus { - #[serde(serialize_with = "display_serialize")] - tenant_id: ZTenantId, - #[serde(serialize_with = "display_serialize")] - timeline_id: ZTimelineId, - acceptor_state: AcceptorStateStatus, - #[serde(serialize_with = "display_serialize")] - commit_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - truncate_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - flush_lsn: Lsn, -} - -/// Report info about timeline. -async fn timeline_status_handler(request: Request) -> Result, ApiError> { - let zttid = ZTenantTimelineId::new( - parse_request_param(&request, "tenant_id")?, - parse_request_param(&request, "timeline_id")?, - ); - - let tli = GlobalTimelines::get(get_conf(&request), zttid, CreateControlFile::False) - .map_err(ApiError::from_err)?; - let sk_state = tli.get_info(); - let flush_lsn = tli.get_end_of_wal(); - - let acc_state = AcceptorStateStatus { - term: sk_state.acceptor_state.term, - epoch: sk_state.acceptor_state.get_epoch(flush_lsn), - term_history: sk_state.acceptor_state.term_history, - }; - - let status = TimelineStatus { - tenant_id: zttid.tenant_id, - timeline_id: zttid.timeline_id, - acceptor_state: acc_state, - commit_lsn: sk_state.commit_lsn, - truncate_lsn: sk_state.truncate_lsn, - flush_lsn, - }; - Ok(json_response(StatusCode::OK, status)?) -} - -/// Safekeeper http router. -pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder { - let router = endpoint::make_router(); - router - .data(Arc::new(conf)) - .get("/v1/status", status_handler) - .get( - "/v1/timeline/:tenant_id/:timeline_id", - timeline_status_handler, - ) -} diff --git a/walkeeper/src/json_ctrl.rs b/walkeeper/src/json_ctrl.rs deleted file mode 100644 index 715ed559a9..0000000000 --- a/walkeeper/src/json_ctrl.rs +++ /dev/null @@ -1,268 +0,0 @@ -//! -//! This module implements JSON_CTRL protocol, which allows exchange -//! JSON messages over psql for testing purposes. -//! -//! Currently supports AppendLogicalMessage, which is used for WAL -//! modifications in tests. -//! - -use anyhow::Result; -use bytes::{BufMut, Bytes, BytesMut}; -use crc32c::crc32c_append; -use serde::{Deserialize, Serialize}; -use tracing::*; - -use crate::handler::SafekeeperPostgresHandler; -use crate::safekeeper::{AcceptorProposerMessage, AppendResponse}; -use crate::safekeeper::{ - AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, ProposerGreeting, -}; -use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry}; -use crate::timeline::TimelineTools; -use postgres_ffi::pg_constants; -use postgres_ffi::xlog_utils; -use postgres_ffi::{uint32, uint64, Oid, XLogRecord}; -use zenith_utils::lsn::Lsn; -use zenith_utils::postgres_backend::PostgresBackend; -use zenith_utils::pq_proto::{BeMessage, RowDescriptor, TEXT_OID}; - -#[derive(Serialize, Deserialize, Debug)] -pub struct AppendLogicalMessage { - // prefix and message to build LogicalMessage - lm_prefix: String, - lm_message: String, - - // if true, commit_lsn will match flush_lsn after append - set_commit_lsn: bool, - - // if true, ProposerElected will be sent before append - send_proposer_elected: bool, - - // fields from AppendRequestHeader - term: Term, - epoch_start_lsn: Lsn, - begin_lsn: Lsn, - truncate_lsn: Lsn, -} - -#[derive(Serialize, Deserialize)] -struct AppendResult { - // safekeeper state after append - state: SafeKeeperState, - // info about new record in the WAL - inserted_wal: InsertedWAL, -} - -/// Handles command to craft logical message WAL record with given -/// content, and then append it with specified term and lsn. This -/// function is used to test safekeepers in different scenarios. -pub fn handle_json_ctrl( - spg: &mut SafekeeperPostgresHandler, - pgb: &mut PostgresBackend, - append_request: &AppendLogicalMessage, -) -> Result<()> { - info!("JSON_CTRL request: {:?}", append_request); - - // need to init safekeeper state before AppendRequest - prepare_safekeeper(spg)?; - - // if send_proposer_elected is true, we need to update local history - if append_request.send_proposer_elected { - send_proposer_elected(spg, append_request.term, append_request.epoch_start_lsn)?; - } - - let inserted_wal = append_logical_message(spg, append_request)?; - let response = AppendResult { - state: spg.timeline.get().get_info(), - inserted_wal, - }; - let response_data = serde_json::to_vec(&response)?; - - pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor { - name: b"json", - typoid: TEXT_OID, - typlen: -1, - ..Default::default() - }]))? - .write_message_noflush(&BeMessage::DataRow(&[Some(&response_data)]))? - .write_message(&BeMessage::CommandComplete(b"JSON_CTRL"))?; - Ok(()) -} - -/// Prepare safekeeper to process append requests without crashes, -/// by sending ProposerGreeting with default server.wal_seg_size. -fn prepare_safekeeper(spg: &mut SafekeeperPostgresHandler) -> Result<()> { - let greeting_request = ProposerAcceptorMessage::Greeting(ProposerGreeting { - protocol_version: 1, // current protocol - pg_version: 0, // unknown - proposer_id: [0u8; 16], - system_id: 0, - ztli: spg.ztimelineid.unwrap(), - tenant_id: spg.ztenantid.unwrap(), - tli: 0, - wal_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32, // 16MB, default for tests - }); - - let response = spg.timeline.get().process_msg(&greeting_request)?; - match response { - Some(AcceptorProposerMessage::Greeting(_)) => Ok(()), - _ => anyhow::bail!("not GreetingResponse"), - } -} - -fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: Lsn) -> Result<()> { - // add new term to existing history - let history = spg.timeline.get().get_info().acceptor_state.term_history; - let history = history.up_to(lsn.checked_sub(1u64).unwrap()); - let mut history_entries = history.0; - history_entries.push(TermSwitchEntry { term, lsn }); - let history = TermHistory(history_entries); - - let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected { - term, - start_streaming_at: lsn, - term_history: history, - }); - - spg.timeline.get().process_msg(&proposer_elected_request)?; - Ok(()) -} - -#[derive(Serialize, Deserialize)] -struct InsertedWAL { - begin_lsn: Lsn, - end_lsn: Lsn, - append_response: AppendResponse, -} - -/// Extend local WAL with new LogicalMessage record. To do that, -/// create AppendRequest with new WAL and pass it to safekeeper. -fn append_logical_message( - spg: &mut SafekeeperPostgresHandler, - msg: &AppendLogicalMessage, -) -> Result { - let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); - let sk_state = spg.timeline.get().get_info(); - - let begin_lsn = msg.begin_lsn; - let end_lsn = begin_lsn + wal_data.len() as u64; - - let commit_lsn = if msg.set_commit_lsn { - end_lsn - } else { - sk_state.commit_lsn - }; - - let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest { - h: AppendRequestHeader { - term: msg.term, - epoch_start_lsn: begin_lsn, - begin_lsn, - end_lsn, - commit_lsn, - truncate_lsn: msg.truncate_lsn, - proposer_uuid: [0u8; 16], - }, - wal_data: Bytes::from(wal_data), - }); - - let response = spg.timeline.get().process_msg(&append_request)?; - - let append_response = match response { - Some(AcceptorProposerMessage::AppendResponse(resp)) => resp, - _ => anyhow::bail!("not AppendResponse"), - }; - - Ok(InsertedWAL { - begin_lsn, - end_lsn, - append_response, - }) -} - -#[repr(C)] -#[derive(Debug, Clone, Default, Serialize, Deserialize)] -struct XlLogicalMessage { - db_id: Oid, - transactional: uint32, // bool, takes 4 bytes due to alignment in C structures - prefix_size: uint64, - message_size: uint64, -} - -impl XlLogicalMessage { - pub fn encode(&self) -> Bytes { - use zenith_utils::bin_ser::LeSer; - self.ser().unwrap().into() - } -} - -/// Create new WAL record for non-transactional logical message. -/// Used for creating artificial WAL for tests, as LogicalMessage -/// record is basically no-op. -fn encode_logical_message(prefix: &str, message: &str) -> Vec { - let mut prefix_bytes = BytesMut::with_capacity(prefix.len() + 1); - prefix_bytes.put(prefix.as_bytes()); - prefix_bytes.put_u8(0); - - let message_bytes = message.as_bytes(); - - let logical_message = XlLogicalMessage { - db_id: 0, - transactional: 0, - prefix_size: prefix_bytes.len() as u64, - message_size: message_bytes.len() as u64, - }; - - let mainrdata = logical_message.encode(); - let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len(); - // only short mainrdata is supported for now - assert!(mainrdata_len <= 255); - let mainrdata_len = mainrdata_len as u8; - - let mut data: Vec = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len]; - data.extend_from_slice(&mainrdata); - data.extend_from_slice(&prefix_bytes); - data.extend_from_slice(message_bytes); - - let total_len = xlog_utils::XLOG_SIZE_OF_XLOG_RECORD + data.len(); - - let mut header = XLogRecord { - xl_tot_len: total_len as u32, - xl_xid: 0, - xl_prev: 0, - xl_info: 0, - xl_rmid: 21, - __bindgen_padding_0: [0u8; 2usize], - xl_crc: 0, // crc will be calculated later - }; - - let header_bytes = header.encode(); - let crc = crc32c_append(0, &data); - let crc = crc32c_append(crc, &header_bytes[0..xlog_utils::XLOG_RECORD_CRC_OFFS]); - header.xl_crc = crc; - - let mut wal: Vec = Vec::new(); - wal.extend_from_slice(&header.encode()); - wal.extend_from_slice(&data); - - // WAL start position must be aligned at 8 bytes, - // this will add padding for the next WAL record. - const PADDING: usize = 8; - let padding_rem = wal.len() % PADDING; - if padding_rem != 0 { - wal.resize(wal.len() + PADDING - padding_rem, 0); - } - - wal -} - -#[test] -fn test_encode_logical_message() { - let expected = [ - 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38, - 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102, - 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, - ]; - let actual = encode_logical_message("prefix", "message"); - assert_eq!(expected, actual[..]); -} diff --git a/walkeeper/src/lib.rs b/walkeeper/src/lib.rs deleted file mode 100644 index 6c3e0b264e..0000000000 --- a/walkeeper/src/lib.rs +++ /dev/null @@ -1,74 +0,0 @@ -// -use std::path::PathBuf; -use std::time::Duration; - -use zenith_utils::zid::ZTenantTimelineId; - -pub mod callmemaybe; -pub mod control_file; -pub mod control_file_upgrade; -pub mod handler; -pub mod http; -pub mod json_ctrl; -pub mod receive_wal; -pub mod s3_offload; -pub mod safekeeper; -pub mod send_wal; -pub mod timeline; -pub mod wal_service; -pub mod wal_storage; - -pub mod defaults { - use const_format::formatcp; - use std::time::Duration; - - pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; - pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); - - pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; - pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); - pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(1); -} - -#[derive(Debug, Clone)] -pub struct SafeKeeperConf { - // Repository directory, relative to current working directory. - // Normally, the safekeeper changes the current working directory - // to the repository, and 'workdir' is always '.'. But we don't do - // that during unit testing, because the current directory is global - // to the process but different unit tests work on different - // data directories to avoid clashing with each other. - pub workdir: PathBuf, - - pub daemonize: bool, - pub no_sync: bool, - pub listen_pg_addr: String, - pub listen_http_addr: String, - pub ttl: Option, - pub recall_period: Duration, -} - -impl SafeKeeperConf { - pub fn timeline_dir(&self, zttid: &ZTenantTimelineId) -> PathBuf { - self.workdir - .join(zttid.tenant_id.to_string()) - .join(zttid.timeline_id.to_string()) - } -} - -impl Default for SafeKeeperConf { - fn default() -> Self { - SafeKeeperConf { - // Always set to './'. We will chdir into the directory specified on the - // command line, so that when the server is running, all paths are relative - // to that. - workdir: PathBuf::from("./"), - daemonize: false, - no_sync: false, - listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), - listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - ttl: None, - recall_period: defaults::DEFAULT_RECALL_PERIOD, - } - } -} diff --git a/walkeeper/src/receive_wal.rs b/walkeeper/src/receive_wal.rs deleted file mode 100644 index b9420714fc..0000000000 --- a/walkeeper/src/receive_wal.rs +++ /dev/null @@ -1,128 +0,0 @@ -//! Safekeeper communication endpoint to WAL proposer (compute node). -//! Gets messages from the network, passes them down to consensus module and -//! sends replies back. - -use anyhow::{bail, Context, Result}; -use bytes::Bytes; -use bytes::BytesMut; -use tokio::sync::mpsc::UnboundedSender; -use tracing::*; - -use crate::timeline::Timeline; -use std::net::SocketAddr; -use std::sync::Arc; - -use crate::safekeeper::AcceptorProposerMessage; -use crate::safekeeper::ProposerAcceptorMessage; - -use crate::handler::SafekeeperPostgresHandler; -use crate::timeline::TimelineTools; -use zenith_utils::postgres_backend::PostgresBackend; -use zenith_utils::pq_proto::{BeMessage, FeMessage}; - -use crate::callmemaybe::CallmeEvent; - -pub struct ReceiveWalConn<'pg> { - /// Postgres connection - pg_backend: &'pg mut PostgresBackend, - /// The cached result of `pg_backend.socket().peer_addr()` (roughly) - peer_addr: SocketAddr, - /// Pageserver connection string forwarded from compute - /// NOTE that it is allowed to operate without a pageserver. - /// So if compute has no pageserver configured do not use it. - pageserver_connstr: Option, -} - -impl<'pg> ReceiveWalConn<'pg> { - pub fn new( - pg: &'pg mut PostgresBackend, - pageserver_connstr: Option, - ) -> ReceiveWalConn<'pg> { - let peer_addr = *pg.get_peer_addr(); - ReceiveWalConn { - pg_backend: pg, - peer_addr, - pageserver_connstr, - } - } - - // Read and extract the bytes of a `CopyData` message from the postgres instance - fn read_msg_bytes(&mut self) -> Result { - match self.pg_backend.read_message()? { - Some(FeMessage::CopyData(bytes)) => Ok(bytes), - Some(msg) => bail!("expected `CopyData` message, found {:?}", msg), - None => bail!("connection closed unexpectedly"), - } - } - - // Read and parse message sent from the postgres instance - fn read_msg(&mut self) -> Result { - let data = self.read_msg_bytes()?; - ProposerAcceptorMessage::parse(data) - } - - // Send message to the postgres - fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> Result<()> { - let mut buf = BytesMut::with_capacity(128); - msg.serialize(&mut buf)?; - self.pg_backend.write_message(&BeMessage::CopyData(&buf))?; - Ok(()) - } - - /// Receive WAL from wal_proposer - pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<()> { - let _enter = info_span!("WAL acceptor", timeline = %spg.ztimelineid.unwrap()).entered(); - - // Notify the libpq client that it's allowed to send `CopyData` messages - self.pg_backend - .write_message(&BeMessage::CopyBothResponse)?; - - // Receive information about server - let mut msg = self - .read_msg() - .context("failed to receive proposer greeting")?; - match msg { - ProposerAcceptorMessage::Greeting(ref greeting) => { - info!( - "start handshake with wal proposer {} sysid {} timeline {}", - self.peer_addr, greeting.system_id, greeting.tli, - ); - } - _ => bail!("unexpected message {:?} instead of greeting", msg), - } - - // Register the connection and defer unregister. - spg.timeline - .get() - .on_compute_connect(self.pageserver_connstr.as_ref(), &spg.tx)?; - let _guard = ComputeConnectionGuard { - timeline: Arc::clone(spg.timeline.get()), - callmemaybe_tx: spg.tx.clone(), - }; - - loop { - let reply = spg - .timeline - .get() - .process_msg(&msg) - .context("failed to process ProposerAcceptorMessage")?; - if let Some(reply) = reply { - self.write_msg(&reply)?; - } - msg = self.read_msg()?; - } - } -} - -struct ComputeConnectionGuard { - timeline: Arc, - callmemaybe_tx: UnboundedSender, -} - -impl Drop for ComputeConnectionGuard { - fn drop(&mut self) { - self.timeline - .on_compute_disconnect(&self.callmemaybe_tx) - .unwrap(); - } -} diff --git a/walkeeper/src/s3_offload.rs b/walkeeper/src/s3_offload.rs deleted file mode 100644 index 2b3285e6c6..0000000000 --- a/walkeeper/src/s3_offload.rs +++ /dev/null @@ -1,104 +0,0 @@ -// -// Offload old WAL segments to S3 and remove them locally -// - -use anyhow::Result; -use postgres_ffi::xlog_utils::*; -use s3::bucket::Bucket; -use s3::creds::Credentials; -use s3::region::Region; -use std::collections::HashSet; -use std::env; -use std::fs::{self, File}; -use std::io::prelude::*; -use std::path::Path; -use std::time::SystemTime; -use tokio::runtime; -use tokio::time::sleep; -use tracing::*; -use walkdir::WalkDir; - -use crate::SafeKeeperConf; - -pub fn thread_main(conf: SafeKeeperConf) { - // Create a new thread pool - // - // FIXME: keep it single-threaded for now, make it easier to debug with gdb, - // and we're not concerned with performance yet. - //let runtime = runtime::Runtime::new().unwrap(); - let runtime = runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - - info!("Starting S3 offload task"); - - runtime.block_on(async { - main_loop(&conf).await.unwrap(); - }); -} - -async fn offload_files( - bucket: &Bucket, - listing: &HashSet, - dir_path: &Path, - conf: &SafeKeeperConf, -) -> Result { - let horizon = SystemTime::now() - conf.ttl.unwrap(); - let mut n: u64 = 0; - for entry in WalkDir::new(dir_path) { - let entry = entry?; - let path = entry.path(); - - if path.is_file() - && IsXLogFileName(entry.file_name().to_str().unwrap()) - && entry.metadata().unwrap().created().unwrap() <= horizon - { - let relpath = path.strip_prefix(&conf.workdir).unwrap(); - let s3path = String::from("walarchive/") + relpath.to_str().unwrap(); - if !listing.contains(&s3path) { - let mut file = File::open(&path)?; - let mut content = Vec::new(); - file.read_to_end(&mut content)?; - bucket.put_object(s3path, &content).await?; - - fs::remove_file(&path)?; - n += 1; - } - } - } - Ok(n) -} - -async fn main_loop(conf: &SafeKeeperConf) -> Result<()> { - let region = Region::Custom { - region: env::var("S3_REGION").unwrap(), - endpoint: env::var("S3_ENDPOINT").unwrap(), - }; - let credentials = Credentials::new( - Some(&env::var("S3_ACCESSKEY").unwrap()), - Some(&env::var("S3_SECRET").unwrap()), - None, - None, - None, - ) - .unwrap(); - - // Create Bucket in REGION for BUCKET - let bucket = Bucket::new_with_path_style("zenith-testbucket", region, credentials)?; - - loop { - // List out contents of directory - let results = bucket - .list("walarchive/".to_string(), Some("".to_string())) - .await?; - let listing = results - .iter() - .flat_map(|b| b.contents.iter().map(|o| o.key.clone())) - .collect(); - - let n = offload_files(&bucket, &listing, &conf.workdir, conf).await?; - info!("Offload {} files to S3", n); - sleep(conf.ttl.unwrap()).await; - } -} diff --git a/walkeeper/src/safekeeper.rs b/walkeeper/src/safekeeper.rs deleted file mode 100644 index 981a0f4d57..0000000000 --- a/walkeeper/src/safekeeper.rs +++ /dev/null @@ -1,826 +0,0 @@ -//! Acceptor part of proposer-acceptor consensus algorithm. - -use anyhow::{bail, Context, Result}; -use byteorder::{LittleEndian, ReadBytesExt}; -use bytes::{Buf, BufMut, Bytes, BytesMut}; - -use postgres_ffi::xlog_utils::TimeLineID; -use serde::{Deserialize, Serialize}; -use std::cmp::min; -use std::fmt; -use std::io::Read; -use tracing::*; - -use lazy_static::lazy_static; - -use crate::control_file; -use crate::send_wal::HotStandbyFeedback; -use crate::wal_storage; -use postgres_ffi::xlog_utils::MAX_SEND_SIZE; -use zenith_metrics::{register_gauge_vec, Gauge, GaugeVec}; -use zenith_utils::bin_ser::LeSer; -use zenith_utils::lsn::Lsn; -use zenith_utils::pq_proto::SystemId; -use zenith_utils::pq_proto::ZenithFeedback; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; - -pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 3; -const SK_PROTOCOL_VERSION: u32 = 1; -const UNKNOWN_SERVER_VERSION: u32 = 0; - -/// Consensus logical timestamp. -pub type Term = u64; - -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub struct TermSwitchEntry { - pub term: Term, - pub lsn: Lsn, -} -#[derive(Clone, Serialize, Deserialize)] -pub struct TermHistory(pub Vec); - -impl TermHistory { - pub fn empty() -> TermHistory { - TermHistory(Vec::new()) - } - - // Parse TermHistory as n_entries followed by TermSwitchEntry pairs - pub fn from_bytes(mut bytes: Bytes) -> Result { - if bytes.remaining() < 4 { - bail!("TermHistory misses len"); - } - let n_entries = bytes.get_u32_le(); - let mut res = Vec::with_capacity(n_entries as usize); - for _ in 0..n_entries { - if bytes.remaining() < 16 { - bail!("TermHistory is incomplete"); - } - res.push(TermSwitchEntry { - term: bytes.get_u64_le(), - lsn: bytes.get_u64_le().into(), - }) - } - Ok(TermHistory(res)) - } - - /// Return copy of self with switches happening strictly after up_to - /// truncated. - pub fn up_to(&self, up_to: Lsn) -> TermHistory { - let mut res = Vec::with_capacity(self.0.len()); - for e in &self.0 { - if e.lsn > up_to { - break; - } - res.push(*e); - } - TermHistory(res) - } -} - -/// Display only latest entries for Debug. -impl fmt::Debug for TermHistory { - fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { - let n_printed = 20; - write!( - fmt, - "{}{:?}", - if self.0.len() > n_printed { "... " } else { "" }, - self.0 - .iter() - .rev() - .take(n_printed) - .map(|&e| (e.term, e.lsn)) // omit TermSwitchEntry - .collect::>() - ) - } -} - -/// Unique id of proposer. Not needed for correctness, used for monitoring. -pub type PgUuid = [u8; 16]; - -/// Persistent consensus state of the acceptor. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct AcceptorState { - /// acceptor's last term it voted for (advanced in 1 phase) - pub term: Term, - /// History of term switches for safekeeper's WAL. - /// Actually it often goes *beyond* WAL contents as we adopt term history - /// from the proposer before recovery. - pub term_history: TermHistory, -} - -impl AcceptorState { - /// acceptor's epoch is the term of the highest entry in the log - pub fn get_epoch(&self, flush_lsn: Lsn) -> Term { - let th = self.term_history.up_to(flush_lsn); - match th.0.last() { - Some(e) => e.term, - None => 0, - } - } -} - -/// Information about Postgres. Safekeeper gets it once and then verifies -/// all further connections from computes match. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct ServerInfo { - /// Postgres server version - pub pg_version: u32, - pub system_id: SystemId, - #[serde(with = "hex")] - pub tenant_id: ZTenantId, - /// Zenith timelineid - #[serde(with = "hex")] - pub timeline_id: ZTimelineId, - pub wal_seg_size: u32, -} - -/// Persistent information stored on safekeeper node -/// On disk data is prefixed by magic and format version and followed by checksum. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SafeKeeperState { - /// persistent acceptor state - pub acceptor_state: AcceptorState, - /// information about server - pub server: ServerInfo, - /// Unique id of the last *elected* proposer we dealed with. Not needed - /// for correctness, exists for monitoring purposes. - #[serde(with = "hex")] - pub proposer_uuid: PgUuid, - /// part of WAL acknowledged by quorum and available locally - pub commit_lsn: Lsn, - /// minimal LSN which may be needed for recovery of some safekeeper (end_lsn - /// of last record streamed to everyone) - pub truncate_lsn: Lsn, - // Safekeeper starts receiving WAL from this LSN, zeros before it ought to - // be skipped during decoding. - pub wal_start_lsn: Lsn, -} - -impl SafeKeeperState { - pub fn new() -> SafeKeeperState { - SafeKeeperState { - acceptor_state: AcceptorState { - term: 0, - term_history: TermHistory::empty(), - }, - server: ServerInfo { - pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ - system_id: 0, /* Postgres system identifier */ - tenant_id: ZTenantId::from([0u8; 16]), - timeline_id: ZTimelineId::from([0u8; 16]), - wal_seg_size: 0, - }, - proposer_uuid: [0; 16], - commit_lsn: Lsn(0), /* part of WAL acknowledged by quorum */ - truncate_lsn: Lsn(0), /* minimal LSN which may be needed for recovery of some safekeeper */ - wal_start_lsn: Lsn(0), - } - } -} - -impl Default for SafeKeeperState { - fn default() -> Self { - Self::new() - } -} - -// protocol messages - -/// Initial Proposer -> Acceptor message -#[derive(Debug, Deserialize)] -pub struct ProposerGreeting { - /// proposer-acceptor protocol version - pub protocol_version: u32, - /// Postgres server version - pub pg_version: u32, - pub proposer_id: PgUuid, - pub system_id: SystemId, - /// Zenith timelineid - pub ztli: ZTimelineId, - pub tenant_id: ZTenantId, - pub tli: TimeLineID, - pub wal_seg_size: u32, -} - -/// Acceptor -> Proposer initial response: the highest term known to me -/// (acceptor voted for). -#[derive(Debug, Serialize)] -pub struct AcceptorGreeting { - term: u64, -} - -/// Vote request sent from proposer to safekeepers -#[derive(Debug, Deserialize)] -pub struct VoteRequest { - term: Term, -} - -/// Vote itself, sent from safekeeper to proposer -#[derive(Debug, Serialize)] -pub struct VoteResponse { - term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date. - vote_given: u64, // fixme u64 due to padding - // Safekeeper flush_lsn (end of WAL) + history of term switches allow - // proposer to choose the most advanced one. - flush_lsn: Lsn, - truncate_lsn: Lsn, - term_history: TermHistory, -} - -/* - * Proposer -> Acceptor message announcing proposer is elected and communicating - * term history to it. - */ -#[derive(Debug)] -pub struct ProposerElected { - pub term: Term, - pub start_streaming_at: Lsn, - pub term_history: TermHistory, -} - -/// Request with WAL message sent from proposer to safekeeper. Along the way it -/// communicates commit_lsn. -#[derive(Debug)] -pub struct AppendRequest { - pub h: AppendRequestHeader, - pub wal_data: Bytes, -} -#[derive(Debug, Clone, Deserialize)] -pub struct AppendRequestHeader { - // safekeeper's current term; if it is higher than proposer's, the compute is out of date. - pub term: Term, - // LSN since the proposer appends WAL; determines epoch switch point. - pub epoch_start_lsn: Lsn, - /// start position of message in WAL - pub begin_lsn: Lsn, - /// end position of message in WAL - pub end_lsn: Lsn, - /// LSN committed by quorum of safekeepers - pub commit_lsn: Lsn, - /// minimal LSN which may be needed by proposer to perform recovery of some safekeeper - pub truncate_lsn: Lsn, - // only for logging/debugging - pub proposer_uuid: PgUuid, -} - -/// Report safekeeper state to proposer -#[derive(Debug, PartialEq, Serialize, Deserialize)] -pub struct AppendResponse { - // Current term of the safekeeper; if it is higher than proposer's, the - // compute is out of date. - pub term: Term, - // NOTE: this is physical end of wal on safekeeper; currently it doesn't - // make much sense without taking epoch into account, as history can be - // diverged. - pub flush_lsn: Lsn, - // We report back our awareness about which WAL is committed, as this is - // a criterion for walproposer --sync mode exit - pub commit_lsn: Lsn, - pub hs_feedback: HotStandbyFeedback, - pub zenith_feedback: ZenithFeedback, -} - -impl AppendResponse { - fn term_only(term: Term) -> AppendResponse { - AppendResponse { - term, - flush_lsn: Lsn(0), - commit_lsn: Lsn(0), - hs_feedback: HotStandbyFeedback::empty(), - zenith_feedback: ZenithFeedback::empty(), - } - } -} - -/// Proposer -> Acceptor messages -#[derive(Debug)] -pub enum ProposerAcceptorMessage { - Greeting(ProposerGreeting), - VoteRequest(VoteRequest), - Elected(ProposerElected), - AppendRequest(AppendRequest), -} - -impl ProposerAcceptorMessage { - /// Parse proposer message. - pub fn parse(msg_bytes: Bytes) -> Result { - // xxx using Reader is inefficient but easy to work with bincode - let mut stream = msg_bytes.reader(); - // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is - let tag = stream.read_u64::()? as u8 as char; - match tag { - 'g' => { - let msg = ProposerGreeting::des_from(&mut stream)?; - Ok(ProposerAcceptorMessage::Greeting(msg)) - } - 'v' => { - let msg = VoteRequest::des_from(&mut stream)?; - Ok(ProposerAcceptorMessage::VoteRequest(msg)) - } - 'e' => { - let mut msg_bytes = stream.into_inner(); - if msg_bytes.remaining() < 16 { - bail!("ProposerElected message is not complete"); - } - let term = msg_bytes.get_u64_le(); - let start_streaming_at = msg_bytes.get_u64_le().into(); - let term_history = TermHistory::from_bytes(msg_bytes)?; - let msg = ProposerElected { - term, - start_streaming_at, - term_history, - }; - Ok(ProposerAcceptorMessage::Elected(msg)) - } - 'a' => { - // read header followed by wal data - let hdr = AppendRequestHeader::des_from(&mut stream)?; - let rec_size = hdr - .end_lsn - .checked_sub(hdr.begin_lsn) - .context("begin_lsn > end_lsn in AppendRequest")? - .0 as usize; - if rec_size > MAX_SEND_SIZE { - bail!( - "AppendRequest is longer than MAX_SEND_SIZE ({})", - MAX_SEND_SIZE - ); - } - - let mut wal_data_vec: Vec = vec![0; rec_size]; - stream.read_exact(&mut wal_data_vec)?; - let wal_data = Bytes::from(wal_data_vec); - let msg = AppendRequest { h: hdr, wal_data }; - - Ok(ProposerAcceptorMessage::AppendRequest(msg)) - } - _ => bail!("unknown proposer-acceptor message tag: {}", tag,), - } - } -} - -/// Acceptor -> Proposer messages -#[derive(Debug)] -pub enum AcceptorProposerMessage { - Greeting(AcceptorGreeting), - VoteResponse(VoteResponse), - AppendResponse(AppendResponse), -} - -impl AcceptorProposerMessage { - /// Serialize acceptor -> proposer message. - pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> { - match self { - AcceptorProposerMessage::Greeting(msg) => { - buf.put_u64_le('g' as u64); - buf.put_u64_le(msg.term); - } - AcceptorProposerMessage::VoteResponse(msg) => { - buf.put_u64_le('v' as u64); - buf.put_u64_le(msg.term); - buf.put_u64_le(msg.vote_given); - buf.put_u64_le(msg.flush_lsn.into()); - buf.put_u64_le(msg.truncate_lsn.into()); - buf.put_u32_le(msg.term_history.0.len() as u32); - for e in &msg.term_history.0 { - buf.put_u64_le(e.term); - buf.put_u64_le(e.lsn.into()); - } - } - AcceptorProposerMessage::AppendResponse(msg) => { - buf.put_u64_le('a' as u64); - buf.put_u64_le(msg.term); - buf.put_u64_le(msg.flush_lsn.into()); - buf.put_u64_le(msg.commit_lsn.into()); - buf.put_i64_le(msg.hs_feedback.ts); - buf.put_u64_le(msg.hs_feedback.xmin); - buf.put_u64_le(msg.hs_feedback.catalog_xmin); - - msg.zenith_feedback.serialize(buf)? - } - } - - Ok(()) - } -} - -lazy_static! { - // The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). - // i64 is faster than f64, so update to u64 when available. - static ref COMMIT_LSN_GAUGE: GaugeVec = register_gauge_vec!( - "safekeeper_commit_lsn", - "Current commit_lsn (not necessarily persisted to disk), grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("Failed to register safekeeper_commit_lsn gauge vec"); -} - -struct SafeKeeperMetrics { - commit_lsn: Gauge, -} - -impl SafeKeeperMetrics { - fn new(tenant_id: ZTenantId, timeline_id: ZTimelineId, commit_lsn: Lsn) -> Self { - let tenant_id = tenant_id.to_string(); - let timeline_id = timeline_id.to_string(); - let m = Self { - commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&[&tenant_id, &timeline_id]), - }; - m.commit_lsn.set(u64::from(commit_lsn) as f64); - m - } -} - -/// SafeKeeper which consumes events (messages from compute) and provides -/// replies. -pub struct SafeKeeper { - // Cached metrics so we don't have to recompute labels on each update. - metrics: SafeKeeperMetrics, - - /// not-yet-flushed pairs of same named fields in s.* - pub commit_lsn: Lsn, - pub truncate_lsn: Lsn, - pub s: SafeKeeperState, // persistent part - - pub control_store: CTRL, - pub wal_store: WAL, -} - -impl SafeKeeper -where - CTRL: control_file::Storage, - WAL: wal_storage::Storage, -{ - // constructor - pub fn new( - ztli: ZTimelineId, - control_store: CTRL, - wal_store: WAL, - state: SafeKeeperState, - ) -> SafeKeeper { - if state.server.timeline_id != ZTimelineId::from([0u8; 16]) - && ztli != state.server.timeline_id - { - panic!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.server.timeline_id); - } - - SafeKeeper { - metrics: SafeKeeperMetrics::new(state.server.tenant_id, ztli, state.commit_lsn), - commit_lsn: state.commit_lsn, - truncate_lsn: state.truncate_lsn, - s: state, - control_store, - wal_store, - } - } - - /// Get history of term switches for the available WAL - fn get_term_history(&self) -> TermHistory { - self.s - .acceptor_state - .term_history - .up_to(self.wal_store.flush_lsn()) - } - - #[cfg(test)] - fn get_epoch(&self) -> Term { - self.s.acceptor_state.get_epoch(self.wal_store.flush_lsn()) - } - - /// Process message from proposer and possibly form reply. Concurrent - /// callers must exclude each other. - pub fn process_msg( - &mut self, - msg: &ProposerAcceptorMessage, - ) -> Result> { - match msg { - ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg), - ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg), - ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg), - ProposerAcceptorMessage::AppendRequest(msg) => self.handle_append_request(msg), - } - } - - /// Handle initial message from proposer: check its sanity and send my - /// current term. - fn handle_greeting( - &mut self, - msg: &ProposerGreeting, - ) -> Result> { - /* Check protocol compatibility */ - if msg.protocol_version != SK_PROTOCOL_VERSION { - bail!( - "incompatible protocol version {}, expected {}", - msg.protocol_version, - SK_PROTOCOL_VERSION - ); - } - /* Postgres upgrade is not treated as fatal error */ - if msg.pg_version != self.s.server.pg_version - && self.s.server.pg_version != UNKNOWN_SERVER_VERSION - { - info!( - "incompatible server version {}, expected {}", - msg.pg_version, self.s.server.pg_version - ); - } - - // set basic info about server, if not yet - // TODO: verify that is doesn't change after - self.s.server.system_id = msg.system_id; - self.s.server.tenant_id = msg.tenant_id; - self.s.server.timeline_id = msg.ztli; - self.s.server.wal_seg_size = msg.wal_seg_size; - self.control_store - .persist(&self.s) - .context("failed to persist shared state")?; - - // pass wal_seg_size to read WAL and find flush_lsn - self.wal_store.init_storage(&self.s)?; - - // update tenant_id/timeline_id in metrics - self.metrics = SafeKeeperMetrics::new(msg.tenant_id, msg.ztli, self.commit_lsn); - - info!( - "processed greeting from proposer {:?}, sending term {:?}", - msg.proposer_id, self.s.acceptor_state.term - ); - Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting { - term: self.s.acceptor_state.term, - }))) - } - - /// Give vote for the given term, if we haven't done that previously. - fn handle_vote_request( - &mut self, - msg: &VoteRequest, - ) -> Result> { - // initialize with refusal - let mut resp = VoteResponse { - term: self.s.acceptor_state.term, - vote_given: false as u64, - flush_lsn: self.wal_store.flush_lsn(), - truncate_lsn: self.s.truncate_lsn, - term_history: self.get_term_history(), - }; - if self.s.acceptor_state.term < msg.term { - self.s.acceptor_state.term = msg.term; - // persist vote before sending it out - self.control_store.persist(&self.s)?; - resp.term = self.s.acceptor_state.term; - resp.vote_given = true as u64; - } - info!("processed VoteRequest for term {}: {:?}", msg.term, &resp); - Ok(Some(AcceptorProposerMessage::VoteResponse(resp))) - } - - /// Bump our term if received a note from elected proposer with higher one - fn bump_if_higher(&mut self, term: Term) -> Result<()> { - if self.s.acceptor_state.term < term { - self.s.acceptor_state.term = term; - self.control_store.persist(&self.s)?; - } - Ok(()) - } - - /// Form AppendResponse from current state. - fn append_response(&self) -> AppendResponse { - AppendResponse { - term: self.s.acceptor_state.term, - flush_lsn: self.wal_store.flush_lsn(), - commit_lsn: self.s.commit_lsn, - // will be filled by the upper code to avoid bothering safekeeper - hs_feedback: HotStandbyFeedback::empty(), - zenith_feedback: ZenithFeedback::empty(), - } - } - - fn handle_elected(&mut self, msg: &ProposerElected) -> Result> { - info!("received ProposerElected {:?}", msg); - self.bump_if_higher(msg.term)?; - // If our term is higher, ignore the message (next feedback will inform the compute) - if self.s.acceptor_state.term > msg.term { - return Ok(None); - } - - // truncate wal, update the lsns - self.wal_store.truncate_wal(msg.start_streaming_at)?; - - // and now adopt term history from proposer - self.s.acceptor_state.term_history = msg.term_history.clone(); - self.control_store.persist(&self.s)?; - - info!("start receiving WAL since {:?}", msg.start_streaming_at); - - Ok(None) - } - - /// Handle request to append WAL. - #[allow(clippy::comparison_chain)] - fn handle_append_request( - &mut self, - msg: &AppendRequest, - ) -> Result> { - if self.s.acceptor_state.term < msg.h.term { - bail!("got AppendRequest before ProposerElected"); - } - - // If our term is higher, immediately refuse the message. - if self.s.acceptor_state.term > msg.h.term { - let resp = AppendResponse::term_only(self.s.acceptor_state.term); - return Ok(Some(AcceptorProposerMessage::AppendResponse(resp))); - } - - // After ProposerElected, which performs truncation, we should get only - // indeed append requests (but flush_lsn is advanced only on record - // boundary, so might be less). - assert!(self.wal_store.flush_lsn() <= msg.h.begin_lsn); - - self.s.proposer_uuid = msg.h.proposer_uuid; - let mut sync_control_file = false; - - // do the job - if !msg.wal_data.is_empty() { - self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?; - - // If this was the first record we ever receieved, remember LSN to help - // find_end_of_wal skip the hole in the beginning. - if self.s.wal_start_lsn == Lsn(0) { - self.s.wal_start_lsn = msg.h.begin_lsn; - sync_control_file = true; - } - } - - // Advance commit_lsn taking into account what we have locally. - // commit_lsn can be 0, being unknown to new walproposer while he hasn't - // collected majority of its epoch acks yet, ignore it in this case. - if msg.h.commit_lsn != Lsn(0) { - let commit_lsn = min(msg.h.commit_lsn, self.wal_store.flush_lsn()); - // If new commit_lsn reached epoch switch, force sync of control - // file: walproposer in sync mode is very interested when this - // happens. Note: this is for sync-safekeepers mode only, as - // otherwise commit_lsn might jump over epoch_start_lsn. - sync_control_file |= commit_lsn == msg.h.epoch_start_lsn; - self.commit_lsn = commit_lsn; - self.metrics - .commit_lsn - .set(u64::from(self.commit_lsn) as f64); - } - - self.truncate_lsn = msg.h.truncate_lsn; - /* - * Update truncate and commit LSN in control file. - * To avoid negative impact on performance of extra fsync, do it only - * when truncate_lsn delta exceeds WAL segment size. - */ - sync_control_file |= - self.s.truncate_lsn + (self.s.server.wal_seg_size as u64) < self.truncate_lsn; - if sync_control_file { - self.s.commit_lsn = self.commit_lsn; - self.s.truncate_lsn = self.truncate_lsn; - } - - if sync_control_file { - self.control_store.persist(&self.s)?; - } - - let resp = self.append_response(); - trace!( - "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, resp {:?}", - msg.wal_data.len(), - msg.h.end_lsn, - msg.h.commit_lsn, - msg.h.truncate_lsn, - &resp, - ); - Ok(Some(AcceptorProposerMessage::AppendResponse(resp))) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::wal_storage::Storage; - - // fake storage for tests - struct InMemoryState { - persisted_state: SafeKeeperState, - } - - impl control_file::Storage for InMemoryState { - fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { - self.persisted_state = s.clone(); - Ok(()) - } - } - - struct DummyWalStore { - lsn: Lsn, - } - - impl wal_storage::Storage for DummyWalStore { - fn flush_lsn(&self) -> Lsn { - self.lsn - } - - fn init_storage(&mut self, _state: &SafeKeeperState) -> Result<()> { - Ok(()) - } - - fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { - self.lsn = startpos + buf.len() as u64; - Ok(()) - } - - fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { - self.lsn = end_pos; - Ok(()) - } - } - - #[test] - fn test_voting() { - let storage = InMemoryState { - persisted_state: SafeKeeperState::new(), - }; - let wal_store = DummyWalStore { lsn: Lsn(0) }; - let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::new()); - - // check voting for 1 is ok - let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); - let mut vote_resp = sk.process_msg(&vote_request); - match vote_resp.unwrap() { - Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0), - r => panic!("unexpected response: {:?}", r), - } - - // reboot... - let state = sk.control_store.persisted_state.clone(); - let storage = InMemoryState { - persisted_state: state.clone(), - }; - sk = SafeKeeper::new(ztli, storage, sk.wal_store, state); - - // and ensure voting second time for 1 is not ok - vote_resp = sk.process_msg(&vote_request); - match vote_resp.unwrap() { - Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0), - r => panic!("unexpected response: {:?}", r), - } - } - - #[test] - fn test_epoch_switch() { - let storage = InMemoryState { - persisted_state: SafeKeeperState::new(), - }; - let wal_store = DummyWalStore { lsn: Lsn(0) }; - let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::new()); - - let mut ar_hdr = AppendRequestHeader { - term: 1, - epoch_start_lsn: Lsn(3), - begin_lsn: Lsn(1), - end_lsn: Lsn(2), - commit_lsn: Lsn(0), - truncate_lsn: Lsn(0), - proposer_uuid: [0; 16], - }; - let mut append_request = AppendRequest { - h: ar_hdr.clone(), - wal_data: Bytes::from_static(b"b"), - }; - - let pem = ProposerElected { - term: 1, - start_streaming_at: Lsn(1), - term_history: TermHistory(vec![TermSwitchEntry { - term: 1, - lsn: Lsn(3), - }]), - }; - sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) - .unwrap(); - - // check that AppendRequest before epochStartLsn doesn't switch epoch - let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)); - assert!(resp.is_ok()); - assert_eq!(sk.get_epoch(), 0); - - // but record at epochStartLsn does the switch - ar_hdr.begin_lsn = Lsn(2); - ar_hdr.end_lsn = Lsn(3); - append_request = AppendRequest { - h: ar_hdr, - wal_data: Bytes::from_static(b"b"), - }; - let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)); - assert!(resp.is_ok()); - sk.wal_store.truncate_wal(Lsn(3)).unwrap(); // imitate the complete record at 3 %) - assert_eq!(sk.get_epoch(), 1); - } -} diff --git a/walkeeper/src/send_wal.rs b/walkeeper/src/send_wal.rs deleted file mode 100644 index 1febd71842..0000000000 --- a/walkeeper/src/send_wal.rs +++ /dev/null @@ -1,357 +0,0 @@ -//! This module implements the streaming side of replication protocol, starting -//! with the "START_REPLICATION" message. - -use crate::handler::SafekeeperPostgresHandler; -use crate::timeline::{ReplicaState, Timeline, TimelineTools}; -use crate::wal_storage::WalReader; -use anyhow::{bail, Context, Result}; - -use postgres_ffi::xlog_utils::{get_current_timestamp, TimestampTz, MAX_SEND_SIZE}; - -use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; -use bytes::Bytes; -use serde::{Deserialize, Serialize}; -use std::cmp::min; -use std::net::Shutdown; -use std::sync::Arc; -use std::thread::sleep; -use std::time::Duration; -use std::{str, thread}; -use tokio::sync::mpsc::UnboundedSender; -use tracing::*; -use zenith_utils::bin_ser::BeSer; -use zenith_utils::lsn::Lsn; -use zenith_utils::postgres_backend::PostgresBackend; -use zenith_utils::pq_proto::{BeMessage, FeMessage, WalSndKeepAlive, XLogDataBody, ZenithFeedback}; -use zenith_utils::sock_split::ReadStream; - -use zenith_utils::zid::{ZTenantId, ZTimelineId}; - -// See: https://www.postgresql.org/docs/13/protocol-replication.html -const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h'; -const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r'; -// zenith extension of replication protocol -const ZENITH_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; - -type FullTransactionId = u64; - -/// Hot standby feedback received from replica -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] -pub struct HotStandbyFeedback { - pub ts: TimestampTz, - pub xmin: FullTransactionId, - pub catalog_xmin: FullTransactionId, -} - -impl HotStandbyFeedback { - pub fn empty() -> HotStandbyFeedback { - HotStandbyFeedback { - ts: 0, - xmin: 0, - catalog_xmin: 0, - } - } -} - -/// Standby status update -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct StandbyReply { - pub write_lsn: Lsn, // last lsn received by pageserver - pub flush_lsn: Lsn, // pageserver's disk consistent lSN - pub apply_lsn: Lsn, // pageserver's remote consistent lSN - pub reply_ts: TimestampTz, - pub reply_requested: bool, -} - -/// A network connection that's speaking the replication protocol. -pub struct ReplicationConn { - /// This is an `Option` because we will spawn a background thread that will - /// `take` it from us. - stream_in: Option, -} - -/// Scope guard to unregister replication connection from timeline -struct ReplicationConnGuard { - replica: usize, // replica internal ID assigned by timeline - timeline: Arc, -} - -impl Drop for ReplicationConnGuard { - fn drop(&mut self) { - self.timeline.remove_replica(self.replica); - } -} - -// XXX: Naming is a bit messy here. -// This ReplicationStreamGuard lives as long as ReplicationConn -// and current ReplicationConnGuard is tied to the background thread -// that receives feedback. -struct ReplicationStreamGuard { - tx: UnboundedSender, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - pageserver_connstr: String, -} - -impl Drop for ReplicationStreamGuard { - fn drop(&mut self) { - // the connection with pageserver is lost, - // resume callback subscription - debug!( - "Connection to pageserver is gone. Resume callmemaybe subsciption if necessary. tenantid {} timelineid {}", - self.tenant_id, self.timeline_id, - ); - - let subscription_key = SubscriptionStateKey::new( - self.tenant_id, - self.timeline_id, - self.pageserver_connstr.to_owned(), - ); - - self.tx - .send(CallmeEvent::Resume(subscription_key)) - .unwrap_or_else(|e| { - error!("failed to send Resume request to callmemaybe thread {}", e); - }); - } -} - -impl ReplicationConn { - /// Create a new `ReplicationConn` - pub fn new(pgb: &mut PostgresBackend) -> Self { - Self { - stream_in: pgb.take_stream_in(), - } - } - - /// Handle incoming messages from the network. - /// This is spawned into the background by `handle_start_replication`. - fn background_thread( - mut stream_in: ReadStream, - replica_guard: Arc, - ) -> Result<()> { - let replica_id = replica_guard.replica; - let timeline = &replica_guard.timeline; - - let mut state = ReplicaState::new(); - // Wait for replica's feedback. - while let Some(msg) = FeMessage::read(&mut stream_in)? { - match &msg { - FeMessage::CopyData(m) => { - // There's three possible data messages that the client is supposed to send here: - // `HotStandbyFeedback` and `StandbyStatusUpdate` and `ZenithStandbyFeedback`. - - match m.first().cloned() { - Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { - // Note: deserializing is on m[1..] because we skip the tag byte. - state.hs_feedback = HotStandbyFeedback::des(&m[1..]) - .context("failed to deserialize HotStandbyFeedback")?; - timeline.update_replica_state(replica_id, state); - } - Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => { - let _reply = StandbyReply::des(&m[1..]) - .context("failed to deserialize StandbyReply")?; - // This must be a regular postgres replica, - // because pageserver doesn't send this type of messages to safekeeper. - // Currently this is not implemented, so this message is ignored. - - warn!("unexpected StandbyReply. Read-only postgres replicas are not supported in safekeepers yet."); - // timeline.update_replica_state(replica_id, Some(state)); - } - Some(ZENITH_STATUS_UPDATE_TAG_BYTE) => { - // Note: deserializing is on m[9..] because we skip the tag byte and len bytes. - let buf = Bytes::copy_from_slice(&m[9..]); - let reply = ZenithFeedback::parse(buf); - - trace!("ZenithFeedback is {:?}", reply); - // Only pageserver sends ZenithFeedback, so set the flag. - // This replica is the source of information to resend to compute. - state.zenith_feedback = Some(reply); - - timeline.update_replica_state(replica_id, state); - } - _ => warn!("unexpected message {:?}", msg), - } - } - FeMessage::Sync => {} - FeMessage::CopyFail => { - // Shutdown the connection, because rust-postgres client cannot be dropped - // when connection is alive. - let _ = stream_in.shutdown(Shutdown::Both); - bail!("Copy failed"); - } - _ => { - // We only handle `CopyData`, 'Sync', 'CopyFail' messages. Anything else is ignored. - info!("unexpected message {:?}", msg); - } - } - } - - Ok(()) - } - - /// - /// Handle START_REPLICATION replication command - /// - pub fn run( - &mut self, - spg: &mut SafekeeperPostgresHandler, - pgb: &mut PostgresBackend, - mut start_pos: Lsn, - pageserver_connstr: Option, - ) -> Result<()> { - let _enter = info_span!("WAL sender", timeline = %spg.ztimelineid.unwrap(), pageserver_connstr = %pageserver_connstr.as_deref().unwrap_or_default()).entered(); - - // spawn the background thread which receives HotStandbyFeedback messages. - let bg_timeline = Arc::clone(spg.timeline.get()); - let bg_stream_in = self.stream_in.take().unwrap(); - - let state = ReplicaState::new(); - // This replica_id is used below to check if it's time to stop replication. - let replica_id = bg_timeline.add_replica(state); - - // Use a guard object to remove our entry from the timeline, when the background - // thread and us have both finished using it. - let replica_guard = Arc::new(ReplicationConnGuard { - replica: replica_id, - timeline: bg_timeline, - }); - let bg_replica_guard = Arc::clone(&replica_guard); - - // TODO: here we got two threads, one for writing WAL and one for receiving - // feedback. If one of them fails, we should shutdown the other one too. - let _ = thread::Builder::new() - .name("HotStandbyFeedback thread".into()) - .spawn(move || { - if let Err(err) = Self::background_thread(bg_stream_in, bg_replica_guard) { - error!("Replication background thread failed: {}", err); - } - })?; - - let mut wal_seg_size: usize; - loop { - wal_seg_size = spg.timeline.get().get_info().server.wal_seg_size as usize; - if wal_seg_size == 0 { - error!("Cannot start replication before connecting to wal_proposer"); - sleep(Duration::from_secs(1)); - } else { - break; - } - } - let wal_end = spg.timeline.get().get_end_of_wal(); - // Walproposer gets special handling: safekeeper must give proposer all - // local WAL till the end, whether committed or not (walproposer will - // hang otherwise). That's because walproposer runs the consensus and - // synchronizes safekeepers on the most advanced one. - // - // There is a small risk of this WAL getting concurrently garbaged if - // another compute rises which collects majority and starts fixing log - // on this safekeeper itself. That's ok as (old) proposer will never be - // able to commit such WAL. - let stop_pos: Option = if spg.appname == Some("wal_proposer_recovery".to_string()) { - Some(wal_end) - } else { - None - }; - info!("Start replication from {:?} till {:?}", start_pos, stop_pos); - - // Don't spam pageserver with callmemaybe queries - // when replication connection with pageserver is already established. - let _guard = { - if spg.appname == Some("wal_proposer_recovery".to_string()) { - None - } else { - let pageserver_connstr = pageserver_connstr.expect("there should be a pageserver connection string since this is not a wal_proposer_recovery"); - let zttid = spg.timeline.get().zttid; - let tx_clone = spg.tx.clone(); - let subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.clone(), - ); - spg.tx - .send(CallmeEvent::Pause(subscription_key)) - .unwrap_or_else(|e| { - error!("failed to send Pause request to callmemaybe thread {}", e); - }); - - // create a guard to subscribe callback again, when this connection will exit - Some(ReplicationStreamGuard { - tx: tx_clone, - tenant_id: zttid.tenant_id, - timeline_id: zttid.timeline_id, - pageserver_connstr, - }) - } - }; - - // switch to copy - pgb.write_message(&BeMessage::CopyBothResponse)?; - - let mut end_pos = Lsn(0); - - let mut wal_reader = WalReader::new( - spg.conf.timeline_dir(&spg.timeline.get().zttid), - wal_seg_size, - start_pos, - ); - - // buffer for wal sending, limited by MAX_SEND_SIZE - let mut send_buf = vec![0u8; MAX_SEND_SIZE]; - - loop { - if let Some(stop_pos) = stop_pos { - if start_pos >= stop_pos { - break; /* recovery finished */ - } - end_pos = stop_pos; - } else { - /* Wait until we have some data to stream */ - let lsn = spg.timeline.get().wait_for_lsn(start_pos); - - if let Some(lsn) = lsn { - end_pos = lsn; - } else { - // TODO: also check once in a while whether we are walsender - // to right pageserver. - if spg.timeline.get().check_deactivate(replica_id, &spg.tx)? { - // Shut down, timeline is suspended. - // TODO create proper error type for this - bail!("end streaming to {:?}", spg.appname); - } - - // timeout expired: request pageserver status - pgb.write_message(&BeMessage::KeepAlive(WalSndKeepAlive { - sent_ptr: end_pos.0, - timestamp: get_current_timestamp(), - request_reply: true, - })) - .context("Failed to send KeepAlive message")?; - continue; - } - } - - let send_size = end_pos.checked_sub(start_pos).unwrap().0 as usize; - let send_size = min(send_size, send_buf.len()); - - let send_buf = &mut send_buf[..send_size]; - - // read wal into buffer - let send_size = wal_reader.read(send_buf)?; - let send_buf = &send_buf[..send_size]; - - // Write some data to the network socket. - pgb.write_message(&BeMessage::XLogData(XLogDataBody { - wal_start: start_pos.0, - wal_end: end_pos.0, - timestamp: get_current_timestamp(), - data: send_buf, - })) - .context("Failed to send XLogData")?; - - start_pos += send_size as u64; - trace!("sent WAL up to {}", start_pos); - } - Ok(()) - } -} diff --git a/walkeeper/src/timeline.rs b/walkeeper/src/timeline.rs deleted file mode 100644 index c639e81b79..0000000000 --- a/walkeeper/src/timeline.rs +++ /dev/null @@ -1,490 +0,0 @@ -//! This module contains timeline id -> safekeeper state map with file-backed -//! persistence and support for interaction between sending and receiving wal. - -use anyhow::{Context, Result}; - -use lazy_static::lazy_static; - -use std::cmp::{max, min}; -use std::collections::HashMap; -use std::fs::{self}; - -use std::sync::{Arc, Condvar, Mutex}; -use std::time::Duration; -use tokio::sync::mpsc::UnboundedSender; -use tracing::*; - -use zenith_utils::lsn::Lsn; -use zenith_utils::zid::ZTenantTimelineId; - -use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; -use crate::control_file::{self, CreateControlFile}; - -use crate::safekeeper::{ - AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, -}; -use crate::send_wal::HotStandbyFeedback; -use crate::wal_storage::{self, Storage}; -use crate::SafeKeeperConf; - -use zenith_utils::pq_proto::ZenithFeedback; - -const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); - -/// Replica status update + hot standby feedback -#[derive(Debug, Clone, Copy)] -pub struct ReplicaState { - /// last known lsn received by replica - pub last_received_lsn: Lsn, // None means we don't know - /// combined remote consistent lsn of pageservers - pub remote_consistent_lsn: Lsn, - /// combined hot standby feedback from all replicas - pub hs_feedback: HotStandbyFeedback, - /// Zenith specific feedback received from pageserver, if any - pub zenith_feedback: Option, -} - -impl Default for ReplicaState { - fn default() -> Self { - Self::new() - } -} - -impl ReplicaState { - pub fn new() -> ReplicaState { - ReplicaState { - last_received_lsn: Lsn::MAX, - remote_consistent_lsn: Lsn(0), - hs_feedback: HotStandbyFeedback { - ts: 0, - xmin: u64::MAX, - catalog_xmin: u64::MAX, - }, - zenith_feedback: None, - } - } -} - -/// Shared state associated with database instance -struct SharedState { - /// Safekeeper object - sk: SafeKeeper, - /// For receiving-sending wal cooperation - /// quorum commit LSN we've notified walsenders about - notified_commit_lsn: Lsn, - /// State of replicas - replicas: Vec>, - /// Inactive clusters shouldn't occupy any resources, so timeline is - /// activated whenever there is a compute connection or pageserver is not - /// caughtup (it must have latest WAL for new compute start) and suspended - /// otherwise. - /// - /// TODO: it might be better to remove tli completely from GlobalTimelines - /// when tli is inactive instead of having this flag. - active: bool, - num_computes: u32, - pageserver_connstr: Option, -} - -impl SharedState { - /// Restore SharedState from control file. - /// If create=false and file doesn't exist, bails out. - fn create_restore( - conf: &SafeKeeperConf, - zttid: &ZTenantTimelineId, - create: CreateControlFile, - ) -> Result { - let state = control_file::FileStorage::load_control_file_conf(conf, zttid, create) - .context("failed to load from control file")?; - - let control_store = control_file::FileStorage::new(zttid, conf); - - let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); - - info!("timeline {} created or restored", zttid.timeline_id); - - Ok(Self { - notified_commit_lsn: Lsn(0), - sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, state), - replicas: Vec::new(), - active: false, - num_computes: 0, - pageserver_connstr: None, - }) - } - - /// Activate the timeline: start/change walsender (via callmemaybe). - fn activate( - &mut self, - zttid: &ZTenantTimelineId, - pageserver_connstr: Option<&String>, - callmemaybe_tx: &UnboundedSender, - ) -> Result<()> { - if let Some(ref pageserver_connstr) = self.pageserver_connstr { - // unsub old sub. xxx: callmemaybe is going out - let old_subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.to_owned(), - ); - callmemaybe_tx - .send(CallmeEvent::Unsubscribe(old_subscription_key)) - .unwrap_or_else(|e| { - error!("failed to send Pause request to callmemaybe thread {}", e); - }); - } - if let Some(pageserver_connstr) = pageserver_connstr { - let subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.to_owned(), - ); - // xx: sending to channel under lock is not very cool, but - // shouldn't be a problem here. If it is, we can grab a counter - // here and later augment channel messages with it. - callmemaybe_tx - .send(CallmeEvent::Subscribe(subscription_key)) - .unwrap_or_else(|e| { - error!( - "failed to send Subscribe request to callmemaybe thread {}", - e - ); - }); - info!( - "timeline {} is subscribed to callmemaybe to {}", - zttid.timeline_id, pageserver_connstr - ); - } - self.pageserver_connstr = pageserver_connstr.map(|c| c.to_owned()); - self.active = true; - Ok(()) - } - - /// Deactivate the timeline: stop callmemaybe. - fn deactivate( - &mut self, - zttid: &ZTenantTimelineId, - callmemaybe_tx: &UnboundedSender, - ) -> Result<()> { - if self.active { - if let Some(ref pageserver_connstr) = self.pageserver_connstr { - let subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.to_owned(), - ); - callmemaybe_tx - .send(CallmeEvent::Unsubscribe(subscription_key)) - .unwrap_or_else(|e| { - error!( - "failed to send Unsubscribe request to callmemaybe thread {}", - e - ); - }); - info!( - "timeline {} is unsubscribed from callmemaybe to {}", - zttid.timeline_id, - self.pageserver_connstr.as_ref().unwrap() - ); - } - self.active = false; - } - Ok(()) - } - - /// Get combined state of all alive replicas - pub fn get_replicas_state(&self) -> ReplicaState { - let mut acc = ReplicaState::new(); - for state in self.replicas.iter().flatten() { - acc.hs_feedback.ts = max(acc.hs_feedback.ts, state.hs_feedback.ts); - acc.hs_feedback.xmin = min(acc.hs_feedback.xmin, state.hs_feedback.xmin); - acc.hs_feedback.catalog_xmin = - min(acc.hs_feedback.catalog_xmin, state.hs_feedback.catalog_xmin); - - // FIXME - // If multiple pageservers are streaming WAL and send feedback for the same timeline simultaneously, - // this code is not correct. - // Now the most advanced feedback is used. - // If one pageserver lags when another doesn't, the backpressure won't be activated on compute and lagging - // pageserver is prone to timeout errors. - // - // To choose what feedback to use and resend to compute node, - // we need to know which pageserver compute node considers to be main. - // See https://github.com/zenithdb/zenith/issues/1171 - // - if let Some(zenith_feedback) = state.zenith_feedback { - if let Some(acc_feedback) = acc.zenith_feedback { - if acc_feedback.ps_writelsn < zenith_feedback.ps_writelsn { - warn!("More than one pageserver is streaming WAL for the timeline. Feedback resolving is not fully supported yet."); - acc.zenith_feedback = Some(zenith_feedback); - } - } else { - acc.zenith_feedback = Some(zenith_feedback); - } - - // last lsn received by pageserver - // FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver. - // See https://github.com/zenithdb/zenith/issues/1171 - acc.last_received_lsn = Lsn::from(zenith_feedback.ps_writelsn); - - // When at least one pageserver has preserved data up to remote_consistent_lsn, - // safekeeper is free to delete it, so choose max of all pageservers. - acc.remote_consistent_lsn = max( - Lsn::from(zenith_feedback.ps_applylsn), - acc.remote_consistent_lsn, - ); - } - } - acc - } - - /// Assign new replica ID. We choose first empty cell in the replicas vector - /// or extend the vector if there are no free slots. - pub fn add_replica(&mut self, state: ReplicaState) -> usize { - if let Some(pos) = self.replicas.iter().position(|r| r.is_none()) { - self.replicas[pos] = Some(state); - return pos; - } - let pos = self.replicas.len(); - self.replicas.push(Some(state)); - pos - } -} - -/// Database instance (tenant) -pub struct Timeline { - pub zttid: ZTenantTimelineId, - mutex: Mutex, - /// conditional variable used to notify wal senders - cond: Condvar, -} - -impl Timeline { - fn new(zttid: ZTenantTimelineId, shared_state: SharedState) -> Timeline { - Timeline { - zttid, - mutex: Mutex::new(shared_state), - cond: Condvar::new(), - } - } - - /// Register compute connection, starting timeline-related activity if it is - /// not running yet. - /// Can fail only if channel to a static thread got closed, which is not normal at all. - pub fn on_compute_connect( - &self, - pageserver_connstr: Option<&String>, - callmemaybe_tx: &UnboundedSender, - ) -> Result<()> { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.num_computes += 1; - // FIXME: currently we always adopt latest pageserver connstr, but we - // should have kind of generations assigned by compute to distinguish - // the latest one or even pass it through consensus to reliably deliver - // to all safekeepers. - shared_state.activate(&self.zttid, pageserver_connstr, callmemaybe_tx)?; - Ok(()) - } - - /// De-register compute connection, shutting down timeline activity if - /// pageserver doesn't need catchup. - /// Can fail only if channel to a static thread got closed, which is not normal at all. - pub fn on_compute_disconnect( - &self, - callmemaybe_tx: &UnboundedSender, - ) -> Result<()> { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.num_computes -= 1; - // If there is no pageserver, can suspend right away; otherwise let - // walsender do that. - if shared_state.num_computes == 0 && shared_state.pageserver_connstr.is_none() { - shared_state.deactivate(&self.zttid, callmemaybe_tx)?; - } - Ok(()) - } - - /// Deactivate tenant if there is no computes and pageserver is caughtup, - /// assuming the pageserver status is in replica_id. - /// Returns true if deactivated. - pub fn check_deactivate( - &self, - replica_id: usize, - callmemaybe_tx: &UnboundedSender, - ) -> Result { - let mut shared_state = self.mutex.lock().unwrap(); - if !shared_state.active { - // already suspended - return Ok(true); - } - if shared_state.num_computes == 0 { - let replica_state = shared_state.replicas[replica_id].unwrap(); - let deactivate = shared_state.notified_commit_lsn == Lsn(0) || // no data at all yet - (replica_state.last_received_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. - replica_state.last_received_lsn >= shared_state.sk.commit_lsn); - if deactivate { - shared_state.deactivate(&self.zttid, callmemaybe_tx)?; - return Ok(true); - } - } - Ok(false) - } - - /// Timed wait for an LSN to be committed. - /// - /// Returns the last committed LSN, which will be at least - /// as high as the LSN waited for, or None if timeout expired. - /// - pub fn wait_for_lsn(&self, lsn: Lsn) -> Option { - let mut shared_state = self.mutex.lock().unwrap(); - loop { - let commit_lsn = shared_state.notified_commit_lsn; - // This must be `>`, not `>=`. - if commit_lsn > lsn { - return Some(commit_lsn); - } - let result = self - .cond - .wait_timeout(shared_state, POLL_STATE_TIMEOUT) - .unwrap(); - if result.1.timed_out() { - return None; - } - shared_state = result.0 - } - } - - // Notify caught-up WAL senders about new WAL data received - pub fn notify_wal_senders(&self, commit_lsn: Lsn) { - let mut shared_state = self.mutex.lock().unwrap(); - if shared_state.notified_commit_lsn < commit_lsn { - shared_state.notified_commit_lsn = commit_lsn; - self.cond.notify_all(); - } - } - - /// Pass arrived message to the safekeeper. - pub fn process_msg( - &self, - msg: &ProposerAcceptorMessage, - ) -> Result> { - let mut rmsg: Option; - let commit_lsn: Lsn; - { - let mut shared_state = self.mutex.lock().unwrap(); - rmsg = shared_state.sk.process_msg(msg)?; - // locally available commit lsn. flush_lsn can be smaller than - // commit_lsn if we are catching up safekeeper. - commit_lsn = shared_state.sk.commit_lsn; - - // if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn - if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { - let state = shared_state.get_replicas_state(); - resp.hs_feedback = state.hs_feedback; - if let Some(zenith_feedback) = state.zenith_feedback { - resp.zenith_feedback = zenith_feedback; - } - } - } - // Ping wal sender that new data might be available. - self.notify_wal_senders(commit_lsn); - Ok(rmsg) - } - - pub fn get_info(&self) -> SafeKeeperState { - self.mutex.lock().unwrap().sk.s.clone() - } - - pub fn add_replica(&self, state: ReplicaState) -> usize { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.add_replica(state) - } - - pub fn update_replica_state(&self, id: usize, state: ReplicaState) { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.replicas[id] = Some(state); - } - - pub fn remove_replica(&self, id: usize) { - let mut shared_state = self.mutex.lock().unwrap(); - assert!(shared_state.replicas[id].is_some()); - shared_state.replicas[id] = None; - } - - pub fn get_end_of_wal(&self) -> Lsn { - let shared_state = self.mutex.lock().unwrap(); - shared_state.sk.wal_store.flush_lsn() - } -} - -// Utilities needed by various Connection-like objects -pub trait TimelineTools { - fn set( - &mut self, - conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, - create: CreateControlFile, - ) -> Result<()>; - - fn get(&self) -> &Arc; -} - -impl TimelineTools for Option> { - fn set( - &mut self, - conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, - create: CreateControlFile, - ) -> Result<()> { - // We will only set the timeline once. If it were to ever change, - // anyone who cloned the Arc would be out of date. - assert!(self.is_none()); - *self = Some(GlobalTimelines::get(conf, zttid, create)?); - Ok(()) - } - - fn get(&self) -> &Arc { - self.as_ref().unwrap() - } -} - -lazy_static! { - pub static ref TIMELINES: Mutex>> = - Mutex::new(HashMap::new()); -} - -/// A zero-sized struct used to manage access to the global timelines map. -pub struct GlobalTimelines; - -impl GlobalTimelines { - /// Get a timeline with control file loaded from the global TIMELINES map. - /// If control file doesn't exist and create=false, bails out. - pub fn get( - conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, - create: CreateControlFile, - ) -> Result> { - let mut timelines = TIMELINES.lock().unwrap(); - - match timelines.get(&zttid) { - Some(result) => Ok(Arc::clone(result)), - None => { - if let CreateControlFile::True = create { - let dir = conf.timeline_dir(&zttid); - info!( - "creating timeline dir {}, create is {:?}", - dir.display(), - create - ); - fs::create_dir_all(dir)?; - } - - let shared_state = SharedState::create_restore(conf, &zttid, create) - .context("failed to restore shared state")?; - - let new_tli = Arc::new(Timeline::new(zttid, shared_state)); - timelines.insert(zttid, Arc::clone(&new_tli)); - Ok(new_tli) - } - } - } -} diff --git a/walkeeper/src/wal_storage.rs b/walkeeper/src/wal_storage.rs deleted file mode 100644 index f8abc26af9..0000000000 --- a/walkeeper/src/wal_storage.rs +++ /dev/null @@ -1,493 +0,0 @@ -//! This module has everything to deal with WAL -- reading and writing to disk. -//! -//! Safekeeper WAL is stored in the timeline directory, in format similar to pg_wal. -//! PG timeline is always 1, so WAL segments are usually have names like this: -//! - 000000010000000000000001 -//! - 000000010000000000000002.partial -//! -//! Note that last file has `.partial` suffix, that's different from postgres. - -use anyhow::{anyhow, Context, Result}; -use std::io::{Read, Seek, SeekFrom}; - -use lazy_static::lazy_static; -use postgres_ffi::xlog_utils::{find_end_of_wal, XLogSegNo, PG_TLI}; -use std::cmp::min; - -use std::fs::{self, File, OpenOptions}; -use std::io::Write; -use std::path::{Path, PathBuf}; - -use tracing::*; - -use zenith_utils::lsn::Lsn; -use zenith_utils::zid::ZTenantTimelineId; - -use crate::safekeeper::SafeKeeperState; - -use crate::SafeKeeperConf; -use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ}; - -use postgres_ffi::waldecoder::WalStreamDecoder; - -use zenith_metrics::{ - register_gauge_vec, register_histogram_vec, Gauge, GaugeVec, Histogram, HistogramVec, - DISK_WRITE_SECONDS_BUCKETS, -}; - -lazy_static! { - // The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). - // i64 is faster than f64, so update to u64 when available. - static ref FLUSH_LSN_GAUGE: GaugeVec = register_gauge_vec!( - "safekeeper_flush_lsn", - "Current flush_lsn, grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("Failed to register safekeeper_flush_lsn gauge vec"); - static ref WRITE_WAL_BYTES: HistogramVec = register_histogram_vec!( - "safekeeper_write_wal_bytes", - "Bytes written to WAL in a single request, grouped by timeline", - &["tenant_id", "timeline_id"], - vec![1.0, 10.0, 100.0, 1024.0, 8192.0, 128.0 * 1024.0, 1024.0 * 1024.0, 10.0 * 1024.0 * 1024.0] - ) - .expect("Failed to register safekeeper_write_wal_bytes histogram vec"); - static ref WRITE_WAL_SECONDS: HistogramVec = register_histogram_vec!( - "safekeeper_write_wal_seconds", - "Seconds spent writing and syncing WAL to a disk in a single request, grouped by timeline", - &["tenant_id", "timeline_id"], - DISK_WRITE_SECONDS_BUCKETS.to_vec() - ) - .expect("Failed to register safekeeper_write_wal_seconds histogram vec"); -} - -struct WalStorageMetrics { - flush_lsn: Gauge, - write_wal_bytes: Histogram, - write_wal_seconds: Histogram, -} - -impl WalStorageMetrics { - fn new(zttid: &ZTenantTimelineId) -> Self { - let tenant_id = zttid.tenant_id.to_string(); - let timeline_id = zttid.timeline_id.to_string(); - Self { - flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&[&tenant_id, &timeline_id]), - write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&tenant_id, &timeline_id]), - write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), - } - } -} - -pub trait Storage { - /// lsn of last durably stored WAL record. - fn flush_lsn(&self) -> Lsn; - - /// Init storage with wal_seg_size and read WAL from disk to get latest lsn. - fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()>; - - /// Write piece of wal in buf to disk and sync it. - fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; - - // Truncate WAL at specified LSN. - fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>; -} - -pub struct PhysicalStorage { - metrics: WalStorageMetrics, - zttid: ZTenantTimelineId, - timeline_dir: PathBuf, - conf: SafeKeeperConf, - - // fields below are filled upon initialization - - // None if unitialized, Some(lsn) if storage is initialized - wal_seg_size: Option, - - // Relationship of lsns: - // `write_lsn` >= `write_record_lsn` >= `flush_record_lsn` - // - // All lsns are zeroes, if storage is just created, and there are no segments on disk. - - // Written to disk, but possibly still in the cache and not fully persisted. - // Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. - write_lsn: Lsn, - - // The LSN of the last WAL record written to disk. Still can be not fully flushed. - write_record_lsn: Lsn, - - // The LSN of the last WAL record flushed to disk. - flush_record_lsn: Lsn, - - // Decoder is required for detecting boundaries of WAL records. - decoder: WalStreamDecoder, -} - -impl PhysicalStorage { - pub fn new(zttid: &ZTenantTimelineId, conf: &SafeKeeperConf) -> PhysicalStorage { - let timeline_dir = conf.timeline_dir(zttid); - PhysicalStorage { - metrics: WalStorageMetrics::new(zttid), - zttid: *zttid, - timeline_dir, - conf: conf.clone(), - wal_seg_size: None, - write_lsn: Lsn(0), - write_record_lsn: Lsn(0), - flush_record_lsn: Lsn(0), - decoder: WalStreamDecoder::new(Lsn(0)), - } - } - - // wrapper for flush_lsn updates that also updates metrics - fn update_flush_lsn(&mut self) { - self.flush_record_lsn = self.write_record_lsn; - self.metrics.flush_lsn.set(self.flush_record_lsn.0 as f64); - } - - /// Helper returning full path to WAL segment file and its .partial brother. - fn wal_file_paths(&self, segno: XLogSegNo) -> Result<(PathBuf, PathBuf)> { - let wal_seg_size = self - .wal_seg_size - .ok_or_else(|| anyhow!("wal_seg_size is not initialized"))?; - - let wal_file_name = XLogFileName(PG_TLI, segno, wal_seg_size); - let wal_file_path = self.timeline_dir.join(wal_file_name.clone()); - let wal_file_partial_path = self.timeline_dir.join(wal_file_name + ".partial"); - Ok((wal_file_path, wal_file_partial_path)) - } - - // TODO: this function is going to be refactored soon, what will change: - // - flush will be called separately from write_wal, this function - // will only write bytes to disk - // - File will be cached in PhysicalStorage, to remove extra syscalls, - // such as open(), seek(), close() - fn write_and_flush(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { - let wal_seg_size = self - .wal_seg_size - .ok_or_else(|| anyhow!("wal_seg_size is not initialized"))?; - - let mut bytes_left: usize = buf.len(); - let mut bytes_written: usize = 0; - let mut partial; - let mut start_pos = startpos; - const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ]; - - /* Extract WAL location for this block */ - let mut xlogoff = start_pos.segment_offset(wal_seg_size) as usize; - - while bytes_left != 0 { - let bytes_to_write; - - /* - * If crossing a WAL boundary, only write up until we reach wal - * segment size. - */ - if xlogoff + bytes_left > wal_seg_size { - bytes_to_write = wal_seg_size - xlogoff; - } else { - bytes_to_write = bytes_left; - } - - /* Open file */ - let segno = start_pos.segment_number(wal_seg_size); - let (wal_file_path, wal_file_partial_path) = self.wal_file_paths(segno)?; - { - let mut wal_file: File; - /* Try to open already completed segment */ - if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { - wal_file = file; - partial = false; - } else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) - { - /* Try to open existed partial file */ - wal_file = file; - partial = true; - } else { - /* Create and fill new partial file */ - partial = true; - match OpenOptions::new() - .create(true) - .write(true) - .open(&wal_file_partial_path) - { - Ok(mut file) => { - for _ in 0..(wal_seg_size / XLOG_BLCKSZ) { - file.write_all(ZERO_BLOCK)?; - } - wal_file = file; - } - Err(e) => { - error!("Failed to open log file {:?}: {}", &wal_file_path, e); - return Err(e.into()); - } - } - } - wal_file.seek(SeekFrom::Start(xlogoff as u64))?; - wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?; - - // Flush file, if not said otherwise - if !self.conf.no_sync { - wal_file.sync_all()?; - } - } - /* Write was successful, advance our position */ - bytes_written += bytes_to_write; - bytes_left -= bytes_to_write; - start_pos += bytes_to_write as u64; - xlogoff += bytes_to_write; - - /* Did we reach the end of a WAL segment? */ - if start_pos.segment_offset(wal_seg_size) == 0 { - xlogoff = 0; - if partial { - fs::rename(&wal_file_partial_path, &wal_file_path)?; - } - } - } - Ok(()) - } -} - -impl Storage for PhysicalStorage { - // flush_lsn returns lsn of last durably stored WAL record. - fn flush_lsn(&self) -> Lsn { - self.flush_record_lsn - } - - // Storage needs to know wal_seg_size to know which segment to read/write, but - // wal_seg_size is not always known at the moment of storage creation. This method - // allows to postpone its initialization. - fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()> { - if state.server.wal_seg_size == 0 { - // wal_seg_size is still unknown - return Ok(()); - } - - if let Some(wal_seg_size) = self.wal_seg_size { - // physical storage is already initialized - assert_eq!(wal_seg_size, state.server.wal_seg_size as usize); - return Ok(()); - } - - // initialize physical storage - let wal_seg_size = state.server.wal_seg_size as usize; - self.wal_seg_size = Some(wal_seg_size); - - // we need to read WAL from disk to know which LSNs are stored on disk - self.write_lsn = - Lsn(find_end_of_wal(&self.timeline_dir, wal_seg_size, true, state.wal_start_lsn)?.0); - - self.write_record_lsn = self.write_lsn; - - // TODO: do we really know that write_lsn is fully flushed to disk? - // If not, maybe it's better to call fsync() here to be sure? - self.update_flush_lsn(); - - info!( - "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, truncate_lsn={}", - self.zttid.timeline_id, self.flush_record_lsn, state.commit_lsn, state.truncate_lsn, - ); - if self.flush_record_lsn < state.commit_lsn || self.flush_record_lsn < state.truncate_lsn { - warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or truncate_lsn from control file", self.zttid.timeline_id); - } - - Ok(()) - } - - // Write and flush WAL to disk. - fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { - if self.write_lsn > startpos { - warn!( - "write_wal rewrites WAL written before, write_lsn={}, startpos={}", - self.write_lsn, startpos - ); - } - if self.write_lsn < startpos { - warn!( - "write_wal creates gap in written WAL, write_lsn={}, startpos={}", - self.write_lsn, startpos - ); - // TODO: return error if write_lsn is not zero - } - - { - let _timer = self.metrics.write_wal_seconds.start_timer(); - self.write_and_flush(startpos, buf)?; - } - - // WAL is written and flushed, updating lsns - self.write_lsn = startpos + buf.len() as u64; - self.metrics.write_wal_bytes.observe(buf.len() as f64); - - // figure out last record's end lsn for reporting (if we got the - // whole record) - if self.decoder.available() != startpos { - info!( - "restart decoder from {} to {}", - self.decoder.available(), - startpos, - ); - self.decoder = WalStreamDecoder::new(startpos); - } - self.decoder.feed_bytes(buf); - loop { - match self.decoder.poll_decode()? { - None => break, // no full record yet - Some((lsn, _rec)) => { - self.write_record_lsn = lsn; - } - } - } - - self.update_flush_lsn(); - Ok(()) - } - - // Truncate written WAL by removing all WAL segments after the given LSN. - // end_pos must point to the end of the WAL record. - fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { - let wal_seg_size = self - .wal_seg_size - .ok_or_else(|| anyhow!("wal_seg_size is not initialized"))?; - - // TODO: cross check divergence point - - // nothing to truncate - if self.write_lsn == Lsn(0) { - return Ok(()); - } - - // Streaming must not create a hole, so truncate cannot be called on non-written lsn - assert!(self.write_lsn >= end_pos); - - // open segment files and delete or fill end with zeroes - - let partial; - const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ]; - - /* Extract WAL location for this block */ - let mut xlogoff = end_pos.segment_offset(wal_seg_size) as usize; - - /* Open file */ - let mut segno = end_pos.segment_number(wal_seg_size); - let (wal_file_path, wal_file_partial_path) = self.wal_file_paths(segno)?; - { - let mut wal_file: File; - /* Try to open already completed segment */ - if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) { - wal_file = file; - partial = false; - } else { - wal_file = OpenOptions::new() - .write(true) - .open(&wal_file_partial_path)?; - partial = true; - } - wal_file.seek(SeekFrom::Start(xlogoff as u64))?; - while xlogoff < wal_seg_size { - let bytes_to_write = min(XLOG_BLCKSZ, wal_seg_size - xlogoff); - wal_file.write_all(&ZERO_BLOCK[0..bytes_to_write])?; - xlogoff += bytes_to_write; - } - // Flush file, if not said otherwise - if !self.conf.no_sync { - wal_file.sync_all()?; - } - } - if !partial { - // Make segment partial once again - fs::rename(&wal_file_path, &wal_file_partial_path)?; - } - // Remove all subsequent segments - loop { - segno += 1; - let (wal_file_path, wal_file_partial_path) = self.wal_file_paths(segno)?; - // TODO: better use fs::try_exists which is currenty avaialble only in nightly build - if wal_file_path.exists() { - fs::remove_file(&wal_file_path)?; - } else if wal_file_partial_path.exists() { - fs::remove_file(&wal_file_partial_path)?; - } else { - break; - } - } - - // Update lsns - self.write_lsn = end_pos; - self.write_record_lsn = end_pos; - self.update_flush_lsn(); - Ok(()) - } -} - -pub struct WalReader { - timeline_dir: PathBuf, - wal_seg_size: usize, - pos: Lsn, - file: Option, -} - -impl WalReader { - pub fn new(timeline_dir: PathBuf, wal_seg_size: usize, pos: Lsn) -> Self { - Self { - timeline_dir, - wal_seg_size, - pos, - file: None, - } - } - - pub fn read(&mut self, buf: &mut [u8]) -> Result { - // Take the `File` from `wal_file`, or open a new file. - let mut file = match self.file.take() { - Some(file) => file, - None => { - // Open a new file. - let segno = self.pos.segment_number(self.wal_seg_size); - let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size); - let wal_file_path = self.timeline_dir.join(wal_file_name); - Self::open_wal_file(&wal_file_path)? - } - }; - - let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize; - - // How much to read and send in message? We cannot cross the WAL file - // boundary, and we don't want send more than provided buffer. - let send_size = min(buf.len(), self.wal_seg_size - xlogoff); - - // Read some data from the file. - let buf = &mut buf[0..send_size]; - file.seek(SeekFrom::Start(xlogoff as u64)) - .and_then(|_| file.read_exact(buf)) - .context("Failed to read data from WAL file")?; - - self.pos += send_size as u64; - - // Decide whether to reuse this file. If we don't set wal_file here - // a new file will be opened next time. - if self.pos.segment_offset(self.wal_seg_size) != 0 { - self.file = Some(file); - } - - Ok(send_size) - } - - /// Helper function for opening a wal file. - fn open_wal_file(wal_file_path: &Path) -> Result { - // First try to open the .partial file. - let mut partial_path = wal_file_path.to_owned(); - partial_path.set_extension("partial"); - if let Ok(opened_file) = File::open(&partial_path) { - return Ok(opened_file); - } - - // If that failed, try it without the .partial extension. - File::open(&wal_file_path) - .with_context(|| format!("Failed to open WAL file {:?}", wal_file_path)) - .map_err(|e| { - error!("{}", e); - e - }) - } -} diff --git a/workspace_hack/.gitattributes b/workspace_hack/.gitattributes new file mode 100644 index 0000000000..3e9dba4b64 --- /dev/null +++ b/workspace_hack/.gitattributes @@ -0,0 +1,4 @@ +# Avoid putting conflict markers in the generated Cargo.toml file, since their presence breaks +# Cargo. +# Also do not check out the file as CRLF on Windows, as that's what hakari needs. +Cargo.toml merge=binary -crlf diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 48d81bbc07..2daa08c9b6 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -1,22 +1,67 @@ +# This file is generated by `cargo hakari`. +# To regenerate, run: +# cargo hakari generate + [package] name = "workspace_hack" version = "0.1.0" -edition = "2021" +description = "workspace-hack package, managed by hakari" +# You can choose to publish this crate: see https://docs.rs/cargo-hakari/latest/cargo_hakari/publishing. +publish = false +# The parts of the file between the BEGIN HAKARI SECTION and END HAKARI SECTION comments +# are managed by hakari. -[target.'cfg(all())'.dependencies] -libc = { version = "0.2", features = ["default", "extra_traits", "std"] } -memchr = { version = "2", features = ["default", "std", "use_std"] } -num-integer = { version = "0.1", default-features = false, features = ["std"] } -num-traits = { version = "0.2", default-features = false, features = ["std"] } -regex = { version = "1", features = ["aho-corasick", "default", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } -regex-syntax = { version = "0.6", features = ["default", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } -serde = { version = "1", features = ["default", "derive", "serde_derive", "std"] } +### BEGIN HAKARI SECTION +[dependencies] +ahash = { version = "0.7", features = ["std"] } +anyhow = { version = "1", features = ["backtrace", "std"] } +bytes = { version = "1", features = ["serde", "std"] } +chrono = { version = "0.4", features = ["clock", "iana-time-zone", "js-sys", "oldtime", "serde", "std", "time", "wasm-bindgen", "wasmbind", "winapi"] } +clap = { version = "4", features = ["color", "error-context", "help", "std", "string", "suggestions", "usage"] } +crossbeam-utils = { version = "0.8", features = ["once_cell", "std"] } +either = { version = "1", features = ["use_std"] } +fail = { version = "0.5", default-features = false, features = ["failpoints"] } +futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] } +futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] } +futures-util = { version = "0.3", default-features = false, features = ["alloc", "async-await", "async-await-macro", "channel", "futures-channel", "futures-io", "futures-macro", "futures-sink", "io", "memchr", "sink", "slab", "std"] } +hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } +indexmap = { version = "1", default-features = false, features = ["std"] } +libc = { version = "0.2", features = ["extra_traits", "std"] } +log = { version = "0.4", default-features = false, features = ["serde", "std"] } +memchr = { version = "2", features = ["std"] } +nom = { version = "7", features = ["alloc", "std"] } +num-bigint = { version = "0.4", features = ["std"] } +num-integer = { version = "0.1", default-features = false, features = ["i128", "std"] } +num-traits = { version = "0.2", features = ["i128", "libm", "std"] } +prost = { version = "0.10", features = ["prost-derive", "std"] } +rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } +regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } +regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } +reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "hyper-rustls", "json", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-rustls", "webpki-roots"] } +scopeguard = { version = "1", features = ["use_std"] } +serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } +stable_deref_trait = { version = "1", features = ["alloc", "std"] } +time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } +tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } +tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } +tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } +tracing-core = { version = "0.1", features = ["once_cell", "std"] } -[target.'cfg(all())'.build-dependencies] -libc = { version = "0.2", features = ["default", "extra_traits", "std"] } -memchr = { version = "2", features = ["default", "std", "use_std"] } -proc-macro2 = { version = "1", features = ["default", "proc-macro"] } -quote = { version = "1", features = ["default", "proc-macro"] } -regex = { version = "1", features = ["aho-corasick", "default", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } -regex-syntax = { version = "0.6", features = ["default", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } -syn = { version = "1", features = ["clone-impls", "default", "derive", "full", "parsing", "printing", "proc-macro", "quote", "visit", "visit-mut"] } +[build-dependencies] +ahash = { version = "0.7", features = ["std"] } +anyhow = { version = "1", features = ["backtrace", "std"] } +bytes = { version = "1", features = ["serde", "std"] } +either = { version = "1", features = ["use_std"] } +hashbrown = { version = "0.12", features = ["ahash", "inline-more", "raw"] } +indexmap = { version = "1", default-features = false, features = ["std"] } +libc = { version = "0.2", features = ["extra_traits", "std"] } +log = { version = "0.4", default-features = false, features = ["serde", "std"] } +memchr = { version = "2", features = ["std"] } +nom = { version = "7", features = ["alloc", "std"] } +prost = { version = "0.10", features = ["prost-derive", "std"] } +regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } +regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } +serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } +syn = { version = "1", features = ["clone-impls", "derive", "extra-traits", "full", "parsing", "printing", "proc-macro", "quote", "visit", "visit-mut"] } + +### END HAKARI SECTION diff --git a/workspace_hack/build.rs b/workspace_hack/build.rs new file mode 100644 index 0000000000..92518ef04c --- /dev/null +++ b/workspace_hack/build.rs @@ -0,0 +1,2 @@ +// A build script is required for cargo to consider build dependencies. +fn main() {} diff --git a/workspace_hack/src/lib.rs b/workspace_hack/src/lib.rs index ceba3d145d..22489f632b 100644 --- a/workspace_hack/src/lib.rs +++ b/workspace_hack/src/lib.rs @@ -1,23 +1 @@ -//! This crate contains no code. -//! -//! The workspace_hack crate exists only to pin down some dependencies, -//! so that those dependencies always build with the same features, -//! under a few different cases that can be problematic: -//! - Running `cargo check` or `cargo build` from a crate sub-directory -//! instead of the workspace root. -//! - Running `cargo install`, which can only be done per-crate -//! -//! The dependency lists in Cargo.toml were automatically generated by -//! a tool called -//! [Hakari](https://github.com/facebookincubator/cargo-guppy/tree/main/tools/hakari). -//! -//! Hakari doesn't have a CLI yet; in the meantime the example code in -//! their `README` file is enough to regenerate the dependencies. -//! Hakari's output was pasted into Cargo.toml, except for the -//! following manual edits: -//! - `winapi` dependency was removed. This is probably just due to the -//! fact that Hakari's target analysis is incomplete. -//! -//! There isn't any penalty to this data falling out of date; it just -//! means that under the conditions above Cargo will rebuild more -//! packages than strictly necessary. +// This is a stub lib.rs. diff --git a/zenith/Cargo.toml b/zenith/Cargo.toml deleted file mode 100644 index 8adbda0723..0000000000 --- a/zenith/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "zenith" -version = "0.1.0" -edition = "2021" - -[dependencies] -clap = "3.0" -anyhow = "1.0" -serde_json = "1" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } - -# FIXME: 'pageserver' is needed for BranchInfo. Refactor -pageserver = { path = "../pageserver" } -control_plane = { path = "../control_plane" } -walkeeper = { path = "../walkeeper" } -postgres_ffi = { path = "../postgres_ffi" } -zenith_utils = { path = "../zenith_utils" } -workspace_hack = { path = "../workspace_hack" } diff --git a/zenith/src/main.rs b/zenith/src/main.rs deleted file mode 100644 index a2a762f5be..0000000000 --- a/zenith/src/main.rs +++ /dev/null @@ -1,731 +0,0 @@ -use anyhow::{bail, Context, Result}; -use clap::{App, AppSettings, Arg, ArgMatches}; -use control_plane::compute::ComputeControlPlane; -use control_plane::local_env; -use control_plane::local_env::LocalEnv; -use control_plane::safekeeper::SafekeeperNode; -use control_plane::storage::PageServerNode; -use pageserver::config::defaults::{ - DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, - DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR, -}; -use std::collections::HashMap; -use std::process::exit; -use std::str::FromStr; -use walkeeper::defaults::{ - DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, - DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, -}; -use zenith_utils::auth::{Claims, Scope}; -use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; -use zenith_utils::GIT_VERSION; - -use pageserver::branches::BranchInfo; - -// Default name of a safekeeper node, if not specified on the command line. -const DEFAULT_SAFEKEEPER_NAME: &str = "single"; - -fn default_conf() -> String { - format!( - r#" -# Default built-in configuration, defined in main.rs -[pageserver] -listen_pg_addr = '{pageserver_pg_addr}' -listen_http_addr = '{pageserver_http_addr}' -auth_type = '{pageserver_auth_type}' - -[[safekeepers]] -name = '{safekeeper_name}' -pg_port = {safekeeper_pg_port} -http_port = {safekeeper_http_port} -"#, - pageserver_pg_addr = DEFAULT_PAGESERVER_PG_ADDR, - pageserver_http_addr = DEFAULT_PAGESERVER_HTTP_ADDR, - pageserver_auth_type = AuthType::Trust, - safekeeper_name = DEFAULT_SAFEKEEPER_NAME, - safekeeper_pg_port = DEFAULT_SAFEKEEPER_PG_PORT, - safekeeper_http_port = DEFAULT_SAFEKEEPER_HTTP_PORT, - ) -} - -/// -/// Branches tree element used as a value in the HashMap. -/// -struct BranchTreeEl { - /// `BranchInfo` received from the `pageserver` via the `branch_list` libpq API call. - pub info: BranchInfo, - /// Holds all direct children of this branch referenced using `timeline_id`. - pub children: Vec, -} - -// Main entry point for the 'zenith' CLI utility -// -// This utility helps to manage zenith installation. That includes following: -// * Management of local postgres installations running on top of the -// pageserver. -// * Providing CLI api to the pageserver -// * TODO: export/import to/from usual postgres -fn main() -> Result<()> { - #[rustfmt::skip] // rustfmt squashes these into a single line otherwise - let pg_node_arg = Arg::new("node") - .index(1) - .help("Node name") - .required(true); - - #[rustfmt::skip] - let safekeeper_node_arg = Arg::new("node") - .index(1) - .help("Node name") - .required(false); - - let timeline_arg = Arg::new("timeline") - .index(2) - .help("Branch name or a point-in time specification") - .required(false); - - let tenantid_arg = Arg::new("tenantid") - .long("tenantid") - .help("Tenant id. Represented as a hexadecimal string 32 symbols length") - .takes_value(true) - .required(false); - - let port_arg = Arg::new("port") - .long("port") - .required(false) - .value_name("port"); - - let stop_mode_arg = Arg::new("stop-mode") - .short('m') - .takes_value(true) - .possible_values(&["fast", "immediate"]) - .help("If 'immediate', don't flush repository data at shutdown") - .required(false) - .value_name("stop-mode"); - - let pageserver_config_args = Arg::new("pageserver-config-override") - .long("pageserver-config-override") - .takes_value(true) - .number_of_values(1) - .multiple_occurrences(true) - .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more") - .required(false); - - let matches = App::new("Zenith CLI") - .setting(AppSettings::ArgRequiredElseHelp) - .version(GIT_VERSION) - .subcommand( - App::new("init") - .about("Initialize a new Zenith repository") - .arg(pageserver_config_args.clone()) - .arg( - Arg::new("config") - .long("config") - .required(false) - .value_name("config"), - ) - ) - .subcommand( - App::new("branch") - .about("Create a new branch") - .arg(Arg::new("branchname").required(false).index(1)) - .arg(Arg::new("start-point").required(false).index(2)) - .arg(tenantid_arg.clone()), - ).subcommand( - App::new("tenant") - .setting(AppSettings::ArgRequiredElseHelp) - .about("Manage tenants") - .subcommand(App::new("list")) - .subcommand(App::new("create").arg(Arg::new("tenantid").required(false).index(1))) - ) - .subcommand( - App::new("pageserver") - .setting(AppSettings::ArgRequiredElseHelp) - .about("Manage pageserver") - .subcommand(App::new("status")) - .subcommand(App::new("start").about("Start local pageserver").arg(pageserver_config_args.clone())) - .subcommand(App::new("stop").about("Stop local pageserver") - .arg(stop_mode_arg.clone())) - .subcommand(App::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone())) - ) - .subcommand( - App::new("safekeeper") - .setting(AppSettings::ArgRequiredElseHelp) - .about("Manage safekeepers") - .subcommand(App::new("start") - .about("Start local safekeeper") - .arg(safekeeper_node_arg.clone()) - ) - .subcommand(App::new("stop") - .about("Stop local safekeeper") - .arg(safekeeper_node_arg.clone()) - .arg(stop_mode_arg.clone()) - ) - .subcommand(App::new("restart") - .about("Restart local safekeeper") - .arg(safekeeper_node_arg.clone()) - .arg(stop_mode_arg.clone()) - ) - ) - .subcommand( - App::new("pg") - .setting(AppSettings::ArgRequiredElseHelp) - .about("Manage postgres instances") - .subcommand(App::new("list").arg(tenantid_arg.clone())) - .subcommand(App::new("create") - .about("Create a postgres compute node") - .arg(pg_node_arg.clone()) - .arg(timeline_arg.clone()) - .arg(tenantid_arg.clone()) - .arg(port_arg.clone()) - .arg( - Arg::new("config-only") - .help("Don't do basebackup, create compute node with only config files") - .long("config-only") - .required(false) - )) - .subcommand(App::new("start") - .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") - .arg(pg_node_arg.clone()) - .arg(timeline_arg.clone()) - .arg(tenantid_arg.clone()) - .arg(port_arg.clone())) - .subcommand( - App::new("stop") - .arg(pg_node_arg.clone()) - .arg(timeline_arg.clone()) - .arg(tenantid_arg.clone()) - .arg( - Arg::new("destroy") - .help("Also delete data directory (now optional, should be default in future)") - .long("destroy") - .required(false) - ) - ) - - ) - .subcommand( - App::new("start") - .about("Start page server and safekeepers") - .arg(pageserver_config_args) - ) - .subcommand( - App::new("stop") - .about("Stop page server and safekeepers") - .arg(stop_mode_arg.clone()) - ) - .get_matches(); - - let (sub_name, sub_args) = match matches.subcommand() { - Some(subcommand_data) => subcommand_data, - None => bail!("no subcommand provided"), - }; - - // Check for 'zenith init' command first. - let subcmd_result = if sub_name == "init" { - handle_init(sub_args) - } else { - // all other commands need an existing config - let env = match LocalEnv::load_config() { - Ok(conf) => conf, - Err(e) => { - eprintln!("Error loading config: {}", e); - exit(1); - } - }; - - match sub_name { - "tenant" => handle_tenant(sub_args, &env), - "branch" => handle_branch(sub_args, &env), - "start" => handle_start_all(sub_args, &env), - "stop" => handle_stop_all(sub_args, &env), - "pageserver" => handle_pageserver(sub_args, &env), - "pg" => handle_pg(sub_args, &env), - "safekeeper" => handle_safekeeper(sub_args, &env), - _ => bail!("unexpected subcommand {}", sub_name), - } - }; - if let Err(e) = subcmd_result { - eprintln!("command failed: {:#}", e); - exit(1); - } - - Ok(()) -} - -/// -/// Prints branches list as a tree-like structure. -/// -fn print_branches_tree(branches: Vec) -> Result<()> { - let mut branches_hash: HashMap = HashMap::new(); - - // Form a hash table of branch timeline_id -> BranchTreeEl. - for branch in &branches { - branches_hash.insert( - branch.timeline_id.to_string(), - BranchTreeEl { - info: branch.clone(), - children: Vec::new(), - }, - ); - } - - // Memorize all direct children of each branch. - for branch in &branches { - if let Some(tid) = &branch.ancestor_id { - branches_hash - .get_mut(tid) - .context("missing branch info in the HashMap")? - .children - .push(branch.timeline_id.to_string()); - } - } - - // Sort children by tid to bring some minimal order. - for branch in &mut branches_hash.values_mut() { - branch.children.sort(); - } - - for branch in branches_hash.values() { - // Start with root branches (no ancestors) first. - // Now there is 'main' branch only, but things may change. - if branch.info.ancestor_id.is_none() { - print_branch(0, &Vec::from([true]), branch, &branches_hash)?; - } - } - - Ok(()) -} - -/// -/// Recursively prints branch info with all its children. -/// -fn print_branch( - nesting_level: usize, - is_last: &[bool], - branch: &BranchTreeEl, - branches: &HashMap, -) -> Result<()> { - // Draw main padding - print!(" "); - - if nesting_level > 0 { - let lsn = branch - .info - .ancestor_lsn - .as_ref() - .context("missing branch info in the HashMap")?; - let mut br_sym = "┣━"; - - // Draw each nesting padding with proper style - // depending on whether its branch ended or not. - if nesting_level > 1 { - for l in &is_last[1..is_last.len() - 1] { - if *l { - print!(" "); - } else { - print!("┃ "); - } - } - } - - // We are the last in this sub-branch - if *is_last.last().unwrap() { - br_sym = "┗━"; - } - - print!("{} @{}: ", br_sym, lsn); - } - - // Finally print a branch name with new line - println!("{}", branch.info.name); - - let len = branch.children.len(); - let mut i: usize = 0; - let mut is_last_new = Vec::from(is_last); - is_last_new.push(false); - - for child in &branch.children { - i += 1; - - // Mark that the last padding is the end of the branch - if i == len { - if let Some(last) = is_last_new.last_mut() { - *last = true; - } - } - - print_branch( - nesting_level + 1, - &is_last_new, - branches - .get(child) - .context("missing branch info in the HashMap")?, - branches, - )?; - } - - Ok(()) -} - -/// Returns a map of timeline IDs to branch_name@lsn strings. -/// Connects to the pageserver to query this information. -fn get_branch_infos( - env: &local_env::LocalEnv, - tenantid: &ZTenantId, -) -> Result> { - let page_server = PageServerNode::from_env(env); - let branch_infos: Vec = page_server.branch_list(tenantid)?; - let branch_infos: HashMap = branch_infos - .into_iter() - .map(|branch_info| (branch_info.timeline_id, branch_info)) - .collect(); - - Ok(branch_infos) -} - -// Helper function to parse --tenantid option, or get the default from config file -fn get_tenantid(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result { - if let Some(tenantid_cmd) = sub_match.value_of("tenantid") { - Ok(ZTenantId::from_str(tenantid_cmd)?) - } else if let Some(tenantid_conf) = env.default_tenantid { - Ok(tenantid_conf) - } else { - bail!("No tenantid. Use --tenantid, or set 'default_tenantid' in the config file"); - } -} - -fn handle_init(init_match: &ArgMatches) -> Result<()> { - // Create config file - let toml_file: String = if let Some(config_path) = init_match.value_of("config") { - // load and parse the file - std::fs::read_to_string(std::path::Path::new(config_path)) - .with_context(|| format!("Could not read configuration file \"{}\"", config_path))? - } else { - // Built-in default config - default_conf() - }; - - let mut env = - LocalEnv::create_config(&toml_file).context("Failed to create zenith configuration")?; - env.init() - .context("Failed to initialize zenith repository")?; - - // Call 'pageserver init'. - let pageserver = PageServerNode::from_env(&env); - if let Err(e) = pageserver.init( - // default_tenantid was generated by the `env.init()` call above - Some(&env.default_tenantid.unwrap().to_string()), - &pageserver_config_overrides(init_match), - ) { - eprintln!("pageserver init failed: {}", e); - exit(1); - } - - Ok(()) -} - -fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { - init_match - .values_of("pageserver-config-override") - .into_iter() - .flatten() - .collect() -} - -fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let pageserver = PageServerNode::from_env(env); - match tenant_match.subcommand() { - Some(("list", _)) => { - for t in pageserver.tenant_list()? { - println!("{} {}", t.id, t.state); - } - } - Some(("create", create_match)) => { - let tenantid = match create_match.value_of("tenantid") { - Some(tenantid) => ZTenantId::from_str(tenantid)?, - None => ZTenantId::generate(), - }; - println!("using tenant id {}", tenantid); - pageserver.tenant_create(tenantid)?; - println!("tenant successfully created on the pageserver"); - } - Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), - None => bail!("no tenant subcommand provided"), - } - Ok(()) -} - -fn handle_branch(branch_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let pageserver = PageServerNode::from_env(env); - - let tenantid = get_tenantid(branch_match, env)?; - - if let Some(branchname) = branch_match.value_of("branchname") { - let startpoint_str = branch_match - .value_of("start-point") - .context("Missing start-point")?; - let branch = pageserver.branch_create(branchname, startpoint_str, &tenantid)?; - println!( - "Created branch '{}' at {:?} for tenant: {}", - branch.name, branch.latest_valid_lsn, tenantid, - ); - } else { - // No arguments, list branches for tenant - let branches = pageserver.branch_list(&tenantid)?; - print_branches_tree(branches)?; - } - - Ok(()) -} - -fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let (sub_name, sub_args) = match pg_match.subcommand() { - Some(pg_subcommand_data) => pg_subcommand_data, - None => bail!("no pg subcommand provided"), - }; - - let mut cplane = ComputeControlPlane::load(env.clone())?; - - // All subcommands take an optional --tenantid option - let tenantid = get_tenantid(sub_args, env)?; - - match sub_name { - "list" => { - let branch_infos = get_branch_infos(env, &tenantid).unwrap_or_else(|e| { - eprintln!("Failed to load branch info: {}", e); - HashMap::new() - }); - - println!("NODE\tADDRESS\t\tBRANCH\tLSN\t\tSTATUS"); - for ((_, node_name), node) in cplane - .nodes - .iter() - .filter(|((node_tenantid, _), _)| node_tenantid == &tenantid) - { - // FIXME: This shows the LSN at the end of the timeline. It's not the - // right thing to do for read-only nodes that might be anchored at an - // older point in time, or following but lagging behind the primary. - let lsn_str = branch_infos - .get(&node.timelineid) - .map(|bi| bi.latest_valid_lsn.to_string()) - .unwrap_or_else(|| "?".to_string()); - - println!( - "{}\t{}\t{}\t{}\t{}", - node_name, - node.address, - node.timelineid, // FIXME: resolve human-friendly branch name - lsn_str, - node.status(), - ); - } - } - "create" => { - let node_name = sub_args.value_of("node").unwrap_or("main"); - let timeline_name = sub_args.value_of("timeline").unwrap_or(node_name); - - let port: Option = match sub_args.value_of("port") { - Some(p) => Some(p.parse()?), - None => None, - }; - cplane.new_node(tenantid, node_name, timeline_name, port)?; - } - "start" => { - let node_name = sub_args.value_of("node").unwrap_or("main"); - let timeline_name = sub_args.value_of("timeline"); - - let port: Option = match sub_args.value_of("port") { - Some(p) => Some(p.parse()?), - None => None, - }; - - let node = cplane.nodes.get(&(tenantid, node_name.to_owned())); - - let auth_token = if matches!(env.pageserver.auth_type, AuthType::ZenithJWT) { - let claims = Claims::new(Some(tenantid), Scope::Tenant); - - Some(env.generate_auth_token(&claims)?) - } else { - None - }; - - if let Some(node) = node { - if timeline_name.is_some() { - println!("timeline name ignored because node exists already"); - } - println!("Starting existing postgres {}...", node_name); - node.start(&auth_token)?; - } else { - // when used with custom port this results in non obvious behaviour - // port is remembered from first start command, i e - // start --port X - // stop - // start <-- will also use port X even without explicit port argument - let timeline_name = timeline_name.unwrap_or(node_name); - println!( - "Starting new postgres {} on {}...", - node_name, timeline_name - ); - let node = cplane.new_node(tenantid, node_name, timeline_name, port)?; - node.start(&auth_token)?; - } - } - "stop" => { - let node_name = sub_args.value_of("node").unwrap_or("main"); - let destroy = sub_args.is_present("destroy"); - - let node = cplane - .nodes - .get(&(tenantid, node_name.to_owned())) - .with_context(|| format!("postgres {} is not found", node_name))?; - node.stop(destroy)?; - } - - _ => { - bail!("Unexpected pg subcommand '{}'", sub_name) - } - } - - Ok(()) -} - -fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let pageserver = PageServerNode::from_env(env); - - match sub_match.subcommand() { - Some(("start", start_match)) => { - if let Err(e) = pageserver.start(&pageserver_config_overrides(start_match)) { - eprintln!("pageserver start failed: {}", e); - exit(1); - } - } - - Some(("stop", stop_match)) => { - let immediate = stop_match.value_of("stop-mode") == Some("immediate"); - - if let Err(e) = pageserver.stop(immediate) { - eprintln!("pageserver stop failed: {}", e); - exit(1); - } - } - - Some(("restart", restart_match)) => { - //TODO what shutdown strategy should we use here? - if let Err(e) = pageserver.stop(false) { - eprintln!("pageserver stop failed: {}", e); - exit(1); - } - - if let Err(e) = pageserver.start(&pageserver_config_overrides(restart_match)) { - eprintln!("pageserver start failed: {}", e); - exit(1); - } - } - Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name), - None => bail!("no pageserver subcommand provided"), - } - Ok(()) -} - -fn get_safekeeper(env: &local_env::LocalEnv, name: &str) -> Result { - if let Some(node) = env.safekeepers.iter().find(|node| node.name == name) { - Ok(SafekeeperNode::from_env(env, node)) - } else { - bail!("could not find safekeeper '{}'", name) - } -} - -fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let (sub_name, sub_args) = match sub_match.subcommand() { - Some(safekeeper_command_data) => safekeeper_command_data, - None => bail!("no safekeeper subcommand provided"), - }; - - // All the commands take an optional safekeeper name argument - let node_name = sub_args.value_of("node").unwrap_or(DEFAULT_SAFEKEEPER_NAME); - let safekeeper = get_safekeeper(env, node_name)?; - - match sub_name { - "start" => { - if let Err(e) = safekeeper.start() { - eprintln!("safekeeper start failed: {}", e); - exit(1); - } - } - - "stop" => { - let immediate = sub_args.value_of("stop-mode") == Some("immediate"); - - if let Err(e) = safekeeper.stop(immediate) { - eprintln!("safekeeper stop failed: {}", e); - exit(1); - } - } - - "restart" => { - let immediate = sub_args.value_of("stop-mode") == Some("immediate"); - - if let Err(e) = safekeeper.stop(immediate) { - eprintln!("safekeeper stop failed: {}", e); - exit(1); - } - - if let Err(e) = safekeeper.start() { - eprintln!("safekeeper start failed: {}", e); - exit(1); - } - } - - _ => { - bail!("Unexpected safekeeper subcommand '{}'", sub_name) - } - } - Ok(()) -} - -fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let pageserver = PageServerNode::from_env(env); - - // Postgres nodes are not started automatically - - if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) { - eprintln!("pageserver start failed: {}", e); - exit(1); - } - - for node in env.safekeepers.iter() { - let safekeeper = SafekeeperNode::from_env(env, node); - if let Err(e) = safekeeper.start() { - eprintln!("safekeeper '{}' start failed: {}", safekeeper.name, e); - exit(1); - } - } - Ok(()) -} - -fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { - let immediate = sub_match.value_of("stop-mode") == Some("immediate"); - - let pageserver = PageServerNode::from_env(env); - - // Stop all compute nodes - let cplane = ComputeControlPlane::load(env.clone())?; - for (_k, node) in cplane.nodes { - if let Err(e) = node.stop(false) { - eprintln!("postgres stop failed: {}", e); - } - } - - if let Err(e) = pageserver.stop(immediate) { - eprintln!("pageserver stop failed: {}", e); - } - - for node in env.safekeepers.iter() { - let safekeeper = SafekeeperNode::from_env(env, node); - if let Err(e) = safekeeper.stop(immediate) { - eprintln!("safekeeper '{}' stop failed: {}", safekeeper.name, e); - } - } - Ok(()) -} diff --git a/zenith_metrics/Cargo.toml b/zenith_metrics/Cargo.toml deleted file mode 100644 index 0c921ede0b..0000000000 --- a/zenith_metrics/Cargo.toml +++ /dev/null @@ -1,10 +0,0 @@ -[package] -name = "zenith_metrics" -version = "0.1.0" -edition = "2021" - -[dependencies] -prometheus = {version = "0.13", default_features=false} # removes protobuf dependency -libc = "0.2" -lazy_static = "1.4" -once_cell = "1.8.0" diff --git a/zenith_utils/Cargo.toml b/zenith_utils/Cargo.toml deleted file mode 100644 index b22fcbf748..0000000000 --- a/zenith_utils/Cargo.toml +++ /dev/null @@ -1,39 +0,0 @@ -[package] -name = "zenith_utils" -version = "0.1.0" -edition = "2021" - -[dependencies] -anyhow = "1.0" -bincode = "1.3" -bytes = "1.0.1" -hyper = { version = "0.14.7", features = ["full"] } -lazy_static = "1.4.0" -pin-project-lite = "0.2.7" -postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } -routerify = "3" -serde = { version = "1.0", features = ["derive"] } -serde_json = "1" -thiserror = "1.0" -tokio = { version = "1.11", features = ["macros"]} -tracing = "0.1" -tracing-subscriber = { version = "0.3", features = ["env-filter"] } -nix = "0.23.0" -signal-hook = "0.3.10" -rand = "0.8.3" -jsonwebtoken = "7" -hex = { version = "0.4.3", features = ["serde"] } -rustls = "0.19.1" -rustls-split = "0.2.1" -git-version = "0.3.5" - -zenith_metrics = { path = "../zenith_metrics" } -workspace_hack = { path = "../workspace_hack" } - -[dev-dependencies] -byteorder = "1.4.3" -bytes = "1.0.1" -hex-literal = "0.3" -tempfile = "3.2" -webpki = "0.21" diff --git a/zenith_utils/build.rs b/zenith_utils/build.rs deleted file mode 100644 index ee3346ae66..0000000000 --- a/zenith_utils/build.rs +++ /dev/null @@ -1,3 +0,0 @@ -fn main() { - println!("cargo:rerun-if-env-changed=GIT_VERSION"); -} diff --git a/zenith_utils/src/auth.rs b/zenith_utils/src/auth.rs deleted file mode 100644 index 274dd13bee..0000000000 --- a/zenith_utils/src/auth.rs +++ /dev/null @@ -1,127 +0,0 @@ -// For details about authentication see docs/authentication.md -// TODO there are two issues for our use case in jsonwebtoken library which will be resolved in next release -// The first one is that there is no way to disable expiration claim, but it can be excluded from validation, so use this as a workaround for now. -// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/190 -// The second one is that we wanted to use ed25519 keys, but they are also not supported until next version. So we go with RSA keys for now. -// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162 - -use hex::{self, FromHex}; -use serde::de::Error; -use serde::{self, Deserializer, Serializer}; -use std::fs; -use std::path::Path; - -use anyhow::{bail, Result}; -use jsonwebtoken::{ - decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, -}; -use serde::{Deserialize, Serialize}; - -use crate::zid::ZTenantId; - -const JWT_ALGORITHM: Algorithm = Algorithm::RS256; - -#[derive(Debug, Serialize, Deserialize, Clone)] -#[serde(rename_all = "lowercase")] -pub enum Scope { - Tenant, - PageServerApi, -} - -pub fn to_hex_option(value: &Option, serializer: S) -> Result -where - S: Serializer, -{ - match value { - Some(tid) => hex::serialize(tid, serializer), - None => Option::serialize(value, serializer), - } -} - -fn from_hex_option<'de, D>(deserializer: D) -> Result, D::Error> -where - D: Deserializer<'de>, -{ - let opt: Option = Option::deserialize(deserializer)?; - match opt { - Some(tid) => Ok(Some(ZTenantId::from_hex(tid).map_err(Error::custom)?)), - None => Ok(None), - } -} - -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct Claims { - // this custom serialize/deserialize_with is needed because Option is not transparent to serde - // so clearest option is serde(with = "hex") but it is not working, for details see https://github.com/serde-rs/serde/issues/1301 - #[serde( - default, - skip_serializing_if = "Option::is_none", - serialize_with = "to_hex_option", - deserialize_with = "from_hex_option" - )] - pub tenant_id: Option, - pub scope: Scope, -} - -impl Claims { - pub fn new(tenant_id: Option, scope: Scope) -> Self { - Self { tenant_id, scope } - } -} - -pub fn check_permission(claims: &Claims, tenantid: Option) -> Result<()> { - match (&claims.scope, tenantid) { - (Scope::Tenant, None) => { - bail!("Attempt to access management api with tenant scope. Permission denied") - } - (Scope::Tenant, Some(tenantid)) => { - if claims.tenant_id.unwrap() != tenantid { - bail!("Tenant id mismatch. Permission denied") - } - Ok(()) - } - (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope - (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope - } -} - -pub struct JwtAuth { - decoding_key: DecodingKey<'static>, - validation: Validation, -} - -impl JwtAuth { - pub fn new(decoding_key: DecodingKey<'_>) -> Self { - Self { - decoding_key: decoding_key.into_static(), - validation: Validation { - algorithms: vec![JWT_ALGORITHM], - validate_exp: false, - ..Default::default() - }, - } - } - - pub fn from_key_path(key_path: &Path) -> Result { - let public_key = fs::read(key_path)?; - Ok(Self::new(DecodingKey::from_rsa_pem(&public_key)?)) - } - - pub fn decode(&self, token: &str) -> Result> { - Ok(decode(token, &self.decoding_key, &self.validation)?) - } -} - -impl std::fmt::Debug for JwtAuth { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("JwtAuth") - .field("validation", &self.validation) - .finish() - } -} - -// this function is used only for testing purposes in CLI e g generate tokens during init -pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result { - let key = EncodingKey::from_rsa_pem(key_data)?; - Ok(encode(&Header::new(JWT_ALGORITHM), claims, &key)?) -} diff --git a/zenith_utils/src/connstring.rs b/zenith_utils/src/connstring.rs deleted file mode 100644 index cda8eeac86..0000000000 --- a/zenith_utils/src/connstring.rs +++ /dev/null @@ -1,52 +0,0 @@ -use postgres::Config; - -pub fn connection_host_port(config: &Config) -> (String, u16) { - assert_eq!( - config.get_hosts().len(), - 1, - "only one pair of host and port is supported in connection string" - ); - assert_eq!( - config.get_ports().len(), - 1, - "only one pair of host and port is supported in connection string" - ); - let host = match &config.get_hosts()[0] { - postgres::config::Host::Tcp(host) => host.as_ref(), - postgres::config::Host::Unix(host) => host.to_str().unwrap(), - }; - (host.to_owned(), config.get_ports()[0]) -} - -pub fn connection_address(config: &Config) -> String { - let (host, port) = connection_host_port(config); - format!("{}:{}", host, port) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_connection_host_port() { - let config: Config = "postgresql://no_user@localhost:64000/no_db" - .parse() - .unwrap(); - assert_eq!( - connection_host_port(&config), - ("localhost".to_owned(), 64000) - ); - } - - #[test] - #[should_panic(expected = "only one pair of host and port is supported in connection string")] - fn test_connection_host_port_multiple_ports() { - let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db" - .parse() - .unwrap(); - assert_eq!( - connection_host_port(&config), - ("localhost".to_owned(), 64000) - ); - } -} diff --git a/zenith_utils/src/http/request.rs b/zenith_utils/src/http/request.rs deleted file mode 100644 index 3bc8993c26..0000000000 --- a/zenith_utils/src/http/request.rs +++ /dev/null @@ -1,33 +0,0 @@ -use std::str::FromStr; - -use super::error::ApiError; -use hyper::{Body, Request}; -use routerify::ext::RequestExt; - -pub fn get_request_param<'a>( - request: &'a Request, - param_name: &str, -) -> Result<&'a str, ApiError> { - match request.param(param_name) { - Some(arg) => Ok(arg), - None => { - return Err(ApiError::BadRequest(format!( - "no {} specified in path param", - param_name - ))) - } - } -} - -pub fn parse_request_param( - request: &Request, - param_name: &str, -) -> Result { - match get_request_param(request, param_name)?.parse() { - Ok(v) => Ok(v), - Err(_) => Err(ApiError::BadRequest(format!( - "failed to parse {}", - param_name - ))), - } -} diff --git a/zenith_utils/src/lib.rs b/zenith_utils/src/lib.rs deleted file mode 100644 index 7d8ef63b1c..0000000000 --- a/zenith_utils/src/lib.rs +++ /dev/null @@ -1,84 +0,0 @@ -//! zenith_utils is intended to be a place to put code that is shared -//! between other crates in this repository. - -#![allow(clippy::manual_range_contains)] - -/// `Lsn` type implements common tasks on Log Sequence Numbers -pub mod lsn; -/// SeqWait allows waiting for a future sequence number to arrive -pub mod seqwait; - -/// append only ordered map implemented with a Vec -pub mod vec_map; - -// Async version of SeqWait. Currently unused. -// pub mod seqwait_async; - -pub mod bin_ser; -pub mod postgres_backend; -pub mod pq_proto; - -// dealing with connstring parsing and handy access to it's parts -pub mod connstring; - -// helper functions for creating and fsyncing directories/trees -pub mod crashsafe_dir; - -// common authentication routines -pub mod auth; - -// utility functions and helper traits for unified unique id generation/serialization etc. -pub mod zid; -// http endpoint utils -pub mod http; - -// socket splitting utils -pub mod sock_split; - -// common log initialisation routine -pub mod logging; - -// Misc -pub mod accum; -pub mod shutdown; - -// Tools for calling certain async methods in sync contexts -pub mod sync; - -// Utility for binding TcpListeners with proper socket options. -pub mod tcp_listener; - -// Utility for putting a raw file descriptor into non-blocking mode -pub mod nonblock; - -// Default signal handling -pub mod signals; - -// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages -// -// we have several cases: -// * building locally from git repo -// * building in CI from git repo -// * building in docker (either in CI or locally) -// -// One thing to note is that .git is not available in docker (and it is bad to include it there). -// So everything becides docker build is covered by git_version crate. -// For docker use environment variable to pass git version, which is then retrieved by buildscript (build.rs). -// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro. -// Git version received from environment variable used as a fallback in git_version invokation. -// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option. -// So the build script will be run only when GIT_VERSION envvar has changed. -// -// Why not to use buildscript to get git commit sha directly without procmacro from different crate? -// Caching and workspaces complicates that. In case zenith_utils is not -// recompiled due to caching then version may become outdated. -// git_version crate handles that case by introducing a dependency on .git internals via include_bytes! macro, -// so if we changed the index state git_version will pick that up and rerun the macro. -// -// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`. -use git_version::git_version; -pub const GIT_VERSION: &str = git_version!( - prefix = "git:", - fallback = concat!("git-env:", env!("GIT_VERSION")), - args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha -); diff --git a/zenith_utils/src/logging.rs b/zenith_utils/src/logging.rs deleted file mode 100644 index 1576a54c8e..0000000000 --- a/zenith_utils/src/logging.rs +++ /dev/null @@ -1,42 +0,0 @@ -use std::{ - fs::{File, OpenOptions}, - path::Path, -}; - -use anyhow::{Context, Result}; - -pub fn init(log_filename: impl AsRef, daemonize: bool) -> Result { - // Don't open the same file for output multiple times; - // the different fds could overwrite each other's output. - let log_file = OpenOptions::new() - .create(true) - .append(true) - .open(&log_filename) - .with_context(|| format!("failed to open {:?}", log_filename.as_ref()))?; - - let default_filter_str = "info"; - - // We fall back to printing all spans at info-level or above if - // the RUST_LOG environment variable is not set. - let env_filter = tracing_subscriber::EnvFilter::try_from_default_env() - .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str)); - - let base_logger = tracing_subscriber::fmt() - .with_env_filter(env_filter) - .with_target(false) // don't include event targets - .with_ansi(false); // don't use colors in log file; - - // we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it - // if we do not use daemonization (e.g. in docker) it is better to log to stdout directly - // for example to be in line with docker log command which expects logs comimg from stdout - if daemonize { - let x = log_file.try_clone().unwrap(); - base_logger - .with_writer(move || x.try_clone().unwrap()) - .init(); - } else { - base_logger.init(); - } - - Ok(log_file) -} diff --git a/zenith_utils/src/seqwait_async.rs b/zenith_utils/src/seqwait_async.rs deleted file mode 100644 index 09138e9dd4..0000000000 --- a/zenith_utils/src/seqwait_async.rs +++ /dev/null @@ -1,224 +0,0 @@ -/// -/// Async version of 'seqwait.rs' -/// -/// NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs. -/// - -#![warn(missing_docs)] - -use std::collections::BTreeMap; -use std::fmt::Debug; -use std::mem; -use std::sync::Mutex; -use std::time::Duration; -use tokio::sync::watch::{channel, Receiver, Sender}; -use tokio::time::timeout; - -/// An error happened while waiting for a number -#[derive(Debug, PartialEq, thiserror::Error)] -#[error("SeqWaitError")] -pub enum SeqWaitError { - /// The wait timeout was reached - Timeout, - /// [`SeqWait::shutdown`] was called - Shutdown, -} - -/// Internal components of a `SeqWait` -struct SeqWaitInt -where - T: Ord, -{ - waiters: BTreeMap, Receiver<()>)>, - current: T, - shutdown: bool, -} - -/// A tool for waiting on a sequence number -/// -/// This provides a way to await the arrival of a number. -/// As soon as the number arrives by another caller calling -/// [`advance`], then the waiter will be woken up. -/// -/// This implementation takes a blocking Mutex on both [`wait_for`] -/// and [`advance`], meaning there may be unexpected executor blocking -/// due to thread scheduling unfairness. There are probably better -/// implementations, but we can probably live with this for now. -/// -/// [`wait_for`]: SeqWait::wait_for -/// [`advance`]: SeqWait::advance -/// -pub struct SeqWait -where - T: Ord, -{ - internal: Mutex>, -} - -impl SeqWait -where - T: Ord + Debug + Copy, -{ - /// Create a new `SeqWait`, initialized to a particular number - pub fn new(starting_num: T) -> Self { - let internal = SeqWaitInt { - waiters: BTreeMap::new(), - current: starting_num, - shutdown: false, - }; - SeqWait { - internal: Mutex::new(internal), - } - } - - /// Shut down a `SeqWait`, causing all waiters (present and - /// future) to return an error. - pub fn shutdown(&self) { - let waiters = { - // Prevent new waiters; wake all those that exist. - // Wake everyone with an error. - let mut internal = self.internal.lock().unwrap(); - - // This will steal the entire waiters map. - // When we drop it all waiters will be woken. - mem::take(&mut internal.waiters) - - // Drop the lock as we exit this scope. - }; - - // When we drop the waiters list, each Receiver will - // be woken with an error. - // This drop doesn't need to be explicit; it's done - // here to make it easier to read the code and understand - // the order of events. - drop(waiters); - } - - /// Wait for a number to arrive - /// - /// This call won't complete until someone has called `advance` - /// with a number greater than or equal to the one we're waiting for. - pub async fn wait_for(&self, num: T) -> Result<(), SeqWaitError> { - let mut rx = { - let mut internal = self.internal.lock().unwrap(); - if internal.current >= num { - return Ok(()); - } - if internal.shutdown { - return Err(SeqWaitError::Shutdown); - } - - // If we already have a channel for waiting on this number, reuse it. - if let Some((_, rx)) = internal.waiters.get_mut(&num) { - // an Err from changed() means the sender was dropped. - rx.clone() - } else { - // Create a new channel. - let (tx, rx) = channel(()); - internal.waiters.insert(num, (tx, rx.clone())); - rx - } - // Drop the lock as we exit this scope. - }; - rx.changed().await.map_err(|_| SeqWaitError::Shutdown) - } - - /// Wait for a number to arrive - /// - /// This call won't complete until someone has called `advance` - /// with a number greater than or equal to the one we're waiting for. - /// - /// If that hasn't happened after the specified timeout duration, - /// [`SeqWaitError::Timeout`] will be returned. - pub async fn wait_for_timeout( - &self, - num: T, - timeout_duration: Duration, - ) -> Result<(), SeqWaitError> { - timeout(timeout_duration, self.wait_for(num)) - .await - .unwrap_or(Err(SeqWaitError::Timeout)) - } - - /// Announce a new number has arrived - /// - /// All waiters at this value or below will be woken. - /// - /// `advance` will panic if you send it a lower number than - /// a previous call. - pub fn advance(&self, num: T) { - let wake_these = { - let mut internal = self.internal.lock().unwrap(); - - if internal.current > num { - panic!( - "tried to advance backwards, from {:?} to {:?}", - internal.current, num - ); - } - internal.current = num; - - // split_off will give me all the high-numbered waiters, - // so split and then swap. Everything at or above `num` - // stays. - let mut split = internal.waiters.split_off(&num); - std::mem::swap(&mut split, &mut internal.waiters); - - // `split_at` didn't get the value at `num`; if it's - // there take that too. - if let Some(sleeper) = internal.waiters.remove(&num) { - split.insert(num, sleeper); - } - - split - }; - - for (_wake_num, (tx, _rx)) in wake_these { - // This can fail if there are no receivers. - // We don't care; discard the error. - let _ = tx.send(()); - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::sync::Arc; - use tokio::time::{sleep, Duration}; - - #[tokio::test] - async fn seqwait() { - let seq = Arc::new(SeqWait::new(0)); - let seq2 = Arc::clone(&seq); - let seq3 = Arc::clone(&seq); - tokio::spawn(async move { - seq2.wait_for(42).await.expect("wait_for 42"); - seq2.advance(100); - seq2.wait_for(999).await.expect_err("no 999"); - }); - tokio::spawn(async move { - seq3.wait_for(42).await.expect("wait_for 42"); - seq3.wait_for(0).await.expect("wait_for 0"); - }); - sleep(Duration::from_secs(1)).await; - seq.advance(99); - seq.wait_for(100).await.expect("wait_for 100"); - seq.shutdown(); - } - - #[tokio::test] - async fn seqwait_timeout() { - let seq = Arc::new(SeqWait::new(0)); - let seq2 = Arc::clone(&seq); - tokio::spawn(async move { - let timeout = Duration::from_millis(1); - let res = seq2.wait_for_timeout(42, timeout).await; - assert_eq!(res, Err(SeqWaitError::Timeout)); - }); - sleep(Duration::from_secs(1)).await; - // This will attempt to wake, but nothing will happen - // because the waiter already dropped its Receiver. - seq.advance(99); - } -} diff --git a/zenith_utils/src/zid.rs b/zenith_utils/src/zid.rs deleted file mode 100644 index 2e93ab596c..0000000000 --- a/zenith_utils/src/zid.rs +++ /dev/null @@ -1,275 +0,0 @@ -use std::{fmt, str::FromStr}; - -use hex::FromHex; -use rand::Rng; -use serde::{Deserialize, Serialize}; - -// Zenith ID is a 128-bit random ID. -// Used to represent various identifiers. Provides handy utility methods and impls. -#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] -struct ZId([u8; 16]); - -impl ZId { - pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> ZId { - let mut arr = [0u8; 16]; - buf.copy_to_slice(&mut arr); - ZId::from(arr) - } - - pub fn as_arr(&self) -> [u8; 16] { - self.0 - } - - pub fn generate() -> Self { - let mut tli_buf = [0u8; 16]; - rand::thread_rng().fill(&mut tli_buf); - ZId::from(tli_buf) - } -} - -impl FromStr for ZId { - type Err = hex::FromHexError; - - fn from_str(s: &str) -> Result { - Self::from_hex(s) - } -} - -// this is needed for pretty serialization and deserialization of ZId's using serde integration with hex crate -impl FromHex for ZId { - type Error = hex::FromHexError; - - fn from_hex>(hex: T) -> Result { - let mut buf: [u8; 16] = [0u8; 16]; - hex::decode_to_slice(hex, &mut buf)?; - Ok(ZId(buf)) - } -} - -impl AsRef<[u8]> for ZId { - fn as_ref(&self) -> &[u8] { - &self.0 - } -} - -impl From<[u8; 16]> for ZId { - fn from(b: [u8; 16]) -> Self { - ZId(b) - } -} - -impl fmt::Display for ZId { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(&hex::encode(self.0)) - } -} - -impl fmt::Debug for ZId { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(&hex::encode(self.0)) - } -} - -macro_rules! zid_newtype { - ($t:ident) => { - impl $t { - pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> $t { - $t(ZId::get_from_buf(buf)) - } - - pub fn as_arr(&self) -> [u8; 16] { - self.0.as_arr() - } - - pub fn generate() -> Self { - $t(ZId::generate()) - } - - pub const fn from_array(b: [u8; 16]) -> Self { - $t(ZId(b)) - } - } - - impl FromStr for $t { - type Err = hex::FromHexError; - - fn from_str(s: &str) -> Result<$t, Self::Err> { - let value = ZId::from_str(s)?; - Ok($t(value)) - } - } - - impl From<[u8; 16]> for $t { - fn from(b: [u8; 16]) -> Self { - $t(ZId::from(b)) - } - } - - impl FromHex for $t { - type Error = hex::FromHexError; - - fn from_hex>(hex: T) -> Result { - Ok($t(ZId::from_hex(hex)?)) - } - } - - impl AsRef<[u8]> for $t { - fn as_ref(&self) -> &[u8] { - &self.0 .0 - } - } - - impl fmt::Display for $t { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.0.fmt(f) - } - } - - impl fmt::Debug for $t { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.0.fmt(f) - } - } - }; -} - -/// Zenith timeline IDs are different from PostgreSQL timeline -/// IDs. They serve a similar purpose though: they differentiate -/// between different "histories" of the same cluster. However, -/// PostgreSQL timeline IDs are a bit cumbersome, because they are only -/// 32-bits wide, and they must be in ascending order in any given -/// timeline history. Those limitations mean that we cannot generate a -/// new PostgreSQL timeline ID by just generating a random number. And -/// that in turn is problematic for the "pull/push" workflow, where you -/// have a local copy of a zenith repository, and you periodically sync -/// the local changes with a remote server. When you work "detached" -/// from the remote server, you cannot create a PostgreSQL timeline ID -/// that's guaranteed to be different from all existing timelines in -/// the remote server. For example, if two people are having a clone of -/// the repository on their laptops, and they both create a new branch -/// with different name. What timeline ID would they assign to their -/// branches? If they pick the same one, and later try to push the -/// branches to the same remote server, they will get mixed up. -/// -/// To avoid those issues, Zenith has its own concept of timelines that -/// is separate from PostgreSQL timelines, and doesn't have those -/// limitations. A zenith timeline is identified by a 128-bit ID, which -/// is usually printed out as a hex string. -#[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] -pub struct ZTimelineId(ZId); - -zid_newtype!(ZTimelineId); - -// Zenith Tenant Id represents identifiar of a particular tenant. -// Is used for distinguishing requests and data belonging to different users. -#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] -pub struct ZTenantId(ZId); - -zid_newtype!(ZTenantId); - -/// Serde routines for Option (de)serialization, using `T:Display` representations for inner values. -/// Useful for Option and Option to get their hex representations into serialized string and deserialize them back. -pub mod opt_display_serde { - use serde::{de, Deserialize, Deserializer, Serialize, Serializer}; - use std::{fmt::Display, str::FromStr}; - - pub fn serialize(id: &Option, ser: S) -> Result - where - S: Serializer, - Id: Display, - { - id.as_ref().map(ToString::to_string).serialize(ser) - } - - pub fn deserialize<'de, D, Id>(des: D) -> Result, D::Error> - where - D: Deserializer<'de>, - Id: FromStr, - ::Err: Display, - { - Ok(if let Some(s) = Option::::deserialize(des)? { - Some(Id::from_str(&s).map_err(de::Error::custom)?) - } else { - None - }) - } -} - -// A pair uniquely identifying Zenith instance. -#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash)] -pub struct ZTenantTimelineId { - pub tenant_id: ZTenantId, - pub timeline_id: ZTimelineId, -} - -impl ZTenantTimelineId { - pub fn new(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Self { - ZTenantTimelineId { - tenant_id, - timeline_id, - } - } - - pub fn generate() -> Self { - Self::new(ZTenantId::generate(), ZTimelineId::generate()) - } -} - -impl fmt::Display for ZTenantTimelineId { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}-{}", self.tenant_id, self.timeline_id) - } -} - -#[cfg(test)] -mod tests { - use std::fmt::Display; - - use super::*; - use hex::FromHexError; - use hex_literal::hex; - - #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] - struct TestStruct + Display> { - #[serde(with = "opt_display_serde")] - field: Option, - } - - #[test] - fn test_hex_serializations_tenant_id() { - let original_struct = TestStruct { - field: Some(ZTenantId::from_array(hex!( - "11223344556677881122334455667788" - ))), - }; - - let serialized_string = serde_json::to_string(&original_struct).unwrap(); - assert_eq!( - serialized_string, - r#"{"field":"11223344556677881122334455667788"}"# - ); - - let deserialized_struct: TestStruct = - serde_json::from_str(&serialized_string).unwrap(); - assert_eq!(original_struct, deserialized_struct); - } - - #[test] - fn test_hex_serializations_timeline_id() { - let original_struct = TestStruct { - field: Some(ZTimelineId::from_array(hex!( - "AA223344556677881122334455667788" - ))), - }; - - let serialized_string = serde_json::to_string(&original_struct).unwrap(); - assert_eq!( - serialized_string, - r#"{"field":"aa223344556677881122334455667788"}"# - ); - - let deserialized_struct: TestStruct = - serde_json::from_str(&serialized_string).unwrap(); - assert_eq!(original_struct, deserialized_struct); - } -}