diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000000..76a2ff549e --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,13 @@ +# The binaries are really slow, if you compile them in 'dev' mode with the defaults. +# Enable some optimizations even in 'dev' mode, to make tests faster. The basic +# optimizations enabled by "opt-level=1" don't affect debuggability too much. +# +# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/ +# +[profile.dev.package."*"] +# Set the default for dependencies in Development mode. +opt-level = 3 + +[profile.dev] +# Turn on a small amount of optimization in Development mode. +opt-level = 1 diff --git a/.circleci/ansible/systemd/safekeeper.service b/.circleci/ansible/systemd/safekeeper.service deleted file mode 100644 index 55088db859..0000000000 --- a/.circleci/ansible/systemd/safekeeper.service +++ /dev/null @@ -1,18 +0,0 @@ -[Unit] -Description=Zenith safekeeper -After=network.target auditd.service - -[Service] -Type=simple -User=safekeeper -Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib -ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --enable-s3-offload={{ safekeeper_enable_s3_offload }} -ExecReload=/bin/kill -HUP $MAINPID -KillMode=mixed -KillSignal=SIGINT -Restart=on-failure -TimeoutSec=10 -LimitNOFILE=30000000 - -[Install] -WantedBy=multi-user.target diff --git a/.circleci/config.yml b/.circleci/config.yml index 3377b907cb..00a51eb906 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -5,21 +5,12 @@ executors: resource_class: xlarge docker: # NB: when changed, do not forget to update rust image tag in all Dockerfiles - - image: zimg/rust:1.58 + - image: neondatabase/rust:1.58 neon-executor: docker: - - image: zimg/rust:1.58 + - image: neondatabase/rust:1.58 jobs: - check-codestyle-rust: - executor: neon-xlarge-executor - steps: - - checkout - - run: - name: rustfmt - when: always - command: cargo fmt --all -- --check - # A job to build postgres build-postgres: executor: neon-xlarge-executor @@ -46,7 +37,7 @@ jobs: name: Restore postgres cache keys: # Restore ONLY if the rev key matches exactly - - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} + - v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} # Build postgres if the restore_cache didn't find a build. # `make` can't figure out whether the cache is valid, since @@ -63,7 +54,7 @@ jobs: - save_cache: name: Save postgres cache - key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} + key: v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} paths: - tmp_install @@ -94,7 +85,7 @@ jobs: name: Restore postgres cache keys: # Restore ONLY if the rev key matches exactly - - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} + - v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }} - restore_cache: name: Restore rust cache @@ -102,31 +93,29 @@ jobs: # Require an exact match. While an out of date cache might speed up the build, # there's no way to clean out old packages, so the cache grows every time something # changes. - - v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} + - v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} # Build the rust code, including test binaries - run: name: Rust build << parameters.build_type >> command: | if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run) CARGO_FLAGS= elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() CARGO_FLAGS="--release --features profiling" fi export CARGO_INCREMENTAL=0 export CACHEPOT_BUCKET=zenith-rust-cachepot - export RUSTC_WRAPPER=cachepot + export RUSTC_WRAPPER="" export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" - "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests + mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests cachepot -s - save_cache: name: Save rust cache - key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} + key: v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }} paths: - ~/.cargo/registry - ~/.cargo/git @@ -137,35 +126,22 @@ jobs: name: cargo test command: | if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run) CARGO_FLAGS= elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() CARGO_FLAGS=--release fi - "${cov_prefix[@]}" cargo test $CARGO_FLAGS + cargo test $CARGO_FLAGS # Install the rust binaries, for use by test jobs - run: name: Install rust binaries command: | - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run) - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - fi - binaries=$( - "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps | + cargo metadata --format-version=1 --no-deps | jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' ) - test_exe_paths=$( - "${cov_prefix[@]}" cargo test --message-format=json --no-run | - jq -r '.executable | select(. != null)' - ) - mkdir -p /tmp/zenith/bin mkdir -p /tmp/zenith/test_bin mkdir -p /tmp/zenith/etc @@ -175,34 +151,15 @@ jobs: SRC=target/$BUILD_TYPE/$bin DST=/tmp/zenith/bin/$bin cp $SRC $DST - echo $DST >> /tmp/zenith/etc/binaries.list done - # Install test executables (for code coverage) - if [[ $BUILD_TYPE == "debug" ]]; then - for bin in $test_exe_paths; do - SRC=$bin - DST=/tmp/zenith/test_bin/$(basename $bin) - cp $SRC $DST - echo $DST >> /tmp/zenith/etc/binaries.list - done - fi - # Install the postgres binaries, for use by test jobs - run: name: Install postgres binaries command: | cp -a tmp_install /tmp/zenith/pg_install - - run: - name: Merge coverage data - command: | - # This will speed up workspace uploads - if [[ $BUILD_TYPE == "debug" ]]; then - scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge - fi - - # Save the rust binaries and coverage data for other jobs in this workflow. + # Save rust binaries for other jobs in the workflow - persist_to_workspace: root: /tmp/zenith paths: @@ -295,7 +252,7 @@ jobs: # no_output_timeout, specified here. no_output_timeout: 10m environment: - - ZENITH_BIN: /tmp/zenith/bin + - NEON_BIN: /tmp/zenith/bin - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install - TEST_OUTPUT: /tmp/test_output # this variable will be embedded in perf test report @@ -323,12 +280,6 @@ jobs: export GITHUB_SHA=$CIRCLE_SHA1 - if [[ $BUILD_TYPE == "debug" ]]; then - cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run) - elif [[ $BUILD_TYPE == "release" ]]; then - cov_prefix=() - fi - # Run the tests. # # The junit.xml file allows CircleCI to display more fine-grained test information @@ -339,7 +290,7 @@ jobs: # -n4 uses four processes to run tests via pytest-xdist # -s is not used to prevent pytest from capturing output, because tests are running # in parallel and logs are mixed between different tests - "${cov_prefix[@]}" ./scripts/pytest \ + ./scripts/pytest \ --junitxml=$TEST_OUTPUT/junit.xml \ --tb=short \ --verbose \ @@ -368,383 +319,15 @@ jobs: # The store_test_results step tells CircleCI where to find the junit.xml file. - store_test_results: path: /tmp/test_output - - run: - name: Merge coverage data - command: | - # This will speed up workspace uploads - if [[ $BUILD_TYPE == "debug" ]]; then - scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge - fi - # Save coverage data (if any) + # Save data (if any) - persist_to_workspace: root: /tmp/zenith paths: - "*" - coverage-report: - executor: neon-xlarge-executor - steps: - - attach_workspace: - at: /tmp/zenith - - checkout - - restore_cache: - name: Restore rust cache - keys: - # Require an exact match. While an out of date cache might speed up the build, - # there's no way to clean out old packages, so the cache grows every time something - # changes. - - v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }} - - run: - name: Build coverage report - command: | - COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1 - - scripts/coverage \ - --dir=/tmp/zenith/coverage report \ - --input-objects=/tmp/zenith/etc/binaries.list \ - --commit-url=$COMMIT_URL \ - --format=github - - run: - name: Upload coverage report - command: | - LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME - REPORT_URL=https://neondatabase.github.io/zenith-coverage-data/$CIRCLE_SHA1 - COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1 - - scripts/git-upload \ - --repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/neondatabase/zenith-coverage-data.git \ - --message="Add code coverage for $COMMIT_URL" \ - copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE - - # Add link to the coverage report to the commit - curl -f -X POST \ - https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \ - -H "Accept: application/vnd.github.v3+json" \ - --user "$CI_ACCESS_TOKEN" \ - --data \ - "{ - \"state\": \"success\", - \"context\": \"zenith-coverage\", - \"description\": \"Coverage report is ready\", - \"target_url\": \"$REPORT_URL\" - }" - - # Build neondatabase/neon:latest image and push it to Docker hub - docker-image: - docker: - - image: cimg/base:2021.04 - steps: - - checkout - - setup_remote_docker: - docker_layer_caching: true - - run: - name: Init postgres submodule - command: git submodule update --init --depth 1 - - run: - name: Build and push Docker image - command: | - echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin - DOCKER_TAG=$(git log --oneline|wc -l) - docker build \ - --pull \ - --build-arg GIT_VERSION=${CIRCLE_SHA1} \ - --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ - --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:latest . - docker push neondatabase/neon:${DOCKER_TAG} - docker push neondatabase/neon:latest - - # Build neondatabase/compute-node:latest image and push it to Docker hub - docker-image-compute: - docker: - - image: cimg/base:2021.04 - steps: - - checkout - - setup_remote_docker: - docker_layer_caching: true - - run: - name: Build and push compute-tools Docker image - command: | - echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin - docker build \ - --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ - --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag neondatabase/compute-tools:local \ - --tag neondatabase/compute-tools:latest \ - -f Dockerfile.compute-tools . - # Only push :latest image - docker push neondatabase/compute-tools:latest - - run: - name: Init postgres submodule - command: git submodule update --init --depth 1 - - run: - name: Build and push compute-node Docker image - command: | - echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin - DOCKER_TAG=$(git log --oneline|wc -l) - docker build --tag neondatabase/compute-node:${DOCKER_TAG} \ - --tag neondatabase/compute-node:latest vendor/postgres \ - --build-arg COMPUTE_TOOLS_TAG=local - docker push neondatabase/compute-node:${DOCKER_TAG} - docker push neondatabase/compute-node:latest - - # Build production neondatabase/neon:release image and push it to Docker hub - docker-image-release: - docker: - - image: cimg/base:2021.04 - steps: - - checkout - - setup_remote_docker: - docker_layer_caching: true - - run: - name: Init postgres submodule - command: git submodule update --init --depth 1 - - run: - name: Build and push Docker image - command: | - echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin - DOCKER_TAG="release-$(git log --oneline|wc -l)" - docker build \ - --pull \ - --build-arg GIT_VERSION=${CIRCLE_SHA1} \ - --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ - --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:release . - docker push neondatabase/neon:${DOCKER_TAG} - docker push neondatabase/neon:release - - # Build production neondatabase/compute-node:release image and push it to Docker hub - docker-image-compute-release: - docker: - - image: cimg/base:2021.04 - steps: - - checkout - - setup_remote_docker: - docker_layer_caching: true - - run: - name: Build and push compute-tools Docker image - command: | - echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin - docker build \ - --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ - --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ - --tag neondatabase/compute-tools:release \ - --tag neondatabase/compute-tools:local \ - -f Dockerfile.compute-tools . - # Only push :release image - docker push neondatabase/compute-tools:release - - run: - name: Init postgres submodule - command: git submodule update --init --depth 1 - - run: - name: Build and push compute-node Docker image - command: | - echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin - DOCKER_TAG="release-$(git log --oneline|wc -l)" - docker build --tag neondatabase/compute-node:${DOCKER_TAG} \ - --tag neondatabase/compute-node:release vendor/postgres \ - --build-arg COMPUTE_TOOLS_TAG=local - docker push neondatabase/compute-node:${DOCKER_TAG} - docker push neondatabase/compute-node:release - - deploy-staging: - docker: - - image: cimg/python:3.10 - steps: - - checkout - - setup_remote_docker - - run: - name: Setup ansible - command: | - pip install --progress-bar off --user ansible boto3 - - run: - name: Redeploy - command: | - cd "$(pwd)/.circleci/ansible" - - ./get_binaries.sh - - echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key - echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub - chmod 0600 ssh-key - ssh-add ssh-key - rm -f ssh-key ssh-key-cert.pub - - ansible-playbook deploy.yaml -i staging.hosts - rm -f neon_install.tar.gz .neon_current_version - - deploy-staging-proxy: - docker: - - image: cimg/base:2021.04 - environment: - KUBECONFIG: .kubeconfig - steps: - - checkout - - run: - name: Store kubeconfig file - command: | - echo "${STAGING_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG} - chmod 0600 ${KUBECONFIG} - - run: - name: Setup helm v3 - command: | - curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add neondatabase https://neondatabase.github.io/helm-charts - - run: - name: Re-deploy proxy - command: | - DOCKER_TAG=$(git log --oneline|wc -l) - helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait - helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait - - deploy-neon-stress: - docker: - - image: cimg/python:3.10 - steps: - - checkout - - setup_remote_docker - - run: - name: Setup ansible - command: | - pip install --progress-bar off --user ansible boto3 - - run: - name: Redeploy - command: | - cd "$(pwd)/.circleci/ansible" - - ./get_binaries.sh - - echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key - echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub - chmod 0600 ssh-key - ssh-add ssh-key - rm -f ssh-key ssh-key-cert.pub - - ansible-playbook deploy.yaml -i neon-stress.hosts - rm -f neon_install.tar.gz .neon_current_version - - deploy-neon-stress-proxy: - docker: - - image: cimg/base:2021.04 - environment: - KUBECONFIG: .kubeconfig - steps: - - checkout - - run: - name: Store kubeconfig file - command: | - echo "${NEON_STRESS_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG} - chmod 0600 ${KUBECONFIG} - - run: - name: Setup helm v3 - command: | - curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add neondatabase https://neondatabase.github.io/helm-charts - - run: - name: Re-deploy proxy - command: | - DOCKER_TAG=$(git log --oneline|wc -l) - helm upgrade neon-stress-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait - helm upgrade neon-stress-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait - - deploy-release: - docker: - - image: cimg/python:3.10 - steps: - - checkout - - setup_remote_docker - - run: - name: Setup ansible - command: | - pip install --progress-bar off --user ansible boto3 - - run: - name: Redeploy - command: | - cd "$(pwd)/.circleci/ansible" - - RELEASE=true ./get_binaries.sh - - echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key - echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub - chmod 0600 ssh-key - ssh-add ssh-key - rm -f ssh-key ssh-key-cert.pub - - ansible-playbook deploy.yaml -i production.hosts - rm -f neon_install.tar.gz .neon_current_version - - deploy-release-proxy: - docker: - - image: cimg/base:2021.04 - environment: - KUBECONFIG: .kubeconfig - steps: - - checkout - - run: - name: Store kubeconfig file - command: | - echo "${PRODUCTION_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG} - chmod 0600 ${KUBECONFIG} - - run: - name: Setup helm v3 - command: | - curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - helm repo add neondatabase https://neondatabase.github.io/helm-charts - - run: - name: Re-deploy proxy - command: | - DOCKER_TAG="release-$(git log --oneline|wc -l)" - helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait - helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait - - # Trigger a new remote CI job - remote-ci-trigger: - docker: - - image: cimg/base:2021.04 - parameters: - remote_repo: - type: string - environment: - REMOTE_REPO: << parameters.remote_repo >> - steps: - - run: - name: Set PR's status to pending - command: | - LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME - - curl -f -X POST \ - https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \ - -H "Accept: application/vnd.github.v3+json" \ - --user "$CI_ACCESS_TOKEN" \ - --data \ - "{ - \"state\": \"pending\", - \"context\": \"neon-cloud-e2e\", - \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\" - }" - - run: - name: Request a remote CI test - command: | - LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME - - curl -f -X POST \ - https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \ - -H "Accept: application/vnd.github.v3+json" \ - --user "$CI_ACCESS_TOKEN" \ - --data \ - "{ - \"ref\": \"main\", - \"inputs\": { - \"ci_job_name\": \"neon-cloud-e2e\", - \"commit_hash\": \"$CIRCLE_SHA1\", - \"remote_repo\": \"$LOCAL_REPO\" - } - }" - workflows: build_and_test: jobs: - - check-codestyle-rust - check-codestyle-python - build-postgres: name: build-postgres-<< matrix.build_type >> @@ -760,7 +343,6 @@ workflows: - build-postgres-<< matrix.build_type >> - run-pytest: name: pg_regress-tests-<< matrix.build_type >> - context: PERF_TEST_RESULT_CONNSTR matrix: parameters: build_type: ["debug", "release"] @@ -785,120 +367,3 @@ workflows: save_perf_report: true requires: - build-neon-release - - coverage-report: - # Context passes credentials for gh api - context: CI_ACCESS_TOKEN - requires: - # TODO: consider adding more - - other-tests-debug - - docker-image: - # Context gives an ability to login - context: Docker Hub - # Build image only for commits to main - filters: - branches: - only: - - main - requires: - - pg_regress-tests-release - - other-tests-release - - docker-image-compute: - # Context gives an ability to login - context: Docker Hub - # Build image only for commits to main - filters: - branches: - only: - - main - requires: - - pg_regress-tests-release - - other-tests-release - - deploy-staging: - # Context gives an ability to login - context: Docker Hub - # deploy only for commits to main - filters: - branches: - only: - - main - requires: - - docker-image - - deploy-staging-proxy: - # deploy only for commits to main - filters: - branches: - only: - - main - requires: - - docker-image - - - deploy-neon-stress: - # Context gives an ability to login - context: Docker Hub - # deploy only for commits to main - filters: - branches: - only: - - main - requires: - - docker-image - - deploy-neon-stress-proxy: - # deploy only for commits to main - filters: - branches: - only: - - main - requires: - - docker-image - - - docker-image-release: - # Context gives an ability to login - context: Docker Hub - # Build image only for commits to main - filters: - branches: - only: - - release - requires: - - pg_regress-tests-release - - other-tests-release - - docker-image-compute-release: - # Context gives an ability to login - context: Docker Hub - # Build image only for commits to main - filters: - branches: - only: - - release - requires: - - pg_regress-tests-release - - other-tests-release - - deploy-release: - # Context gives an ability to login - context: Docker Hub - # deploy only for commits to main - filters: - branches: - only: - - release - requires: - - docker-image-release - - deploy-release-proxy: - # deploy only for commits to main - filters: - branches: - only: - - release - requires: - - docker-image-release - - remote-ci-trigger: - # Context passes credentials for gh api - context: CI_ACCESS_TOKEN - remote_repo: "neondatabase/cloud" - requires: - # XXX: Successful build doesn't mean everything is OK, but - # the job to be triggered takes so much time to complete (~22 min) - # that it's better not to wait for the commented-out steps - - build-neon-release - # - pg_regress-tests-release - # - other-tests-release diff --git a/.dockerignore b/.dockerignore index 352336496f..0667d8870e 100644 --- a/.dockerignore +++ b/.dockerignore @@ -9,8 +9,8 @@ tmp_install tmp_check_cli test_output .vscode -.zenith -integration_tests/.zenith +.neon +integration_tests/.neon .mypy_cache Dockerfile diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml new file mode 100644 index 0000000000..f220be2b12 --- /dev/null +++ b/.github/actions/run-python-test-set/action.yml @@ -0,0 +1,153 @@ +name: 'Run python test' +description: 'Runs a Neon python test set, performing all the required preparations before' + +inputs: + build_type: + description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug".' + required: true + rust_toolchain: + description: 'Rust toolchain version to fetch the caches' + required: true + test_selection: + description: 'A python test suite to run' + required: true + extra_params: + description: 'Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr' + required: false + default: '' + needs_postgres_source: + description: 'Set to true if the test suite requires postgres source checked out' + required: false + default: 'false' + run_in_parallel: + description: 'Whether to run tests in parallel' + required: false + default: 'true' + save_perf_report: + description: 'Whether to upload the performance report' + required: false + default: 'false' + +runs: + using: "composite" + steps: + - name: Get Neon artifact for restoration + uses: actions/download-artifact@v3 + with: + name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact + path: ./neon-artifact/ + + - name: Get Postgres artifact for restoration + uses: actions/download-artifact@v3 + with: + name: postgres-${{ runner.os }}-${{ inputs.build_type }}-artifact + path: ./pg-artifact/ + + - name: Extract Neon artifact + shell: bash -ex {0} + run: | + mkdir -p /tmp/neon/ + tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/ + rm -rf ./neon-artifact/ + + - name: Extract Postgres artifact + shell: bash -ex {0} + run: | + mkdir -p /tmp/neon/tmp_install + tar -xf ./pg-artifact/pg.tgz -C /tmp/neon/tmp_install + rm -rf ./pg-artifact/ + + - name: Checkout + if: inputs.needs_postgres_source == 'true' + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 1 + + - name: Cache poetry deps + id: cache_poetry + uses: actions/cache@v3 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + shell: bash -ex {0} + run: ./scripts/pysync + + - name: Run pytest + env: + NEON_BIN: /tmp/neon/bin + POSTGRES_DISTRIB_DIR: /tmp/neon/tmp_install + TEST_OUTPUT: /tmp/test_output + # this variable will be embedded in perf test report + # and is needed to distinguish different environments + PLATFORM: github-actions-selfhosted + shell: bash -ex {0} + run: | + PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)" + rm -rf $PERF_REPORT_DIR + + TEST_SELECTION="test_runner/${{ inputs.test_selection }}" + EXTRA_PARAMS="${{ inputs.extra_params }}" + if [ -z "$TEST_SELECTION" ]; then + echo "test_selection must be set" + exit 1 + fi + if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then + EXTRA_PARAMS="-n4 $EXTRA_PARAMS" + fi + if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then + if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then + mkdir -p "$PERF_REPORT_DIR" + EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" + fi + fi + + if [[ "${{ inputs.build_type }}" == "debug" ]]; then + cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) + elif [[ "${{ inputs.build_type }}" == "release" ]]; then + cov_prefix=() + fi + + # Run the tests. + # + # The junit.xml file allows CircleCI to display more fine-grained test information + # in its "Tests" tab in the results page. + # --verbose prints name of each test (helpful when there are + # multiple tests in one file) + # -rA prints summary in the end + # -n4 uses four processes to run tests via pytest-xdist + # -s is not used to prevent pytest from capturing output, because tests are running + # in parallel and logs are mixed between different tests + "${cov_prefix[@]}" ./scripts/pytest \ + --junitxml=$TEST_OUTPUT/junit.xml \ + --tb=short \ + --verbose \ + -m "not remote_cluster" \ + -rA $TEST_SELECTION $EXTRA_PARAMS + + if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then + if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then + export REPORT_FROM="$PERF_REPORT_DIR" + export REPORT_TO=local + scripts/generate_and_push_perf_report.sh + fi + fi + + - name: Delete all data but logs + shell: bash -ex {0} + if: always() + run: | + du -sh /tmp/test_output/* + find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete + du -sh /tmp/test_output/* + + - name: Upload python test logs + if: always() + uses: actions/upload-artifact@v3 + with: + retention-days: 7 + if-no-files-found: error + name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs + path: /tmp/test_output/ diff --git a/.github/actions/save-coverage-data/action.yml b/.github/actions/save-coverage-data/action.yml new file mode 100644 index 0000000000..7ad04cf1fe --- /dev/null +++ b/.github/actions/save-coverage-data/action.yml @@ -0,0 +1,17 @@ +name: 'Merge and upload coverage data' +description: 'Compresses and uploads the coverage data as an artifact' + +runs: + using: "composite" + steps: + - name: Merge coverage data + shell: bash -ex {0} + run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge + + - name: Upload coverage data + uses: actions/upload-artifact@v3 + with: + retention-days: 7 + if-no-files-found: error + name: coverage-data-artifact + path: /tmp/coverage/ diff --git a/.circleci/ansible/.gitignore b/.github/ansible/.gitignore similarity index 100% rename from .circleci/ansible/.gitignore rename to .github/ansible/.gitignore diff --git a/.circleci/ansible/ansible.cfg b/.github/ansible/ansible.cfg similarity index 50% rename from .circleci/ansible/ansible.cfg rename to .github/ansible/ansible.cfg index e3daf3abe3..5818a64455 100644 --- a/.circleci/ansible/ansible.cfg +++ b/.github/ansible/ansible.cfg @@ -6,5 +6,7 @@ timeout = 30 [ssh_connection] ssh_args = -F ./ansible.ssh.cfg -scp_if_ssh = True +# teleport doesn't support sftp yet https://github.com/gravitational/teleport/issues/7127 +# and scp neither worked for me +transfer_method = piped pipelining = True diff --git a/.circleci/ansible/ansible.ssh.cfg b/.github/ansible/ansible.ssh.cfg similarity index 56% rename from .circleci/ansible/ansible.ssh.cfg rename to .github/ansible/ansible.ssh.cfg index 91f673718e..cd058b5427 100644 --- a/.circleci/ansible/ansible.ssh.cfg +++ b/.github/ansible/ansible.ssh.cfg @@ -1,3 +1,7 @@ +# Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed +# (use pre 8.5 option name to cope with old ssh in CI) +PubkeyAcceptedKeyTypes +ssh-rsa-cert-v01@openssh.com + Host tele.zenith.tech User admin Port 3023 diff --git a/.circleci/ansible/deploy.yaml b/.github/ansible/deploy.yaml similarity index 97% rename from .circleci/ansible/deploy.yaml rename to .github/ansible/deploy.yaml index a8154ba3b0..b47db6a9b5 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.github/ansible/deploy.yaml @@ -57,7 +57,7 @@ args: creates: "/storage/pageserver/data/tenants" environment: - ZENITH_REPO_DIR: "/storage/pageserver/data" + NEON_REPO_DIR: "/storage/pageserver/data" LD_LIBRARY_PATH: "/usr/local/lib" become: true tags: @@ -131,7 +131,7 @@ args: creates: "/storage/safekeeper/data/safekeeper.id" environment: - ZENITH_REPO_DIR: "/storage/safekeeper/data" + NEON_REPO_DIR: "/storage/safekeeper/data" LD_LIBRARY_PATH: "/usr/local/lib" become: true tags: diff --git a/.circleci/ansible/get_binaries.sh b/.github/ansible/get_binaries.sh similarity index 95% rename from .circleci/ansible/get_binaries.sh rename to .github/ansible/get_binaries.sh index c613213a75..c9cbe91f34 100755 --- a/.circleci/ansible/get_binaries.sh +++ b/.github/ansible/get_binaries.sh @@ -6,7 +6,7 @@ RELEASE=${RELEASE:-false} # look at docker hub for latest tag for neon docker image if [ "${RELEASE}" = "true" ]; then - echo "search latest relase tag" + echo "search latest release tag" VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1) if [ -z "${VERSION}" ]; then echo "no any docker tags found, exiting..." @@ -31,7 +31,7 @@ echo "found ${VERSION}" rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version mkdir neon_install -# retrive binaries from docker image +# retrieve binaries from docker image echo "getting binaries from docker image" docker pull --quiet neondatabase/neon:${TAG} ID=$(docker create neondatabase/neon:${TAG}) diff --git a/.circleci/ansible/neon-stress.hosts b/.github/ansible/neon-stress.hosts similarity index 95% rename from .circleci/ansible/neon-stress.hosts rename to .github/ansible/neon-stress.hosts index 283ec0e8b3..750fd8106a 100644 --- a/.circleci/ansible/neon-stress.hosts +++ b/.github/ansible/neon-stress.hosts @@ -12,6 +12,7 @@ pageservers safekeepers [storage:vars] +env_name = neon-stress console_mgmt_base_url = http://neon-stress-console.local bucket_name = neon-storage-ireland bucket_region = eu-west-1 diff --git a/.circleci/ansible/production.hosts b/.github/ansible/production.hosts similarity index 89% rename from .circleci/ansible/production.hosts rename to .github/ansible/production.hosts index 6cefd724d8..d22ce0e37e 100644 --- a/.circleci/ansible/production.hosts +++ b/.github/ansible/production.hosts @@ -1,6 +1,7 @@ [pageservers] #zenith-1-ps-1 console_region_id=1 zenith-1-ps-2 console_region_id=1 +zenith-1-ps-3 console_region_id=1 [safekeepers] zenith-1-sk-1 console_region_id=1 @@ -12,8 +13,8 @@ pageservers safekeepers [storage:vars] +env_name = prod-1 console_mgmt_base_url = http://console-release.local bucket_name = zenith-storage-oregon bucket_region = us-west-2 etcd_endpoints = etcd-release.local:2379 -safekeeper_enable_s3_offload = false diff --git a/.circleci/ansible/scripts/init_pageserver.sh b/.github/ansible/scripts/init_pageserver.sh similarity index 100% rename from .circleci/ansible/scripts/init_pageserver.sh rename to .github/ansible/scripts/init_pageserver.sh diff --git a/.circleci/ansible/scripts/init_safekeeper.sh b/.github/ansible/scripts/init_safekeeper.sh similarity index 100% rename from .circleci/ansible/scripts/init_safekeeper.sh rename to .github/ansible/scripts/init_safekeeper.sh diff --git a/.circleci/ansible/staging.hosts b/.github/ansible/staging.hosts similarity index 86% rename from .circleci/ansible/staging.hosts rename to .github/ansible/staging.hosts index d99ffa6dac..35e77513df 100644 --- a/.circleci/ansible/staging.hosts +++ b/.github/ansible/staging.hosts @@ -1,9 +1,9 @@ [pageservers] #zenith-us-stage-ps-1 console_region_id=27 zenith-us-stage-ps-2 console_region_id=27 +zenith-us-stage-ps-3 console_region_id=27 [safekeepers] -zenith-us-stage-sk-1 console_region_id=27 zenith-us-stage-sk-4 console_region_id=27 zenith-us-stage-sk-5 console_region_id=27 zenith-us-stage-sk-6 console_region_id=27 @@ -13,8 +13,8 @@ pageservers safekeepers [storage:vars] +env_name = us-stage console_mgmt_base_url = http://console-staging.local bucket_name = zenith-staging-storage-us-east-1 bucket_region = us-east-1 etcd_endpoints = etcd-staging.local:2379 -safekeeper_enable_s3_offload = false diff --git a/.circleci/ansible/systemd/pageserver.service b/.github/ansible/systemd/pageserver.service similarity index 83% rename from .circleci/ansible/systemd/pageserver.service rename to .github/ansible/systemd/pageserver.service index 54a7b1ba0a..bb78054fa3 100644 --- a/.circleci/ansible/systemd/pageserver.service +++ b/.github/ansible/systemd/pageserver.service @@ -5,7 +5,7 @@ After=network.target auditd.service [Service] Type=simple User=pageserver -Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib +Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data ExecReload=/bin/kill -HUP $MAINPID KillMode=mixed diff --git a/.github/ansible/systemd/safekeeper.service b/.github/ansible/systemd/safekeeper.service new file mode 100644 index 0000000000..d5c6d00017 --- /dev/null +++ b/.github/ansible/systemd/safekeeper.service @@ -0,0 +1,18 @@ +[Unit] +Description=Zenith safekeeper +After=network.target auditd.service + +[Service] +Type=simple +User=safekeeper +Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib +ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}' +ExecReload=/bin/kill -HUP $MAINPID +KillMode=mixed +KillSignal=SIGINT +Restart=on-failure +TimeoutSec=10 +LimitNOFILE=30000000 + +[Install] +WantedBy=multi-user.target diff --git a/.circleci/helm-values/neon-stress.proxy-scram.yaml b/.github/helm-values/neon-stress.proxy-scram.yaml similarity index 100% rename from .circleci/helm-values/neon-stress.proxy-scram.yaml rename to .github/helm-values/neon-stress.proxy-scram.yaml diff --git a/.circleci/helm-values/neon-stress.proxy.yaml b/.github/helm-values/neon-stress.proxy.yaml similarity index 100% rename from .circleci/helm-values/neon-stress.proxy.yaml rename to .github/helm-values/neon-stress.proxy.yaml diff --git a/.circleci/helm-values/production.proxy-scram.yaml b/.github/helm-values/production.proxy-scram.yaml similarity index 100% rename from .circleci/helm-values/production.proxy-scram.yaml rename to .github/helm-values/production.proxy-scram.yaml diff --git a/.circleci/helm-values/production.proxy.yaml b/.github/helm-values/production.proxy.yaml similarity index 100% rename from .circleci/helm-values/production.proxy.yaml rename to .github/helm-values/production.proxy.yaml diff --git a/.circleci/helm-values/staging.proxy-scram.yaml b/.github/helm-values/staging.proxy-scram.yaml similarity index 100% rename from .circleci/helm-values/staging.proxy-scram.yaml rename to .github/helm-values/staging.proxy-scram.yaml diff --git a/.circleci/helm-values/staging.proxy.yaml b/.github/helm-values/staging.proxy.yaml similarity index 100% rename from .circleci/helm-values/staging.proxy.yaml rename to .github/helm-values/staging.proxy.yaml diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 72041c9d02..d08c3c50bd 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -19,18 +19,18 @@ jobs: bench: # this workflow runs on self hosteed runner # it's environment is quite different from usual guthub runner - # probably the most important difference is that it doesnt start from clean workspace each time + # probably the most important difference is that it doesn't start from clean workspace each time # e g if you install system packages they are not cleaned up since you install them directly in host machine # not a container or something # See documentation for more info: https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners runs-on: [self-hosted, zenith-benchmarker] env: - POSTGRES_DISTRIB_DIR: "/usr/pgsql-13" + POSTGRES_DISTRIB_DIR: "/usr/pgsql-14" steps: - name: Checkout zenith repo - uses: actions/checkout@v2 + uses: actions/checkout@v3 # actions/setup-python@v2 is not working correctly on self-hosted runners # see https://github.com/actions/setup-python/issues/162 @@ -88,7 +88,7 @@ jobs: # Plus time needed to initialize the test databases. TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" - PLATFORM: "zenith-staging" + PLATFORM: "neon-staging" BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally run: | @@ -96,7 +96,7 @@ jobs: # since it might generate duplicates when calling ingest_perf_test_result.py rm -rf perf-report-staging mkdir -p perf-report-staging - ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging + ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600 - name: Submit result env: @@ -104,3 +104,12 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" run: | REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml new file mode 100644 index 0000000000..776c696f59 --- /dev/null +++ b/.github/workflows/build_and_test.yml @@ -0,0 +1,650 @@ +name: Test and Deploy + +on: + push: + branches: + - main + - release + pull_request: + +defaults: + run: + shell: bash -ex {0} + +concurrency: + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + cancel-in-progress: true + +env: + RUST_BACKTRACE: 1 + COPT: '-Werror' + +jobs: + build-postgres: + runs-on: [ self-hosted, Linux, k8s-runner ] + strategy: + fail-fast: false + matrix: + build_type: [ debug, release ] + rust_toolchain: [ 1.58 ] + + env: + BUILD_TYPE: ${{ matrix.build_type }} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 1 + + - name: Set pg revision for caching + id: pg_ver + run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres) + + - name: Cache postgres build + id: cache_pg + uses: actions/cache@v3 + with: + path: tmp_install/ + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Build postgres + if: steps.cache_pg.outputs.cache-hit != 'true' + run: mold -run make postgres -j$(nproc) + + # actions/cache@v3 does not allow concurrently using the same cache across job steps, so use a separate cache + - name: Prepare postgres artifact + run: tar -C tmp_install/ -czf ./pg.tgz . + - name: Upload postgres artifact + uses: actions/upload-artifact@v3 + with: + retention-days: 7 + if-no-files-found: error + name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact + path: ./pg.tgz + + + build-neon: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ build-postgres ] + strategy: + fail-fast: false + matrix: + build_type: [ debug, release ] + rust_toolchain: [ 1.58 ] + + env: + BUILD_TYPE: ${{ matrix.build_type }} + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 1 + + - name: Get postgres artifact for restoration + uses: actions/download-artifact@v3 + with: + name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact + path: ./postgres-artifact/ + - name: Extract postgres artifact + run: | + mkdir ./tmp_install/ + tar -xf ./postgres-artifact/pg.tgz -C ./tmp_install/ + rm -rf ./postgres-artifact/ + + # Don't include the ~/.cargo/registry/src directory. It contains just + # uncompressed versions of the crates in ~/.cargo/registry/cache + # directory, and it's faster to let 'cargo' to rebuild it from the + # compressed crates. + - name: Cache cargo deps + id: cache_cargo + uses: actions/cache@v3 + with: + path: | + ~/.cargo/registry/ + !~/.cargo/registry/src + ~/.cargo/git/ + target/ + # Fall back to older versions of the key, if no cache for current Cargo.lock was found + key: | + v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}- + + - name: Run cargo build + run: | + if [[ $BUILD_TYPE == "debug" ]]; then + cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) + CARGO_FLAGS= + elif [[ $BUILD_TYPE == "release" ]]; then + cov_prefix=() + CARGO_FLAGS="--release --features profiling" + fi + + "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests + + - name: Run cargo test + run: | + if [[ $BUILD_TYPE == "debug" ]]; then + cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) + CARGO_FLAGS= + elif [[ $BUILD_TYPE == "release" ]]; then + cov_prefix=() + CARGO_FLAGS=--release + fi + + "${cov_prefix[@]}" cargo test $CARGO_FLAGS + + - name: Install rust binaries + run: | + if [[ $BUILD_TYPE == "debug" ]]; then + cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) + elif [[ $BUILD_TYPE == "release" ]]; then + cov_prefix=() + fi + + binaries=$( + "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps | + jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name' + ) + + test_exe_paths=$( + "${cov_prefix[@]}" cargo test --message-format=json --no-run | + jq -r '.executable | select(. != null)' + ) + + mkdir -p /tmp/neon/bin/ + mkdir -p /tmp/neon/test_bin/ + mkdir -p /tmp/neon/etc/ + + # Keep bloated coverage data files away from the rest of the artifact + mkdir -p /tmp/coverage/ + + # Install target binaries + for bin in $binaries; do + SRC=target/$BUILD_TYPE/$bin + DST=/tmp/neon/bin/$bin + cp "$SRC" "$DST" + done + + # Install test executables and write list of all binaries (for code coverage) + if [[ $BUILD_TYPE == "debug" ]]; then + for bin in $binaries; do + echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list + done + for bin in $test_exe_paths; do + SRC=$bin + DST=/tmp/neon/test_bin/$(basename $bin) + + # We don't need debug symbols for code coverage, so strip them out to make + # the artifact smaller. + strip "$SRC" -o "$DST" + echo "$DST" >> /tmp/coverage/binaries.list + done + fi + + - name: Prepare neon artifact + run: tar -C /tmp/neon/ -czf ./neon.tgz . + + - name: Upload neon binaries + uses: actions/upload-artifact@v3 + with: + retention-days: 7 + if-no-files-found: error + name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact + path: ./neon.tgz + + # XXX: keep this after the binaries.list is formed, so the coverage can properly work later + - name: Merge and upload coverage data + if: matrix.build_type == 'debug' + uses: ./.github/actions/save-coverage-data + + + pg_regress-tests: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ build-neon ] + strategy: + fail-fast: false + matrix: + build_type: [ debug, release ] + rust_toolchain: [ 1.58 ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 2 + + - name: Pytest regress tests + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ matrix.build_type }} + rust_toolchain: ${{ matrix.rust_toolchain }} + test_selection: batch_pg_regress + needs_postgres_source: true + + - name: Merge and upload coverage data + if: matrix.build_type == 'debug' + uses: ./.github/actions/save-coverage-data + + other-tests: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ build-neon ] + strategy: + fail-fast: false + matrix: + build_type: [ debug, release ] + rust_toolchain: [ 1.58 ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 2 + + - name: Pytest other tests + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ matrix.build_type }} + rust_toolchain: ${{ matrix.rust_toolchain }} + test_selection: batch_others + + - name: Merge and upload coverage data + if: matrix.build_type == 'debug' + uses: ./.github/actions/save-coverage-data + + benchmarks: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ build-neon ] + strategy: + fail-fast: false + matrix: + build_type: [ release ] + rust_toolchain: [ 1.58 ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 2 + + - name: Pytest benchmarks + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ matrix.build_type }} + rust_toolchain: ${{ matrix.rust_toolchain }} + test_selection: performance + run_in_parallel: false + save_perf_report: true + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + # XXX: no coverage data handling here, since benchmarks are run on release builds, + # while coverage is currently collected for the debug ones + + coverage-report: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ other-tests, pg_regress-tests ] + strategy: + fail-fast: false + matrix: + build_type: [ debug ] + rust_toolchain: [ 1.58 ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 1 + + - name: Restore cargo deps cache + id: cache_cargo + uses: actions/cache@v3 + with: + path: | + ~/.cargo/registry/ + !~/.cargo/registry/src + ~/.cargo/git/ + target/ + key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }} + + - name: Get Neon artifact for restoration + uses: actions/download-artifact@v3 + with: + name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact + path: ./neon-artifact/ + + - name: Extract Neon artifact + run: | + mkdir -p /tmp/neon/ + tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/ + rm -rf ./neon-artifact/ + + - name: Restore coverage data + uses: actions/download-artifact@v3 + with: + name: coverage-data-artifact + path: /tmp/coverage/ + + - name: Merge coverage data + run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge + + - name: Build and upload coverage report + run: | + COMMIT_SHA=${{ github.event.pull_request.head.sha }} + COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} + COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA + + scripts/coverage \ + --dir=/tmp/coverage report \ + --input-objects=/tmp/coverage/binaries.list \ + --commit-url=$COMMIT_URL \ + --format=github + + REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA + + scripts/git-upload \ + --repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \ + --message="Add code coverage for $COMMIT_URL" \ + copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE + + # Add link to the coverage report to the commit + curl -f -X POST \ + https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ + -H "Accept: application/vnd.github.v3+json" \ + --user "${{ secrets.CI_ACCESS_TOKEN }}" \ + --data \ + "{ + \"state\": \"success\", + \"context\": \"neon-coverage\", + \"description\": \"Coverage report is ready\", + \"target_url\": \"$REPORT_URL\" + }" + + trigger-e2e-tests: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ build-neon ] + steps: + - name: Set PR's status to pending and request a remote CI test + run: | + COMMIT_SHA=${{ github.event.pull_request.head.sha }} + COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} + + REMOTE_REPO="${{ github.repository_owner }}/cloud" + + curl -f -X POST \ + https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ + -H "Accept: application/vnd.github.v3+json" \ + --user "${{ secrets.CI_ACCESS_TOKEN }}" \ + --data \ + "{ + \"state\": \"pending\", + \"context\": \"neon-cloud-e2e\", + \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\" + }" + + curl -f -X POST \ + https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \ + -H "Accept: application/vnd.github.v3+json" \ + --user "${{ secrets.CI_ACCESS_TOKEN }}" \ + --data \ + "{ + \"ref\": \"main\", + \"inputs\": { + \"ci_job_name\": \"neon-cloud-e2e\", + \"commit_hash\": \"$COMMIT_SHA\", + \"remote_repo\": \"${{ github.repository }}\" + } + }" + + docker-image: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ pg_regress-tests, other-tests ] + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + outputs: + build-tag: ${{steps.build-tag.outputs.tag}} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + with: + driver: docker + + - name: Get build tag + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "::set-output name=tag::$(git rev-list --count HEAD)" + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "::set-output name=tag::release-$(git rev-list --count HEAD)" + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + id: build-tag + + - name: Get legacy build tag + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "::set-output name=tag::latest + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "::set-output name=tag::release + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + id: legacy-build-tag + + - name: Build neon Docker image + uses: docker/build-push-action@v2 + with: + context: . + build-args: | + GIT_VERSION="${{github.sha}}" + AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" + AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" + pull: true + push: true + tags: neondatabase/neon:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/neon:${{steps.build-tag.outputs.tag}} + + docker-image-compute: + runs-on: [ self-hosted, Linux, k8s-runner ] + needs: [ pg_regress-tests, other-tests ] + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + outputs: + build-tag: ${{steps.build-tag.outputs.tag}} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + with: + driver: docker + + - name: Get build tag + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "::set-output name=tag::$(git rev-list --count HEAD)" + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "::set-output name=tag::release-$(git rev-list --count HEAD)" + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + id: build-tag + + - name: Get legacy build tag + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "::set-output name=tag::latest + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "::set-output name=tag::release + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + id: legacy-build-tag + + - name: Build compute-tools Docker image + uses: docker/build-push-action@v2 + with: + context: . + build-args: | + GIT_VERSION="${{github.sha}}" + AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" + AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" + push: false + file: Dockerfile.compute-tools + tags: neondatabase/compute-tools:local + + - name: Push compute-tools Docker image + uses: docker/build-push-action@v2 + with: + context: . + build-args: | + GIT_VERSION="${{github.sha}}" + AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}" + AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}" + push: true + file: Dockerfile.compute-tools + tags: neondatabase/compute-tools:${{steps.legacy-build-tag.outputs.tag}} + + - name: Build compute-node Docker image + uses: docker/build-push-action@v2 + with: + context: ./vendor/postgres/ + build-args: + COMPUTE_TOOLS_TAG=local + push: true + tags: neondatabase/compute-node:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/compute-node:${{steps.build-tag.outputs.tag}} + + calculate-deploy-targets: + runs-on: [ self-hosted, Linux, k8s-runner ] + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + outputs: + matrix-include: ${{ steps.set-matrix.outputs.include }} + steps: + - id: set-matrix + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}' + NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}' + echo "::set-output name=include::[$STAGING, $NEON_STRESS]" + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}' + echo "::set-output name=include::[$PRODUCTION]" + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + + deploy: + runs-on: [ self-hosted, Linux, k8s-runner ] + # We need both storage **and** compute images for deploy, because control plane + # picks the compute version based on the storage version. If it notices a fresh + # storage it may bump the compute version. And if compute image failed to build + # it may break things badly. + needs: [ docker-image, docker-image-compute, calculate-deploy-targets ] + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + strategy: + matrix: + include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Setup ansible + run: | + pip install --progress-bar off --user ansible boto3 + + - name: Redeploy + run: | + cd "$(pwd)/.github/ansible" + + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + ./get_binaries.sh + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + RELEASE=true ./get_binaries.sh + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + exit 1 + fi + + eval $(ssh-agent) + echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key + echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub + chmod 0600 ssh-key + ssh-add ssh-key + rm -f ssh-key ssh-key-cert.pub + + ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts + rm -f neon_install.tar.gz .neon_current_version + + deploy-proxy: + runs-on: [ self-hosted, Linux, k8s-runner ] + # Compute image isn't strictly required for proxy deploy, but let's still wait for it + # to run all deploy jobs consistently. + needs: [ docker-image, docker-image-compute, calculate-deploy-targets ] + if: | + (github.ref_name == 'main' || github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + strategy: + matrix: + include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} + env: + KUBECONFIG: .kubeconfig + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Store kubeconfig file + run: | + echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG} + chmod 0600 ${KUBECONFIG} + + - name: Setup helm v3 + run: | + curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + helm repo add neondatabase https://neondatabase.github.io/helm-charts + + - name: Re-deploy proxy + run: | + DOCKER_TAG=${{needs.docker-image.outputs.build-tag}} + helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s diff --git a/.github/workflows/testing.yml b/.github/workflows/codestyle.yml similarity index 63% rename from .github/workflows/testing.yml rename to .github/workflows/codestyle.yml index ad7bddfabc..89bfffd4b9 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/codestyle.yml @@ -1,18 +1,33 @@ -name: Build and Test +name: Check code style and build on: - pull_request: push: + branches: + - main + pull_request: + +defaults: + run: + shell: bash -ex {0} + +concurrency: + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + cancel-in-progress: true + +env: + RUST_BACKTRACE: 1 jobs: - regression-check: + check-codestyle-rust: strategy: + fail-fast: false matrix: # If we want to duplicate this job for different # Rust toolchains (e.g. nightly or 1.37.0), add them here. - rust_toolchain: [stable] + rust_toolchain: [1.58] os: [ubuntu-latest, macos-latest] - timeout-minutes: 30 + timeout-minutes: 50 name: run regression test suite runs-on: ${{ matrix.os }} @@ -23,13 +38,17 @@ jobs: submodules: true fetch-depth: 2 - - name: install rust toolchain ${{ matrix.rust_toolchain }} + - name: Install rust toolchain ${{ matrix.rust_toolchain }} uses: actions-rs/toolchain@v1 with: profile: minimal toolchain: ${{ matrix.rust_toolchain }} + components: rustfmt, clippy override: true + - name: Check formatting + run: cargo fmt --all -- --check + - name: Install Ubuntu postgres dependencies if: matrix.os == 'ubuntu-latest' run: | @@ -79,12 +98,38 @@ jobs: with: path: | ~/.cargo/registry + !~/.cargo/registry/src ~/.cargo/git target - key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }} + key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }} - name: Run cargo clippy run: ./run_clippy.sh - - name: Run cargo test - run: cargo test --all --all-targets + - name: Ensure all project builds + run: cargo build --all --all-targets + + check-codestyle-python: + runs-on: [ self-hosted, Linux, k8s-runner ] + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: false + fetch-depth: 1 + + - name: Cache poetry deps + id: cache_poetry + uses: actions/cache@v3 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + run: ./scripts/pysync + + - name: Run yapf to ensure code format + run: poetry run yapf --recursive --diff . + + - name: Run mypy to check types + run: poetry run mypy . diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml new file mode 100644 index 0000000000..4ff31ac508 --- /dev/null +++ b/.github/workflows/pg_clients.yml @@ -0,0 +1,72 @@ +name: Test Postgres client libraries + +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '23 02 * * *' # run once a day, timezone is utc + + workflow_dispatch: + +concurrency: + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + cancel-in-progress: true + +jobs: + test-postgres-client-libs: + runs-on: [ ubuntu-latest ] + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: 3.9 + + - name: Install Poetry + uses: snok/install-poetry@v1 + + - name: Cache poetry deps + id: cache_poetry + uses: actions/cache@v3 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + shell: bash -ex {0} + run: ./scripts/pysync + + - name: Run pytest + env: + REMOTE_ENV: 1 + BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}" + TEST_OUTPUT: /tmp/test_output + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + shell: bash -ex {0} + run: | + # Test framework expects we have psql binary; + # but since we don't really need it in this test, let's mock it + mkdir -p "$POSTGRES_DISTRIB_DIR/bin" && touch "$POSTGRES_DISTRIB_DIR/bin/psql"; + ./scripts/pytest \ + --junitxml=$TEST_OUTPUT/junit.xml \ + --tb=short \ + --verbose \ + -m "remote_cluster" \ + -rA "test_runner/pg_clients" + + - name: Post to a Slack channel + if: failure() + id: slack + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.gitignore b/.gitignore index adb1b41503..ed718c8c79 100644 --- a/.gitignore +++ b/.gitignore @@ -5,8 +5,9 @@ __pycache__/ test_output/ .vscode -/.zenith -/integration_tests/.zenith +.idea +/.neon +/integration_tests/.neon # Coverage *.profraw diff --git a/.yapfignore b/.yapfignore index 258f6c59cd..149428e452 100644 --- a/.yapfignore +++ b/.yapfignore @@ -6,5 +6,5 @@ target/ tmp_install/ __pycache__/ test_output/ -.zenith/ +.neon/ .git/ diff --git a/Cargo.lock b/Cargo.lock index 6acad6dac8..4f453678e6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -48,9 +48,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.53" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94a45b455c14666b85fc40a019e8ab9eb75e3a124e05494f5397122bc9eb06e0" +checksum = "bb07d2053ccdbe10e2af2995a2f116c1330396493dc1269f6a91d0ae82e19704" dependencies = [ "backtrace", ] @@ -64,6 +64,45 @@ dependencies = [ "nodrop", ] +[[package]] +name = "asn1-rs" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30ff05a702273012438132f449575dbc804e27b2f3cbe3069aa237d26c98fa33" +dependencies = [ + "asn1-rs-derive", + "asn1-rs-impl", + "displaydoc", + "nom", + "num-traits", + "rusticata-macros", + "thiserror", + "time 0.3.11", +] + +[[package]] +name = "asn1-rs-derive" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db8b7511298d5b7784b40b092d9e9dcd3a627a5707e4b5e507931ab0d44eeebf" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "asn1-rs-impl" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "async-stream" version = "0.3.3" @@ -87,9 +126,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.52" +version = "0.1.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061a7acccaa286c011ddc30970520b98fa40e00c9d644633fb26b5fc63a265e3" +checksum = "96cf8829f67d2eab0b2dfa42c5d0ef737e0724e4a82b01b3e292456202b19716" dependencies = [ "proc-macro2", "quote", @@ -115,9 +154,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "axum" -version = "0.5.4" +version = "0.5.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4af7447fc1214c1f3a1ace861d0216a6c8bb13965b64bbad9650f375b67689a" +checksum = "d16705af05732b7d3258ec0f7b73c03a658a28925e050d8852d5b568ee8bcf4e" dependencies = [ "async-trait", "axum-core", @@ -127,7 +166,7 @@ dependencies = [ "http", "http-body", "hyper", - "itoa 1.0.1", + "itoa 1.0.2", "matchit", "memchr", "mime", @@ -144,9 +183,9 @@ dependencies = [ [[package]] name = "axum-core" -version = "0.2.3" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bdc19781b16e32f8a7200368a336fa4509d4b72ef15dd4e41df5290855ee1e6" +checksum = "e4f44a0e6200e9d11a1cdc989e4b358f6e3d354fbf48478f345a17f4e43f8635" dependencies = [ "async-trait", "bytes", @@ -158,15 +197,15 @@ dependencies = [ [[package]] name = "backtrace" -version = "0.3.64" +version = "0.3.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e121dee8023ce33ab248d9ce1493df03c3b38a659b240096fcbd7048ff9c31f" +checksum = "cab84319d616cfb654d03394f38ab7e6f0919e181b1b57e1fd15e7fb4077d9a7" dependencies = [ "addr2line", "cc", "cfg-if", "libc", - "miniz_oxide 0.4.4", + "miniz_oxide", "object", "rustc-demangle", ] @@ -253,15 +292,15 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.9.1" +version = "3.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a45a46ab1f2412e53d3a0ade76ffad2025804294569aae387231a0cd6e0899" +checksum = "37ccbd214614c6783386c1af30caf03192f17891059cecc394b4fb119e363de3" [[package]] name = "bytemuck" -version = "1.9.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdead85bdec19c194affaeeb670c0e41fe23de31459efd1c174d049269cf02cc" +checksum = "c53dfa917ec274df8ed3c572698f381a24eef2efba9492d797301b72b6db408a" [[package]] name = "byteorder" @@ -288,10 +327,16 @@ dependencies = [ ] [[package]] -name = "cc" -version = "1.0.72" +name = "cast" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" [[package]] name = "cexpr" @@ -324,9 +369,9 @@ dependencies = [ [[package]] name = "clang-sys" -version = "1.3.1" +version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cc00842eed744b858222c4c9faf7243aafc6d33f92f96935263ef4d8a41ce21" +checksum = "5a050e2153c5be08febd6734e29298e844fdb0fa21aeddd63b4eb7baa106c69b" dependencies = [ "glob", "libc", @@ -350,17 +395,36 @@ dependencies = [ [[package]] name = "clap" -version = "3.0.14" +version = "3.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b63edc3f163b3c71ec8aa23f9bd6070f77edbf3d1d198b164afa90ff00e4ec62" +checksum = "ab8b79fe3946ceb4a0b1c080b4018992b8d27e9ff363644c1c9b6387c854614d" dependencies = [ "atty", "bitflags", + "clap_lex", "indexmap", - "os_str_bytes", "strsim 0.10.0", "termcolor", - "textwrap 0.14.2", + "textwrap 0.15.0", +] + +[[package]] +name = "clap_lex" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5" +dependencies = [ + "os_str_bytes", +] + +[[package]] +name = "close_fds" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed" +dependencies = [ + "cfg-if", + "libc", ] [[package]] @@ -374,9 +438,9 @@ dependencies = [ [[package]] name = "combine" -version = "4.6.3" +version = "4.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50b727aacc797f9fc28e355d21f34709ac4fc9adecfe470ad07b8f4464f53062" +checksum = "2a604e93b79d1808327a6fca85a6f2d69de66461e7620f5a4cbf5fb4d1d7c948" dependencies = [ "bytes", "memchr", @@ -400,7 +464,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 3.0.14", + "clap 3.2.12", "env_logger", "hyper", "libc", @@ -412,14 +476,15 @@ dependencies = [ "tar", "tokio", "tokio-postgres", + "url", "workspace_hack", ] [[package]] name = "const_format" -version = "0.2.22" +version = "0.2.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22bc6cd49b0ec407b680c3e380182b6ac63b73991cb7602de350352fc309b614" +checksum = "939dc9e2eb9077e0679d2ce32de1ded8531779360b003b4a972a7a39ec263495" dependencies = [ "const_format_proc_macros", ] @@ -484,18 +549,18 @@ dependencies = [ [[package]] name = "cpufeatures" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469" +checksum = "59a6001667ab124aebae2a495118e11d30984c3a653e99d86d58971708cf5e4b" dependencies = [ "libc", ] [[package]] name = "crc32c" -version = "0.6.1" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee6b9c9389584bcba988bd0836086789b7f87ad91892d6a83d5291dbb24524b5" +checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" dependencies = [ "rustc_version", ] @@ -511,12 +576,12 @@ dependencies = [ [[package]] name = "criterion" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" +checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f" dependencies = [ "atty", - "cast", + "cast 0.3.0", "clap 2.34.0", "criterion-plot", "csv", @@ -541,15 +606,15 @@ version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" dependencies = [ - "cast", + "cast 0.2.7", "itertools", ] [[package]] name = "crossbeam-channel" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aaa7bd5fb665c6864b5f963dd9097905c54125909c7aa94c9e18507cdbe6c53" +checksum = "4c02a4d71819009c192cf4872265391563fd6a84c81ff2c0f2a7026ca4c1d85c" dependencies = [ "cfg-if", "crossbeam-utils", @@ -568,26 +633,26 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.8" +version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1145cf131a2c6ba0615079ab6a638f7e1973ac9c2634fcbeaaad6114246efe8c" +checksum = "07db9d94cbd326813772c968ccd25999e5f8ae22f4f8d1b11effa37ef6ce281d" dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", - "lazy_static", "memoffset", + "once_cell", "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.7" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e5bed1f1c269533fa816a0a5492b3545209a205ca1a54842be180eb63a16a6" +checksum = "7d82ee10ce34d7bc12c2122495e7593a9c41347ecdd64185af4ecf72cb1a7f83" dependencies = [ "cfg-if", - "lazy_static", + "once_cell", ] [[package]] @@ -600,7 +665,7 @@ dependencies = [ "crossterm_winapi", "libc", "mio", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "signal-hook", "signal-hook-mio", "winapi", @@ -617,9 +682,9 @@ dependencies = [ [[package]] name = "crypto-common" -version = "0.1.3" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57952ca27b5e3606ff4dd79b0020231aaf9d6aa76dc05fd30137538c50bd3ce8" +checksum = "2ccfd8c0ee4cce11e45b3fd6f9d5e69e0cc62912aa6a0cb1bf4617b0eba5a12f" dependencies = [ "generic-array", "typenum", @@ -669,9 +734,9 @@ dependencies = [ [[package]] name = "darling" -version = "0.13.1" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0d720b8683f8dd83c65155f0530560cba68cd2bf395f6513a483caee57ff7f4" +checksum = "a01d95850c592940db9b8194bc39f4bc0e89dee5c4265e4b1807c34a9aba453c" dependencies = [ "darling_core", "darling_macro", @@ -679,9 +744,9 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.13.1" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a340f241d2ceed1deb47ae36c4144b2707ec7dd0b649f894cb39bb595986324" +checksum = "859d65a907b6852c9361e3185c862aae7fafd2887876799fa55f5f99dc40d610" dependencies = [ "fnv", "ident_case", @@ -693,15 +758,21 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.13.1" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72c41b3b7352feb3211a0d743dc5700a4e3b60f51bd2b368892d1e0f9a95f44b" +checksum = "9c972679f83bdf9c42bd905396b6c3588a843a17f0f16dfcfa3e2c5d57441835" dependencies = [ "darling_core", "quote", "syn", ] +[[package]] +name = "data-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ee2393c4a91429dffb4bedf19f4d6abf27d8a732c8ce4980305d782e5426d57" + [[package]] name = "debugid" version = "0.7.3" @@ -711,6 +782,20 @@ dependencies = [ "uuid", ] +[[package]] +name = "der-parser" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe398ac75057914d7d07307bf67dc7f3f574a26783b4fc7805a20ffa9f506e82" +dependencies = [ + "asn1-rs", + "displaydoc", + "nom", + "num-bigint", + "num-traits", + "rusticata-macros", +] + [[package]] name = "digest" version = "0.9.0" @@ -753,16 +838,27 @@ dependencies = [ ] [[package]] -name = "either" -version = "1.6.1" +name = "displaydoc" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +checksum = "3bf95dc3f046b9da4f2d51833c0d3547d8564ef6910f5c1ed130306a75b92886" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "either" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f107b87b6afc2a64fd13cac55fe06d6c8859f12d4b14cbcdd2c67d0976781be" [[package]] name = "encoding_rs" -version = "0.8.30" +version = "0.8.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7896dc8abb250ffdda33912550faa54c88ec8b998dec0b2c55ab224921ce11df" +checksum = "9852635589dc9f9ea1b6fe9f05b50ef208c85c834a562f0c6abb1c475736ec2b" dependencies = [ "cfg-if", ] @@ -782,9 +878,9 @@ dependencies = [ [[package]] name = "etcd-client" -version = "0.9.1" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c434d2800b273a506b82397aad2f20971636f65e47b27c027f77d498530c5954" +checksum = "9fb8664f6ea68aba5503d42dd1be786b0f1bd9b7972e7f40208c83ef74db91bf" dependencies = [ "http", "prost", @@ -801,6 +897,7 @@ name = "etcd_broker" version = "0.1.0" dependencies = [ "etcd-client", + "once_cell", "regex", "serde", "serde_json", @@ -840,14 +937,14 @@ dependencies = [ [[package]] name = "filetime" -version = "0.2.15" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "975ccf83d8d9d0d84682850a38c8169027be83368805971cc4f238c2b245bc98" +checksum = "e94a7bbaa59354bc20dd75b67f23e2797b4490e9d6928203fb105c79e448c86c" dependencies = [ "cfg-if", "libc", "redox_syscall", - "winapi", + "windows-sys", ] [[package]] @@ -864,21 +961,9 @@ dependencies = [ [[package]] name = "fixedbitset" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "279fb028e20b3c4c320317955b77c5e0c9701f05a1d309905d6fc702cdc5053e" - -[[package]] -name = "flate2" -version = "1.0.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b39522e96686d38f4bc984b9198e3a0613264abaebaff2c5c918bfa6b6da09af" -dependencies = [ - "cfg-if", - "crc32fast", - "libc", - "miniz_oxide 0.5.1", -] +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "fnv" @@ -1022,13 +1107,13 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.4" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418d37c8b1d42553c93648be529cb70f920d3baf8ef469b74b9638df426e0b4c" +checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6" dependencies = [ "cfg-if", "libc", - "wasi 0.10.0+wasi-snapshot-preview1", + "wasi 0.11.0+wasi-snapshot-preview1", ] [[package]] @@ -1067,9 +1152,9 @@ checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" [[package]] name = "h2" -version = "0.3.11" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9f1f717ddc7b2ba36df7e871fd88db79326551d3d6f1fc406fbfd28b582ff8e" +checksum = "37a82c6d637fc9515a4694bbf1cb2457b79d81ce52b3108bdeea58b07dd34a57" dependencies = [ "bytes", "fnv", @@ -1080,7 +1165,7 @@ dependencies = [ "indexmap", "slab", "tokio", - "tokio-util 0.6.9", + "tokio-util", "tracing", ] @@ -1099,6 +1184,12 @@ dependencies = [ "ahash", ] +[[package]] +name = "hashbrown" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "607c8a29735385251a339424dd462993c0fed8fa09d378f259377df08c126022" + [[package]] name = "heck" version = "0.3.3" @@ -1159,20 +1250,20 @@ dependencies = [ [[package]] name = "http" -version = "0.2.6" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f4c6746584866f0feabcc69893c5b51beef3831656a968ed7ae254cdc4fd03" +checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" dependencies = [ "bytes", "fnv", - "itoa 1.0.1", + "itoa 1.0.2", ] [[package]] name = "http-body" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ff4f84919677303da5f147645dbea6b1881f368d03ac84e1dc09031ebd7b2c6" +checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" dependencies = [ "bytes", "http", @@ -1187,9 +1278,9 @@ checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" [[package]] name = "httparse" -version = "1.6.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9100414882e15fb7feccb4897e5f0ff0ff1ca7d1a86a23208ada4d7a18e6c6c4" +checksum = "496ce29bb5a52785b44e0f7ca2847ae0bb839c9bd28f69acac9b99d461c0c04c" [[package]] name = "httpdate" @@ -1215,9 +1306,9 @@ dependencies = [ [[package]] name = "hyper" -version = "0.14.17" +version = "0.14.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "043f0e083e9901b6cc658a77d1eb86f4fc650bbb977a4337dd63192826aa85dd" +checksum = "02c929dc5c39e335a03c405292728118860721b10190d98c2a0f0efd5baafbac" dependencies = [ "bytes", "futures-channel", @@ -1228,7 +1319,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 1.0.1", + "itoa 1.0.2", "pin-project-lite", "socket2", "tokio", @@ -1294,12 +1385,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.8.0" +version = "1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282a6247722caba404c065016bbfa522806e51714c34f5dfc3e4a3a46fcb4223" +checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" dependencies = [ "autocfg", - "hashbrown", + "hashbrown 0.12.2", ] [[package]] @@ -1311,7 +1402,7 @@ dependencies = [ "ahash", "atty", "indexmap", - "itoa 1.0.1", + "itoa 1.0.2", "lazy_static", "log", "num-format", @@ -1331,9 +1422,9 @@ dependencies = [ [[package]] name = "ipnet" -version = "2.3.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f2d64f2edebec4ce84ad108148e67e1064789bee435edc5b60ad398714a3a9" +checksum = "879d54834c8c76457ef4293a689b2a8c59b076067ad77b15efafbb05f92a592b" [[package]] name = "itertools" @@ -1352,24 +1443,24 @@ checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" [[package]] name = "itoa" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" +checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d" [[package]] name = "js-sys" -version = "0.3.56" +version = "0.3.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a38fc24e30fd564ce974c02bf1d337caddff65be6cc4735a1f7eab22a7440f04" +checksum = "c3fac17f7123a73ca62df411b1bf727ccc805daa070338fda671c86dac1bdc27" dependencies = [ "wasm-bindgen", ] [[package]] name = "jsonwebtoken" -version = "8.1.0" +version = "8.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9051c17f81bae79440afa041b3a278e1de71bfb96d32454b477fd4703ccb6f" +checksum = "1aa4b4af834c6cfd35d8763d359661b90f2e45d8f750a0849156c7f4671af09c" dependencies = [ "base64", "pem", @@ -1402,9 +1493,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.117" +version = "0.2.126" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e74d72e0f9b65b5b4ca49a346af3976df0f9c61d550727f349ecd559f251a26c" +checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836" [[package]] name = "libloading" @@ -1418,18 +1509,19 @@ dependencies = [ [[package]] name = "lock_api" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88943dd7ef4a2e5a4bfa2753aaab3013e34ce2533d1996fb18ef591e315e2b3b" +checksum = "327fa5b6a6940e4699ec49a9beae1ea4845c6bab9314e4f84ac68742139d8c53" dependencies = [ + "autocfg", "scopeguard", ] [[package]] name = "log" -version = "0.4.14" +version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ "cfg-if", "serde", @@ -1484,15 +1576,15 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "memchr" -version = "2.4.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memmap2" -version = "0.5.3" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "057a3db23999c867821a7a59feb06a578fcb03685e983dff90daf9e7d24ac08f" +checksum = "3a79b39c93a7a5a27eeaf9a23b5ff43f1b9e0ad6b1cdd441140ae53c35613fc7" dependencies = [ "libc", ] @@ -1531,44 +1623,23 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.4.4" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" -dependencies = [ - "adler", - "autocfg", -] - -[[package]] -name = "miniz_oxide" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2b29bd4bc3f33391105ebee3589c19197c4271e3e5a9ec9bfe8127eeff8f082" +checksum = "6f5c75688da582b8ffc1f1799e9db273f32133c49e048f614d22ec3256773ccc" dependencies = [ "adler", ] [[package]] name = "mio" -version = "0.8.2" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52da4364ffb0e4fe33a9841a98a3f3014fb964045ce4f7a45a398243c8d6b0c9" +checksum = "57ee1c23c7c63b0c9250c339ffdc69255f110b298b901b9f6c82547b7b87caaf" dependencies = [ "libc", "log", - "miow", - "ntapi", "wasi 0.11.0+wasi-snapshot-preview1", - "winapi", -] - -[[package]] -name = "miow" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" -dependencies = [ - "winapi", + "windows-sys", ] [[package]] @@ -1579,9 +1650,9 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" [[package]] name = "native-tls" -version = "0.2.8" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48ba9f7719b5a0f42f338907614285fb5fd70e53858141f69898a1fb7203b24d" +checksum = "fd7e2f3618557f980e0b17e8856252eee3c97fa12c54dff0ca290fb6266ca4a9" dependencies = [ "lazy_static", "libc", @@ -1600,7 +1671,7 @@ name = "neon_local" version = "0.1.0" dependencies = [ "anyhow", - "clap 3.0.14", + "clap 3.2.12", "comfy-table", "control_plane", "git-version", @@ -1634,22 +1705,12 @@ checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" [[package]] name = "nom" -version = "7.1.0" +version = "7.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b1d11e1ef389c76fe5b81bcaf2ea32cf88b62bc494e19f493d0b30e7a930109" +checksum = "a8903e5a29a317527874d0402f867152a3d21c908bb0b933e416c65e301d4c36" dependencies = [ "memchr", "minimal-lexical", - "version_check", -] - -[[package]] -name = "ntapi" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c28774a7fd2fbb4f0babd8237ce554b73af68021b5f695a3cebd6c59bac0980f" -dependencies = [ - "winapi", ] [[package]] @@ -1675,9 +1736,9 @@ dependencies = [ [[package]] name = "num-integer" -version = "0.1.44" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" dependencies = [ "autocfg", "num-traits", @@ -1685,9 +1746,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", ] @@ -1704,27 +1765,36 @@ dependencies = [ [[package]] name = "num_threads" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aba1801fb138d8e85e11d0fc70baf4fe1cdfffda7c6cd34a854905df588e5ed0" +checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" dependencies = [ "libc", ] [[package]] name = "object" -version = "0.27.1" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67ac1d3f9a1d3616fd9a60c8d74296f22406a238b6a72f5cc1e6f314df4ffbf9" +checksum = "21158b2c33aa6d4561f1c0a6ea283ca92bc54802a93b263e910746d679a7eb53" dependencies = [ "memchr", ] [[package]] -name = "once_cell" -version = "1.9.0" +name = "oid-registry" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" +checksum = "38e20717fa0541f39bd146692035c37bedfa532b3e5071b35761082407546b2a" +dependencies = [ + "asn1-rs", +] + +[[package]] +name = "once_cell" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18a6dbe30758c9f83eb00cbea4ac95966305f5a7772f3f42ebfc7fc7eddbd8e1" [[package]] name = "oorandom" @@ -1740,18 +1810,30 @@ checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" [[package]] name = "openssl" -version = "0.10.38" +version = "0.10.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c7ae222234c30df141154f159066c5093ff73b63204dcda7121eb082fc56a95" +checksum = "618febf65336490dfcf20b73f885f5651a0c89c64c2d4a8c3662585a70bf5bd0" dependencies = [ "bitflags", "cfg-if", "foreign-types", "libc", "once_cell", + "openssl-macros", "openssl-sys", ] +[[package]] +name = "openssl-macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "openssl-probe" version = "0.1.5" @@ -1760,9 +1842,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.72" +version = "0.9.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e46109c383602735fa0a2e48dd2b7c892b048e1bf69e5c3b1d804b7d9c203cb" +checksum = "e5f9bd0c2710541a3cda73d6f9ac4f1b240de4ae261065d309dbe73d9dceb42f" dependencies = [ "autocfg", "cc", @@ -1773,12 +1855,9 @@ dependencies = [ [[package]] name = "os_str_bytes" -version = "6.0.0" +version = "6.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" -dependencies = [ - "memchr", -] +checksum = "21326818e99cfe6ce1e524c2a805c189a99b5ae555a35d19f9a284b427d86afa" [[package]] name = "pageserver" @@ -1788,7 +1867,8 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 3.0.14", + "clap 3.2.12", + "close_fds", "const_format", "crc32c", "crossbeam-utils", @@ -1830,6 +1910,7 @@ dependencies = [ "tracing", "url", "utils", + "walkdir", "workspace_hack", ] @@ -1846,12 +1927,12 @@ dependencies = [ [[package]] name = "parking_lot" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f5ec2493a61ac0506c0f4199f99070cbe83857b0337006a30f3e6719b8ef58" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", - "parking_lot_core 0.9.2", + "parking_lot_core 0.9.3", ] [[package]] @@ -1870,9 +1951,9 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.2" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "995f667a6c822200b0433ac218e05582f0e2efa1b922a3fd2fbaadc5f87bab37" +checksum = "09a279cbf25cb0757810394fbc1e359949b59e348145c643a939a525692e6929" dependencies = [ "cfg-if", "libc", @@ -1889,9 +1970,9 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] name = "pem" -version = "1.0.2" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9a3b09a20e374558580a4914d3b7d89bd61b954a5a5e1dcbea98753addb1947" +checksum = "03c64931a1a212348ec4f3b4362585eca7159d0d09cbdf4a7f74f02173596fd4" dependencies = [ "base64", ] @@ -1904,9 +1985,9 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "petgraph" -version = "0.6.0" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" +checksum = "e6d5014253a1331579ce62aa67443b4a658c5e7dd03d4bc6d302b94474888143" dependencies = [ "fixedbitset", "indexmap", @@ -1932,18 +2013,18 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58ad3879ad3baf4e44784bc6a718a8698867bb991f8ce24d1bcbe2cfb4c3a75e" +checksum = "78203e83c48cffbe01e4a2d35d566ca4de445d79a85372fc64e378bfc812a260" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744b6f092ba29c3650faf274db506afd39944f48420f6c86b17cfe0ee1cb36bb" +checksum = "710faf75e1b33345361201d36d04e98ac1ed8909151a017ed384700836104c74" dependencies = [ "proc-macro2", "quote", @@ -1952,9 +2033,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e280fbe77cc62c91527259e9442153f4688736748d24660126286329742b4c6c" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" [[package]] name = "pin-utils" @@ -1964,15 +2045,15 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58893f751c9b0412871a09abd62ecd2a00298c6c83befa223ef98c52aef40cbe" +checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae" [[package]] name = "plotters" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" +checksum = "9428003b84df1496fb9d6eeee9c5f8145cb41ca375eb0dad204328888832811f" dependencies = [ "num-traits", "plotters-backend", @@ -1983,15 +2064,15 @@ dependencies = [ [[package]] name = "plotters-backend" -version = "0.3.2" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" +checksum = "193228616381fecdc1224c62e96946dfbc73ff4384fba576e052ff8c1bea8142" [[package]] name = "plotters-svg" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" +checksum = "e0918736323d1baff32ee0eade54984f6f201ad7e97d5cfb5d6ab4a358529615" dependencies = [ "plotters-backend", ] @@ -2058,7 +2139,7 @@ dependencies = [ "serde", "thiserror", "utils", - "wal_generate", + "wal_craft", "workspace_hack", ] @@ -2089,9 +2170,9 @@ checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" [[package]] name = "prettyplease" -version = "0.1.10" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9e07e3a46d0771a8a06b5f4441527802830b43e679ba12f44960f48dd4c6803" +checksum = "da6ffbe862780245013cb1c0a48c4e44b7d665548088f91f6b90876d0625e4c2" dependencies = [ "proc-macro2", "syn", @@ -2105,22 +2186,21 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" -version = "1.0.36" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" +checksum = "dd96a1e8ed2596c337f8eae5f24924ec83f5ad5ab21ea8e455d3566c69fbcaf7" dependencies = [ - "unicode-xid", + "unicode-ident", ] [[package]] name = "procfs" -version = "0.10.1" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95e344cafeaeefe487300c361654bcfc85db3ac53619eeccced29f5ea18c4c70" +checksum = "0941606b9934e2d98a3677759a971756eb821f75764d0e0d26946d08e74d9104" dependencies = [ "bitflags", "byteorder", - "flate2", "hex", "lazy_static", "libc", @@ -2128,25 +2208,25 @@ dependencies = [ [[package]] name = "prometheus" -version = "0.13.0" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f64969ffd5dd8f39bd57a68ac53c163a095ed9d0fb707146da1b27025a3504" +checksum = "cface98dfa6d645ea4c789839f176e4b072265d085bfcc48eaa8d137f58d3c39" dependencies = [ "cfg-if", "fnv", "lazy_static", "libc", "memchr", - "parking_lot 0.11.2", + "parking_lot 0.12.1", "procfs", "thiserror", ] [[package]] name = "prost" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc03e116981ff7d8da8e5c220e374587b98d294af7ba7dd7fda761158f00086f" +checksum = "71adf41db68aa0daaefc69bb30bcd68ded9b9abaad5d1fbb6304c4fb390e083e" dependencies = [ "bytes", "prost-derive", @@ -2154,9 +2234,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65a1118354442de7feb8a2a76f3d80ef01426bd45542c8c1fdffca41a758f846" +checksum = "8ae5a4388762d5815a9fc0dea33c56b021cdc8dde0c55e0c9ca57197254b0cab" dependencies = [ "bytes", "cfg-if", @@ -2205,17 +2285,17 @@ dependencies = [ "async-trait", "base64", "bytes", - "clap 3.0.14", + "clap 3.2.12", "futures", "git-version", - "hashbrown", + "hashbrown 0.11.2", "hex", "hmac 0.12.1", "hyper", "lazy_static", "md5", "metrics", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pin-project-lite", "rand", "rcgen", @@ -2223,7 +2303,7 @@ dependencies = [ "routerify", "rstest", "rustls", - "rustls-pemfile", + "rustls-pemfile 0.2.1", "scopeguard", "serde", "serde_json", @@ -2237,6 +2317,7 @@ dependencies = [ "url", "utils", "workspace_hack", + "x509-parser", ] [[package]] @@ -2259,9 +2340,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.15" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "864d3e96a899863136fc6e99f3d7cae289dafe43bf2c5ac19b70df7210c0a145" +checksum = "3bcdf212e9776fbcb2d23ab029360416bb1706b1aea2d1a5ba002727cbcab804" dependencies = [ "proc-macro2", ] @@ -2308,9 +2389,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.5.1" +version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" dependencies = [ "autocfg", "crossbeam-deque", @@ -2320,14 +2401,13 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.9.1" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" dependencies = [ "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", - "lazy_static", "num_cpus", ] @@ -2345,28 +2425,29 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.2.10" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +checksum = "62f25bc4c7e55e0b0b7a1d43fb893f4fa1361d0abe38b9ce4f323c2adfe6ef42" dependencies = [ "bitflags", ] [[package]] name = "redox_users" -version = "0.4.0" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" +checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" dependencies = [ "getrandom", "redox_syscall", + "thiserror", ] [[package]] name = "regex" -version = "1.5.5" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286" +checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" dependencies = [ "aho-corasick", "memchr", @@ -2384,9 +2465,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.25" +version = "0.6.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" +checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" [[package]] name = "remote_storage" @@ -2394,13 +2475,16 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "metrics", + "once_cell", "rusoto_core", "rusoto_s3", "serde", "serde_json", "tempfile", "tokio", - "tokio-util 0.7.0", + "tokio-util", + "toml_edit", "tracing", "workspace_hack", ] @@ -2416,9 +2500,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.9" +version = "0.11.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f242f1488a539a79bac6dbe7c8609ae43b7914b7736210f239a37cccb32525" +checksum = "b75aa69a3f06bbcc66ede33af2af253c6f7a86b1ca0033f60c580a27074fbf92" dependencies = [ "base64", "bytes", @@ -2438,12 +2522,13 @@ dependencies = [ "percent-encoding", "pin-project-lite", "rustls", - "rustls-pemfile", + "rustls-pemfile 1.0.0", "serde", "serde_json", "serde_urlencoded", "tokio", "tokio-rustls", + "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", @@ -2454,9 +2539,9 @@ dependencies = [ [[package]] name = "rgb" -version = "0.8.32" +version = "0.8.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e74fdc210d8f24a7dbfedc13b04ba5764f5232754ccebfdf5fff1bad791ccbc6" +checksum = "c3b221de559e4a29df3b957eec92bc0de6bc8eaf6ca9cfed43e5e1d67ff65a34" dependencies = [ "bytemuck", ] @@ -2606,10 +2691,19 @@ dependencies = [ ] [[package]] -name = "rustls" -version = "0.20.4" +name = "rusticata-macros" +version = "4.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fbfeb8d0ddb84706bc597a5574ab8912817c52a397f819e5b614e2265206921" +checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632" +dependencies = [ + "nom", +] + +[[package]] +name = "rustls" +version = "0.20.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aab8ee6c7097ed6057f43c187a62418d0c05a4bd5f18b3571db50ee0f9ce033" dependencies = [ "log", "ring", @@ -2626,6 +2720,15 @@ dependencies = [ "base64", ] +[[package]] +name = "rustls-pemfile" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7522c9de787ff061458fe9a829dc790a3f5b22dc571694fc5883f448b94d9a9" +dependencies = [ + "base64", +] + [[package]] name = "rustls-split" version = "0.3.0" @@ -2637,35 +2740,38 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f" +checksum = "a0a5f7c728f5d284929a1cccb5bc19884422bfe6ef4d6c409da2c41838983fcf" [[package]] name = "ryu" -version = "1.0.9" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" +checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695" [[package]] name = "safekeeper" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", "byteorder", "bytes", - "clap 3.0.14", + "clap 3.2.12", "const_format", "crc32c", "daemonize", "etcd_broker", "fs2", + "futures", "git-version", "hex", "humantime", "hyper", "lazy_static", "metrics", + "once_cell", "postgres", "postgres-protocol", "postgres_ffi", @@ -2678,7 +2784,8 @@ dependencies = [ "tempfile", "tokio", "tokio-postgres", - "tokio-util 0.7.0", + "tokio-util", + "toml_edit", "tracing", "url", "utils", @@ -2697,12 +2804,12 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.19" +version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f05ba609c234e60bee0d547fe94a4c7e9da733d1c962cf6e59efa4cd9c8bc75" +checksum = "88d6731146462ea25d9244b2ed5fd1d716d25c52e4d54aa4fb0f3c4e9854dbe2" dependencies = [ "lazy_static", - "winapi", + "windows-sys", ] [[package]] @@ -2746,15 +2853,15 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.5" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0486718e92ec9a68fbed73bb5ef687d71103b142595b406835649bebd33f72c7" +checksum = "a2333e6df6d6598f2b1974829f853c2b4c5f4a6e503c10af918081aa6f8564e1" [[package]] name = "serde" -version = "1.0.136" +version = "1.0.139" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce31e24b01e1e524df96f1c2fdd054405f8d7376249a5110886fb4b658484789" +checksum = "0171ebb889e45aa68b44aee0859b3eede84c6f5f5c228e6f140c0b2a0a46cad6" dependencies = [ "serde_derive", ] @@ -2771,9 +2878,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.136" +version = "1.0.139" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08597e7152fcd306f41838ed3e37be9eaeed2b61c42e2117266a554fab4662f9" +checksum = "dc1d3230c1de7932af58ad8ffbe1d784bd55efd5a9d84ac24f69c72d83543dfb" dependencies = [ "proc-macro2", "quote", @@ -2782,11 +2889,11 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.78" +version = "1.0.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d23c1ba4cf0efd44be32017709280b32d1cea5c3f1275c3b6d9e8bc54f758085" +checksum = "82c2c1fdcd807d1098552c5b9a36e425e42e9fbd7c6a37a8425f390f781f7fa7" dependencies = [ - "itoa 1.0.1", + "itoa 1.0.2", "ryu", "serde", ] @@ -2798,27 +2905,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" dependencies = [ "form_urlencoded", - "itoa 1.0.1", + "itoa 1.0.2", "ryu", "serde", ] [[package]] name = "serde_with" -version = "1.12.0" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec1e6ec4d8950e5b1e894eac0d360742f3b1407a6078a604a731c4b3f49cefbc" +checksum = "678b5a069e50bf00ecd22d0cd8ddf7c236f68581b03db652061ed5eb13a312ff" dependencies = [ - "rustversion", "serde", "serde_with_macros", ] [[package]] name = "serde_with_macros" -version = "1.5.1" +version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12e47be9471c72889ebafb5e14d5ff930d89ae7a67bbdb5f8abb564f845a927e" +checksum = "e182d6ec6f05393cc0e5ed1bf81ad6db3a8feedf8ee515ecdd369809bcce8082" dependencies = [ "darling", "proc-macro2", @@ -2867,9 +2973,9 @@ checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" [[package]] name = "signal-hook" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "647c97df271007dcea485bb74ffdb57f2e683f1306c854f468a0c244badabf2d" +checksum = "a253b5e89e2698464fc26b545c9edceb338e18a89effeeecfea192c3025be29d" dependencies = [ "libc", "signal-hook-registry", @@ -2897,33 +3003,33 @@ dependencies = [ [[package]] name = "simple_asn1" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a762b1c38b9b990c694b9c2f8abe3372ce6a9ceaae6bca39cfc46e054f45745" +checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085" dependencies = [ "num-bigint", "num-traits", "thiserror", - "time 0.3.9", + "time 0.3.11", ] [[package]] name = "siphasher" -version = "0.3.9" +version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a86232ab60fa71287d7f2ddae4a7073f6b7aac33631c3015abb556f08c6d0a3e" +checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" [[package]] name = "slab" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9def91fd1e018fe007022791f865d0ccc9b3a0d5001e01aabb8b40e46000afb5" +checksum = "eb703cfe953bccee95685111adeedb76fabe4e97549a58d16f03ea7b9367bb32" [[package]] name = "smallvec" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" +checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" [[package]] name = "socket2" @@ -3002,9 +3108,9 @@ checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" [[package]] name = "symbolic-common" -version = "8.7.0" +version = "8.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac6aac7b803adc9ee75344af7681969f76d4b38e4723c6eaacf3b28f5f1d87ff" +checksum = "f551f902d5642e58039aee6a9021a61037926af96e071816361644983966f540" dependencies = [ "debugid", "memmap2", @@ -3014,9 +3120,9 @@ dependencies = [ [[package]] name = "symbolic-demangle" -version = "8.7.0" +version = "8.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8143ea5aa546f86c64f9b9aafdd14223ffad4ecd2d58575c63c21335909c99a7" +checksum = "4564ca7b4e6eb14105aa8bbbce26e080f6b5d9c4373e67167ab31f7b86443750" dependencies = [ "cpp_demangle", "rustc-demangle", @@ -3025,13 +3131,13 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.92" +version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ff7c592601f11445996a06f8ad0c27f094a58857c2f89e97974ab9235b92c52" +checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd" dependencies = [ "proc-macro2", "quote", - "unicode-xid", + "unicode-ident", ] [[package]] @@ -3040,6 +3146,18 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8" +[[package]] +name = "synstructure" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "unicode-xid", +] + [[package]] name = "tar" version = "0.4.38" @@ -3067,9 +3185,9 @@ dependencies = [ [[package]] name = "termcolor" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" +checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" dependencies = [ "winapi-util", ] @@ -3085,24 +3203,24 @@ dependencies = [ [[package]] name = "textwrap" -version = "0.14.2" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0066c8d12af8b5acd21e00547c3797fde4e8677254a7ee429176ccebbe93dd80" +checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb" [[package]] name = "thiserror" -version = "1.0.30" +version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417" +checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.30" +version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b" +checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a" dependencies = [ "proc-macro2", "quote", @@ -3131,11 +3249,11 @@ dependencies = [ [[package]] name = "time" -version = "0.3.9" +version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd" +checksum = "72c91f41dcb2f096c05f0873d667dceec1087ce5bcf984ec8ffb19acddbb3217" dependencies = [ - "itoa 1.0.1", + "itoa 1.0.2", "libc", "num_threads", "quickcheck", @@ -3160,9 +3278,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.5.1" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c1c1d5a42b6245520c249549ec267180beaffcc0615401ac8e31853d4b6d8d2" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" dependencies = [ "tinyvec_macros", ] @@ -3175,10 +3293,11 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.17.0" +version = "1.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2af73ac49756f3f7c01172e34a23e5d0216f6c32333757c2c61feb2bbff5a5ee" +checksum = "57aec3cfa4c296db7255446efb4928a6be304b431a806216105542a67b6ca82e" dependencies = [ + "autocfg", "bytes", "libc", "memchr", @@ -3204,9 +3323,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "1.7.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b557f72f448c511a979e2564e55d74e6c4432fc96ff4f6241bc6bded342643b7" +checksum = "9724f9a975fb987ef7a3cd9be0350edcbe130698af5b8f7a631e23d42d052484" dependencies = [ "proc-macro2", "quote", @@ -3234,7 +3353,7 @@ dependencies = [ "fallible-iterator", "futures", "log", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "percent-encoding", "phf", "pin-project-lite", @@ -3242,7 +3361,7 @@ dependencies = [ "postgres-types", "socket2", "tokio", - "tokio-util 0.7.0", + "tokio-util", ] [[package]] @@ -3261,9 +3380,9 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.23.3" +version = "0.23.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4151fda0cf2798550ad0b34bcfc9b9dcc2a9d2471c895c68f3a8818e54f2389e" +checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" dependencies = [ "rustls", "tokio", @@ -3272,9 +3391,9 @@ dependencies = [ [[package]] name = "tokio-stream" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50145484efff8818b5ccd256697f36863f587da82cf8b409c53adf1e840798e3" +checksum = "df54d54117d6fdc4e4fea40fe1e4e566b3505700e148a6827e59b34b0d2600d9" dependencies = [ "futures-core", "pin-project-lite", @@ -3283,37 +3402,23 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.6.9" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e99e1983e5d376cd8eb4b66604d2e99e79f5bd988c3055891dcd8c9e2604cc0" +checksum = "cc463cd8deddc3770d20f9852143d50bf6094e640b485cb2e189a2099085ff45" dependencies = [ "bytes", "futures-core", "futures-sink", - "log", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tokio-util" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64910e1b9c1901aaf5375561e35b9c057d95ff41a44ede043a03e09279eabaf1" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "log", "pin-project-lite", "tokio", + "tracing", ] [[package]] name = "toml" -version = "0.5.8" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa" +checksum = "8d82e1a7758622a465f8cee077614c73484dac5b836c02ff6a40d5d1010324d7" dependencies = [ "serde", ] @@ -3355,7 +3460,7 @@ dependencies = [ "prost-derive", "tokio", "tokio-stream", - "tokio-util 0.7.0", + "tokio-util", "tower", "tower-layer", "tower-service", @@ -3378,9 +3483,9 @@ dependencies = [ [[package]] name = "tower" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a89fd63ad6adf737582df5db40d286574513c69a11dac5214dc3b5603d6713e" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" dependencies = [ "futures-core", "futures-util", @@ -3390,7 +3495,7 @@ dependencies = [ "rand", "slab", "tokio", - "tokio-util 0.7.0", + "tokio-util", "tower-layer", "tower-service", "tracing", @@ -3398,9 +3503,9 @@ dependencies = [ [[package]] name = "tower-http" -version = "0.3.2" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e980386f06883cf4d0578d6c9178c81f68b45d77d00f2c2c1bc034b3439c2c56" +checksum = "3c530c8675c1dbf98facee631536fa116b5fb6382d7dd6dc1b118d970eafe3ba" dependencies = [ "bitflags", "bytes", @@ -3423,15 +3528,15 @@ checksum = "343bc9466d3fe6b0f960ef45960509f84480bf4fd96f92901afe7ff3df9d3a62" [[package]] name = "tower-service" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.30" +version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d8d93354fe2a8e50d5953f5ae2e47a3fc2ef03292e7ea46e3cc38f549525fb9" +checksum = "5d0ecdcb44a79f0fe9844f0c4f33a342cbcbb5117de8001e6ba0dc2351327d09" dependencies = [ "cfg-if", "log", @@ -3442,9 +3547,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.19" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8276d9a4a3a558d7b7ad5303ad50b53d58264641b82914b7ada36bd762e7a716" +checksum = "11c75893af559bc8e10716548bdef5cb2b983f8e637db9d0e15126b61b484ee2" dependencies = [ "proc-macro2", "quote", @@ -3453,9 +3558,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.22" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03cfcb51380632a72d3111cb8d3447a8d908e577d31beeac006f836383d29a23" +checksum = "f54c8ca710e81886d498c2fd3331b56c93aa248d49de2222ad2742247c60072f" dependencies = [ "lazy_static", "valuable", @@ -3473,9 +3578,9 @@ dependencies = [ [[package]] name = "tracing-log" -version = "0.1.2" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3" +checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922" dependencies = [ "lazy_static", "log", @@ -3484,9 +3589,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.8" +version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74786ce43333fcf51efe947aed9718fbe46d5c7328ec3f1029e818083966d9aa" +checksum = "4bc28f93baff38037f64e6f43d34cfa1605f27a49c34e8a04c5e78b0babf2596" dependencies = [ "ansi_term", "lazy_static", @@ -3514,15 +3619,21 @@ checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" [[package]] name = "unicode-bidi" -version = "0.3.7" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a01404663e3db436ed2746d9fefef640d868edae3cceb81c3b8d5732fda678f" +checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992" + +[[package]] +name = "unicode-ident" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bd2fe26506023ed7b5e1e315add59d6f584c621d037f9368fea9cfb988f368c" [[package]] name = "unicode-normalization" -version = "0.1.19" +version = "0.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" +checksum = "854cbdc4f7bc6ae19c820d44abdc3277ac3e1b2b93db20a636825d9322fb60e6" dependencies = [ "tinyvec", ] @@ -3541,9 +3652,9 @@ checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" [[package]] name = "unicode-xid" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" +checksum = "957e51f3646910546462e67d5f7599b9e4fb8acdd304b087a6494730f9eebf04" [[package]] name = "untrusted" @@ -3586,7 +3697,7 @@ dependencies = [ "rand", "routerify", "rustls", - "rustls-pemfile", + "rustls-pemfile 0.2.1", "rustls-split", "serde", "serde_json", @@ -3631,14 +3742,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] -name = "wal_generate" +name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", - "clap 3.0.14", + "clap 3.2.12", "env_logger", "log", + "once_cell", "postgres", + "postgres_ffi", "tempfile", ] @@ -3677,9 +3790,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.79" +version = "0.2.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25f1af7423d8588a3d840681122e72e6a24ddbcb3f0ec385cac0d12d24256c06" +checksum = "7c53b543413a17a202f4be280a7e5c62a1c69345f5de525ee64f8cfdbc954994" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -3687,9 +3800,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.79" +version = "0.2.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b21c0df030f5a177f3cba22e9bc4322695ec43e7257d865302900290bcdedca" +checksum = "5491a68ab4500fa6b4d726bd67408630c3dbe9c4fe7bda16d5c82a1fd8c7340a" dependencies = [ "bumpalo", "lazy_static", @@ -3702,9 +3815,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.29" +version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2eb6ec270a31b1d3c7e266b999739109abce8b6c87e4b31fcfcd788b65267395" +checksum = "de9a9cec1733468a8c657e57fa2413d2ae2c0129b95e87c5b72b8ace4d13f31f" dependencies = [ "cfg-if", "js-sys", @@ -3714,9 +3827,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.79" +version = "0.2.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f4203d69e40a52ee523b2529a773d5ffc1dc0071801c87b3d270b471b80ed01" +checksum = "c441e177922bc58f1e12c022624b6216378e5febc2f0533e41ba443d505b80aa" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3724,9 +3837,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.79" +version = "0.2.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa8a30d46208db204854cadbb5d4baf5fcf8071ba5bf48190c3e59937962ebc" +checksum = "7d94ac45fcf608c1f45ef53e748d35660f168490c10b23704c7779ab8f5c3048" dependencies = [ "proc-macro2", "quote", @@ -3737,15 +3850,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.79" +version = "0.2.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d958d035c4438e28c70e4321a2911302f10135ce78a9c7834c0cab4123d06a2" +checksum = "6a89911bd99e5f3659ec4acf9c4d93b0a90fe4a2a11f15328472058edc5261be" [[package]] name = "web-sys" -version = "0.3.56" +version = "0.3.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c060b319f29dd25724f09a2ba1418f142f539b2be99fbf4d2d5a8f7330afb8eb" +checksum = "2fed94beee57daf8dd7d51f2b15dc2bcde92d7a72304cdf662a4371008b71b90" dependencies = [ "js-sys", "wasm-bindgen", @@ -3763,18 +3876,18 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "0.22.2" +version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "552ceb903e957524388c4d3475725ff2c8b7960922063af6ce53c9a43da07449" +checksum = "f1c760f0d366a6c24a02ed7816e23e691f5d92291f94d15e836006fd11b04daf" dependencies = [ "webpki", ] [[package]] name = "which" -version = "4.2.4" +version = "4.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a5a7e487e921cf220206864a94a89b6c6905bfc19f1057fa26a4cb360e5c1d2" +checksum = "5c4fb54e6113b6a8772ee41c3404fb0301ac79604489467e0a9ce1f3e97c24ae" dependencies = [ "either", "lazy_static", @@ -3814,9 +3927,9 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" -version = "0.34.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5acdd78cb4ba54c0045ac14f62d8f94a03d10047904ae2a40afa1e99d8f70825" +checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2" dependencies = [ "windows_aarch64_msvc", "windows_i686_gnu", @@ -3827,39 +3940,39 @@ dependencies = [ [[package]] name = "windows_aarch64_msvc" -version = "0.34.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17cffbe740121affb56fad0fc0e421804adf0ae00891205213b5cecd30db881d" +checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47" [[package]] name = "windows_i686_gnu" -version = "0.34.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2564fde759adb79129d9b4f54be42b32c89970c18ebf93124ca8870a498688ed" +checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6" [[package]] name = "windows_i686_msvc" -version = "0.34.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cd9d32ba70453522332c14d38814bceeb747d80b3958676007acadd7e166956" +checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024" [[package]] name = "windows_x86_64_gnu" -version = "0.34.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfce6deae227ee8d356d19effc141a509cc503dfd1f850622ec4b0f84428e1f4" +checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1" [[package]] name = "windows_x86_64_msvc" -version = "0.34.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d19538ccc21819d01deaf88d6a17eae6596a12e9aafdbb97916fb49896d89de9" +checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680" [[package]] name = "winreg" -version = "0.7.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69" +checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d" dependencies = [ "winapi", ] @@ -3879,7 +3992,7 @@ dependencies = [ "futures-task", "futures-util", "generic-array", - "hashbrown", + "hashbrown 0.11.2", "hex", "hyper", "indexmap", @@ -3887,6 +4000,8 @@ dependencies = [ "libc", "log", "memchr", + "nom", + "num-bigint", "num-integer", "num-traits", "prost", @@ -3896,17 +4011,36 @@ dependencies = [ "scopeguard", "serde", "syn", + "time 0.3.11", "tokio", - "tokio-util 0.7.0", + "tokio-util", "tracing", "tracing-core", ] [[package]] -name = "xattr" -version = "0.2.2" +name = "x509-parser" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "244c3741f4240ef46274860397c7c74e50eb23624996930e484c16679633a54c" +checksum = "9fb9bace5b5589ffead1afb76e43e34cff39cd0f3ce7e170ae0c29e53b88eb1c" +dependencies = [ + "asn1-rs", + "base64", + "data-encoding", + "der-parser", + "lazy_static", + "nom", + "oid-registry", + "rusticata-macros", + "thiserror", + "time 0.3.11", +] + +[[package]] +name = "xattr" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc" dependencies = [ "libc", ] @@ -3928,6 +4062,6 @@ dependencies = [ [[package]] name = "zeroize" -version = "1.5.2" +version = "1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c88870063c39ee00ec285a2f8d6a966e5b6fb2becc4e8dac77ed0d370ed6006" +checksum = "20b578acffd8516a6c3f2a1bdefc1ec37e547bb4e0fb8b6b01a4cafc886b4442" diff --git a/Dockerfile b/Dockerfile index a7afd1f335..ad85638af3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Build Postgres -FROM zimg/rust:1.58 AS pg-build +FROM neondatabase/rust:1.58 AS pg-build WORKDIR /pg USER root @@ -14,7 +14,7 @@ RUN set -e \ && tar -C tmp_install -czf /postgres_install.tar.gz . # Build zenith binaries -FROM zimg/rust:1.58 AS build +FROM neondatabase/rust:1.58 AS build ARG GIT_VERSION=local ARG CACHEPOT_BUCKET=zenith-rust-cachepot @@ -25,7 +25,7 @@ COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/inclu COPY . . # Show build caching stats to check if it was used in the end. -# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, loosing the compilation stats. +# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ && sudo -E "PATH=$PATH" mold -run cargo build --release \ && cachepot -s @@ -46,9 +46,9 @@ RUN set -e \ && useradd -d /data zenith \ && chown -R zenith:zenith /data -COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/pageserver /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/safekeeper /usr/local/bin -COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/proxy /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/runner/target/release/pageserver /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/runner/target/release/safekeeper /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/runner/target/release/proxy /usr/local/bin COPY --from=pg-build /pg/tmp_install/ /usr/local/ COPY --from=pg-build /postgres_install.tar.gz /data/ diff --git a/Dockerfile.alpine b/Dockerfile.alpine deleted file mode 100644 index dafb7eaf6b..0000000000 --- a/Dockerfile.alpine +++ /dev/null @@ -1,95 +0,0 @@ -# -# Docker image for console integration testing. -# -# We may also reuse it in CI to unify installation process and as a general binaries building -# tool for production servers. -# -# Dynamic linking is used for librocksdb and libstdc++ bacause librocksdb-sys calls -# bindgen with "dynamic" feature flag. This also prevents usage of dockerhub alpine-rust -# images which are statically linked and have guards against any dlopen. I would rather -# prefer all static binaries so we may change the way librocksdb-sys builds or wait until -# we will have our own storage and drop rockdb dependency. -# -# Cargo-chef is used to separate dependencies building from main binaries building. This -# way `docker build` will download and install dependencies only of there are changes to -# out Cargo.toml files. -# - - -# -# build postgres separately -- this layer will be rebuilt only if one of -# mentioned paths will get any changes -# -FROM alpine:3.13 as pg-build -RUN apk add --update clang llvm compiler-rt compiler-rt-static lld musl-dev binutils \ - make bison flex readline-dev zlib-dev perl linux-headers libseccomp-dev -WORKDIR zenith -COPY ./vendor/postgres vendor/postgres -COPY ./Makefile Makefile -# Build using clang and lld -RUN CC='clang' LD='lld' CFLAGS='-fuse-ld=lld --rtlib=compiler-rt' make postgres -j4 - -# -# Calculate cargo dependencies. -# This will always run, but only generate recipe.json with list of dependencies without -# installing them. -# -FROM alpine:20210212 as cargo-deps-inspect -RUN apk add --update rust cargo -RUN cargo install cargo-chef -WORKDIR zenith -COPY . . -RUN cargo chef prepare --recipe-path recipe.json - -# -# Build cargo dependencies. -# This temp cantainner would be build only if recipe.json was changed. -# -FROM alpine:20210212 as deps-build -RUN apk add --update rust cargo openssl-dev clang build-base -# rust-rocksdb can be built against system-wide rocksdb -- that saves about -# 10 minutes during build. Rocksdb apk package is in testing now, but use it -# anyway. In case of any troubles we can download and build rocksdb here manually -# (to cache it as a docker layer). -RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev -WORKDIR zenith -COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server -COPY --from=cargo-deps-inspect /root/.cargo/bin/cargo-chef /root/.cargo/bin/ -COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json -RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json - -# -# Build zenith binaries -# -FROM alpine:20210212 as build -RUN apk add --update rust cargo openssl-dev clang build-base -RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev -WORKDIR zenith -COPY . . -# Copy cached dependencies -COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server -COPY --from=deps-build /zenith/target target -COPY --from=deps-build /root/.cargo /root/.cargo -RUN cargo build --release - -# -# Copy binaries to resulting image. -# build-base hare to provide libstdc++ (it will also bring gcc, but leave it this way until we figure -# out how to statically link rocksdb or avoid it at all). -# -FROM alpine:3.13 -RUN apk add --update openssl build-base libseccomp-dev -RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb -COPY --from=build /zenith/target/release/pageserver /usr/local/bin -COPY --from=build /zenith/target/release/safekeeper /usr/local/bin -COPY --from=build /zenith/target/release/proxy /usr/local/bin -COPY --from=pg-build /zenith/tmp_install /usr/local -COPY docker-entrypoint.sh /docker-entrypoint.sh - -RUN addgroup zenith && adduser -h /data -D -G zenith zenith -VOLUME ["/data"] -WORKDIR /data -USER zenith -EXPOSE 6400 -ENTRYPOINT ["/docker-entrypoint.sh"] -CMD ["pageserver"] diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools index f0c9b9d56a..71770ae9ed 100644 --- a/Dockerfile.compute-tools +++ b/Dockerfile.compute-tools @@ -1,6 +1,6 @@ # First transient image to build compute_tools binaries # NB: keep in sync with rust image version in .circle/config.yml -FROM zimg/rust:1.58 AS rust-build +FROM neondatabase/rust:1.58 AS rust-build ARG CACHEPOT_BUCKET=zenith-rust-cachepot ARG AWS_ACCESS_KEY_ID @@ -15,4 +15,4 @@ RUN set -e \ # Final image that only has one binary FROM debian:buster-slim -COPY --from=rust-build /home/circleci/project/target/release/compute_ctl /usr/local/bin/compute_ctl +COPY --from=rust-build /home/runner/target/release/compute_ctl /usr/local/bin/compute_ctl diff --git a/Makefile b/Makefile index fdfc64f6fa..566f2ecb10 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,8 @@ +ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) + +# Where to install Postgres, default is ./tmp_install, maybe useful for package managers +POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/tmp_install + # Seccomp BPF is only available for Linux UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Linux) @@ -26,7 +31,7 @@ endif # macOS with brew-installed openssl requires explicit paths UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) - PG_CONFIGURE_OPTS += --with-includes=/usr/local/opt/openssl/include --with-libraries=/usr/local/opt/openssl/lib + PG_CONFIGURE_OPTS += --with-includes=$(HOMEBREW_PREFIX)/opt/openssl/include --with-libraries=$(HOMEBREW_PREFIX)/opt/openssl/lib endif # Choose whether we should be silent or verbose @@ -55,55 +60,55 @@ zenith: postgres-headers $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) ### PostgreSQL parts -tmp_install/build/config.status: +$(POSTGRES_INSTALL_DIR)/build/config.status: +@echo "Configuring postgres build" - mkdir -p tmp_install/build - (cd tmp_install/build && \ - ../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \ + mkdir -p $(POSTGRES_INSTALL_DIR)/build + (cd $(POSTGRES_INSTALL_DIR)/build && \ + $(ROOT_PROJECT_DIR)/vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \ $(PG_CONFIGURE_OPTS) \ $(SECCOMP) \ - --prefix=$(abspath tmp_install) > configure.log) + --prefix=$(abspath $(POSTGRES_INSTALL_DIR)) > configure.log) # nicer alias for running 'configure' .PHONY: postgres-configure -postgres-configure: tmp_install/build/config.status +postgres-configure: $(POSTGRES_INSTALL_DIR)/build/config.status -# Install the PostgreSQL header files into tmp_install/include +# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/include .PHONY: postgres-headers postgres-headers: postgres-configure +@echo "Installing PostgreSQL headers" - $(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/include MAKELEVEL=0 install -# Compile and install PostgreSQL and contrib/zenith +# Compile and install PostgreSQL and contrib/neon .PHONY: postgres postgres: postgres-configure \ postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers` +@echo "Compiling PostgreSQL" - $(MAKE) -C tmp_install/build MAKELEVEL=0 install - +@echo "Compiling contrib/zenith" - $(MAKE) -C tmp_install/build/contrib/zenith install - +@echo "Compiling contrib/zenith_test_utils" - $(MAKE) -C tmp_install/build/contrib/zenith_test_utils install + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install + +@echo "Compiling contrib/neon" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon install + +@echo "Compiling contrib/neon_test_utils" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon_test_utils install +@echo "Compiling pg_buffercache" - $(MAKE) -C tmp_install/build/contrib/pg_buffercache install + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install +@echo "Compiling pageinspect" - $(MAKE) -C tmp_install/build/contrib/pageinspect install + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install .PHONY: postgres-clean postgres-clean: - $(MAKE) -C tmp_install/build MAKELEVEL=0 clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean # This doesn't remove the effects of 'configure'. .PHONY: clean clean: - cd tmp_install/build && $(MAKE) clean + cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean $(CARGO_CMD_PREFIX) cargo clean # This removes everything .PHONY: distclean distclean: - rm -rf tmp_install + rm -rf $(POSTGRES_INSTALL_DIR) $(CARGO_CMD_PREFIX) cargo clean .PHONY: fmt @@ -112,4 +117,4 @@ fmt: .PHONY: setup-pre-commit-hook setup-pre-commit-hook: - ln -s -f ../../pre-commit.py .git/hooks/pre-commit + ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit diff --git a/README.md b/README.md index 8e8bf1a9b2..6a4fc5ce1b 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,11 @@ Neon is a serverless open source alternative to AWS Aurora Postgres. It separate The project used to be called "Zenith". Many of the commands and code comments still refer to "zenith", but we are in the process of renaming things. +## Quick start +[Join the waitlist](https://neon.tech/) for our free tier to receive your serverless postgres instance. Then connect to it with your preferred postgres client (psql, dbeaver, etc) or use the online SQL editor. + +Alternatively, compile and run the project [locally](#running-local-installation). + ## Architecture overview A Neon installation consists of compute nodes and Neon storage engine. @@ -24,13 +29,18 @@ Pageserver consists of: ## Running local installation -#### building on Ubuntu/ Debian (Linux) +#### Installing dependencies on Linux 1. Install build dependencies and other useful packages -On Ubuntu or Debian this set of packages should be sufficient to build the code: -```text +* On Ubuntu or Debian this set of packages should be sufficient to build the code: +```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ -libssl-dev clang pkg-config libpq-dev libprotobuf-dev etcd +libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client +``` +* On Fedora these packages are needed: +```bash +dnf install flex bison readline-devel zlib-devel openssl-devel \ + libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib ``` 2. [Install Rust](https://www.rust-lang.org/tools/install) @@ -39,23 +49,11 @@ libssl-dev clang pkg-config libpq-dev libprotobuf-dev etcd curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh ``` -3. Install PostgreSQL Client -``` -apt install postgresql-client -``` - -4. Build neon and patched postgres -```sh -git clone --recursive https://github.com/neondatabase/neon.git -cd neon -make -j5 -``` - -#### building on OSX (12.3.1) +#### Installing dependencies on OSX (12.3.1) 1. Install XCode and dependencies ``` xcode-select --install -brew install protobuf etcd +brew install protobuf etcd openssl ``` 2. [Install Rust](https://www.rust-lang.org/tools/install) @@ -71,11 +69,20 @@ brew install libpq brew link --force libpq ``` -4. Build neon and patched postgres -```sh +#### Building on Linux and OSX + +1. Build neon and patched postgres +``` +# Note: The path to the neon sources can not contain a space. + git clone --recursive https://github.com/neondatabase/neon.git cd neon -make -j5 + +# The preferred and default is to make a debug build. This will create a +# demonstrably slower build than a release build. If you want to use a release +# build, utilize "`BUILD_TYPE=release make -j`nproc``" + +make -j`nproc` ``` #### dependency installation notes @@ -88,7 +95,7 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r #### running neon database 1. Start pageserver and postgres on top of it (should be called from repo root): ```sh -# Create repository in .zenith with proper paths to binaries and data +# Create repository in .neon with proper paths to binaries and data # Later that would be responsibility of a package install script > ./target/debug/neon_local init initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c @@ -98,17 +105,17 @@ pageserver init succeeded # start pageserver and safekeeper > ./target/debug/neon_local start -Starting pageserver at '127.0.0.1:64000' in '.zenith' +Starting pageserver at '127.0.0.1:64000' in '.neon' Pageserver started initializing for sk 1 for 7676 -Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/sk1' +Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1' Safekeeper started # start postgres compute node > ./target/debug/neon_local pg start main Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ... -Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432 -Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres' +Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432 +Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres' # check list of running postgres instances > ./target/debug/neon_local pg list @@ -118,7 +125,7 @@ Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=po 2. Now it is possible to connect to postgres and run some queries: ```text -> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres +> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# CREATE TABLE t(key int primary key, value text); CREATE TABLE postgres=# insert into t values(1,1); @@ -144,8 +151,8 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: # start postgres on that branch > ./target/debug/neon_local pg start migration_check --branch-name migration_check Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ... -Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433 -Starting postgres node at 'host=127.0.0.1 port=55433 user=zenith_admin dbname=postgres' +Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433 +Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres' # check the new list of running postgres instances > ./target/debug/neon_local pg list @@ -155,7 +162,7 @@ Starting postgres node at 'host=127.0.0.1 port=55433 user=zenith_admin dbname=po # this new postgres instance will have all the data from 'main' postgres, # but all modifications would not affect data in original postgres -> psql -p55433 -h 127.0.0.1 -U zenith_admin postgres +> psql -p55433 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- @@ -166,7 +173,7 @@ postgres=# insert into t values(2,2); INSERT 0 1 # check that the new change doesn't affect the 'main' postgres -> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres +> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- @@ -204,7 +211,7 @@ Same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, wh To get more familiar with this aspect, refer to: - [Neon glossary](/docs/glossary.md) -- [PostgreSQL glossary](https://www.postgresql.org/docs/13/glossary.html) +- [PostgreSQL glossary](https://www.postgresql.org/docs/14/glossary.html) - Other PostgreSQL documentation and sources (Neon fork sources can be found [here](https://github.com/neondatabase/postgres)) ## Join the development diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 42db763961..1022438c2e 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -18,4 +18,5 @@ serde_json = "1" tar = "0.4" tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +url = "2.2.2" workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/compute_tools/README.md b/compute_tools/README.md index 15876ed246..97a7513344 100644 --- a/compute_tools/README.md +++ b/compute_tools/README.md @@ -22,7 +22,7 @@ Also `compute_ctl` spawns two separate service threads: Usage example: ```sh compute_ctl -D /var/db/postgres/compute \ - -C 'postgresql://zenith_admin@localhost/postgres' \ + -C 'postgresql://cloud_admin@localhost/postgres' \ -S /var/db/postgres/specs/current.json \ -b /usr/local/bin/postgres ``` diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 5c951b7779..f535adfd87 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -21,7 +21,7 @@ //! Usage example: //! ```sh //! compute_ctl -D /var/db/postgres/compute \ -//! -C 'postgresql://zenith_admin@localhost/postgres' \ +//! -C 'postgresql://cloud_admin@localhost/postgres' \ //! -S /var/db/postgres/specs/current.json \ //! -b /usr/local/bin/postgres //! ``` @@ -33,7 +33,7 @@ use std::process::exit; use std::sync::{Arc, RwLock}; use std::{thread, time::Duration}; -use anyhow::Result; +use anyhow::{Context, Result}; use chrono::Utc; use clap::Arg; use log::{error, info}; @@ -45,6 +45,7 @@ use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::pg_helpers::*; use compute_tools::spec::*; +use url::Url; fn main() -> Result<()> { // TODO: re-use `utils::logging` later @@ -116,22 +117,22 @@ fn main() -> Result<()> { let pageserver_connstr = spec .cluster .settings - .find("zenith.page_server_connstring") + .find("neon.pageserver_connstring") .expect("pageserver connstr should be provided"); let tenant = spec .cluster .settings - .find("zenith.zenith_tenant") + .find("neon.tenant_id") .expect("tenant id should be provided"); let timeline = spec .cluster .settings - .find("zenith.zenith_timeline") + .find("neon.timeline_id") .expect("tenant id should be provided"); let compute_state = ComputeNode { start_time: Utc::now(), - connstr: connstr.to_string(), + connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?, pgdata: pgdata.to_string(), pgbin: pgbin.to_string(), spec, diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index dbb70a74cf..b6ba1692f9 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -1,5 +1,3 @@ -use std::sync::Arc; - use anyhow::{anyhow, Result}; use log::error; use postgres::Client; @@ -23,9 +21,8 @@ pub fn create_writablity_check_data(client: &mut Client) -> Result<()> { Ok(()) } -pub async fn check_writability(compute: &Arc) -> Result<()> { - let connstr = &compute.connstr; - let (client, connection) = tokio_postgres::connect(connstr, NoTls).await?; +pub async fn check_writability(compute: &ComputeNode) -> Result<()> { + let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?; if client.is_closed() { return Err(anyhow!("connection to postgres closed")); } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index a8422fb2b2..1e812f2aa0 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -35,7 +35,8 @@ use crate::spec::*; /// Compute node info shared across several `compute_ctl` threads. pub struct ComputeNode { pub start_time: DateTime, - pub connstr: String, + // Url type maintains proper escaping + pub connstr: url::Url, pub pgdata: String, pub pgbin: String, pub spec: ComputeSpec, @@ -146,8 +147,14 @@ impl ComputeNode { _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn), }; let copyreader = client.copy_out(basebackup_cmd.as_str())?; - let mut ar = tar::Archive::new(copyreader); + // Read the archive directly from the `CopyOutReader` + // + // Set `ignore_zeros` so that unpack() reads all the Copy data and + // doesn't stop at the end-of-archive marker. Otherwise, if the server + // sends an Error after finishing the tarball, we will not notice it. + let mut ar = tar::Archive::new(copyreader); + ar.set_ignore_zeros(true); ar.unpack(&self.pgdata)?; self.metrics.basebackup_ms.store( @@ -256,11 +263,39 @@ impl ComputeNode { .unwrap_or_else(|| "5432".to_string()); wait_for_postgres(&mut pg, &port, pgdata_path)?; - let mut client = Client::connect(&self.connstr, NoTls)?; + // If connection fails, + // it may be the old node with `zenith_admin` superuser. + // + // In this case we need to connect with old `zenith_admin`name + // and create new user. We cannot simply rename connected user, + // but we can create a new one and grant it all privileges. + let mut client = match Client::connect(self.connstr.as_str(), NoTls) { + Err(e) => { + info!( + "cannot connect to postgres: {}, retrying with `zenith_admin` username", + e + ); + let mut zenith_admin_connstr = self.connstr.clone(); + + zenith_admin_connstr + .set_username("zenith_admin") + .map_err(|_| anyhow::anyhow!("invalid connstr"))?; + + let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?; + client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; + client.simple_query("GRANT zenith_admin TO cloud_admin")?; + drop(client); + + // reconnect with connsting with expected name + Client::connect(self.connstr.as_str(), NoTls)? + } + Ok(client) => client, + }; handle_roles(&self.spec, &mut client)?; handle_databases(&self.spec, &mut client)?; - handle_grants(&self.spec, &mut client)?; + handle_role_deletions(self, &mut client)?; + handle_grants(self, &mut client)?; create_writablity_check_data(&mut client)?; // 'Close' connection diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 496a5aae3b..58cdf796bc 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -13,11 +13,11 @@ const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds // Spin in a loop and figure out the last activity time in the Postgres. // Then update it in the shared state. This function never errors out. // XXX: the only expected panic is at `RwLock` unwrap(). -fn watch_compute_activity(compute: &Arc) { +fn watch_compute_activity(compute: &ComputeNode) { // Suppose that `connstr` doesn't change - let connstr = compute.connstr.clone(); + let connstr = compute.connstr.as_str(); // Define `client` outside of the loop to reuse existing connection if it's active. - let mut client = Client::connect(&connstr, NoTls); + let mut client = Client::connect(connstr, NoTls); let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL); info!("watching Postgres activity at {}", connstr); @@ -32,7 +32,7 @@ fn watch_compute_activity(compute: &Arc) { info!("connection to postgres closed, trying to reconnect"); // Connection is closed, reconnect and try again. - client = Client::connect(&connstr, NoTls); + client = Client::connect(connstr, NoTls); continue; } @@ -43,7 +43,7 @@ fn watch_compute_activity(compute: &Arc) { FROM pg_stat_activity WHERE backend_type = 'client backend' AND pid != pg_backend_pid() - AND usename != 'zenith_admin';", // XXX: find a better way to filter other monitors? + AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors? &[], ); let mut last_active = compute.state.read().unwrap().last_active; @@ -93,7 +93,7 @@ fn watch_compute_activity(compute: &Arc) { debug!("cannot connect to postgres: {}, retrying", e); // Establish a new connection and try again. - client = Client::connect(&connstr, NoTls); + client = Client::connect(connstr, NoTls); } } } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 74856eac63..207d09d76b 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -1,3 +1,4 @@ +use std::fmt::Write; use std::fs::File; use std::io::{BufRead, BufReader}; use std::net::{SocketAddr, TcpStream}; @@ -138,9 +139,11 @@ impl Role { // Now we also support SCRAM-SHA-256 and to preserve compatibility // we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256. if pass.starts_with("SCRAM-SHA-256") { - params.push_str(&format!(" PASSWORD '{}'", pass)); + write!(params, " PASSWORD '{pass}'") + .expect("String is documented to not to error during write operations"); } else { - params.push_str(&format!(" PASSWORD 'md5{}'", pass)); + write!(params, " PASSWORD 'md5{pass}'") + .expect("String is documented to not to error during write operations"); } } else { params.push_str(" PASSWORD NULL"); @@ -158,7 +161,8 @@ impl Database { /// it may require a proper quoting too. pub fn to_pg_options(&self) -> String { let mut params: String = self.options.as_pg_options(); - params.push_str(&format!(" OWNER {}", &self.owner.quote())); + write!(params, " OWNER {}", &self.owner.quote()) + .expect("String is documented to not to error during write operations"); params } @@ -244,18 +248,20 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<() bail!("Postgres exited unexpectedly with code {}", code); } - if pid_path.exists() { - let file = BufReader::new(File::open(&pid_path)?); - let status = file - .lines() - .last() - .unwrap() - .unwrap_or_else(|_| "unknown".to_string()); - let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok(); + // Check that we can open pid file first. + if let Ok(file) = File::open(&pid_path) { + let file = BufReader::new(file); + let last_line = file.lines().last(); - // Now Postgres is ready to accept connections - if status.trim() == "ready" && can_connect { - break; + // Pid file could be there and we could read it, but it could be empty, for example. + if let Some(Ok(line)) = last_line { + let status = line.trim(); + let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok(); + + // Now Postgres is ready to accept connections + if status == "ready" && can_connect { + break; + } } } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index e88df56a65..bd47614386 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -2,9 +2,10 @@ use std::path::Path; use anyhow::Result; use log::{info, log_enabled, warn, Level}; -use postgres::Client; +use postgres::{Client, NoTls}; use serde::Deserialize; +use crate::compute::ComputeNode; use crate::config; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; @@ -97,18 +98,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { // Process delta operations first if let Some(ops) = &spec.delta_operations { - info!("processing delta operations on roles"); + info!("processing role renames"); for op in ops { match op.action.as_ref() { - // We do not check either role exists or not, - // Postgres will take care of it for us "delete_role" => { - let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote()); - - warn!("deleting role '{}'", &op.name); - xact.execute(query.as_str(), &[])?; + // no-op now, roles will be deleted at the end of configuration } - // Renaming role drops its password, since tole name is + // Renaming role drops its password, since role name is // used as a salt there. It is important that this role // is recorded with a new `name` in the `roles` list. // Follow up roles update will set the new password. @@ -182,7 +178,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { xact.execute(query.as_str(), &[])?; let grant_query = format!( - "grant pg_read_all_data, pg_write_all_data to {}", + "GRANT pg_read_all_data, pg_write_all_data TO {}", name.quote() ); xact.execute(grant_query.as_str(), &[])?; @@ -197,6 +193,70 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { Ok(()) } +/// Reassign all dependent objects and delete requested roles. +pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> { + let spec = &node.spec; + + // First, reassign all dependent objects to db owners. + if let Some(ops) = &spec.delta_operations { + info!("reassigning dependent objects of to-be-deleted roles"); + for op in ops { + if op.action == "delete_role" { + reassign_owned_objects(node, &op.name)?; + } + } + } + + // Second, proceed with role deletions. + let mut xact = client.transaction()?; + if let Some(ops) = &spec.delta_operations { + info!("processing role deletions"); + for op in ops { + // We do not check either role exists or not, + // Postgres will take care of it for us + if op.action == "delete_role" { + let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote()); + + warn!("deleting role '{}'", &op.name); + xact.execute(query.as_str(), &[])?; + } + } + } + + Ok(()) +} + +// Reassign all owned objects in all databases to the owner of the database. +fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> { + for db in &node.spec.cluster.databases { + if db.owner != *role_name { + let mut connstr = node.connstr.clone(); + // database name is always the last and the only component of the path + connstr.set_path(&db.name); + + let mut client = Client::connect(connstr.as_str(), NoTls)?; + + // This will reassign all dependent objects to the db owner + let reassign_query = format!( + "REASSIGN OWNED BY {} TO {}", + role_name.quote(), + db.owner.quote() + ); + info!( + "reassigning objects owned by '{}' in db '{}' to '{}'", + role_name, &db.name, &db.owner + ); + client.simple_query(&reassign_query)?; + + // This now will only drop privileges of the role + let drop_query = format!("DROP OWNED BY {}", role_name.quote()); + client.simple_query(&drop_query)?; + } + } + + Ok(()) +} + /// It follows mostly the same logic as `handle_roles()` excepting that we /// does not use an explicit transactions block, since major database operations /// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level @@ -289,23 +349,80 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { Ok(()) } -// Grant CREATE ON DATABASE to the database owner -// to allow clients create trusted extensions. -pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> { +/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants +/// to allow users creating trusted extensions and re-creating `public` schema, for example. +pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { + let spec = &node.spec; + info!("cluster spec grants:"); + // We now have a separate `web_access` role to connect to the database + // via the web interface and proxy link auth. And also we grant a + // read / write all data privilege to every role. So also grant + // create to everyone. + // XXX: later we should stop messing with Postgres ACL in such horrible + // ways. + let roles = spec + .cluster + .roles + .iter() + .map(|r| r.name.quote()) + .collect::>(); + for db in &spec.cluster.databases { let dbname = &db.name; let query: String = format!( "GRANT CREATE ON DATABASE {} TO {}", dbname.quote(), - db.owner.quote() + roles.join(", ") ); info!("grant query {}", &query); client.execute(query.as_str(), &[])?; } + // Do some per-database access adjustments. We'd better do this at db creation time, + // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants + // atomically. + let mut db_connstr = node.connstr.clone(); + for db in &node.spec.cluster.databases { + // database name is always the last and the only component of the path + db_connstr.set_path(&db.name); + + let mut db_client = Client::connect(db_connstr.as_str(), NoTls)?; + + // This will only change ownership on the schema itself, not the objects + // inside it. Without it owner of the `public` schema will be `cloud_admin` + // and database owner cannot do anything with it. SQL procedure ensures + // that it won't error out if schema `public` doesn't exist. + let alter_query = format!( + "DO $$\n\ + DECLARE\n\ + schema_owner TEXT;\n\ + BEGIN\n\ + IF EXISTS(\n\ + SELECT nspname\n\ + FROM pg_catalog.pg_namespace\n\ + WHERE nspname = 'public'\n\ + )\n\ + THEN\n\ + SELECT nspowner::regrole::text\n\ + FROM pg_catalog.pg_namespace\n\ + WHERE nspname = 'public'\n\ + INTO schema_owner;\n\ + \n\ + IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'\n\ + THEN\n\ + ALTER SCHEMA public OWNER TO {};\n\ + END IF;\n\ + END IF;\n\ + END\n\ + $$;", + db.owner.quote() + ); + db_client.simple_query(&alter_query)?; + } + Ok(()) } diff --git a/compute_tools/tests/cluster_spec.json b/compute_tools/tests/cluster_spec.json index 4a1672919c..bdd6e60a69 100644 --- a/compute_tools/tests/cluster_spec.json +++ b/compute_tools/tests/cluster_spec.json @@ -85,7 +85,7 @@ "vartype": "bool" }, { - "name": "wal_acceptors", + "name": "safekeepers", "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501", "vartype": "string" }, @@ -150,7 +150,7 @@ "vartype": "integer" }, { - "name": "zenith.zenith_tenant", + "name": "neon.tenant_id", "value": "b0554b632bd4d547a63b86c3630317e8", "vartype": "string" }, @@ -160,13 +160,13 @@ "vartype": "integer" }, { - "name": "zenith.zenith_timeline", + "name": "neon.timeline_id", "value": "2414a61ffc94e428f14b5758fe308e13", "vartype": "string" }, { "name": "shared_preload_libraries", - "value": "zenith", + "value": "neon", "vartype": "string" }, { @@ -175,7 +175,7 @@ "vartype": "string" }, { - "name": "zenith.page_server_connstring", + "name": "neon.pageserver_connstring", "value": "host=127.0.0.1 port=6400", "vartype": "string" } diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 33f903f0e1..1f2e188398 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -28,7 +28,7 @@ mod pg_helpers_tests { assert_eq!( spec.cluster.settings.as_pg_settings(), - "fsync = off\nwal_level = replica\nhot_standby = on\nwal_acceptors = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nzenith.zenith_tenant = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nzenith.zenith_timeline = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'zenith'\nsynchronous_standby_names = 'walproposer'\nzenith.page_server_connstring = 'host=127.0.0.1 port=6400'" + "fsync = off\nwal_level = replica\nhot_standby = on\nsafekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'" ); } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 41417aab9a..21311eea9a 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -tar = "0.4.33" +tar = "0.4.38" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } serde = { version = "1.0", features = ["derive"] } serde_with = "1.12.0" diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 350cf74b7c..e78f96074e 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -148,9 +148,9 @@ impl PostgresNode { // Read a few options from the config file let context = format!("in config file {}", cfg_path_str); let port: u16 = conf.parse_field("port", &context)?; - let timeline_id: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?; - let tenant_id: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?; - let uses_wal_proposer = conf.get("wal_acceptors").is_some(); + let timeline_id: ZTimelineId = conf.parse_field("neon.timeline_id", &context)?; + let tenant_id: ZTenantId = conf.parse_field("neon.tenant_id", &context)?; + let uses_wal_proposer = conf.get("safekeepers").is_some(); // parse recovery_target_lsn, if any let recovery_target_lsn: Option = @@ -231,8 +231,13 @@ impl PostgresNode { .context("page server 'basebackup' command failed")?; // Read the archive directly from the `CopyOutReader` - tar::Archive::new(copyreader) - .unpack(&self.pgdata()) + // + // Set `ignore_zeros` so that unpack() reads all the Copy data and + // doesn't stop at the end-of-archive marker. Otherwise, if the server + // sends an Error after finishing the tarball, we will not notice it. + let mut ar = tar::Archive::new(copyreader); + ar.set_ignore_zeros(true); + ar.unpack(&self.pgdata()) .context("extracting base backup failed")?; Ok(()) @@ -298,11 +303,11 @@ impl PostgresNode { // uses only needed variables namely host, port, user, password. format!("postgresql://no_user:{}@{}:{}", password, host, port) }; - conf.append("shared_preload_libraries", "zenith"); + conf.append("shared_preload_libraries", "neon"); conf.append_line(""); - conf.append("zenith.page_server_connstring", &pageserver_connstr); - conf.append("zenith.zenith_tenant", &self.tenant_id.to_string()); - conf.append("zenith.zenith_timeline", &self.timeline_id.to_string()); + conf.append("neon.pageserver_connstring", &pageserver_connstr); + conf.append("neon.tenant_id", &self.tenant_id.to_string()); + conf.append("neon.timeline_id", &self.timeline_id.to_string()); if let Some(lsn) = self.lsn { conf.append("recovery_target_lsn", &lsn.to_string()); } @@ -336,7 +341,7 @@ impl PostgresNode { .map(|sk| format!("localhost:{}", sk.pg_port)) .collect::>() .join(","); - conf.append("wal_acceptors", &safekeepers); + conf.append("safekeepers", &safekeepers); } else { // We only use setup without safekeepers for tests, // and don't care about data durability on pageserver, @@ -347,7 +352,6 @@ impl PostgresNode { // This isn't really a supported configuration, but can be useful for // testing. conf.append("synchronous_standby_names", "pageserver"); - conf.append("zenith.callmemaybe_connstring", &self.connstr()); } let mut file = File::create(self.pgdata().join("postgresql.conf"))?; @@ -494,7 +498,7 @@ impl PostgresNode { "host={} port={} user={} dbname={}", self.address.ip(), self.address.port(), - "zenith_admin", + "cloud_admin", "postgres" ) } diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs index df657dd1be..0123d9c491 100644 --- a/control_plane/src/etcd.rs +++ b/control_plane/src/etcd.rs @@ -48,6 +48,10 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { format!("--data-dir={}", etcd_data_dir.display()), format!("--listen-client-urls={client_urls}"), format!("--advertise-client-urls={client_urls}"), + // Set --quota-backend-bytes to keep the etcd virtual memory + // size smaller. Our test etcd clusters are very small. + // See https://github.com/etcd-io/etcd/issues/7910 + "--quota-backend-bytes=100000000".to_string(), ]) .stdout(Stdio::from(etcd_stdout_file)) .stderr(Stdio::from(etcd_stderr_file)) @@ -73,7 +77,7 @@ pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { let etcd_pid_file_path = etcd_pid_file_path(env); let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| { format!( - "Failed to read etcd pid filea at {}", + "Failed to read etcd pid file at {}", etcd_pid_file_path.display() ) })?); diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index c3469c3350..4dfca588ad 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -49,3 +49,12 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { cmd } } + +fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { + for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] { + if let Ok(value) = std::env::var(env_key) { + cmd = cmd.env(env_key, value); + } + } + cmd +} diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index c73af7d338..e0b409f32d 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -15,15 +15,15 @@ use std::process::{Command, Stdio}; use utils::{ auth::{encode_from_key_file, Claims, Scope}, postgres_backend::AuthType, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use crate::safekeeper::SafekeeperNode; // -// This data structures represents zenith CLI config +// This data structures represents neon_local CLI config // -// It is deserialized from the .zenith/config file, or the config file passed +// It is deserialized from the .neon/config file, or the config file passed // to 'zenith init --config=' option. See control_plane/simple.conf for // an example. // @@ -34,8 +34,8 @@ pub struct LocalEnv { // compute nodes). // // This is not stored in the config file. Rather, this is the path where the - // config file itself is. It is read from the ZENITH_REPO_DIR env variable or - // '.zenith' if not given. + // config file itself is. It is read from the NEON_REPO_DIR env variable or + // '.neon' if not given. #[serde(skip)] pub base_data_dir: PathBuf, @@ -119,16 +119,24 @@ impl EtcdBroker { } pub fn comma_separated_endpoints(&self) -> String { - self.broker_endpoints.iter().map(Url::as_str).fold( - String::new(), - |mut comma_separated_urls, url| { + self.broker_endpoints + .iter() + .map(|url| { + // URL by default adds a '/' path at the end, which is not what etcd CLI wants. + let url_string = url.as_str(); + if url_string.ends_with('/') { + &url_string[0..url_string.len() - 1] + } else { + url_string + } + }) + .fold(String::new(), |mut comma_separated_urls, url| { if !comma_separated_urls.is_empty() { comma_separated_urls.push(','); } comma_separated_urls.push_str(url); comma_separated_urls - }, - ) + }) } } @@ -136,7 +144,7 @@ impl EtcdBroker { #[serde(default)] pub struct PageServerConf { // node id - pub id: ZNodeId, + pub id: NodeId, // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, @@ -151,7 +159,7 @@ pub struct PageServerConf { impl Default for PageServerConf { fn default() -> Self { Self { - id: ZNodeId(0), + id: NodeId(0), listen_pg_addr: String::new(), listen_http_addr: String::new(), auth_type: AuthType::Trust, @@ -163,19 +171,25 @@ impl Default for PageServerConf { #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct SafekeeperConf { - pub id: ZNodeId, + pub id: NodeId, pub pg_port: u16, pub http_port: u16, pub sync: bool, + pub remote_storage: Option, + pub backup_threads: Option, + pub auth_enabled: bool, } impl Default for SafekeeperConf { fn default() -> Self { Self { - id: ZNodeId(0), + id: NodeId(0), pg_port: 0, http_port: 0, sync: true, + remote_storage: None, + backup_threads: None, + auth_enabled: false, } } } @@ -325,7 +339,7 @@ impl LocalEnv { pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { // Currently, the user first passes a config file with 'zenith init --config=' // We read that in, in `create_config`, and fill any missing defaults. Then it's saved - // to .zenith/config. TODO: We lose any formatting and comments along the way, which is + // to .neon/config. TODO: We lose any formatting and comments along the way, which is // a bit sad. let mut conf_content = r#"# This file describes a locale deployment of the page server # and safekeeeper node. It is read by the 'zenith' command-line @@ -377,6 +391,7 @@ impl LocalEnv { base_path != Path::new(""), "repository base path is missing" ); + ensure!( !base_path.exists(), "directory '{}' already exists. Perhaps already initialized?", @@ -388,16 +403,6 @@ impl LocalEnv { self.pg_distrib_dir.display() ); } - for binary in ["pageserver", "safekeeper"] { - if !self.zenith_distrib_dir.join(binary).exists() { - bail!( - "Can't find binary '{}' in zenith distrib dir '{}'", - binary, - self.zenith_distrib_dir.display() - ); - } - } - for binary in ["pageserver", "safekeeper"] { if !self.zenith_distrib_dir.join(binary).exists() { bail!( @@ -406,12 +411,6 @@ impl LocalEnv { ); } } - if !self.pg_distrib_dir.join("bin/postgres").exists() { - bail!( - "Can't find postgres binary at {}", - self.pg_distrib_dir.display() - ); - } fs::create_dir(&base_path)?; @@ -468,9 +467,9 @@ impl LocalEnv { } fn base_path() -> PathBuf { - match std::env::var_os("ZENITH_REPO_DIR") { + match std::env::var_os("NEON_REPO_DIR") { Some(val) => PathBuf::from(val), - None => PathBuf::from(".zenith"), + None => PathBuf::from(".neon"), } } diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index d5b6251209..c90f36d104 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -18,12 +18,12 @@ use thiserror::Error; use utils::{ connstring::connection_address, http::error::HttpErrorBody, - zid::{ZNodeId, ZTenantId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTimelineId}, }; use crate::local_env::{LocalEnv, SafekeeperConf}; use crate::storage::PageServerNode; -use crate::{fill_rust_env_vars, read_pidfile}; +use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile}; #[derive(Error, Debug)] pub enum SafekeeperHttpError { @@ -65,7 +65,7 @@ impl ResponseErrorMessageExt for Response { // #[derive(Debug)] pub struct SafekeeperNode { - pub id: ZNodeId, + pub id: NodeId, pub conf: SafekeeperConf, @@ -100,7 +100,7 @@ impl SafekeeperNode { .unwrap() } - pub fn datadir_path_by_id(env: &LocalEnv, sk_id: ZNodeId) -> PathBuf { + pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf { env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref()) } @@ -143,6 +143,19 @@ impl SafekeeperNode { if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() { cmd.args(&["--broker-etcd-prefix", prefix]); } + if let Some(threads) = self.conf.backup_threads { + cmd.args(&["--backup-threads", threads.to_string().as_ref()]); + } + if let Some(ref remote_storage) = self.conf.remote_storage { + cmd.args(&["--remote-storage", remote_storage]); + } + if self.conf.auth_enabled { + cmd.arg("--auth-validation-public-key-path"); + // PathBuf is better be passed as is, not via `String`. + cmd.arg(self.env.base_data_dir.join("auth_public_key.pem")); + } + + fill_aws_secrets_vars(&mut cmd); if !cmd.status()?.success() { bail!( @@ -286,7 +299,7 @@ impl SafekeeperNode { &self, tenant_id: ZTenantId, timeline_id: ZTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result<()> { Ok(self .http_request( diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index 355c7c250d..f1eaa99904 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,6 +1,8 @@ use std::collections::HashMap; -use std::io::Write; +use std::fs::File; +use std::io::{BufReader, Write}; use std::net::TcpStream; +use std::num::NonZeroU64; use std::path::PathBuf; use std::process::Command; use std::time::Duration; @@ -11,6 +13,7 @@ use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; use pageserver::http::models::{TenantConfigRequest, TenantCreateRequest, TimelineCreateRequest}; +use pageserver::tenant_mgr::TenantInfo; use pageserver::timelines::TimelineInfo; use postgres::{Config, NoTls}; use reqwest::blocking::{Client, RequestBuilder, Response}; @@ -25,8 +28,7 @@ use utils::{ }; use crate::local_env::LocalEnv; -use crate::{fill_rust_env_vars, read_pidfile}; -use pageserver::tenant_mgr::TenantInfo; +use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile}; #[derive(Error, Debug)] pub enum PageserverHttpError { @@ -37,6 +39,12 @@ pub enum PageserverHttpError { Response(String), } +impl From for PageserverHttpError { + fn from(e: anyhow::Error) -> Self { + Self::Response(e.to_string()) + } +} + type Result = result::Result; pub trait ResponseErrorMessageExt: Sized { @@ -410,6 +418,15 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose()?, pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()), + walreceiver_connect_timeout: settings + .get("walreceiver_connect_timeout") + .map(|x| x.to_string()), + lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()), + max_lsn_wal_lag: settings + .get("max_lsn_wal_lag") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, }) .send()? .error_from_body()? @@ -433,22 +450,41 @@ impl PageServerNode { tenant_id, checkpoint_distance: settings .get("checkpoint_distance") - .map(|x| x.parse::().unwrap()), + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'checkpoint_distance' as an integer")?, compaction_target_size: settings .get("compaction_target_size") - .map(|x| x.parse::().unwrap()), + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_target_size' as an integer")?, compaction_period: settings.get("compaction_period").map(|x| x.to_string()), compaction_threshold: settings .get("compaction_threshold") - .map(|x| x.parse::().unwrap()), + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'compaction_threshold' as an integer")?, gc_horizon: settings .get("gc_horizon") - .map(|x| x.parse::().unwrap()), + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_horizon' as an integer")?, gc_period: settings.get("gc_period").map(|x| x.to_string()), image_creation_threshold: settings .get("image_creation_threshold") - .map(|x| x.parse::().unwrap()), + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'image_creation_threshold' as non zero integer")?, pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()), + walreceiver_connect_timeout: settings + .get("walreceiver_connect_timeout") + .map(|x| x.to_string()), + lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()), + max_lsn_wal_lag: settings + .get("max_lsn_wal_lag") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?, }) .send()? .error_from_body()?; @@ -492,13 +528,54 @@ impl PageServerNode { Ok(timeline_info_response) } -} -fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { - for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] { - if let Ok(value) = std::env::var(env_key) { - cmd = cmd.env(env_key, value); + /// Import a basebackup prepared using either: + /// a) `pg_basebackup -F tar`, or + /// b) The `fullbackup` pageserver endpoint + /// + /// # Arguments + /// * `tenant_id` - tenant to import into. Created if not exists + /// * `timeline_id` - id to assign to imported timeline + /// * `base` - (start lsn of basebackup, path to `base.tar` file) + /// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`) + pub fn timeline_import( + &self, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + base: (Lsn, PathBuf), + pg_wal: Option<(Lsn, PathBuf)>, + ) -> anyhow::Result<()> { + let mut client = self.pg_connection_config.connect(NoTls).unwrap(); + + // Init base reader + let (start_lsn, base_tarfile_path) = base; + let base_tarfile = File::open(base_tarfile_path)?; + let mut base_reader = BufReader::new(base_tarfile); + + // Init wal reader if necessary + let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal { + let wal_tarfile = File::open(wal_tarfile_path)?; + let wal_reader = BufReader::new(wal_tarfile); + (end_lsn, Some(wal_reader)) + } else { + (start_lsn, None) + }; + + // Import base + let import_cmd = + format!("import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn}"); + let mut writer = client.copy_in(&import_cmd)?; + io::copy(&mut base_reader, &mut writer)?; + writer.finish()?; + + // Import wal if necessary + if let Some(mut wal_reader) = wal_reader { + let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"); + let mut writer = client.copy_in(&import_cmd)?; + io::copy(&mut wal_reader, &mut writer)?; + writer.finish()?; } + + Ok(()) } - cmd } diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000000..7585238efe --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +book diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 886363dccc..0000000000 --- a/docs/README.md +++ /dev/null @@ -1,14 +0,0 @@ -# Zenith documentation - -## Table of contents - -- [authentication.md](authentication.md) — pageserver JWT authentication. -- [docker.md](docker.md) — Docker images and building pipeline. -- [glossary.md](glossary.md) — Glossary of all the terms used in codebase. -- [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI. -- [sourcetree.md](sourcetree.md) — Overview of the source tree layeout. -- [pageserver/README.md](/pageserver/README.md) — pageserver overview. -- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) — Postgres FFI overview. -- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview. -- [safekeeper/README.md](/safekeeper/README.md) — WAL service overview. -- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md new file mode 100644 index 0000000000..cf29ee3c6a --- /dev/null +++ b/docs/SUMMARY.md @@ -0,0 +1,84 @@ +# Summary + +[Introduction]() +- [Separation of Compute and Storage](./separation-compute-storage.md) + +# Architecture + +- [Compute]() + - [WAL proposer]() + - [WAL Backpressure]() + - [Postgres changes](./core_changes.md) + +- [Pageserver](./pageserver.md) + - [Services](./pageserver-services.md) + - [Thread management](./pageserver-thread-mgmt.md) + - [WAL Redo](./pageserver-walredo.md) + - [Page cache](./pageserver-pagecache.md) + - [Storage](./pageserver-storage.md) + - [Datadir mapping]() + - [Layer files]() + - [Branching]() + - [Garbage collection]() + - [Cloud Storage]() + - [Processing a GetPage request](./pageserver-processing-getpage.md) + - [Processing WAL](./pageserver-processing-wal.md) + - [Management API]() + - [Tenant Rebalancing]() + +- [WAL Service](walservice.md) + - [Consensus protocol](safekeeper-protocol.md) + - [Management API]() + - [Rebalancing]() + +- [Control Plane]() + +- [Proxy]() + +- [Source view](./sourcetree.md) + - [docker.md](./docker.md) — Docker images and building pipeline. + - [Error handling and logging]() + - [Testing]() + - [Unit testing]() + - [Integration testing]() + - [Benchmarks]() + + +- [Glossary](./glossary.md) + +# Uncategorized + +- [authentication.md](./authentication.md) +- [multitenancy.md](./multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI. +- [settings.md](./settings.md) +#FIXME: move these under sourcetree.md +#- [pageserver/README.md](/pageserver/README.md) +#- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) +#- [test_runner/README.md](/test_runner/README.md) +#- [safekeeper/README.md](/safekeeper/README.md) + + +# RFCs + +- [RFCs](./rfcs/README.md) + +- [002-storage](rfcs/002-storage.md) +- [003-laptop-cli](rfcs/003-laptop-cli.md) +- [004-durability](rfcs/004-durability.md) +- [005-zenith_local](rfcs/005-zenith_local.md) +- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md) +- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md) +- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md) +- [008-push-pull](rfcs/008-push-pull.md) +- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md) +- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md) +- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md) +- [010-storage_details](rfcs/010-storage_details.md) +- [011-retention-policy](rfcs/011-retention-policy.md) +- [012-background-tasks](rfcs/012-background-tasks.md) +- [013-term-history](rfcs/013-term-history.md) +- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md) +- [014-storage-lsm](rfcs/014-storage-lsm.md) +- [015-storage-messaging](rfcs/015-storage-messaging.md) +- [016-connection-routing](rfcs/016-connection-routing.md) +- [cluster-size-limits](rfcs/cluster-size-limits.md) diff --git a/docs/book.toml b/docs/book.toml new file mode 100644 index 0000000000..f83ac2a6aa --- /dev/null +++ b/docs/book.toml @@ -0,0 +1,5 @@ +[book] +language = "en" +multilingual = false +src = "." +title = "Neon architecture" diff --git a/docs/core_changes.md b/docs/core_changes.md index db311e3667..86fdc420f7 100644 --- a/docs/core_changes.md +++ b/docs/core_changes.md @@ -1,3 +1,12 @@ +# Postgres core changes + +This lists all the changes that have been made to the PostgreSQL +source tree, as a somewhat logical set of patches. The long-term goal +is to eliminate all these changes, by submitting patches to upstream +and refactoring code into extensions, so that you can run unmodified +PostgreSQL against Neon storage. + + 1. Add t_cid to XLOG record - Why? The cmin/cmax on a heap page is a real bummer. I don't see any other way to fix that than bite the bullet and modify the WAL-logging routine to include the cmin/cmax. @@ -188,7 +197,7 @@ Not currently committed but proposed: 3. Prefetching - Why? As far as pages in Zenith are loaded on demand, to reduce node startup time - and also sppedup some massive queries we need some mechanism for bulk loading to + and also speedup some massive queries we need some mechanism for bulk loading to reduce page request round-trip overhead. Currently Postgres is supporting prefetching only for bitmap scan. diff --git a/docs/glossary.md b/docs/glossary.md index a014446010..7aeae27a39 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -2,7 +2,7 @@ ### Authentication -### Backpresssure +### Backpressure Backpressure is used to limit the lag between pageserver and compute node or WAL service. @@ -115,7 +115,7 @@ Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/RE * `CommitLSN`: position in WAL confirmed by quorum safekeepers. * `RestartLSN`: position in WAL confirmed by all safekeepers. * `FlushLSN`: part of WAL persisted to the disk by safekeeper. -* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records. +* `VCL`: the largest LSN for which we can guarantee availability of all prior records. Neon pageserver LSNs: * `last_record_lsn` - the end of last processed WAL record. diff --git a/docs/multitenancy.md b/docs/multitenancy.md index 4f1d45e970..c697ae93cd 100644 --- a/docs/multitenancy.md +++ b/docs/multitenancy.md @@ -6,7 +6,7 @@ Zenith supports multitenancy. One pageserver can serve multiple tenants at once. ### Tenants in other commands -By default during `zenith init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct arugment `--tenantid=` is provided. So generally tenantid more frequently appears in internal pageserver interface. Its commands take tenantid argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants. +By default during `zenith init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct argument `--tenantid=` is provided. So generally tenantid more frequently appears in internal pageserver interface. Its commands take tenantid argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants. Examples for cli: diff --git a/docs/pageserver-page-service.md b/docs/pageserver-page-service.md new file mode 100644 index 0000000000..cea9e5a637 --- /dev/null +++ b/docs/pageserver-page-service.md @@ -0,0 +1,9 @@ +# Page Service + +The Page Service listens for GetPage@LSN requests from the Compute Nodes, +and responds with pages from the repository. On each GetPage@LSN request, +it calls into the Repository function + +A separate thread is spawned for each incoming connection to the page +service. The page service uses the libpq protocol to communicate with +the client. The client is a Compute Postgres instance. diff --git a/docs/pageserver-pagecache.md b/docs/pageserver-pagecache.md new file mode 100644 index 0000000000..d9b120bbb9 --- /dev/null +++ b/docs/pageserver-pagecache.md @@ -0,0 +1,8 @@ +# Page cache + +TODO: + +- shared across tenants +- store pages from layer files +- store pages from "in-memory layer" +- store materialized pages diff --git a/docs/pageserver-processing-getpage.md b/docs/pageserver-processing-getpage.md new file mode 100644 index 0000000000..be99ab82d4 --- /dev/null +++ b/docs/pageserver-processing-getpage.md @@ -0,0 +1,4 @@ +# Processing a GetPage request + +TODO: +- sequence diagram that shows how a GetPage@LSN request is processed diff --git a/docs/pageserver-processing-wal.md b/docs/pageserver-processing-wal.md new file mode 100644 index 0000000000..f8c43b6085 --- /dev/null +++ b/docs/pageserver-processing-wal.md @@ -0,0 +1,5 @@ +# Processing WAL + +TODO: +- diagram that shows how incoming WAL is processed +- explain durability, what is fsync'd when, disk_consistent_lsn diff --git a/pageserver/README.md b/docs/pageserver-services.md similarity index 74% rename from pageserver/README.md rename to docs/pageserver-services.md index cf841d1e46..4e85413513 100644 --- a/pageserver/README.md +++ b/docs/pageserver-services.md @@ -1,15 +1,4 @@ -## Page server architecture - -The Page Server has a few different duties: - -- Respond to GetPage@LSN requests from the Compute Nodes -- Receive WAL from WAL safekeeper -- Replay WAL that's applicable to the chunks that the Page Server maintains -- Backup to S3 - -S3 is the main fault-tolerant storage of all data, as there are no Page Server -replicas. We use a separate fault-tolerant WAL service to reduce latency. It -keeps track of WAL records which are not synced to S3 yet. +# Services The Page Server consists of multiple threads that operate on a shared repository of page versions: @@ -21,18 +10,22 @@ repository of page versions: | WAL receiver | | | +--------------+ - +----+ - +---------+ .......... | | - | | . . | | - GetPage@LSN | | . backup . -------> | S3 | --------------> | Page | repository . . | | - | Service | .......... | | - page | | +----+ + ...... + +---------+ +--------+ . . + | | | | . . + GetPage@LSN | | | backup | -------> . S3 . +-------------> | Page | repository | | . . + | Service | +--------+ . . + page | | ...... <------------- | | - +---------+ +--------------------+ - | Checkpointing / | - | Garbage collection | - +--------------------+ + +---------+ +-----------+ +--------------------+ + | WAL redo | | Checkpointing, | + +----------+ | processes | | Garbage collection | + | | +-----------+ +--------------------+ + | HTTP | + | mgmt API | + | | + +----------+ Legend: @@ -40,83 +33,29 @@ Legend: | | A thread or multi-threaded service +--+ -.... -. . Component at its early development phase. -.... - ---> Data flow <--- ``` -Page Service ------------- +## Page Service The Page Service listens for GetPage@LSN requests from the Compute Nodes, -and responds with pages from the repository. +and responds with pages from the repository. On each GetPage@LSN request, +it calls into the Repository function + +A separate thread is spawned for each incoming connection to the page +service. The page service uses the libpq protocol to communicate with +the client. The client is a Compute Postgres instance. + +## WAL Receiver + +The WAL receiver connects to the external WAL safekeeping service +using PostgreSQL physical streaming replication, and continuously +receives WAL. It decodes the WAL records, and stores them to the +repository. -WAL Receiver ------------- - -The WAL receiver connects to the external WAL safekeeping service (or -directly to the primary) using PostgreSQL physical streaming -replication, and continuously receives WAL. It decodes the WAL records, -and stores them to the repository. - - -Repository ----------- - -The repository stores all the page versions, or WAL records needed to -reconstruct them. Each tenant has a separate Repository, which is -stored in the .zenith/tenants/ directory. - -Repository is an abstract trait, defined in `repository.rs`. It is -implemented by the LayeredRepository object in -`layered_repository.rs`. There is only that one implementation of the -Repository trait, but it's still a useful abstraction that keeps the -interface for the low-level storage functionality clean. The layered -storage format is described in layered_repository/README.md. - -Each repository consists of multiple Timelines. Timeline is a -workhorse that accepts page changes from the WAL, and serves -get_page_at_lsn() and get_rel_size() requests. Note: this has nothing -to do with PostgreSQL WAL timeline. The term "timeline" is mostly -interchangeable with "branch", there is a one-to-one mapping from -branch to timeline. A timeline has a unique ID within the tenant, -represented as 16-byte hex string that never changes, whereas a -branch is a user-given name for a timeline. - -Each repository also has a WAL redo manager associated with it, see -`walredo.rs`. The WAL redo manager is used to replay PostgreSQL WAL -records, whenever we need to reconstruct a page version from WAL to -satisfy a GetPage@LSN request, or to avoid accumulating too much WAL -for a page. The WAL redo manager uses a Postgres process running in -special zenith wal-redo mode to do the actual WAL redo, and -communicates with the process using a pipe. - - -Checkpointing / Garbage Collection ----------------------------------- - -Periodically, the checkpointer thread wakes up and performs housekeeping -duties on the repository. It has two duties: - -### Checkpointing - -Flush WAL that has accumulated in memory to disk, so that the old WAL -can be truncated away in the WAL safekeepers. Also, to free up memory -for receiving new WAL. This process is called "checkpointing". It's -similar to checkpointing in PostgreSQL or other DBMSs, but in the page -server, checkpointing happens on a per-segment basis. - -### Garbage collection - -Remove old on-disk layer files that are no longer needed according to the -PITR retention policy - - -### Backup service +## Backup service The backup service, responsible for storing pageserver recovery data externally. @@ -159,6 +98,67 @@ prefix_in_bucket = '/test_prefix/' `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed. + +## Repository background tasks + +The Repository also has a few different background threads and tokio tasks that perform +background duties like dumping accumulated WAL data from memory to disk, reorganizing +files for performance (compaction), and garbage collecting old files. + + +Repository +---------- + +The repository stores all the page versions, or WAL records needed to +reconstruct them. Each tenant has a separate Repository, which is +stored in the .neon/tenants/ directory. + +Repository is an abstract trait, defined in `repository.rs`. It is +implemented by the LayeredRepository object in +`layered_repository.rs`. There is only that one implementation of the +Repository trait, but it's still a useful abstraction that keeps the +interface for the low-level storage functionality clean. The layered +storage format is described in layered_repository/README.md. + +Each repository consists of multiple Timelines. Timeline is a +workhorse that accepts page changes from the WAL, and serves +get_page_at_lsn() and get_rel_size() requests. Note: this has nothing +to do with PostgreSQL WAL timeline. The term "timeline" is mostly +interchangeable with "branch", there is a one-to-one mapping from +branch to timeline. A timeline has a unique ID within the tenant, +represented as 16-byte hex string that never changes, whereas a +branch is a user-given name for a timeline. + +Each repository also has a WAL redo manager associated with it, see +`walredo.rs`. The WAL redo manager is used to replay PostgreSQL WAL +records, whenever we need to reconstruct a page version from WAL to +satisfy a GetPage@LSN request, or to avoid accumulating too much WAL +for a page. The WAL redo manager uses a Postgres process running in +special Neon wal-redo mode to do the actual WAL redo, and +communicates with the process using a pipe. + + +Checkpointing / Garbage Collection +---------------------------------- + +Periodically, the checkpointer thread wakes up and performs housekeeping +duties on the repository. It has two duties: + +### Checkpointing + +Flush WAL that has accumulated in memory to disk, so that the old WAL +can be truncated away in the WAL safekeepers. Also, to free up memory +for receiving new WAL. This process is called "checkpointing". It's +similar to checkpointing in PostgreSQL or other DBMSs, but in the page +server, checkpointing happens on a per-segment basis. + +### Garbage collection + +Remove old on-disk layer files that are no longer needed according to the +PITR retention policy + + + TODO: Sharding -------------------- diff --git a/pageserver/src/layered_repository/README.md b/docs/pageserver-storage.md similarity index 98% rename from pageserver/src/layered_repository/README.md rename to docs/pageserver-storage.md index 70c571a507..8d03e68ac7 100644 --- a/pageserver/src/layered_repository/README.md +++ b/docs/pageserver-storage.md @@ -1,4 +1,4 @@ -# Overview +# Pageserver storage The main responsibility of the Page Server is to process the incoming WAL, and reprocess it into a format that allows reasonably quick access to any page @@ -123,7 +123,7 @@ The files are called "layer files". Each layer file covers a range of keys, and a range of LSNs (or a single LSN, in case of image layers). You can think of it as a rectangle in the two-dimensional key-LSN space. The layer files for each timeline are stored in the timeline's subdirectory under -`.zenith/tenants//timelines`. +`.neon/tenants//timelines`. There are two kind of layer files: images, and delta layers. An image file contains a snapshot of all keys at a particular LSN, whereas a delta file @@ -178,7 +178,7 @@ version, and how branching and GC works is still valid. The full path of a delta file looks like this: ``` - .zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000 + .neon/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000 ``` For simplicity, the examples below use a simplified notation for the @@ -260,7 +260,7 @@ Whenever a GetPage@LSN request comes in from the compute node, the page server needs to reconstruct the requested page, as it was at the requested LSN. To do that, the page server first checks the recent in-memory layer; if the requested page version is found there, it can -be returned immediatedly without looking at the files on +be returned immediately without looking at the files on disk. Otherwise the page server needs to locate the layer file that contains the requested page version. @@ -409,7 +409,7 @@ removed because there is no newer layer file for the table. Things get slightly more complicated with multiple branches. All of the above still holds, but in addition to recent files we must also -retain older shapshot files that are still needed by child branches. +retain older snapshot files that are still needed by child branches. For example, if child branch is created at LSN 150, and the 'customers' table is updated on the branch, you would have these files: diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md new file mode 100644 index 0000000000..9ee3e40085 --- /dev/null +++ b/docs/pageserver-thread-mgmt.md @@ -0,0 +1,26 @@ +## Thread management + +Each thread in the system is tracked by the `thread_mgr` module. It +maintains a registry of threads, and which tenant or timeline they are +operating on. This is used for safe shutdown of a tenant, or the whole +system. + +### Handling shutdown + +When a tenant or timeline is deleted, we need to shut down all threads +operating on it, before deleting the data on disk. A thread registered +in the thread registry can check if it has been requested to shut down, +by calling `is_shutdown_requested()`. For async operations, there's also +a `shudown_watcher()` async task that can be used to wake up on shutdown. + +### Sync vs async + +The primary programming model in the page server is synchronous, +blocking code. However, there are some places where async code is +used. Be very careful when mixing sync and async code. + +Async is primarily used to wait for incoming data on network +connections. For example, all WAL receivers have a shared thread pool, +with one async Task for each connection. Once a piece of WAL has been +received from the network, the thread calls the blocking functions in +the Repository to process the WAL. diff --git a/docs/pageserver-walredo.md b/docs/pageserver-walredo.md new file mode 100644 index 0000000000..1de9c177cc --- /dev/null +++ b/docs/pageserver-walredo.md @@ -0,0 +1,77 @@ +# WAL Redo + +To reconstruct a particular page version from an image of the page and +some WAL records, the pageserver needs to replay the WAL records. This +happens on-demand, when a GetPage@LSN request comes in, or as part of +background jobs that reorganize data for faster access. + +It's important that data cannot leak from one tenant to another, and +that a corrupt WAL record on one timeline doesn't affect other tenants +or timelines. + +## Multi-tenant security + +If you have direct access to the WAL directory, or if you have +superuser access to a running PostgreSQL server, it's easy to +construct a malicious or corrupt WAL record that causes the WAL redo +functions to crash, or to execute arbitrary code. That is not a +security problem for PostgreSQL; if you have superuser access, you +have full access to the system anyway. + +The Neon pageserver, however, is multi-tenant. It needs to execute WAL +belonging to different tenants in the same system, and malicious WAL +in one tenant must not affect other tenants. + +A separate WAL redo process is launched for each tenant, and the +process uses the seccomp(2) system call to restrict its access to the +bare minimum needed to replay WAL records. The process does not have +access to the filesystem or network. It can only communicate with the +parent pageserver process through a pipe. + +If an attacker creates a malicious WAL record and injects it into the +WAL stream of a timeline, he can take control of the WAL redo process +in the pageserver. However, the WAL redo process cannot access the +rest of the system. And because there is a separate WAL redo process +for each tenant, the hijacked WAL redo process can only see WAL and +data belonging to the same tenant, which the attacker would have +access to anyway. + +## WAL-redo process communication + +The WAL redo process runs the 'postgres' executable, launched with a +Neon-specific command-line option to put it into WAL-redo process +mode. The pageserver controls the lifetime of the WAL redo processes, +launching them as needed. If a tenant is detached from the pageserver, +any WAL redo processes for that tenant are killed. + +The pageserver communicates with each WAL redo process over its +stdin/stdout/stderr. It works in request-response model with a simple +custom protocol, described in walredo.rs. To replay a set of WAL +records for a page, the pageserver sends the "before" image of the +page and the WAL records over 'stdin', followed by a command to +perform the replay. The WAL redo process responds with an "after" +image of the page. + +## Special handling of some records + +Some WAL record types are handled directly in the pageserver, by +bespoken Rust code, and are not sent over to the WAL redo process. +This includes SLRU-related WAL records, like commit records. SLRUs +don't use the standard Postgres buffer manager, so dealing with them +in the Neon WAL redo mode would require quite a few changes to +Postgres code and special handling in the protocol anyway. + +Some record types that include a full-page-image (e.g. XLOG_FPI) are +also handled specially when incoming WAL is processed already, and are +stored as page images rather than WAL records. + + +## Records that modify multiple pages + +Some Postgres WAL records modify multiple pages. Such WAL records are +duplicated, so that a copy is stored for each affected page. This is +somewhat wasteful, but because most WAL records only affect one page, +the overhead is acceptable. + +The WAL redo always happens for one particular page. If the WAL record +coantains changes to other pages, they are ignored. diff --git a/docs/pageserver.md b/docs/pageserver.md new file mode 100644 index 0000000000..ee70032396 --- /dev/null +++ b/docs/pageserver.md @@ -0,0 +1,11 @@ +# Page server architecture + +The Page Server has a few different duties: + +- Respond to GetPage@LSN requests from the Compute Nodes +- Receive WAL from WAL safekeeper, and store it +- Upload data to S3 to make it durable, download files from S3 as needed + +S3 is the main fault-tolerant storage of all data, as there are no Page Server +replicas. We use a separate fault-tolerant WAL service to reduce latency. It +keeps track of WAL records which are not synced to S3 yet. diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md index 5cac377272..f99683cf09 100644 --- a/docs/rfcs/002-storage.md +++ b/docs/rfcs/002-storage.md @@ -77,7 +77,7 @@ Upon storage node restart recent WAL files are applied to appropriate pages and ### **Checkpointing** -No such mechanism is needed. Or we may look at the storage node as at kind of continuous chekpointer. +No such mechanism is needed. Or we may look at the storage node as at kind of continuous checkpointer. ### **Full page writes (torn page protection)** @@ -111,13 +111,13 @@ Since we are storing page diffs of variable sizes there is no structural depende ### **Chunk metadata** -Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunck should always consult this data when merging SSTables and applying delete markers. +Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunk should always consult this data when merging SSTables and applying delete markers. ### **Chunk splitting** *(NB: following paragraph is about how to avoid page splitting)* -When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global matadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following: +When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global metadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following: 1. Find separation key and spawn two new chunks with [lo, mid) [mid, hi) boundaries. @@ -166,7 +166,7 @@ Multi-tenant storage makes sense even on a laptop, when you work with different Few databases are stored in one chunk, replicated three times -- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we alway may manually move chunks around the cluster. +- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we always may manually move chunks around the cluster. Screenshot_2021-02-22_at_16 49 10 diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md index 4d1f0a68f0..1a549c2df5 100644 --- a/docs/rfcs/003-laptop-cli.md +++ b/docs/rfcs/003-laptop-cli.md @@ -123,7 +123,7 @@ Show currently attached storages. For example: > zenith storage list NAME USED TYPE OPTIONS PATH local 5.1G zenith-local /opt/zenith/store/local -local.compr 20.4G zenith-local comression=on /opt/zenith/store/local.compr +local.compr 20.4G zenith-local compression=on /opt/zenith/store/local.compr zcloud 60G zenith-remote zenith.tech/stas/mystore s3tank 80G S3 ``` @@ -136,9 +136,9 @@ s3tank 80G S3 ## pg -Manages postgres data directories and can start postgreses with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themself. +Manages postgres data directories and can start postgres instances with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themselves. -Pg is a term for a single postgres running on some data. I'm trying to avoid here separation of datadir management and postgres instance management -- both that concepts bundled here together. +Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together. **zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata diff --git a/docs/rfcs/005-zenith_local.md b/docs/rfcs/005-zenith_local.md index 7b078e9ec0..e36d0a9ae3 100644 --- a/docs/rfcs/005-zenith_local.md +++ b/docs/rfcs/005-zenith_local.md @@ -31,7 +31,7 @@ Ideally, just one binary that incorporates all elements we need. #### Components: -- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responces to show them in a user-friendly way. +- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way. CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli diff --git a/docs/rfcs/006-laptop-cli-v2-CLI.md b/docs/rfcs/006-laptop-cli-v2-CLI.md index a04536922a..84dc932211 100644 --- a/docs/rfcs/006-laptop-cli-v2-CLI.md +++ b/docs/rfcs/006-laptop-cli-v2-CLI.md @@ -25,9 +25,9 @@ To make changes in the catalog you need to run compute nodes zenith start /home/pipedpiper/northwind:main -- starts a compute instance zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud -- you can start a compute node against any hash or branch -zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start anothe compute instance (on different port) +zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port) -- you can start a compute node against any hash or branch -zenith start /home/pipedpiper/northwind: --port 8009 -- start anothe compute instance (on different port) +zenith start /home/pipedpiper/northwind: --port 8009 -- start another compute instance (on different port) -- After running some DML you can run -- zenith status and see how there are two WAL streams one on top of diff --git a/docs/rfcs/006-laptop-cli-v2-repository-structure.md b/docs/rfcs/006-laptop-cli-v2-repository-structure.md index ee4e432182..e6e6e172ad 100644 --- a/docs/rfcs/006-laptop-cli-v2-repository-structure.md +++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md @@ -121,7 +121,7 @@ repository, launch an instance on the same branch in both clones, and later try to push/pull between them? Perhaps create a new timeline every time you start up an instance? Then you would detect that the timelines have diverged. That would match with the "epoch" concept -that we have in the WAL safekeepr +that we have in the WAL safekeeper ### zenith checkout/commit diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md index 11ded3a724..0acbd68f86 100644 --- a/docs/rfcs/009-snapshot-first-storage-cli.md +++ b/docs/rfcs/009-snapshot-first-storage-cli.md @@ -2,9 +2,9 @@ While working on export/import commands, I understood that they fit really well We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files. -Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postges to zenith. +Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to zenith. -So here is an attemt to design consistent CLI for diferent usage scenarios: +So here is an attempt to design consistent CLI for different usage scenarios: #### 1. Start empty pageserver. That is what we have now. diff --git a/docs/rfcs/009-snapshot-first-storage-pitr.md b/docs/rfcs/009-snapshot-first-storage-pitr.md index 801613e2c9..29d3614d34 100644 --- a/docs/rfcs/009-snapshot-first-storage-pitr.md +++ b/docs/rfcs/009-snapshot-first-storage-pitr.md @@ -3,7 +3,7 @@ GetPage@LSN can be called with older LSNs, and the page server needs to be able to reconstruct older page versions. That's needed for having read-only replicas that lag behind the primary, or that are -"anchored" at an older LSN, and internally in the page server whne you +"anchored" at an older LSN, and internally in the page server when you branch at an older point in time. How do you do that? For now, I'm not considering incremental snapshots at all. I don't @@ -192,7 +192,7 @@ for a particular relation readily available alongside the snapshot files, and you don't need to track what snapshot LSNs exist separately. -(If we wanted to minize the number of files, you could include the +(If we wanted to minimize the number of files, you could include the snapshot @300 and the WAL between 200 and 300 in the same file, but I feel it's probably better to keep them separate) diff --git a/docs/rfcs/009-snapshot-first-storage.md b/docs/rfcs/009-snapshot-first-storage.md index aeef54898a..75ed490f21 100644 --- a/docs/rfcs/009-snapshot-first-storage.md +++ b/docs/rfcs/009-snapshot-first-storage.md @@ -121,7 +121,7 @@ The properties of s3 that we depend on are: list objects streaming read of entire object read byte range from object -streaming write new object (may use multipart upload for better relialibity) +streaming write new object (may use multipart upload for better reliability) delete object (that should not disrupt an already-started read). Uploaded files, restored backups, or s3 buckets controlled by users could contain malicious content. We should always validate that objects contain the content they’re supposed to. Incorrect, Corrupt or malicious-looking contents should cause software (cloud tools, pageserver) to fail gracefully. diff --git a/docs/rfcs/010-storage_details.md b/docs/rfcs/010-storage_details.md index 8429a2d9e3..bc79924e7b 100644 --- a/docs/rfcs/010-storage_details.md +++ b/docs/rfcs/010-storage_details.md @@ -40,7 +40,7 @@ b) overwrite older pages with the newer pages -- if there is no replica we proba I imagine that newly created pages would just be added to the back of PageStore (again in queue-like fashion) and this way there wouldn't be any meaningful ordering inside of that queue. When we are forming a new incremental snapshot we may prohibit any updates to the current set of pages in PageStore (giving up on single page version rule) and cut off that whole set when snapshot creation is complete. -With option b) we can also treat PageStor as an uncompleted increamental snapshot. +With option b) we can also treat PageStor as an uncompleted incremental snapshot. ### LocalStore @@ -123,7 +123,7 @@ As far as I understand Bookfile/Aversion addresses versioning and serialization As for exact data that should go to snapshots I think it is the following for each snapshot: * format version number -* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknow key are present. If we add something backward compatible to the file we can keep the version number. +* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknown key are present. If we add something backward compatible to the file we can keep the version number. * array of [BuffTag, corresponding offset in file] for pages -- IIUC that is analogous to ToC in Bookfile * array of [(BuffTag, LSN), corresponding offset in file] for the WAL records * pages, one by one @@ -131,7 +131,7 @@ As for exact data that should go to snapshots I think it is the following for ea It is also important to be able to load metadata quickly since it would be one of the main factors impacting the time of page server start. E.g. if would store/cache about 10TB of data per page server, the size of uncompressed page references would be about 30GB (10TB / ( 8192 bytes page size / ( ~18 bytes per ObjectTag + 8 bytes offset in the file))). -1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when realtion_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset delatas would be small). +1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when relation_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset deltas would be small). 2) It makes sense to keep ToC at the beginning of the file to avoid extra seeks to locate it. Doesn't matter too much with the local files but matters on S3 -- if we are accessing a lot of ~1Gb files with the size of metadata ~ 1Mb then the time to transfer this metadata would be comparable with access latency itself (which is about a half of a second). So by slurping metadata with one read of file header instead of N reads we can improve the speed of page server start by this N factor. I think both of that optimizations can be done later, but that is something to keep in mind when we are designing our storage serialization routines. diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md index 0c359028ed..59833526c5 100644 --- a/docs/rfcs/013-term-history.md +++ b/docs/rfcs/013-term-history.md @@ -7,13 +7,13 @@ and e.g. prevents electing two proposers with the same term -- it is actually called `term` in the code. The second, called `epoch`, reflects progress of log receival and this might lag behind `term`; safekeeper switches to epoch `n` when it has received all committed log records from all `< n` terms. This roughly -correspones to proposed in +corresponds to proposed in https://github.com/zenithdb/rfcs/pull/3/files This makes our biggest our difference from Raft. In Raft, every log record is -stamped with term in which it was generated; while we essentialy store in +stamped with term in which it was generated; while we essentially store in `epoch` only the term of the highest record on this safekeeper -- when we know it -- because during recovery generally we don't, and `epoch` is bumped directly to the term of the proposer who performs the recovery when it is finished. It is diff --git a/docs/rfcs/015-storage-messaging.md b/docs/rfcs/015-storage-messaging.md index 47bc9eb89c..a415b90459 100644 --- a/docs/rfcs/015-storage-messaging.md +++ b/docs/rfcs/015-storage-messaging.md @@ -124,7 +124,7 @@ Each storage node can subscribe to the relevant sets of keys and maintain a loca ### Safekeeper address discovery -During the startup safekeeper should publish the address he is listening on as the part of `{"sk_#{sk_id}" => ip_address}`. Then the pageserver can resolve `sk_#{sk_id}` to the actual address. This way it would work both locally and in the cloud setup. Safekeeper should have `--advertised-address` CLI option so that we can listen on e.g. 0.0.0.0 but advertize something more useful. +During the startup safekeeper should publish the address he is listening on as the part of `{"sk_#{sk_id}" => ip_address}`. Then the pageserver can resolve `sk_#{sk_id}` to the actual address. This way it would work both locally and in the cloud setup. Safekeeper should have `--advertised-address` CLI option so that we can listen on e.g. 0.0.0.0 but advertise something more useful. ### Safekeeper behavior @@ -195,7 +195,7 @@ sequenceDiagram PS1->>SK1: start replication ``` -#### Behavour of services during typical operations +#### Behaviour of services during typical operations ```mermaid sequenceDiagram @@ -250,7 +250,7 @@ sequenceDiagram PS2->>M: Register downloaded timeline PS2->>M: Get safekeepers for timeline, subscribe to changes PS2->>SK1: Start replication to catch up - note over O: PS2 catched up, time to switch compute + note over O: PS2 caught up, time to switch compute O->>C: Restart compute with new pageserver url in config note over C: Wal push is restarted loop request pages diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md index fdf6885929..f7b0b3a587 100644 --- a/docs/rfcs/README.md +++ b/docs/rfcs/README.md @@ -49,7 +49,7 @@ topics. RFC lifecycle: -- Should be submitted in a pull request with and full RFC text in a commited markdown file and copy of the Summary and Motivation sections also included in the PR body. +- Should be submitted in a pull request with and full RFC text in a committed markdown file and copy of the Summary and Motivation sections also included in the PR body. - RFC should be published for review before most of the actual code is written. This isn’t a strict rule, don’t hesitate to experiment and build a POC in parallel with writing an RFC. - Add labels to the PR in the same manner as you do Issues. Example TBD - Request the review from your peers. Reviewing the RFCs from your peers is a priority, same as reviewing the actual code. diff --git a/docs/rfcs/cluster-size-limits.md b/docs/rfcs/cluster-size-limits.md index 4696f2c7f0..bd4cb9ef32 100644 --- a/docs/rfcs/cluster-size-limits.md +++ b/docs/rfcs/cluster-size-limits.md @@ -22,8 +22,8 @@ so we don't want to give users access to the functionality that we don't think i * pageserver - calculate the size consumed by a timeline and add it to the feedback message. * safekeeper - pass feedback message from pageserver to compute. -* compute - receive feedback message, enforce size limit based on GUC `zenith.max_cluster_size`. -* console - set and update `zenith.max_cluster_size` setting +* compute - receive feedback message, enforce size limit based on GUC `neon.max_cluster_size`. +* console - set and update `neon.max_cluster_size` setting ## Proposed implementation @@ -36,12 +36,12 @@ This is how the `LOGICAL_TIMELINE_SIZE` metric is implemented in the pageserver. Alternatively, we could count only relation data. As in pg_database_size(). This approach is somewhat more user-friendly because it is the data that is really affected by the user. On the other hand, it puts us in a weaker position than other services, i.e., RDS. -We will need to refactor the timeline_size counter or add another counter to implement it. +We will need to refactor the timeline_size counter or add another counter to implement it. Timeline size is updated during wal digestion. It is not versioned and is valid at the last_received_lsn moment. Then this size should be reported to compute node. -`current_timeline_size` value is included in the walreceiver's custom feedback message: `ZenithFeedback.` +`current_timeline_size` value is included in the walreceiver's custom feedback message: `ReplicationFeedback.` (PR about protocol changes https://github.com/zenithdb/zenith/pull/1037). @@ -49,7 +49,7 @@ This message is received by the safekeeper and propagated to compute node as a p Finally, when compute node receives the `current_timeline_size` from safekeeper (or from pageserver directly), it updates the global variable. -And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > zenith.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so. +And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > neon.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so. (see Postgres error codes [https://www.postgresql.org/docs/devel/errcodes-appendix.html](https://www.postgresql.org/docs/devel/errcodes-appendix.html)) TODO: @@ -64,16 +64,16 @@ We should warn users if the limit is soon to be reached. ### **Reliability, failure modes and corner cases** 1. `current_timeline_size` is valid at the last received and digested by pageserver lsn. - + If pageserver lags behind compute node, `current_timeline_size` will lag too. This lag can be tuned using backpressure, but it is not expected to be 0 all the time. - + So transactions that happen in this lsn range may cause limit overflow. Especially operations that generate (i.e., CREATE DATABASE) or free (i.e., TRUNCATE) a lot of data pages while generating a small amount of WAL. Are there other operations like this? - + Currently, CREATE DATABASE operations are restricted in the console. So this is not an issue. ### **Security implications** We treat compute as an untrusted component. That's why we try to isolate it with secure container runtime or a VM. -Malicious users may change the `zenith.max_cluster_size`, so we need an extra size limit check. +Malicious users may change the `neon.max_cluster_size`, so we need an extra size limit check. To cover this case, we also monitor the compute node size in the console. diff --git a/safekeeper/README_PROTO.md b/docs/safekeeper-protocol.md similarity index 98% rename from safekeeper/README_PROTO.md rename to docs/safekeeper-protocol.md index 6b2ae50254..a2d4fa455d 100644 --- a/safekeeper/README_PROTO.md +++ b/docs/safekeeper-protocol.md @@ -143,7 +143,7 @@ Restart of PostgreSQL initiates new round of voting and switching new epoch. ## Limitations Right now message queue is maintained in main memory and is not spilled to the disk. It can cause memory overflow in case of presence of lagging safekeepers. -It is assumed that in case of loosing local data by some safekeepers, it should be recovered using some external mechanism. +It is assumed that in case of losing local data by some safekeepers, it should be recovered using some external mechanism. ## Glossary @@ -152,8 +152,8 @@ It is assumed that in case of loosing local data by some safekeepers, it should * `FlushLSN`: part of WAL persisted to the disk by safekeeper. * `NodeID`: pair (term,UUID) * `Pager`: Neon component restoring pages from WAL stream -* `Replica`: read-only computatio node -* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records. +* `Replica`: read-only computation node +* `VCL`: the largest LSN for which we can guarantee availability of all prior records. ## Algorithm diff --git a/docs/separation-compute-storage.md b/docs/separation-compute-storage.md new file mode 100644 index 0000000000..f07fa8b6dc --- /dev/null +++ b/docs/separation-compute-storage.md @@ -0,0 +1,8 @@ +# Separation of Compute and Storage + +TODO: + +- Read path +- Write path +- Durability model +- API auth diff --git a/docs/settings.md b/docs/settings.md index 9564ef626f..f2aaab75a8 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -23,7 +23,7 @@ gc_horizon = '67108864' max_file_descriptors = '100' # initial superuser role name to use when creating a new tenant -initial_superuser_name = 'zenith_admin' +initial_superuser_name = 'cloud_admin' broker_etcd_prefix = 'neon' broker_endpoints = ['some://etcd'] @@ -31,14 +31,14 @@ broker_endpoints = ['some://etcd'] # [remote_storage] ``` -The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user, +The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user, see the corresponding section below. Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank. Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start. Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#table) in TOML specification and -- either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'` +- either has to be placed in the config after the table-less values such as `initial_superuser_name = 'cloud_admin'` - or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}` @@ -54,7 +54,7 @@ Note that TOML distinguishes between strings and integers, the former require si A list of endpoints (etcd currently) to connect and pull the information from. Mandatory, does not have a default, since requires etcd to be started as a separate process, -and its connection url should be specified separately. +and its connection url should be specified separately. #### broker_etcd_prefix @@ -105,17 +105,31 @@ Interval at which garbage collection is triggered. Default is 100 s. #### image_creation_threshold -L0 delta layer threshold for L1 iamge layer creation. Default is 3. +L0 delta layer threshold for L1 image layer creation. Default is 3. #### pitr_interval WAL retention duration for PITR branching. Default is 30 days. +#### walreceiver_connect_timeout + +Time to wait to establish the wal receiver connection before failing + +#### lagging_wal_timeout + +Time the pageserver did not get any WAL updates from safekeeper (if any). +Avoids lagging pageserver preemptively by forcing to switch it from stalled connections. + +#### max_lsn_wal_lag + +Difference between Lsn values of the latest available WAL on safekeepers: if currently connected safekeeper starts to lag too long and too much, +it gets swapped to the different one. + #### initial_superuser_name Name of the initial superuser role, passed to initdb when a new tenant is initialized. It doesn't affect anything after initialization. The -default is Note: The default is 'zenith_admin', and the console +default is Note: The default is 'cloud_admin', and the console depends on that, so if you change it, bad things will happen. #### page_cache_size @@ -140,7 +154,7 @@ The default distrib dir is `./tmp_install/`. #### workdir (-D) A directory in the file system, where pageserver will store its files. -The default is `./.zenith/`. +The default is `./.neon/`. This parameter has a special CLI alias (`-D`) and can not be overridden with regular `-c` way. @@ -185,7 +199,7 @@ If no IAM bucket access is used during the remote storage usage, use the `AWS_AC ###### General remote storage configuration -Pagesever allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used. +Pageserver allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used. No default values are used for the remote storage configuration parameters. Besides, there are parameters common for all types of remote storage that can be configured, those have defaults: diff --git a/docs/sourcetree.md b/docs/sourcetree.md index c8d4baff62..05eaa96938 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -10,7 +10,7 @@ Intended to be used in integration tests and in CLI tools for local installation `/docs`: -Documentaion of the Zenith features and concepts. +Documentation of the Zenith features and concepts. Now it is mostly dev documentation. `/monitoring`: @@ -42,13 +42,13 @@ Integration tests, written in Python using the `pytest` framework. `/vendor/postgres`: -PostgreSQL source tree, with the modifications needed for Zenith. +PostgreSQL source tree, with the modifications needed for Neon. -`/vendor/postgres/contrib/zenith`: +`/vendor/postgres/contrib/neon`: PostgreSQL extension that implements storage manager API and network communications with remote page server. -`/vendor/postgres/contrib/zenith_test_utils`: +`/vendor/postgres/contrib/neon_test_utils`: PostgreSQL extension that contains functions needed for testing and debugging. @@ -92,7 +92,7 @@ A single virtual environment with all dependencies is described in the single `P ### Prerequisites - Install Python 3.9 (the minimal supported version) or greater. - - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesnt work as expected. + - Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesn't work as expected. - If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.: ```bash # In Ubuntu diff --git a/safekeeper/README.md b/docs/walservice.md similarity index 98% rename from safekeeper/README.md rename to docs/walservice.md index a4bb260932..7b217ddbec 100644 --- a/safekeeper/README.md +++ b/docs/walservice.md @@ -75,7 +75,7 @@ safekeepers. The Paxos and crash recovery algorithm ensures that only one primary node can be actively streaming WAL to the quorum of safekeepers. -See README_PROTO.md for a more detailed desription of the consensus +See README_PROTO.md for a more detailed description of the consensus protocol. spec/ contains TLA+ specification of it. # Q&A diff --git a/libs/etcd_broker/Cargo.toml b/libs/etcd_broker/Cargo.toml index 65bd406131..49be7ad207 100644 --- a/libs/etcd_broker/Cargo.toml +++ b/libs/etcd_broker/Cargo.toml @@ -9,6 +9,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_with = "1.12.0" + once_cell = "1.8.0" utils = { path = "../utils" } workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs index 76181f9ba1..8f698977a9 100644 --- a/libs/etcd_broker/src/lib.rs +++ b/libs/etcd_broker/src/lib.rs @@ -1,348 +1,209 @@ //! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent). //! Intended to connect services to each other, not to store their data. -use std::{ - collections::{hash_map, HashMap}, - fmt::Display, - str::FromStr, -}; -use regex::{Captures, Regex}; -use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, DisplayFromStr}; +/// All broker keys, that are used when dealing with etcd. +pub mod subscription_key; +/// All broker values, possible to use when dealing with etcd. +pub mod subscription_value; -pub use etcd_client::*; +use std::str::FromStr; +use serde::de::DeserializeOwned; + +use subscription_key::SubscriptionKey; use tokio::{sync::mpsc, task::JoinHandle}; use tracing::*; -use utils::{ - lsn::Lsn, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId}, -}; + +use crate::subscription_key::SubscriptionFullKey; + +pub use etcd_client::*; /// Default value to use for prefixing to all etcd keys with. /// This way allows isolating safekeeper/pageserver groups in the same etcd cluster. pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon"; -#[derive(Debug, Deserialize, Serialize)] -struct SafekeeperTimeline { - safekeeper_id: ZNodeId, - info: SkTimelineInfo, +/// A way to control the data retrieval from a certain subscription. +pub struct BrokerSubscription { + /// An unbounded channel to fetch the relevant etcd updates from. + pub value_updates: mpsc::UnboundedReceiver>, + key: SubscriptionKey, + /// A subscription task handle, to allow waiting on it for the task to complete. + /// Both the updates channel and the handle require `&mut`, so it's better to keep + /// both `pub` to allow using both in the same structures without borrow checker complaining. + pub watcher_handle: JoinHandle>, + watcher: Watcher, } -/// Published data about safekeeper's timeline. Fields made optional for easy migrations. -#[serde_as] -#[derive(Debug, Deserialize, Serialize)] -pub struct SkTimelineInfo { - /// Term of the last entry. - pub last_log_term: Option, - /// LSN of the last record. - #[serde_as(as = "Option")] - #[serde(default)] - pub flush_lsn: Option, - /// Up to which LSN safekeeper regards its WAL as committed. - #[serde_as(as = "Option")] - #[serde(default)] - pub commit_lsn: Option, - /// LSN up to which safekeeper offloaded WAL to s3. - #[serde_as(as = "Option")] - #[serde(default)] - pub s3_wal_lsn: Option, - /// LSN of last checkpoint uploaded by pageserver. - #[serde_as(as = "Option")] - #[serde(default)] - pub remote_consistent_lsn: Option, - #[serde_as(as = "Option")] - #[serde(default)] - pub peer_horizon_lsn: Option, - #[serde(default)] - pub safekeeper_connection_string: Option, +impl BrokerSubscription { + /// Cancels the subscription, stopping the data poller and waiting for it to shut down. + pub async fn cancel(mut self) -> Result<(), BrokerError> { + self.watcher.cancel().await.map_err(|e| { + BrokerError::EtcdClient( + e, + format!("Failed to cancel broker subscription, kind: {:?}", self.key), + ) + })?; + match (&mut self.watcher_handle).await { + Ok(res) => res, + Err(e) => { + if e.is_cancelled() { + // don't error on the tasks that are cancelled already + Ok(()) + } else { + Err(BrokerError::InternalError(format!( + "Panicked during broker subscription task, kind: {:?}, error: {e}", + self.key + ))) + } + } + } + } +} + +impl Drop for BrokerSubscription { + fn drop(&mut self) { + // we poll data from etcd into the channel in the same struct, so if the whole struct gets dropped, + // no more data is used by the receiver and it's safe to cancel and drop the whole etcd subscription task. + self.watcher_handle.abort(); + } +} + +/// An update from the etcd broker. +pub struct BrokerUpdate { + /// Etcd generation version, the bigger the more actual the data is. + pub etcd_version: i64, + /// Etcd key for the corresponding value, parsed from the broker KV. + pub key: SubscriptionFullKey, + /// Current etcd value, parsed from the broker KV. + pub value: V, } #[derive(Debug, thiserror::Error)] pub enum BrokerError { #[error("Etcd client error: {0}. Context: {1}")] EtcdClient(etcd_client::Error, String), - #[error("Error during parsing etcd data: {0}")] - ParsingError(String), + #[error("Error during parsing etcd key: {0}")] + KeyNotParsed(String), #[error("Internal error: {0}")] InternalError(String), } -/// A way to control the data retrieval from a certain subscription. -pub struct SkTimelineSubscription { - safekeeper_timeline_updates: - mpsc::UnboundedReceiver>>, - kind: SkTimelineSubscriptionKind, - watcher_handle: JoinHandle>, - watcher: Watcher, -} - -impl SkTimelineSubscription { - /// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet. - pub async fn fetch_data( - &mut self, - ) -> Option>> { - self.safekeeper_timeline_updates.recv().await - } - - /// Cancels the subscription, stopping the data poller and waiting for it to shut down. - pub async fn cancel(mut self) -> Result<(), BrokerError> { - self.watcher.cancel().await.map_err(|e| { - BrokerError::EtcdClient( - e, - format!( - "Failed to cancel timeline subscription, kind: {:?}", - self.kind - ), - ) - })?; - self.watcher_handle.await.map_err(|e| { - BrokerError::InternalError(format!( - "Failed to join the timeline updates task, kind: {:?}, error: {e}", - self.kind - )) - })? - } -} - -/// The subscription kind to the timeline updates from safekeeper. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct SkTimelineSubscriptionKind { - broker_etcd_prefix: String, - kind: SubscriptionKind, -} - -impl SkTimelineSubscriptionKind { - pub fn all(broker_etcd_prefix: String) -> Self { - Self { - broker_etcd_prefix, - kind: SubscriptionKind::All, - } - } - - pub fn tenant(broker_etcd_prefix: String, tenant: ZTenantId) -> Self { - Self { - broker_etcd_prefix, - kind: SubscriptionKind::Tenant(tenant), - } - } - - pub fn timeline(broker_etcd_prefix: String, timeline: ZTenantTimelineId) -> Self { - Self { - broker_etcd_prefix, - kind: SubscriptionKind::Timeline(timeline), - } - } - - fn watch_regex(&self) -> Regex { - match self.kind { - SubscriptionKind::All => Regex::new(&format!( - r"^{}/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$", - self.broker_etcd_prefix - )) - .expect("wrong regex for 'everything' subscription"), - SubscriptionKind::Tenant(tenant_id) => Regex::new(&format!( - r"^{}/{tenant_id}/([[:xdigit:]]+)/safekeeper/([[:digit:]])$", - self.broker_etcd_prefix - )) - .expect("wrong regex for 'tenant' subscription"), - SubscriptionKind::Timeline(ZTenantTimelineId { - tenant_id, - timeline_id, - }) => Regex::new(&format!( - r"^{}/{tenant_id}/{timeline_id}/safekeeper/([[:digit:]])$", - self.broker_etcd_prefix - )) - .expect("wrong regex for 'timeline' subscription"), - } - } - - /// Etcd key to use for watching a certain timeline updates from safekeepers. - pub fn watch_key(&self) -> String { - match self.kind { - SubscriptionKind::All => self.broker_etcd_prefix.to_string(), - SubscriptionKind::Tenant(tenant_id) => { - format!("{}/{tenant_id}/safekeeper", self.broker_etcd_prefix) - } - SubscriptionKind::Timeline(ZTenantTimelineId { - tenant_id, - timeline_id, - }) => format!( - "{}/{tenant_id}/{timeline_id}/safekeeper", - self.broker_etcd_prefix - ), - } - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -enum SubscriptionKind { - /// Get every timeline update. - All, - /// Get certain tenant timelines' updates. - Tenant(ZTenantId), - /// Get certain timeline updates. - Timeline(ZTenantTimelineId), -} - /// Creates a background task to poll etcd for timeline updates from safekeepers. /// Stops and returns `Err` on any error during etcd communication. /// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle, /// exiting normally in such cases. -pub async fn subscribe_to_safekeeper_timeline_updates( +/// Etcd values are parsed as json fukes into a type, specified in the generic patameter. +pub async fn subscribe_for_json_values( client: &mut Client, - subscription: SkTimelineSubscriptionKind, -) -> Result { - info!("Subscribing to timeline updates, subscription kind: {subscription:?}"); + key: SubscriptionKey, +) -> Result, BrokerError> +where + V: DeserializeOwned + Send + 'static, +{ + subscribe_for_values(client, key, |_, value_str| { + match serde_json::from_str::(value_str) { + Ok(value) => Some(value), + Err(e) => { + error!("Failed to parse value str '{value_str}': {e}"); + None + } + } + }) + .await +} + +/// Same as [`subscribe_for_json_values`], but allows to specify a custom parser of a etcd value string. +pub async fn subscribe_for_values( + client: &mut Client, + key: SubscriptionKey, + value_parser: P, +) -> Result, BrokerError> +where + V: Send + 'static, + P: Fn(SubscriptionFullKey, &str) -> Option + Send + 'static, +{ + info!("Subscribing to broker value updates, key: {key:?}"); + let subscription_key = key.clone(); let (watcher, mut stream) = client - .watch( - subscription.watch_key(), - Some(WatchOptions::new().with_prefix()), - ) + .watch(key.watch_key(), Some(WatchOptions::new().with_prefix())) .await .map_err(|e| { BrokerError::EtcdClient( e, - format!("Failed to init the watch for subscription {subscription:?}"), + format!("Failed to init the watch for subscription {key:?}"), ) })?; - let (timeline_updates_sender, safekeeper_timeline_updates) = mpsc::unbounded_channel(); - - let subscription_kind = subscription.kind; - let regex = subscription.watch_regex(); + let (value_updates_sender, value_updates_receiver) = mpsc::unbounded_channel(); let watcher_handle = tokio::spawn(async move { while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!( - "Failed to get messages from the subscription stream, kind: {subscription_kind:?}, error: {e}" + "Failed to get messages from the subscription stream, kind: {:?}, error: {e}", key.kind )))? { if resp.canceled() { info!("Watch for timeline updates subscription was canceled, exiting"); break; } - let mut timeline_updates: HashMap> = HashMap::new(); - // Keep track that the timeline data updates from etcd arrive in the right order. - // https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas - // > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering. - let mut timeline_etcd_versions: HashMap = HashMap::new(); - - let events = resp.events(); debug!("Processing {} events", events.len()); for event in events { if EventType::Put == event.event_type() { if let Some(new_etcd_kv) = event.kv() { - let new_kv_version = new_etcd_kv.version(); - - match parse_etcd_key_value(subscription_kind, ®ex, new_etcd_kv) { - Ok(Some((zttid, timeline))) => { - match timeline_updates - .entry(zttid) - .or_default() - .entry(timeline.safekeeper_id) - { - hash_map::Entry::Occupied(mut o) => { - let old_etcd_kv_version = timeline_etcd_versions.get(&zttid).copied().unwrap_or(i64::MIN); - if old_etcd_kv_version < new_kv_version { - o.insert(timeline.info); - timeline_etcd_versions.insert(zttid,new_kv_version); - } - } - hash_map::Entry::Vacant(v) => { - v.insert(timeline.info); - timeline_etcd_versions.insert(zttid,new_kv_version); - } - } - } - Ok(None) => {} - Err(e) => error!("Failed to parse timeline update: {e}"), + match parse_etcd_kv(new_etcd_kv, &value_parser, &key.cluster_prefix) { + Ok(Some((key, value))) => if let Err(e) = value_updates_sender.send(BrokerUpdate { + etcd_version: new_etcd_kv.version(), + key, + value, + }) { + info!("Broker value updates for key {key:?} sender got dropped, exiting: {e}"); + break; + }, + Ok(None) => debug!("Ignoring key {key:?} : no value was returned by the parser"), + Err(BrokerError::KeyNotParsed(e)) => debug!("Unexpected key {key:?} for timeline update: {e}"), + Err(e) => error!("Failed to represent etcd KV {new_etcd_kv:?}: {e}"), }; } } } - - if let Err(e) = timeline_updates_sender.send(timeline_updates) { - info!("Timeline updates sender got dropped, exiting: {e}"); - break; - } } Ok(()) - }); + }.instrument(info_span!("etcd_broker"))); - Ok(SkTimelineSubscription { - kind: subscription, - safekeeper_timeline_updates, + Ok(BrokerSubscription { + key: subscription_key, + value_updates: value_updates_receiver, watcher_handle, watcher, }) } -fn parse_etcd_key_value( - subscription_kind: SubscriptionKind, - regex: &Regex, +fn parse_etcd_kv( kv: &KeyValue, -) -> Result, BrokerError> { - let caps = if let Some(caps) = regex.captures(kv.key_str().map_err(|e| { - BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as key str")) - })?) { - caps - } else { - return Ok(None); - }; - - let (zttid, safekeeper_id) = match subscription_kind { - SubscriptionKind::All => ( - ZTenantTimelineId::new( - parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?, - parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?, - ), - ZNodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?), - ), - SubscriptionKind::Tenant(tenant_id) => ( - ZTenantTimelineId::new( - tenant_id, - parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?, - ), - ZNodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?), - ), - SubscriptionKind::Timeline(zttid) => ( - zttid, - ZNodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?), - ), - }; - - let info_str = kv.value_str().map_err(|e| { - BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as value str")) - })?; - Ok(Some(( - zttid, - SafekeeperTimeline { - safekeeper_id, - info: serde_json::from_str(info_str).map_err(|e| { - BrokerError::ParsingError(format!( - "Failed to parse '{info_str}' as safekeeper timeline info: {e}" - )) - })?, - }, - ))) -} - -fn parse_capture(caps: &Captures, index: usize) -> Result + value_parser: &P, + cluster_prefix: &str, +) -> Result, BrokerError> where - T: FromStr, - ::Err: Display, + P: Fn(SubscriptionFullKey, &str) -> Option, { - let capture_match = caps - .get(index) - .ok_or_else(|| format!("Failed to get capture match at index {index}"))? - .as_str(); - capture_match.parse().map_err(|e| { - format!( - "Failed to parse {} from {capture_match}: {e}", - std::any::type_name::() - ) - }) + let key_str = kv.key_str().map_err(|e| { + BrokerError::EtcdClient(e, "Failed to extract key str out of etcd KV".to_string()) + })?; + let value_str = kv.value_str().map_err(|e| { + BrokerError::EtcdClient(e, "Failed to extract value str out of etcd KV".to_string()) + })?; + + if !key_str.starts_with(cluster_prefix) { + return Err(BrokerError::KeyNotParsed(format!( + "KV has unexpected key '{key_str}' that does not start with cluster prefix {cluster_prefix}" + ))); + } + + let key = SubscriptionFullKey::from_str(&key_str[cluster_prefix.len()..]).map_err(|e| { + BrokerError::KeyNotParsed(format!("Failed to parse KV key '{key_str}': {e}")) + })?; + + Ok(value_parser(key, value_str).map(|value| (key, value))) } diff --git a/libs/etcd_broker/src/subscription_key.rs b/libs/etcd_broker/src/subscription_key.rs new file mode 100644 index 0000000000..8f8579f4e5 --- /dev/null +++ b/libs/etcd_broker/src/subscription_key.rs @@ -0,0 +1,310 @@ +//! Etcd broker keys, used in the project and shared between instances. +//! The keys are split into two categories: +//! +//! * [`SubscriptionFullKey`] full key format: `/////` +//! Always returned from etcd in this form, always start with the user key provided. +//! +//! * [`SubscriptionKey`] user input key format: always partial, since it's unknown which `node_id`'s are available. +//! Full key always starts with the user input one, due to etcd subscription properties. + +use std::{fmt::Display, str::FromStr}; + +use once_cell::sync::Lazy; +use regex::{Captures, Regex}; +use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId}; + +/// The subscription kind to the timeline updates from safekeeper. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SubscriptionKey { + /// Generic cluster prefix, allowing to use the same etcd instance by multiple logic groups. + pub cluster_prefix: String, + /// The subscription kind. + pub kind: SubscriptionKind, +} + +/// All currently possible key kinds of a etcd broker subscription. +/// Etcd works so, that every key that starts with the subbscription key given is considered matching and +/// returned as part of the subscrption. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum SubscriptionKind { + /// Get every update in etcd. + All, + /// Get etcd updates for any timeiline of a certain tenant, affected by any operation from any node kind. + TenantTimelines(ZTenantId), + /// Get etcd updates for a certain timeline of a tenant, affected by any operation from any node kind. + Timeline(ZTenantTimelineId), + /// Get etcd timeline updates, specific to a certain node kind. + Node(ZTenantTimelineId, NodeKind), + /// Get etcd timeline updates for a certain operation on specific nodes. + Operation(ZTenantTimelineId, NodeKind, OperationKind), +} + +/// All kinds of nodes, able to write into etcd. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum NodeKind { + Safekeeper, + Pageserver, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum OperationKind { + Safekeeper(SkOperationKind), +} + +/// Current operations, running inside the safekeeper node. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum SkOperationKind { + TimelineInfo, + WalBackup, +} + +static SUBSCRIPTION_FULL_KEY_REGEX: Lazy = Lazy::new(|| { + Regex::new("/([[:xdigit:]]+)/([[:xdigit:]]+)/([^/]+)/([^/]+)/([[:digit:]]+)$") + .expect("wrong subscription full etcd key regex") +}); + +/// Full key, received from etcd during any of the component's work. +/// No other etcd keys are considered during system's work. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct SubscriptionFullKey { + pub id: ZTenantTimelineId, + pub node_kind: NodeKind, + pub operation: OperationKind, + pub node_id: NodeId, +} + +impl SubscriptionKey { + /// Subscribes for all etcd updates. + pub fn all(cluster_prefix: String) -> Self { + SubscriptionKey { + cluster_prefix, + kind: SubscriptionKind::All, + } + } + + /// Subscribes to a given timeline info updates from safekeepers. + pub fn sk_timeline_info(cluster_prefix: String, timeline: ZTenantTimelineId) -> Self { + Self { + cluster_prefix, + kind: SubscriptionKind::Operation( + timeline, + NodeKind::Safekeeper, + OperationKind::Safekeeper(SkOperationKind::TimelineInfo), + ), + } + } + + /// Subscribes to all timeine updates during specific operations, running on the corresponding nodes. + pub fn operation( + cluster_prefix: String, + timeline: ZTenantTimelineId, + node_kind: NodeKind, + operation: OperationKind, + ) -> Self { + Self { + cluster_prefix, + kind: SubscriptionKind::Operation(timeline, node_kind, operation), + } + } + + /// Etcd key to use for watching a certain timeline updates from safekeepers. + pub fn watch_key(&self) -> String { + let cluster_prefix = &self.cluster_prefix; + match self.kind { + SubscriptionKind::All => cluster_prefix.to_string(), + SubscriptionKind::TenantTimelines(tenant_id) => { + format!("{cluster_prefix}/{tenant_id}") + } + SubscriptionKind::Timeline(id) => { + format!("{cluster_prefix}/{id}") + } + SubscriptionKind::Node(id, node_kind) => { + format!("{cluster_prefix}/{id}/{node_kind}") + } + SubscriptionKind::Operation(id, node_kind, operation_kind) => { + format!("{cluster_prefix}/{id}/{node_kind}/{operation_kind}") + } + } + } +} + +impl Display for OperationKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + OperationKind::Safekeeper(o) => o.fmt(f), + } + } +} + +impl FromStr for OperationKind { + type Err = String; + + fn from_str(operation_kind_str: &str) -> Result { + match operation_kind_str { + "timeline_info" => Ok(OperationKind::Safekeeper(SkOperationKind::TimelineInfo)), + "wal_backup" => Ok(OperationKind::Safekeeper(SkOperationKind::WalBackup)), + _ => Err(format!("Unknown operation kind: {operation_kind_str}")), + } + } +} + +impl Display for SubscriptionFullKey { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let Self { + id, + node_kind, + operation, + node_id, + } = self; + write!(f, "{id}/{node_kind}/{operation}/{node_id}") + } +} + +impl FromStr for SubscriptionFullKey { + type Err = String; + + fn from_str(subscription_kind_str: &str) -> Result { + let key_captures = match SUBSCRIPTION_FULL_KEY_REGEX.captures(subscription_kind_str) { + Some(captures) => captures, + None => { + return Err(format!( + "Subscription kind str does not match a subscription full key regex {}", + SUBSCRIPTION_FULL_KEY_REGEX.as_str() + )); + } + }; + + Ok(Self { + id: ZTenantTimelineId::new( + parse_capture(&key_captures, 1)?, + parse_capture(&key_captures, 2)?, + ), + node_kind: parse_capture(&key_captures, 3)?, + operation: parse_capture(&key_captures, 4)?, + node_id: NodeId(parse_capture(&key_captures, 5)?), + }) + } +} + +fn parse_capture(caps: &Captures, index: usize) -> Result +where + T: FromStr, + ::Err: Display, +{ + let capture_match = caps + .get(index) + .ok_or_else(|| format!("Failed to get capture match at index {index}"))? + .as_str(); + capture_match.parse().map_err(|e| { + format!( + "Failed to parse {} from {capture_match}: {e}", + std::any::type_name::() + ) + }) +} + +impl Display for NodeKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Safekeeper => write!(f, "safekeeper"), + Self::Pageserver => write!(f, "pageserver"), + } + } +} + +impl FromStr for NodeKind { + type Err = String; + + fn from_str(node_kind_str: &str) -> Result { + match node_kind_str { + "safekeeper" => Ok(Self::Safekeeper), + "pageserver" => Ok(Self::Pageserver), + _ => Err(format!("Invalid node kind: {node_kind_str}")), + } + } +} + +impl Display for SkOperationKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::TimelineInfo => write!(f, "timeline_info"), + Self::WalBackup => write!(f, "wal_backup"), + } + } +} + +impl FromStr for SkOperationKind { + type Err = String; + + fn from_str(operation_str: &str) -> Result { + match operation_str { + "timeline_info" => Ok(Self::TimelineInfo), + "wal_backup" => Ok(Self::WalBackup), + _ => Err(format!("Invalid operation: {operation_str}")), + } + } +} + +#[cfg(test)] +mod tests { + use utils::zid::ZTimelineId; + + use super::*; + + #[test] + fn full_cluster_key_parsing() { + let prefix = "neon"; + let node_kind = NodeKind::Safekeeper; + let operation_kind = OperationKind::Safekeeper(SkOperationKind::WalBackup); + let tenant_id = ZTenantId::generate(); + let timeline_id = ZTimelineId::generate(); + let id = ZTenantTimelineId::new(tenant_id, timeline_id); + let node_id = NodeId(1); + + let timeline_subscription_keys = [ + SubscriptionKey { + cluster_prefix: prefix.to_string(), + kind: SubscriptionKind::All, + }, + SubscriptionKey { + cluster_prefix: prefix.to_string(), + kind: SubscriptionKind::TenantTimelines(tenant_id), + }, + SubscriptionKey { + cluster_prefix: prefix.to_string(), + kind: SubscriptionKind::Timeline(id), + }, + SubscriptionKey { + cluster_prefix: prefix.to_string(), + kind: SubscriptionKind::Node(id, node_kind), + }, + SubscriptionKey { + cluster_prefix: prefix.to_string(), + kind: SubscriptionKind::Operation(id, node_kind, operation_kind), + }, + ]; + + let full_key_string = format!( + "{}/{node_id}", + timeline_subscription_keys.last().unwrap().watch_key() + ); + + for key in timeline_subscription_keys { + assert!(full_key_string.starts_with(&key.watch_key()), "Full key '{full_key_string}' should start with any of the keys, keys, but {key:?} did not match"); + } + + let full_key = SubscriptionFullKey::from_str(&full_key_string).unwrap_or_else(|e| { + panic!("Failed to parse {full_key_string} as a subscription full key: {e}") + }); + + assert_eq!( + full_key, + SubscriptionFullKey { + id, + node_kind, + operation: operation_kind, + node_id + } + ) + } +} diff --git a/libs/etcd_broker/src/subscription_value.rs b/libs/etcd_broker/src/subscription_value.rs new file mode 100644 index 0000000000..d3e2011761 --- /dev/null +++ b/libs/etcd_broker/src/subscription_value.rs @@ -0,0 +1,35 @@ +//! Module for the values to put into etcd. + +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; +use utils::lsn::Lsn; + +/// Data about safekeeper's timeline. Fields made optional for easy migrations. +#[serde_as] +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct SkTimelineInfo { + /// Term of the last entry. + pub last_log_term: Option, + /// LSN of the last record. + #[serde_as(as = "Option")] + #[serde(default)] + pub flush_lsn: Option, + /// Up to which LSN safekeeper regards its WAL as committed. + #[serde_as(as = "Option")] + #[serde(default)] + pub commit_lsn: Option, + /// LSN up to which safekeeper has backed WAL. + #[serde_as(as = "Option")] + #[serde(default)] + pub backup_lsn: Option, + /// LSN of last checkpoint uploaded by pageserver. + #[serde_as(as = "Option")] + #[serde(default)] + pub remote_consistent_lsn: Option, + #[serde_as(as = "Option")] + #[serde(default)] + pub peer_horizon_lsn: Option, + /// A connection string to use for WAL receiving. + #[serde(default)] + pub safekeeper_connstr: Option, +} diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index 9929fc6d45..3b5da9f7ff 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -3,6 +3,7 @@ //! Otherwise, we might not see all metrics registered via //! a default registry. use lazy_static::lazy_static; +pub use prometheus::{core, default_registry, proto}; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_gauge, Gauge}; pub use prometheus::{register_gauge_vec, GaugeVec}; diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 129c93cf6d..c9cc858ab9 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -23,7 +23,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" } [dev-dependencies] env_logger = "0.9" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -wal_generate = { path = "wal_generate" } +wal_craft = { path = "wal_craft" } [build-dependencies] bindgen = "0.59.1" diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 0043b9ab58..c6df4fc0b0 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -2,6 +2,7 @@ extern crate bindgen; use std::env; use std::path::PathBuf; +use std::process::Command; use bindgen::callbacks::ParseCallbacks; @@ -45,6 +46,43 @@ fn main() { // Tell cargo to invalidate the built crate whenever the wrapper changes println!("cargo:rerun-if-changed=pg_control_ffi.h"); + // Finding the location of C headers for the Postgres server: + // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `/tmp_install` + // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `/tmp_install/include/postgresql/server` + let mut pg_install_dir: PathBuf; + if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") { + pg_install_dir = postgres_install_dir.into(); + } else { + pg_install_dir = PathBuf::from("tmp_install") + } + + if pg_install_dir.is_relative() { + let cwd = env::current_dir().unwrap(); + pg_install_dir = cwd.join("..").join("..").join(pg_install_dir); + } + + let pg_config_bin = pg_install_dir.join("bin").join("pg_config"); + let inc_server_path: String = if pg_config_bin.exists() { + let output = Command::new(pg_config_bin) + .arg("--includedir-server") + .output() + .expect("failed to execute `pg_config --includedir-server`"); + + if !output.status.success() { + panic!("`pg_config --includedir-server` failed") + } + + String::from_utf8(output.stdout).unwrap().trim_end().into() + } else { + pg_install_dir + .join("include") + .join("postgresql") + .join("server") + .into_os_string() + .into_string() + .unwrap() + }; + // The bindgen::Builder is the main entry point // to bindgen, and lets you build up options for // the resulting bindings. @@ -81,15 +119,7 @@ fn main() { // explicit padding fields. .explicit_padding(true) // - // Path the server include dir. It is in tmp_install/include/server, if you did - // "configure --prefix=". But if you used "configure --prefix=/", - // and used DESTDIR to move it into tmp_install, then it's in - // tmp_install/include/postgres/server - // 'pg_config --includedir-server' would perhaps be the more proper way to find it, - // but this will do for now. - // - .clang_arg("-I../../tmp_install/include/server") - .clang_arg("-I../../tmp_install/include/postgresql/server") + .clang_arg(format!("-I{inc_server_path}")) // // Finish the builder and generate the bindings. // diff --git a/libs/postgres_ffi/src/waldecoder.rs b/libs/postgres_ffi/src/waldecoder.rs index 95ea9660e8..7a69f471d9 100644 --- a/libs/postgres_ffi/src/waldecoder.rs +++ b/libs/postgres_ffi/src/waldecoder.rs @@ -73,7 +73,7 @@ impl WalStreamDecoder { /// Returns one of the following: /// Ok((Lsn, Bytes)): a tuple containing the LSN of next record, and the record itself /// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function - /// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid. + /// Err(WalDecodeError): an error occurred while decoding, meaning the input was invalid. /// pub fn poll_decode(&mut self) -> Result, WalDecodeError> { let recordbuf; @@ -82,7 +82,17 @@ impl WalStreamDecoder { // that cross page boundaries. loop { // parse and verify page boundaries as we go - if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 { + if self.padlen > 0 { + // We should first skip padding, as we may have to skip some page headers if we're processing the XLOG_SWITCH record. + if self.inputbuf.remaining() < self.padlen as usize { + return Ok(None); + } + + // skip padding + self.inputbuf.advance(self.padlen as usize); + self.lsn += self.padlen as u64; + self.padlen = 0; + } else if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 { // parse long header if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD { @@ -128,15 +138,6 @@ impl WalStreamDecoder { self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64; continue; - } else if self.padlen > 0 { - if self.inputbuf.remaining() < self.padlen as usize { - return Ok(None); - } - - // skip padding - self.inputbuf.advance(self.padlen as usize); - self.lsn += self.padlen as u64; - self.padlen = 0; } else if self.contlen == 0 { assert!(self.recordbuf.is_empty()); @@ -226,10 +227,10 @@ impl WalStreamDecoder { self.padlen = self.lsn.calc_padding(8u32) as u32; } - // Always align resulting LSN on 0x8 boundary -- that is important for getPage() - // and WalReceiver integration. Since this code is used both for WalReceiver and - // initial WAL import let's force alignment right here. - let result = (self.lsn.align(), recordbuf); + // We should return LSN of the next record, not the last byte of this record or + // the byte immediately after. Note that this handles both XLOG_SWITCH and usual + // records, the former "spans" until the next WAL segment (see test_xlog_switch). + let result = (self.lsn + self.padlen as u64, recordbuf); Ok(Some(result)) } } diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 32a3022c5a..520870cc53 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -15,6 +15,7 @@ use crate::XLogPageHeaderData; use crate::XLogRecord; use crate::XLOG_PAGE_MAGIC; +use crate::pg_constants::WAL_SEGMENT_SIZE; use anyhow::{bail, ensure}; use byteorder::{ByteOrder, LittleEndian}; use bytes::BytesMut; @@ -461,8 +462,7 @@ pub fn find_end_of_wal( pub fn main() { let mut data_dir = PathBuf::new(); data_dir.push("."); - let wal_seg_size = 16 * 1024 * 1024; - let (wal_end, tli) = find_end_of_wal(&data_dir, wal_seg_size, true, Lsn(0)).unwrap(); + let (wal_end, tli) = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, true, Lsn(0)).unwrap(); println!( "wal_end={:>08X}{:>08X}, tli={}", (wal_end >> 32) as u32, @@ -531,7 +531,7 @@ impl CheckPoint { /// /// Returns 'true' if the XID was updated. pub fn update_next_xid(&mut self, xid: u32) -> bool { - // nextXid should nw greate than any XID in WAL, so increment provided XID and check for wraparround. + // nextXid should nw greater than any XID in WAL, so increment provided XID and check for wraparround. let mut new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID); // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL. // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE @@ -597,20 +597,18 @@ mod tests { fn init_logging() { let _ = env_logger::Builder::from_env( env_logger::Env::default() - .default_filter_or("wal_generate=info,postgres_ffi::xlog_utils=trace"), + .default_filter_or("wal_craft=info,postgres_ffi::xlog_utils=trace"), ) .is_test(true) .try_init(); } - fn test_end_of_wal( + fn test_end_of_wal( test_name: &str, - generate_wal: impl Fn(&mut postgres::Client) -> anyhow::Result, expected_end_of_wal_non_partial: Lsn, - last_segment: &str, ) { - use wal_generate::*; - // 1. Generate some WAL + use wal_craft::*; + // Craft some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("..") .join(".."); @@ -622,25 +620,72 @@ mod tests { fs::remove_dir_all(&cfg.datadir).unwrap(); } cfg.initdb().unwrap(); - let mut srv = cfg.start_server().unwrap(); - let expected_wal_end: Lsn = - u64::from(generate_wal(&mut srv.connect_with_timeout().unwrap()).unwrap()).into(); + let srv = cfg.start_server().unwrap(); + let (intermediate_lsns, expected_end_of_wal_partial) = + C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap(); + let intermediate_lsns: Vec = intermediate_lsns + .iter() + .map(|&lsn| u64::from(lsn).into()) + .collect(); + let expected_end_of_wal_partial: Lsn = u64::from(expected_end_of_wal_partial).into(); srv.kill(); - // 2. Pick WAL generated by initdb - let wal_dir = cfg.datadir.join("pg_wal"); - let wal_seg_size = 16 * 1024 * 1024; + // Check find_end_of_wal on the initial WAL + let last_segment = cfg + .wal_dir() + .read_dir() + .unwrap() + .map(|f| f.unwrap().file_name().into_string().unwrap()) + .filter(|fname| IsXLogFileName(fname)) + .max() + .unwrap(); + check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal_partial); + for start_lsn in std::iter::once(Lsn(0)) + .chain(intermediate_lsns) + .chain(std::iter::once(expected_end_of_wal_partial)) + { + // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`. + // We assume that `start_lsn` is non-decreasing. + info!( + "Checking with start_lsn={}, erasing WAL before it", + start_lsn + ); + for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() { + let fname = file.file_name().into_string().unwrap(); + if !IsXLogFileName(&fname) { + continue; + } + let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE); + let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); + if seg_start_lsn > u64::from(start_lsn) { + continue; + } + let mut f = File::options().write(true).open(file.path()).unwrap(); + const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE]; + f.write_all( + &ZEROS[0..min( + WAL_SEGMENT_SIZE, + (u64::from(start_lsn) - seg_start_lsn) as usize, + )], + ) + .unwrap(); + } + check_end_of_wal( + &cfg, + &last_segment, + start_lsn, + expected_end_of_wal_non_partial, + expected_end_of_wal_partial, + ); + } + } - // 3. Check end_of_wal on non-partial WAL segment (we treat it as fully populated) - let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap(); - let wal_end = Lsn(wal_end); - info!( - "find_end_of_wal returned (wal_end={}, tli={})", - wal_end, tli - ); - assert_eq!(wal_end, expected_end_of_wal_non_partial); - - // 4. Get the actual end of WAL by pg_waldump + fn check_pg_waldump_end_of_wal( + cfg: &wal_craft::Conf, + last_segment: &str, + expected_end_of_wal: Lsn, + ) { + // Get the actual end of WAL by pg_waldump let waldump_output = cfg .pg_waldump("000000010000000000000001", last_segment) .unwrap() @@ -659,44 +704,66 @@ mod tests { let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap(); info!( "waldump erred on {}, expected wal end at {}", - waldump_wal_end, expected_wal_end + waldump_wal_end, expected_end_of_wal ); - assert_eq!(waldump_wal_end, expected_wal_end); + assert_eq!(waldump_wal_end, expected_end_of_wal); + } - // 5. Rename file to partial to actually find last valid lsn - fs::rename( - wal_dir.join(last_segment), - wal_dir.join(format!("{}.partial", last_segment)), - ) - .unwrap(); - let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap(); + fn check_end_of_wal( + cfg: &wal_craft::Conf, + last_segment: &str, + start_lsn: Lsn, + expected_end_of_wal_non_partial: Lsn, + expected_end_of_wal_partial: Lsn, + ) { + // Check end_of_wal on non-partial WAL segment (we treat it as fully populated) + let (wal_end, tli) = + find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap(); let wal_end = Lsn(wal_end); info!( - "find_end_of_wal returned (wal_end={}, tli={})", + "find_end_of_wal returned (wal_end={}, tli={}) with non-partial WAL segment", wal_end, tli ); - assert_eq!(wal_end, waldump_wal_end); + assert_eq!(wal_end, expected_end_of_wal_non_partial); + + // Rename file to partial to actually find last valid lsn, then rename it back. + fs::rename( + cfg.wal_dir().join(&last_segment), + cfg.wal_dir().join(format!("{}.partial", last_segment)), + ) + .unwrap(); + let (wal_end, tli) = + find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap(); + let wal_end = Lsn(wal_end); + info!( + "find_end_of_wal returned (wal_end={}, tli={}) with partial WAL segment", + wal_end, tli + ); + assert_eq!(wal_end, expected_end_of_wal_partial); + fs::rename( + cfg.wal_dir().join(format!("{}.partial", last_segment)), + cfg.wal_dir().join(last_segment), + ) + .unwrap(); } + const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024); + #[test] pub fn test_find_end_of_wal_simple() { init_logging(); - test_end_of_wal( + test_end_of_wal::( "test_find_end_of_wal_simple", - wal_generate::generate_simple, "0/2000000".parse::().unwrap(), - "000000010000000000000001", ); } #[test] pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() { init_logging(); - test_end_of_wal( + test_end_of_wal::( "test_find_end_of_wal_crossing_segment_followed_by_small_one", - wal_generate::generate_wal_record_crossing_segment_followed_by_small_one, "0/3000000".parse::().unwrap(), - "000000010000000000000002", ); } @@ -704,11 +771,9 @@ mod tests { #[ignore = "not yet fixed, needs correct parsing of pre-last segments"] // TODO pub fn test_find_end_of_wal_last_crossing_segment() { init_logging(); - test_end_of_wal( + test_end_of_wal::( "test_find_end_of_wal_last_crossing_segment", - wal_generate::generate_last_wal_record_crossing_segment, "0/3000000".parse::().unwrap(), - "000000010000000000000002", ); } diff --git a/libs/postgres_ffi/wal_generate/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml similarity index 83% rename from libs/postgres_ffi/wal_generate/Cargo.toml rename to libs/postgres_ffi/wal_craft/Cargo.toml index a10671dddd..374c8e2e55 100644 --- a/libs/postgres_ffi/wal_generate/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "wal_generate" +name = "wal_craft" version = "0.1.0" edition = "2021" @@ -10,5 +10,7 @@ anyhow = "1.0" clap = "3.0" env_logger = "0.9" log = "0.4" +once_cell = "1.8.0" postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres_ffi = { path = "../" } tempfile = "3.2" diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs new file mode 100644 index 0000000000..938f8f421b --- /dev/null +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -0,0 +1,103 @@ +use anyhow::*; +use clap::{App, Arg, ArgMatches}; +use std::str::FromStr; +use wal_craft::*; + +fn main() -> Result<()> { + env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("wal_craft=info")) + .init(); + let type_arg = &Arg::new("type") + .takes_value(true) + .help("Type of WAL to craft") + .possible_values([ + Simple::NAME, + LastWalRecordXlogSwitch::NAME, + LastWalRecordXlogSwitchEndsOnPageBoundary::NAME, + WalRecordCrossingSegmentFollowedBySmallOne::NAME, + LastWalRecordCrossingSegment::NAME, + ]) + .required(true); + let arg_matches = App::new("Postgres WAL crafter") + .about("Crafts Postgres databases with specific WAL properties") + .subcommand( + App::new("print-postgres-config") + .about("Print the configuration required for PostgreSQL server before running this script") + ) + .subcommand( + App::new("with-initdb") + .about("Craft WAL in a new data directory first initialized with initdb") + .arg(type_arg) + .arg( + Arg::new("datadir") + .takes_value(true) + .help("Data directory for the Postgres server") + .required(true) + ) + .arg( + Arg::new("pg-distrib-dir") + .long("pg-distrib-dir") + .takes_value(true) + .help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)") + .default_value("/usr/local") + ) + ) + .subcommand( + App::new("in-existing") + .about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.") + .arg(type_arg) + .arg( + Arg::new("connection") + .takes_value(true) + .help("Connection string to the Postgres database to populate") + .required(true) + ) + ) + .get_matches(); + + let wal_craft = |arg_matches: &ArgMatches, client| { + let (intermediate_lsns, end_of_wal_lsn) = match arg_matches.value_of("type").unwrap() { + Simple::NAME => Simple::craft(client)?, + LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?, + LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => { + LastWalRecordXlogSwitchEndsOnPageBoundary::craft(client)? + } + WalRecordCrossingSegmentFollowedBySmallOne::NAME => { + WalRecordCrossingSegmentFollowedBySmallOne::craft(client)? + } + LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?, + a => panic!("Unknown --type argument: {}", a), + }; + for lsn in intermediate_lsns { + println!("intermediate_lsn = {}", lsn); + } + println!("end_of_wal = {}", end_of_wal_lsn); + Ok(()) + }; + + match arg_matches.subcommand() { + None => panic!("No subcommand provided"), + Some(("print-postgres-config", _)) => { + for cfg in REQUIRED_POSTGRES_CONFIG.iter() { + println!("{}", cfg); + } + Ok(()) + } + Some(("with-initdb", arg_matches)) => { + let cfg = Conf { + pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(), + datadir: arg_matches.value_of("datadir").unwrap().into(), + }; + cfg.initdb()?; + let srv = cfg.start_server()?; + wal_craft(arg_matches, &mut srv.connect_with_timeout()?)?; + srv.kill(); + Ok(()) + } + Some(("in-existing", arg_matches)) => wal_craft( + arg_matches, + &mut postgres::Config::from_str(arg_matches.value_of("connection").unwrap())? + .connect(postgres::NoTls)?, + ), + Some(_) => panic!("Unknown subcommand"), + } +} diff --git a/libs/postgres_ffi/wal_generate/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs similarity index 50% rename from libs/postgres_ffi/wal_generate/src/lib.rs rename to libs/postgres_ffi/wal_craft/src/lib.rs index a5cd81d68a..e3b666da41 100644 --- a/libs/postgres_ffi/wal_generate/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -1,9 +1,15 @@ use anyhow::*; use core::time::Duration; use log::*; +use once_cell::sync::Lazy; use postgres::types::PgLsn; use postgres::Client; +use postgres_ffi::pg_constants::WAL_SEGMENT_SIZE; +use postgres_ffi::xlog_utils::{ + XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, +}; use std::cmp::Ordering; +use std::fs; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use std::time::Instant; @@ -21,6 +27,16 @@ pub struct PostgresServer { client_config: postgres::Config, } +pub static REQUIRED_POSTGRES_CONFIG: Lazy> = Lazy::new(|| { + vec![ + "wal_keep_size=50MB", // Ensure old WAL is not removed + "shared_preload_libraries=neon", // can only be loaded at startup + // Disable background processes as much as possible + "wal_writer_delay=10s", + "autovacuum=off", + ] +}); + impl Conf { fn pg_bin_dir(&self) -> PathBuf { self.pg_distrib_dir.join("bin") @@ -30,6 +46,10 @@ impl Conf { self.pg_distrib_dir.join("lib") } + pub fn wal_dir(&self) -> PathBuf { + self.datadir.join("pg_wal") + } + fn new_pg_command(&self, command: impl AsRef) -> Result { let path = self.pg_bin_dir().join(command); ensure!(path.exists(), "Command {:?} does not exist", path); @@ -69,6 +89,12 @@ impl Conf { pub fn start_server(&self) -> Result { info!("Starting Postgres server in {:?}", self.datadir); + let log_file = fs::File::create(self.datadir.join("pg.log")).with_context(|| { + format!( + "Failed to create pg.log file in directory {}", + self.datadir.display() + ) + })?; let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols) let unix_socket_dir_path = unix_socket_dir.path().to_owned(); let server_process = self @@ -78,13 +104,9 @@ impl Conf { .arg(unix_socket_dir_path.as_os_str()) .arg("-D") .arg(self.datadir.as_os_str()) - .args(&["-c", "wal_keep_size=50MB"]) // Ensure old WAL is not removed .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output - .args(&["-c", "shared_preload_libraries=zenith"]) // can only be loaded at startup - // Disable background processes as much as possible - .args(&["-c", "wal_writer_delay=10s"]) - .args(&["-c", "autovacuum=off"]) - .stderr(Stdio::null()) + .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg])) + .stderr(Stdio::from(log_file)) .spawn()?; let server = PostgresServer { process: server_process, @@ -137,7 +159,7 @@ impl PostgresServer { bail!("Connection timed out"); } - pub fn kill(&mut self) { + pub fn kill(mut self) { self.process.kill().unwrap(); self.process.wait().unwrap(); } @@ -174,11 +196,15 @@ pub trait PostgresClientExt: postgres::GenericClient { impl PostgresClientExt for C {} -fn generate_internal( - client: &mut C, - f: impl Fn(&mut C, PgLsn) -> Result>, -) -> Result { - client.execute("create extension if not exists zenith_test_utils", &[])?; +pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result<()> { + client.execute("create extension if not exists neon_test_utils", &[])?; + + let wal_keep_size: String = client.query_one("SHOW wal_keep_size", &[])?.get(0); + ensure!(wal_keep_size == "50MB"); + let wal_writer_delay: String = client.query_one("SHOW wal_writer_delay", &[])?.get(0); + ensure!(wal_writer_delay == "10s"); + let autovacuum: String = client.query_one("SHOW autovacuum", &[])?.get(0); + ensure!(autovacuum == "off"); let wal_segment_size = client.query_one( "select cast(setting as bigint) as setting, unit \ @@ -190,44 +216,160 @@ fn generate_internal( "Unexpected wal_segment_size unit" ); ensure!( - wal_segment_size.get::<_, i64>("setting") == 16 * 1024 * 1024, + wal_segment_size.get::<_, i64>("setting") == WAL_SEGMENT_SIZE as i64, "Unexpected wal_segment_size in bytes" ); + Ok(()) +} + +pub trait Crafter { + const NAME: &'static str; + + /// Generates WAL using the client `client`. Returns a pair of: + /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from. + /// May include or exclude Lsn(0) and the end-of-wal. + /// * The expected end-of-wal LSN. + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)>; +} + +fn craft_internal( + client: &mut C, + f: impl Fn(&mut C, PgLsn) -> Result<(Vec, Option)>, +) -> Result<(Vec, PgLsn)> { + ensure_server_config(client)?; + let initial_lsn = client.pg_current_wal_insert_lsn()?; info!("LSN initial = {}", initial_lsn); - let last_lsn = match f(client, initial_lsn)? { + let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?; + let last_lsn = match last_lsn { None => client.pg_current_wal_insert_lsn()?, Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) { - Ordering::Less => bail!("Some records were inserted after the generated WAL"), + Ordering::Less => bail!("Some records were inserted after the crafted WAL"), Ordering::Equal => last_lsn, Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"), }, }; + if !intermediate_lsns.starts_with(&[initial_lsn]) { + intermediate_lsns.insert(0, initial_lsn); + } // Some records may be not flushed, e.g. non-transactional logical messages. client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?; match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) { - Ordering::Less => bail!("Some records were flushed after the generated WAL"), + Ordering::Less => bail!("Some records were flushed after the crafted WAL"), Ordering::Equal => {} Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"), } - Ok(last_lsn) + Ok((intermediate_lsns, last_lsn)) } -pub fn generate_simple(client: &mut impl postgres::GenericClient) -> Result { - generate_internal(client, |client, _| { +pub struct Simple; +impl Crafter for Simple { + const NAME: &'static str = "simple"; + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + craft_internal(client, |client, _| { + client.execute("CREATE table t(x int)", &[])?; + Ok((Vec::new(), None)) + }) + } +} + +pub struct LastWalRecordXlogSwitch; +impl Crafter for LastWalRecordXlogSwitch { + const NAME: &'static str = "last_wal_record_xlog_switch"; + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + // Do not use generate_internal because here we end up with flush_lsn exactly on + // the segment boundary and insert_lsn after the initial page header, which is unusual. + ensure_server_config(client)?; + client.execute("CREATE table t(x int)", &[])?; - Ok(None) - }) + let before_xlog_switch = client.pg_current_wal_insert_lsn()?; + let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + let next_segment = PgLsn::from(0x0200_0000); + ensure!( + after_xlog_switch <= next_segment, + "XLOG_SWITCH message ended after the expected segment boundary: {} > {}", + after_xlog_switch, + next_segment + ); + Ok((vec![before_xlog_switch, after_xlog_switch], next_segment)) + } } -fn generate_single_logical_message( +pub struct LastWalRecordXlogSwitchEndsOnPageBoundary; +impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { + const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary"; + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + // Do not use generate_internal because here we end up with flush_lsn exactly on + // the segment boundary and insert_lsn after the initial page header, which is unusual. + ensure_server_config(client)?; + + client.execute("CREATE table t(x int)", &[])?; + + // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. + // We will use logical message as the padding. We start with detecting how much WAL + // it takes for one logical message, considering all alignments and headers. + let base_wal_advance = { + let before_lsn = client.pg_current_wal_insert_lsn()?; + // Small non-empty message bigger than few bytes is more likely than an empty + // message to have the same format as the big padding message. + client.execute( + "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))", + &[], + )?; + // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD. + (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize + + XLOG_SIZE_OF_XLOG_RECORD + }; + let mut remaining_lsn = + XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ; + if remaining_lsn < base_wal_advance { + remaining_lsn += XLOG_BLCKSZ; + } + let repeats = 10 + remaining_lsn - base_wal_advance; + info!( + "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}", + client.pg_current_wal_insert_lsn()?, + remaining_lsn, + base_wal_advance, + repeats + ); + client.execute( + "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", + &[&(repeats as i32)], + )?; + info!( + "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", + client.pg_current_wal_insert_lsn()?, + XLOG_SIZE_OF_XLOG_RECORD + ); + + // Emit the XLOG_SWITCH + let before_xlog_switch = client.pg_current_wal_insert_lsn()?; + let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + let next_segment = PgLsn::from(0x0200_0000); + ensure!( + after_xlog_switch < next_segment, + "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}", + after_xlog_switch, + next_segment + ); + ensure!( + u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD, + "XLOG_SWITCH message ended not on page boundary: {}", + after_xlog_switch + ); + Ok((vec![before_xlog_switch, after_xlog_switch], next_segment)) + } +} + +fn craft_single_logical_message( client: &mut impl postgres::GenericClient, transactional: bool, -) -> Result { - generate_internal(client, |client, initial_lsn| { +) -> Result<(Vec, PgLsn)> { + craft_internal(client, |client, initial_lsn| { ensure!( initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024), "Initial LSN is too far in the future" @@ -258,21 +400,25 @@ fn generate_single_logical_message( message_lsn < after_message_lsn, "No record found after the emitted message" ); - Ok(Some(after_message_lsn)) + Ok((vec![message_lsn], Some(after_message_lsn))) } else { - Ok(Some(message_lsn)) + Ok((Vec::new(), Some(message_lsn))) } }) } -pub fn generate_wal_record_crossing_segment_followed_by_small_one( - client: &mut impl postgres::GenericClient, -) -> Result { - generate_single_logical_message(client, true) +pub struct WalRecordCrossingSegmentFollowedBySmallOne; +impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { + const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one"; + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + craft_single_logical_message(client, true) + } } -pub fn generate_last_wal_record_crossing_segment( - client: &mut C, -) -> Result { - generate_single_logical_message(client, false) +pub struct LastWalRecordCrossingSegment; +impl Crafter for LastWalRecordCrossingSegment { + const NAME: &'static str = "last_wal_record_crossing_segment"; + fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec, PgLsn)> { + craft_single_logical_message(client, false) + } } diff --git a/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs b/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs deleted file mode 100644 index 07ceb31c7f..0000000000 --- a/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs +++ /dev/null @@ -1,58 +0,0 @@ -use anyhow::*; -use clap::{App, Arg}; -use wal_generate::*; - -fn main() -> Result<()> { - env_logger::Builder::from_env( - env_logger::Env::default().default_filter_or("wal_generate=info"), - ) - .init(); - let arg_matches = App::new("Postgres WAL generator") - .about("Generates Postgres databases with specific WAL properties") - .arg( - Arg::new("datadir") - .short('D') - .long("datadir") - .takes_value(true) - .help("Data directory for the Postgres server") - .required(true) - ) - .arg( - Arg::new("pg-distrib-dir") - .long("pg-distrib-dir") - .takes_value(true) - .help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)") - .default_value("/usr/local") - ) - .arg( - Arg::new("type") - .long("type") - .takes_value(true) - .help("Type of WAL to generate") - .possible_values(["simple", "last_wal_record_crossing_segment", "wal_record_crossing_segment_followed_by_small_one"]) - .required(true) - ) - .get_matches(); - - let cfg = Conf { - pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(), - datadir: arg_matches.value_of("datadir").unwrap().into(), - }; - cfg.initdb()?; - let mut srv = cfg.start_server()?; - let lsn = match arg_matches.value_of("type").unwrap() { - "simple" => generate_simple(&mut srv.connect_with_timeout()?)?, - "last_wal_record_crossing_segment" => { - generate_last_wal_record_crossing_segment(&mut srv.connect_with_timeout()?)? - } - "wal_record_crossing_segment_followed_by_small_one" => { - generate_wal_record_crossing_segment_followed_by_small_one( - &mut srv.connect_with_timeout()?, - )? - } - a => panic!("Unknown --type argument: {}", a), - }; - println!("end_of_wal = {}", lsn); - srv.kill(); - Ok(()) -} diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 291f6e50ac..b11b3cf371 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -5,14 +5,17 @@ edition = "2021" [dependencies] anyhow = { version = "1.0", features = ["backtrace"] } -tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] } -tokio-util = { version = "0.7", features = ["io"] } -tracing = "0.1.27" +async-trait = "0.1" +metrics = { version = "0.1", path = "../metrics" } +once_cell = "1.8.0" rusoto_core = "0.48" rusoto_s3 = "0.48" serde = { version = "1.0", features = ["derive"] } serde_json = "1" -async-trait = "0.1" +tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] } +tokio-util = { version = "0.7", features = ["io"] } +toml_edit = { version = "0.13", features = ["easy"] } +tracing = "0.1.27" workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 8092e4fc49..dec79e4580 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -12,12 +12,16 @@ use std::{ borrow::Cow, collections::HashMap, ffi::OsStr, + fmt::Debug, num::{NonZeroU32, NonZeroUsize}, path::{Path, PathBuf}, + pin::Pin, }; -use anyhow::Context; +use anyhow::{bail, Context}; + use tokio::io; +use toml_edit::Item; use tracing::info; pub use self::{ @@ -38,13 +42,19 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; +pub trait RemoteObjectName { + // Needed to retrieve last component for RemoteObjectId. + // In other words a file name + fn object_name(&self) -> Option<&str>; +} + /// Storage (potentially remote) API to manage its state. /// This storage tries to be unaware of any layered repository context, /// providing basic CRUD operations for storage files. #[async_trait::async_trait] pub trait RemoteStorage: Send + Sync { /// A way to uniquely reference a file in the remote storage. - type RemoteObjectId; + type RemoteObjectId: RemoteObjectName; /// Attempts to derive the storage path out of the local path, if the latter is correct. fn remote_object_id(&self, local_path: &Path) -> anyhow::Result; @@ -55,6 +65,12 @@ pub trait RemoteStorage: Send + Sync { /// Lists all items the storage has right now. async fn list(&self) -> anyhow::Result>; + /// Lists all top level subdirectories for a given prefix + async fn list_prefixes( + &self, + prefix: Option, + ) -> anyhow::Result>; + /// Streams the local file contents into remote into the remote storage entry. async fn upload( &self, @@ -68,11 +84,7 @@ pub trait RemoteStorage: Send + Sync { /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer. /// Returns the metadata, if any was stored with the file previously. - async fn download( - &self, - from: &Self::RemoteObjectId, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result>; + async fn download(&self, from: &Self::RemoteObjectId) -> Result; /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer. /// Returns the metadata, if any was stored with the file previously. @@ -81,12 +93,49 @@ pub trait RemoteStorage: Send + Sync { from: &Self::RemoteObjectId, start_inclusive: u64, end_exclusive: Option, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result>; + ) -> Result; async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>; } +pub struct Download { + pub download_stream: Pin>, + /// Extra key-value data, associated with the current remote file. + pub metadata: Option, +} + +impl Debug for Download { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Download") + .field("metadata", &self.metadata) + .finish() + } +} + +#[derive(Debug)] +pub enum DownloadError { + /// Validation or other error happened due to user input. + BadInput(anyhow::Error), + /// The file was not found in the remote storage. + NotFound, + /// The file was found in the remote storage, but the download failed. + Other(anyhow::Error), +} + +impl std::fmt::Display for DownloadError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DownloadError::BadInput(e) => { + write!(f, "Failed to download a remote file due to user input: {e}") + } + DownloadError::NotFound => write!(f, "No file found for the remote object id given"), + DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e}"), + } + } +} + +impl std::error::Error for DownloadError {} + /// Every storage, currently supported. /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. pub enum GenericRemoteStorage { @@ -178,7 +227,7 @@ pub struct S3Config { pub concurrency_limit: NonZeroUsize, } -impl std::fmt::Debug for S3Config { +impl Debug for S3Config { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("S3Config") .field("bucket_name", &self.bucket_name) @@ -203,6 +252,90 @@ pub fn path_with_suffix_extension(original_path: impl AsRef, suffix: &str) .with_extension(new_extension.as_ref()) } +impl RemoteStorageConfig { + pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { + let local_path = toml.get("local_path"); + let bucket_name = toml.get("bucket_name"); + let bucket_region = toml.get("bucket_region"); + + let max_concurrent_syncs = NonZeroUsize::new( + parse_optional_integer("max_concurrent_syncs", toml)? + .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS), + ) + .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?; + + let max_sync_errors = NonZeroU32::new( + parse_optional_integer("max_sync_errors", toml)? + .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS), + ) + .context("Failed to parse 'max_sync_errors' as a positive integer")?; + + let concurrency_limit = NonZeroUsize::new( + parse_optional_integer("concurrency_limit", toml)? + .unwrap_or(DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT), + ) + .context("Failed to parse 'concurrency_limit' as a positive integer")?; + + let storage = match (local_path, bucket_name, bucket_region) { + (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"), + (_, Some(_), None) => { + bail!("'bucket_region' option is mandatory if 'bucket_name' is given ") + } + (_, None, Some(_)) => { + bail!("'bucket_name' option is mandatory if 'bucket_region' is given ") + } + (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config { + bucket_name: parse_toml_string("bucket_name", bucket_name)?, + bucket_region: parse_toml_string("bucket_region", bucket_region)?, + prefix_in_bucket: toml + .get("prefix_in_bucket") + .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket)) + .transpose()?, + endpoint: toml + .get("endpoint") + .map(|endpoint| parse_toml_string("endpoint", endpoint)) + .transpose()?, + concurrency_limit, + }), + (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from( + parse_toml_string("local_path", local_path)?, + )), + (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"), + }; + + Ok(RemoteStorageConfig { + max_concurrent_syncs, + max_sync_errors, + storage, + }) + } +} + +// Helper functions to parse a toml Item +fn parse_optional_integer(name: &str, item: &toml_edit::Item) -> anyhow::Result> +where + I: TryFrom, + E: std::error::Error + Send + Sync + 'static, +{ + let toml_integer = match item.get(name) { + Some(item) => item + .as_integer() + .with_context(|| format!("configure option {name} is not an integer"))?, + None => return Ok(None), + }; + + I::try_from(toml_integer) + .map(Some) + .with_context(|| format!("configure option {name} is too large")) +} + +fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result { + let s = item + .as_str() + .with_context(|| format!("configure option {name} is not a string"))?; + Ok(s.to_string()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 50243352ee..df1581fb51 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -5,6 +5,7 @@ //! volume is mounted to the local FS. use std::{ + borrow::Cow, future::Future, path::{Path, PathBuf}, pin::Pin, @@ -17,10 +18,16 @@ use tokio::{ }; use tracing::*; -use crate::path_with_suffix_extension; +use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectName}; use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; +impl RemoteObjectName for PathBuf { + fn object_name(&self) -> Option<&str> { + self.file_stem().and_then(|n| n.to_str()) + } +} + pub struct LocalFs { working_directory: PathBuf, storage_root: PathBuf, @@ -101,7 +108,18 @@ impl RemoteStorage for LocalFs { } async fn list(&self) -> anyhow::Result> { - get_all_files(&self.storage_root).await + get_all_files(&self.storage_root, true).await + } + + async fn list_prefixes( + &self, + prefix: Option, + ) -> anyhow::Result> { + let path = match prefix { + Some(prefix) => Cow::Owned(self.storage_root.join(prefix)), + None => Cow::Borrowed(&self.storage_root), + }; + get_all_files(path.as_ref(), false).await } async fn upload( @@ -192,15 +210,12 @@ impl RemoteStorage for LocalFs { Ok(()) } - async fn download( - &self, - from: &Self::RemoteObjectId, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result> { - let file_path = self.resolve_in_storage(from)?; - - if file_path.exists() && file_path.is_file() { - let mut source = io::BufReader::new( + async fn download(&self, from: &Self::RemoteObjectId) -> Result { + let file_path = self + .resolve_in_storage(from) + .map_err(DownloadError::BadInput)?; + if file_exists(&file_path).map_err(DownloadError::BadInput)? { + let source = io::BufReader::new( fs::OpenOptions::new() .read(true) .open(&file_path) @@ -210,22 +225,20 @@ impl RemoteStorage for LocalFs { "Failed to open source file '{}' to use in the download", file_path.display() ) - })?, + }) + .map_err(DownloadError::Other)?, ); - io::copy(&mut source, to).await.with_context(|| { - format!( - "Failed to download file '{}' from the local storage", - file_path.display() - ) - })?; - source.flush().await?; - self.read_storage_metadata(&file_path).await + let metadata = self + .read_storage_metadata(&file_path) + .await + .map_err(DownloadError::Other)?; + Ok(Download { + metadata, + download_stream: Box::pin(source), + }) } else { - bail!( - "File '{}' either does not exist or is not a file", - file_path.display() - ) + Err(DownloadError::NotFound) } } @@ -234,22 +247,19 @@ impl RemoteStorage for LocalFs { from: &Self::RemoteObjectId, start_inclusive: u64, end_exclusive: Option, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result> { + ) -> Result { if let Some(end_exclusive) = end_exclusive { - ensure!( - end_exclusive > start_inclusive, - "Invalid range, start ({}) is bigger then end ({:?})", - start_inclusive, - end_exclusive - ); + if end_exclusive <= start_inclusive { + return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) is not less than end_exclusive ({end_exclusive:?})"))); + }; if start_inclusive == end_exclusive.saturating_sub(1) { - return Ok(None); + return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes"))); } } - let file_path = self.resolve_in_storage(from)?; - - if file_path.exists() && file_path.is_file() { + let file_path = self + .resolve_in_storage(from) + .map_err(DownloadError::BadInput)?; + if file_exists(&file_path).map_err(DownloadError::BadInput)? { let mut source = io::BufReader::new( fs::OpenOptions::new() .read(true) @@ -260,31 +270,31 @@ impl RemoteStorage for LocalFs { "Failed to open source file '{}' to use in the download", file_path.display() ) - })?, + }) + .map_err(DownloadError::Other)?, ); source .seek(io::SeekFrom::Start(start_inclusive)) .await - .context("Failed to seek to the range start in a local storage file")?; - match end_exclusive { - Some(end_exclusive) => { - io::copy(&mut source.take(end_exclusive - start_inclusive), to).await - } - None => io::copy(&mut source, to).await, - } - .with_context(|| { - format!( - "Failed to download file '{}' range from the local storage", - file_path.display() - ) - })?; + .context("Failed to seek to the range start in a local storage file") + .map_err(DownloadError::Other)?; + let metadata = self + .read_storage_metadata(&file_path) + .await + .map_err(DownloadError::Other)?; - self.read_storage_metadata(&file_path).await + Ok(match end_exclusive { + Some(end_exclusive) => Download { + metadata, + download_stream: Box::pin(source.take(end_exclusive - start_inclusive)), + }, + None => Download { + metadata, + download_stream: Box::pin(source), + }, + }) } else { - bail!( - "File '{}' either does not exist or is not a file", - file_path.display() - ) + Err(DownloadError::NotFound) } } @@ -307,6 +317,7 @@ fn storage_metadata_path(original_path: &Path) -> PathBuf { fn get_all_files<'a, P>( directory_path: P, + recursive: bool, ) -> Pin>> + Send + Sync + 'a>> where P: AsRef + Send + Sync + 'a, @@ -323,7 +334,11 @@ where if file_type.is_symlink() { debug!("{:?} us a symlink, skipping", entry_path) } else if file_type.is_dir() { - paths.extend(get_all_files(entry_path).await?.into_iter()) + if recursive { + paths.extend(get_all_files(entry_path, true).await?.into_iter()) + } else { + paths.push(dir_entry.path()) + } } else { paths.push(dir_entry.path()); } @@ -352,6 +367,19 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> Ok(()) } +fn file_exists(file_path: &Path) -> anyhow::Result { + if file_path.exists() { + ensure!( + file_path.is_file(), + "file path '{}' is not a file", + file_path.display() + ); + Ok(true) + } else { + Ok(false) + } +} + #[cfg(test)] mod pure_tests { use tempfile::tempdir; @@ -518,6 +546,31 @@ mod fs_tests { use std::{collections::HashMap, io::Write}; use tempfile::tempdir; + async fn read_and_assert_remote_file_contents( + storage: &LocalFs, + #[allow(clippy::ptr_arg)] + // have to use &PathBuf due to `storage.local_path` parameter requirements + remote_storage_path: &PathBuf, + expected_metadata: Option<&StorageMetadata>, + ) -> anyhow::Result { + let mut download = storage + .download(remote_storage_path) + .await + .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?; + ensure!( + download.metadata.as_ref() == expected_metadata, + "Unexpected metadata returned for the downloaded file" + ); + + let mut contents = String::new(); + download + .download_stream + .read_to_string(&mut contents) + .await + .context("Failed to read remote file contents into string")?; + Ok(contents) + } + #[tokio::test] async fn upload_file() -> anyhow::Result<()> { let workdir = tempdir()?.path().to_owned(); @@ -568,15 +621,7 @@ mod fs_tests { let upload_name = "upload_1"; let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; - let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - let metadata = storage.download(&upload_target, &mut content_bytes).await?; - assert!( - metadata.is_none(), - "No metadata should be returned for no metadata upload" - ); - - content_bytes.flush().await?; - let contents = String::from_utf8(content_bytes.into_inner().into_inner())?; + let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?; assert_eq!( dummy_contents(upload_name), contents, @@ -584,13 +629,9 @@ mod fs_tests { ); let non_existing_path = PathBuf::from("somewhere").join("else"); - match storage.download(&non_existing_path, &mut io::sink()).await { - Ok(_) => panic!("Should not allow downloading non-existing storage files"), - Err(e) => { - let error_string = e.to_string(); - assert!(error_string.contains("does not exist")); - assert!(error_string.contains(&non_existing_path.display().to_string())); - } + match storage.download(&non_existing_path).await { + Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys + other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"), } Ok(()) } @@ -603,58 +644,31 @@ mod fs_tests { let upload_name = "upload_1"; let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; - let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - let metadata = storage - .download_byte_range(&upload_target, 0, None, &mut full_range_bytes) - .await?; - assert!( - metadata.is_none(), - "No metadata should be returned for no metadata upload" - ); - full_range_bytes.flush().await?; + let full_range_download_contents = + read_and_assert_remote_file_contents(&storage, &upload_target, None).await?; assert_eq!( dummy_contents(upload_name), - String::from_utf8(full_range_bytes.into_inner().into_inner())?, + full_range_download_contents, "Download full range should return the whole upload" ); - let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - let same_byte = 1_000_000_000; - let metadata = storage - .download_byte_range( - &upload_target, - same_byte, - Some(same_byte + 1), // exclusive end - &mut zero_range_bytes, - ) - .await?; - assert!( - metadata.is_none(), - "No metadata should be returned for no metadata upload" - ); - zero_range_bytes.flush().await?; - assert!( - zero_range_bytes.into_inner().into_inner().is_empty(), - "Zero byte range should not download any part of the file" - ); - let uploaded_bytes = dummy_contents(upload_name).into_bytes(); let (first_part_local, second_part_local) = uploaded_bytes.split_at(3); - let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - let metadata = storage - .download_byte_range( - &upload_target, - 0, - Some(first_part_local.len() as u64), - &mut first_part_remote, - ) + let mut first_part_download = storage + .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64)) .await?; assert!( - metadata.is_none(), + first_part_download.metadata.is_none(), "No metadata should be returned for no metadata upload" ); + let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); + io::copy( + &mut first_part_download.download_stream, + &mut first_part_remote, + ) + .await?; first_part_remote.flush().await?; let first_part_remote = first_part_remote.into_inner().into_inner(); assert_eq!( @@ -663,20 +677,24 @@ mod fs_tests { "First part bytes should be returned when requested" ); - let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - let metadata = storage + let mut second_part_download = storage .download_byte_range( &upload_target, first_part_local.len() as u64, Some((first_part_local.len() + second_part_local.len()) as u64), - &mut second_part_remote, ) .await?; assert!( - metadata.is_none(), + second_part_download.metadata.is_none(), "No metadata should be returned for no metadata upload" ); + let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); + io::copy( + &mut second_part_download.download_stream, + &mut second_part_remote, + ) + .await?; second_part_remote.flush().await?; let second_part_remote = second_part_remote.into_inner().into_inner(); assert_eq!( @@ -696,11 +714,30 @@ mod fs_tests { let upload_name = "upload_1"; let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; + let start = 1_000_000_000; + let end = start + 1; + match storage + .download_byte_range( + &upload_target, + start, + Some(end), // exclusive end + ) + .await + { + Ok(_) => panic!("Should not allow downloading wrong ranges"), + Err(e) => { + let error_string = e.to_string(); + assert!(error_string.contains("zero bytes")); + assert!(error_string.contains(&start.to_string())); + assert!(error_string.contains(&end.to_string())); + } + } + let start = 10000; let end = 234; assert!(start > end, "Should test an incorrect range"); match storage - .download_byte_range(&upload_target, start, Some(end), &mut io::sink()) + .download_byte_range(&upload_target, start, Some(end)) .await { Ok(_) => panic!("Should not allow downloading wrong ranges"), @@ -712,18 +749,6 @@ mod fs_tests { } } - let non_existing_path = PathBuf::from("somewhere").join("else"); - match storage - .download_byte_range(&non_existing_path, 1, Some(3), &mut io::sink()) - .await - { - Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"), - Err(e) => { - let error_string = e.to_string(); - assert!(error_string.contains("does not exist")); - assert!(error_string.contains(&non_existing_path.display().to_string())); - } - } Ok(()) } @@ -762,35 +787,26 @@ mod fs_tests { let upload_target = upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?; - let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - let full_download_metadata = storage.download(&upload_target, &mut content_bytes).await?; - - content_bytes.flush().await?; - let contents = String::from_utf8(content_bytes.into_inner().into_inner())?; + let full_range_download_contents = + read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?; assert_eq!( dummy_contents(upload_name), - contents, + full_range_download_contents, "We should upload and download the same contents" ); - assert_eq!( - full_download_metadata.as_ref(), - Some(&metadata), - "We should get the same metadata back for full download" - ); - let uploaded_bytes = dummy_contents(upload_name).into_bytes(); let (first_part_local, _) = uploaded_bytes.split_at(3); - let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); - let partial_download_metadata = storage - .download_byte_range( - &upload_target, - 0, - Some(first_part_local.len() as u64), - &mut first_part_remote, - ) + let mut partial_download_with_metadata = storage + .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64)) .await?; + let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new())); + io::copy( + &mut partial_download_with_metadata.download_stream, + &mut first_part_remote, + ) + .await?; first_part_remote.flush().await?; let first_part_remote = first_part_remote.into_inner().into_inner(); assert_eq!( @@ -800,8 +816,8 @@ mod fs_tests { ); assert_eq!( - partial_download_metadata.as_ref(), - Some(&metadata), + partial_download_with_metadata.metadata, + Some(metadata), "We should get the same metadata back for partial download" ); @@ -843,7 +859,7 @@ mod fs_tests { } fn dummy_contents(name: &str) -> String { - format!("contents for {}", name) + format!("contents for {name}") } async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result> { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 01aaf7ca7e..ff52f033d1 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -9,20 +9,87 @@ use std::path::{Path, PathBuf}; use anyhow::Context; use rusoto_core::{ credential::{InstanceMetadataProvider, StaticProvider}, - HttpClient, Region, + HttpClient, Region, RusotoError, }; use rusoto_s3::{ - DeleteObjectRequest, GetObjectRequest, ListObjectsV2Request, PutObjectRequest, S3Client, - StreamingBody, S3, + DeleteObjectRequest, GetObjectError, GetObjectRequest, ListObjectsV2Request, PutObjectRequest, + S3Client, StreamingBody, S3, }; use tokio::{io, sync::Semaphore}; use tokio_util::io::ReaderStream; use tracing::debug; -use crate::{strip_path_prefix, RemoteStorage, S3Config}; +use crate::{ + strip_path_prefix, Download, DownloadError, RemoteObjectName, RemoteStorage, S3Config, +}; use super::StorageMetadata; +pub(super) mod metrics { + use metrics::{register_int_counter_vec, IntCounterVec}; + use once_cell::sync::Lazy; + + static S3_REQUESTS_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "remote_storage_s3_requests_count", + "Number of s3 requests of particular type", + &["request_type"], + ) + .expect("failed to define a metric") + }); + + static S3_REQUESTS_FAIL_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "remote_storage_s3_failures_count", + "Number of failed s3 requests of particular type", + &["request_type"], + ) + .expect("failed to define a metric") + }); + + pub fn inc_get_object() { + S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc(); + } + + pub fn inc_get_object_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["get_object"]) + .inc(); + } + + pub fn inc_put_object() { + S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc(); + } + + pub fn inc_put_object_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["put_object"]) + .inc(); + } + + pub fn inc_delete_object() { + S3_REQUESTS_COUNT + .with_label_values(&["delete_object"]) + .inc(); + } + + pub fn inc_delete_object_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["delete_object"]) + .inc(); + } + + pub fn inc_list_objects() { + S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc(); + } + + pub fn inc_list_objects_fail() { + S3_REQUESTS_FAIL_COUNT + .with_label_values(&["list_objects"]) + .inc(); + } +} + const S3_PREFIX_SEPARATOR: char = '/'; #[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Hash)] @@ -52,6 +119,25 @@ impl S3ObjectKey { } } +impl RemoteObjectName for S3ObjectKey { + /// Turn a/b/c or a/b/c/ into c + fn object_name(&self) -> Option<&str> { + // corner case, char::to_string is not const, thats why this is more verbose than it needs to be + // see https://github.com/rust-lang/rust/issues/88674 + if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR { + return None; + } + + if self.0.ends_with(S3_PREFIX_SEPARATOR) { + self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1) + } else { + self.0 + .rsplit_once(S3_PREFIX_SEPARATOR) + .map(|(_, last)| last) + } + } +} + /// AWS S3 storage. pub struct S3Bucket { workdir: PathBuf, @@ -122,6 +208,39 @@ impl S3Bucket { concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()), }) } + + async fn download_object(&self, request: GetObjectRequest) -> Result { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 download") + .map_err(DownloadError::Other)?; + + metrics::inc_get_object(); + + match self.client.get_object(request).await { + Ok(object_output) => match object_output.body { + None => { + metrics::inc_get_object_fail(); + Err(DownloadError::Other(anyhow::anyhow!( + "Got no body for the S3 object given" + ))) + } + Some(body) => Ok(Download { + metadata: object_output.metadata.map(StorageMetadata), + download_stream: Box::pin(io::BufReader::new(body.into_async_read())), + }), + }, + Err(RusotoError::Service(GetObjectError::NoSuchKey(_))) => Err(DownloadError::NotFound), + Err(e) => { + metrics::inc_get_object_fail(); + Err(DownloadError::Other(anyhow::anyhow!( + "Failed to download S3 object: {e}" + ))) + } + } + } } #[async_trait::async_trait] @@ -152,6 +271,9 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 list")?; + + metrics::inc_list_objects(); + let fetch_response = self .client .list_objects_v2(ListObjectsV2Request { @@ -160,7 +282,11 @@ impl RemoteStorage for S3Bucket { continuation_token, ..ListObjectsV2Request::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_list_objects_fail(); + e + })?; document_keys.extend( fetch_response .contents @@ -178,6 +304,77 @@ impl RemoteStorage for S3Bucket { Ok(document_keys) } + /// Note: it wont include empty "directories" + async fn list_prefixes( + &self, + prefix: Option, + ) -> anyhow::Result> { + let list_prefix = match prefix { + Some(prefix) => { + let mut prefix_in_bucket = self.prefix_in_bucket.clone().unwrap_or_default(); + // if there is no trailing / in default prefix and + // supplied prefix does not start with "/" insert it + if !(prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR) + || prefix.0.starts_with(S3_PREFIX_SEPARATOR)) + { + prefix_in_bucket.push(S3_PREFIX_SEPARATOR); + } + + prefix_in_bucket.push_str(&prefix.0); + // required to end with a separator + // otherwise request will return only the entry of a prefix + if !prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR) { + prefix_in_bucket.push(S3_PREFIX_SEPARATOR); + } + Some(prefix_in_bucket) + } + None => self.prefix_in_bucket.clone(), + }; + + let mut document_keys = Vec::new(); + + let mut continuation_token = None; + loop { + let _guard = self + .concurrency_limiter + .acquire() + .await + .context("Concurrency limiter semaphore got closed during S3 list")?; + + metrics::inc_list_objects(); + + let fetch_response = self + .client + .list_objects_v2(ListObjectsV2Request { + bucket: self.bucket_name.clone(), + prefix: list_prefix.clone(), + continuation_token, + delimiter: Some(S3_PREFIX_SEPARATOR.to_string()), + ..ListObjectsV2Request::default() + }) + .await + .map_err(|e| { + metrics::inc_list_objects_fail(); + e + })?; + + document_keys.extend( + fetch_response + .common_prefixes + .unwrap_or_default() + .into_iter() + .filter_map(|o| Some(S3ObjectKey(o.prefix?))), + ); + + match fetch_response.continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + + Ok(document_keys) + } + async fn upload( &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, @@ -190,6 +387,8 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 upload")?; + + metrics::inc_put_object(); self.client .put_object(PutObjectRequest { body: Some(StreamingBody::new_with_size( @@ -201,35 +400,21 @@ impl RemoteStorage for S3Bucket { metadata: metadata.map(|m| m.0), ..PutObjectRequest::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_put_object_fail(); + e + })?; Ok(()) } - async fn download( - &self, - from: &Self::RemoteObjectId, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result> { - let _guard = self - .concurrency_limiter - .acquire() - .await - .context("Concurrency limiter semaphore got closed during S3 download")?; - let object_output = self - .client - .get_object(GetObjectRequest { - bucket: self.bucket_name.clone(), - key: from.key().to_owned(), - ..GetObjectRequest::default() - }) - .await?; - - if let Some(body) = object_output.body { - let mut from = io::BufReader::new(body.into_async_read()); - io::copy(&mut from, to).await?; - } - - Ok(object_output.metadata.map(StorageMetadata)) + async fn download(&self, from: &Self::RemoteObjectId) -> Result { + self.download_object(GetObjectRequest { + bucket: self.bucket_name.clone(), + key: from.key().to_owned(), + ..GetObjectRequest::default() + }) + .await } async fn download_byte_range( @@ -237,8 +422,7 @@ impl RemoteStorage for S3Bucket { from: &Self::RemoteObjectId, start_inclusive: u64, end_exclusive: Option, - to: &mut (impl io::AsyncWrite + Unpin + Send + Sync), - ) -> anyhow::Result> { + ) -> Result { // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 // and needs both ends to be exclusive let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1)); @@ -246,27 +430,14 @@ impl RemoteStorage for S3Bucket { Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive), None => format!("bytes={}-", start_inclusive), }); - let _guard = self - .concurrency_limiter - .acquire() - .await - .context("Concurrency limiter semaphore got closed during S3 range download")?; - let object_output = self - .client - .get_object(GetObjectRequest { - bucket: self.bucket_name.clone(), - key: from.key().to_owned(), - range, - ..GetObjectRequest::default() - }) - .await?; - if let Some(body) = object_output.body { - let mut from = io::BufReader::new(body.into_async_read()); - io::copy(&mut from, to).await?; - } - - Ok(object_output.metadata.map(StorageMetadata)) + self.download_object(GetObjectRequest { + bucket: self.bucket_name.clone(), + key: from.key().to_owned(), + range, + ..GetObjectRequest::default() + }) + .await } async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> { @@ -275,13 +446,20 @@ impl RemoteStorage for S3Bucket { .acquire() .await .context("Concurrency limiter semaphore got closed during S3 delete")?; + + metrics::inc_delete_object(); + self.client .delete_object(DeleteObjectRequest { bucket: self.bucket_name.clone(), key: path.key().to_owned(), ..DeleteObjectRequest::default() }) - .await?; + .await + .map_err(|e| { + metrics::inc_delete_object_fail(); + e + })?; Ok(()) } } @@ -292,6 +470,25 @@ mod tests { use super::*; + #[test] + fn object_name() { + let k = S3ObjectKey("a/b/c".to_owned()); + assert_eq!(k.object_name(), Some("c")); + + let k = S3ObjectKey("a/b/c/".to_owned()); + assert_eq!(k.object_name(), Some("c")); + + let k = S3ObjectKey("a/".to_owned()); + assert_eq!(k.object_name(), Some("a")); + + // XXX is it impossible to have an empty key? + let k = S3ObjectKey("".to_owned()); + assert_eq!(k.object_name(), None); + + let k = S3ObjectKey("/".to_owned()); + assert_eq!(k.object_name(), None); + } + #[test] fn download_destination() -> anyhow::Result<()> { let workdir = tempdir()?.path().to_owned(); diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh index 4983449f24..9bd860affb 100755 --- a/libs/utils/scripts/restore_from_wal.sh +++ b/libs/utils/scripts/restore_from_wal.sh @@ -5,7 +5,7 @@ DATA_DIR=$3 PORT=$4 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-` rm -fr $DATA_DIR -env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U zenith_admin -D $DATA_DIR --sysid=$SYSID +env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID echo port=$PORT >> $DATA_DIR/postgresql.conf REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-` declare -i WAL_SIZE=$REDO_POS+114 diff --git a/libs/utils/scripts/restore_from_wal_archive.sh b/libs/utils/scripts/restore_from_wal_archive.sh index 07f4fe1e4f..ce58b349fc 100755 --- a/libs/utils/scripts/restore_from_wal_archive.sh +++ b/libs/utils/scripts/restore_from_wal_archive.sh @@ -5,7 +5,7 @@ PORT=$4 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-` rm -fr $DATA_DIR /tmp/pg_wals mkdir /tmp/pg_wals -env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U zenith_admin -D $DATA_DIR --sysid=$SYSID +env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID echo port=$PORT >> $DATA_DIR/postgresql.conf REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-` declare -i WAL_SIZE=$REDO_POS+114 diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs index 063d69557d..70f54ea02f 100644 --- a/libs/utils/src/bin_ser.rs +++ b/libs/utils/src/bin_ser.rs @@ -71,7 +71,7 @@ impl From for SerializeError { /// - Fixed integer encoding (i.e. 1u32 is 00000001 not 01) /// /// Does not allow trailing bytes in deserialization. If this is desired, you -/// may set [`Options::allow_trailing_bytes`] to explicitly accomodate this. +/// may set [`Options::allow_trailing_bytes`] to explicitly accommodate this. pub fn be_coder() -> impl Options { bincode::DefaultOptions::new() .with_big_endian() @@ -85,7 +85,7 @@ pub fn be_coder() -> impl Options { /// - Fixed integer encoding (i.e. 1u32 is 00000001 not 01) /// /// Does not allow trailing bytes in deserialization. If this is desired, you -/// may set [`Options::allow_trailing_bytes`] to explicitly accomodate this. +/// may set [`Options::allow_trailing_bytes`] to explicitly accommodate this. pub fn le_coder() -> impl Options { bincode::DefaultOptions::new() .with_little_endian() diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 15d4c7a81e..1b011bb73a 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -64,7 +64,7 @@ pub mod signals; /// One thing to note is that .git is not available in docker (and it is bad to include it there). /// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required. /// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro. -/// Git version received from environment variable used as a fallback in git_version invokation. +/// Git version received from environment variable used as a fallback in git_version invocation. /// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option. /// So the build script will be run only when GIT_VERSION envvar has changed. /// diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index c09d8c67ce..3dab2a625c 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -26,6 +26,9 @@ impl Lsn { /// Maximum possible value for an LSN pub const MAX: Lsn = Lsn(u64::MAX); + /// Invalid value for InvalidXLogRecPtr, as defined in xlogdefs.h + pub const INVALID: Lsn = Lsn(0); + /// Subtract a number, returning None on overflow. pub fn checked_sub>(self, other: T) -> Option { let other: u64 = other.into(); @@ -103,6 +106,12 @@ impl Lsn { pub fn is_aligned(&self) -> bool { *self == self.align() } + + /// Return if the LSN is valid + /// mimics postgres XLogRecPtrIsInvalid macro + pub fn is_valid(self) -> bool { + self != Lsn::INVALID + } } impl From for Lsn { diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 857df0ec84..79dca96fcf 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -13,13 +13,10 @@ use std::fmt; use std::io::{self, Write}; use std::net::{Shutdown, SocketAddr, TcpStream}; use std::str::FromStr; -use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::time::Duration; use tracing::*; -static PGBACKEND_SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false); - pub trait Handler { /// Handle single query. /// postgres_backend will issue ReadyForQuery after calling this (this @@ -45,6 +42,10 @@ pub trait Handler { fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> { bail!("JWT auth failed") } + + fn is_shutdown_requested(&self) -> bool { + false + } } /// PostgresBackend protocol state. @@ -274,7 +275,7 @@ impl PostgresBackend { let mut unnamed_query_string = Bytes::new(); - while !PGBACKEND_SHUTDOWN_REQUESTED.load(Ordering::Relaxed) { + while !handler.is_shutdown_requested() { match self.read_message() { Ok(message) => { if let Some(msg) = message { @@ -336,11 +337,11 @@ impl PostgresBackend { let have_tls = self.tls_config.is_some(); match msg { FeMessage::StartupPacket(m) => { - trace!("got startup message {:?}", m); + trace!("got startup message {m:?}"); match m { FeStartupPacket::SslRequest => { - info!("SSL requested"); + debug!("SSL requested"); self.write_message(&BeMessage::EncryptionResponse(have_tls))?; if have_tls { @@ -349,7 +350,7 @@ impl PostgresBackend { } } FeStartupPacket::GssEncRequest => { - info!("GSS requested"); + debug!("GSS requested"); self.write_message(&BeMessage::EncryptionResponse(false))?; } FeStartupPacket::StartupMessage { .. } => { @@ -433,12 +434,7 @@ impl PostgresBackend { // full cause of the error, not just the top-level context + its trace. // We don't want to send that in the ErrorResponse though, // because it's not relevant to the compute node logs. - if query_string.starts_with("callmemaybe") { - // FIXME avoid printing a backtrace for tenant x not found errors until this is properly fixed - error!("query handler for '{}' failed: {}", query_string, e); - } else { - error!("query handler for '{}' failed: {:?}", query_string, e); - } + error!("query handler for '{}' failed: {:?}", query_string, e); self.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?; // TODO: untangle convoluted control flow if e.to_string().contains("failed to run") { @@ -475,7 +471,7 @@ impl PostgresBackend { self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?; } // NOTE there is no ReadyForQuery message. This handler is used - // for basebackup and it uses CopyOut which doesnt require + // for basebackup and it uses CopyOut which doesn't require // ReadyForQuery message and backend just switches back to // processing mode after sending CopyDone or ErrorResponse. } @@ -498,8 +494,3 @@ impl PostgresBackend { Ok(ProcessMsgResult::Continue) } } - -// Set the flag to inform connections to cancel -pub fn set_pgbackend_shutdown_requested() { - PGBACKEND_SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed); -} diff --git a/libs/utils/src/pq_proto.rs b/libs/utils/src/pq_proto.rs index ce86cf8c91..0a320f123c 100644 --- a/libs/utils/src/pq_proto.rs +++ b/libs/utils/src/pq_proto.rs @@ -269,7 +269,14 @@ impl FeStartupPacket { .next() .context("expected even number of params in StartupMessage")?; if name == "options" { - // deprecated way of passing params as cmd line args + // parsing options arguments "...&options=%3D+=..." + // '%3D' is '=' and '+' is ' ' + + // Note: we allow users that don't have SNI capabilities, + // to pass a special keyword argument 'project' + // to be used to determine the cluster name by the proxy. + + //TODO: write unit test for this and refactor in its own function. for cmdopt in value.split(' ') { let nameval: Vec<&str> = cmdopt.split('=').collect(); if nameval.len() == 2 { @@ -464,7 +471,7 @@ impl BeParameterStatusMessage<'static> { } } -// One row desciption in RowDescription packet. +// One row description in RowDescription packet. #[derive(Debug)] pub struct RowDescriptor<'a> { pub name: &'a [u8], @@ -613,7 +620,7 @@ fn cstr_to_str(b: &Bytes) -> Result<&str> { impl<'a> BeMessage<'a> { /// Write message to the given buf. // Unlike the reading side, we use BytesMut - // here as msg len preceeds its body and it is handy to write it down first + // here as msg len precedes its body and it is handy to write it down first // and then fill the length. With Write we would have to either calc it // manually or have one more buffer. pub fn write(buf: &mut BytesMut, message: &BeMessage) -> io::Result<()> { @@ -919,10 +926,10 @@ impl<'a> BeMessage<'a> { } } -// Zenith extension of postgres replication protocol -// See ZENITH_STATUS_UPDATE_TAG_BYTE +// Neon extension of postgres replication protocol +// See NEON_STATUS_UPDATE_TAG_BYTE #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] -pub struct ZenithFeedback { +pub struct ReplicationFeedback { // Last known size of the timeline. Used to enforce timeline size limit. pub current_timeline_size: u64, // Parts of StandbyStatusUpdate we resend to compute via safekeeper @@ -932,13 +939,13 @@ pub struct ZenithFeedback { pub ps_replytime: SystemTime, } -// NOTE: Do not forget to increment this number when adding new fields to ZenithFeedback. +// NOTE: Do not forget to increment this number when adding new fields to ReplicationFeedback. // Do not remove previously available fields because this might be backwards incompatible. -pub const ZENITH_FEEDBACK_FIELDS_NUMBER: u8 = 5; +pub const REPLICATION_FEEDBACK_FIELDS_NUMBER: u8 = 5; -impl ZenithFeedback { - pub fn empty() -> ZenithFeedback { - ZenithFeedback { +impl ReplicationFeedback { + pub fn empty() -> ReplicationFeedback { + ReplicationFeedback { current_timeline_size: 0, ps_writelsn: 0, ps_applylsn: 0, @@ -947,7 +954,7 @@ impl ZenithFeedback { } } - // Serialize ZenithFeedback using custom format + // Serialize ReplicationFeedback using custom format // to support protocol extensibility. // // Following layout is used: @@ -958,7 +965,7 @@ impl ZenithFeedback { // uint32 - value length in bytes // value itself pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> { - buf.put_u8(ZENITH_FEEDBACK_FIELDS_NUMBER); // # of keys + buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys write_cstr(&Bytes::from("current_timeline_size"), buf)?; buf.put_i32(8); buf.put_u64(self.current_timeline_size); @@ -985,9 +992,9 @@ impl ZenithFeedback { Ok(()) } - // Deserialize ZenithFeedback message - pub fn parse(mut buf: Bytes) -> ZenithFeedback { - let mut zf = ZenithFeedback::empty(); + // Deserialize ReplicationFeedback message + pub fn parse(mut buf: Bytes) -> ReplicationFeedback { + let mut zf = ReplicationFeedback::empty(); let nfields = buf.get_u8(); let mut i = 0; while i < nfields { @@ -1028,14 +1035,14 @@ impl ZenithFeedback { _ => { let len = buf.get_i32(); warn!( - "ZenithFeedback parse. unknown key {} of len {}. Skip it.", + "ReplicationFeedback parse. unknown key {} of len {}. Skip it.", key, len ); buf.advance(len as usize); } } } - trace!("ZenithFeedback parsed is {:?}", zf); + trace!("ReplicationFeedback parsed is {:?}", zf); zf } } @@ -1045,9 +1052,9 @@ mod tests { use super::*; #[test] - fn test_zenithfeedback_serialization() { - let mut zf = ZenithFeedback::empty(); - // Fill zf wih some values + fn test_replication_feedback_serialization() { + let mut zf = ReplicationFeedback::empty(); + // Fill zf with some values zf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. @@ -1055,14 +1062,14 @@ mod tests { let mut data = BytesMut::new(); zf.serialize(&mut data).unwrap(); - let zf_parsed = ZenithFeedback::parse(data.freeze()); + let zf_parsed = ReplicationFeedback::parse(data.freeze()); assert_eq!(zf, zf_parsed); } #[test] - fn test_zenithfeedback_unknown_key() { - let mut zf = ZenithFeedback::empty(); - // Fill zf wih some values + fn test_replication_feedback_unknown_key() { + let mut zf = ReplicationFeedback::empty(); + // Fill zf with some values zf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. @@ -1072,7 +1079,7 @@ mod tests { // Add an extra field to the buffer and adjust number of keys if let Some(first) = data.first_mut() { - *first = ZENITH_FEEDBACK_FIELDS_NUMBER + 1; + *first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1; } write_cstr(&Bytes::from("new_field_one"), &mut data).unwrap(); @@ -1080,7 +1087,7 @@ mod tests { data.put_u64(42); // Parse serialized data and check that new field is not parsed - let zf_parsed = ZenithFeedback::parse(data.freeze()); + let zf_parsed = ReplicationFeedback::parse(data.freeze()); assert_eq!(zf, zf_parsed); } diff --git a/libs/utils/src/zid.rs b/libs/utils/src/zid.rs index 44d81cda50..6da5355f61 100644 --- a/libs/utils/src/zid.rs +++ b/libs/utils/src/zid.rs @@ -193,7 +193,7 @@ pub struct ZTenantId(ZId); zid_newtype!(ZTenantId); // A pair uniquely identifying Zenith instance. -#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct ZTenantTimelineId { pub tenant_id: ZTenantId, pub timeline_id: ZTimelineId, @@ -218,7 +218,7 @@ impl ZTenantTimelineId { impl fmt::Display for ZTenantTimelineId { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}-{}", self.tenant_id, self.timeline_id) + write!(f, "{}/{}", self.tenant_id, self.timeline_id) } } @@ -226,9 +226,9 @@ impl fmt::Display for ZTenantTimelineId { // by the console. #[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)] #[serde(transparent)] -pub struct ZNodeId(pub u64); +pub struct NodeId(pub u64); -impl fmt::Display for ZNodeId { +impl fmt::Display for NodeId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.0) } diff --git a/neon_local/src/main.rs b/neon_local/src/main.rs index f04af9cfdd..b29cc6978c 100644 --- a/neon_local/src/main.rs +++ b/neon_local/src/main.rs @@ -14,7 +14,7 @@ use safekeeper::defaults::{ DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; use std::collections::{BTreeSet, HashMap}; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::process::exit; use std::str::FromStr; use utils::{ @@ -22,14 +22,14 @@ use utils::{ lsn::Lsn, postgres_backend::AuthType, project_git_version, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use pageserver::timelines::TimelineInfo; // Default id of a safekeeper node, if not specified on the command line. -const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1); -const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1); +const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1); +const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); @@ -159,6 +159,20 @@ fn main() -> Result<()> { .about("Create a new blank timeline") .arg(tenant_id_arg.clone()) .arg(branch_name_arg.clone())) + .subcommand(App::new("import") + .about("Import timeline from basebackup directory") + .arg(tenant_id_arg.clone()) + .arg(timeline_id_arg.clone()) + .arg(Arg::new("node-name").long("node-name").takes_value(true) + .help("Name to assign to the imported timeline")) + .arg(Arg::new("base-tarfile").long("base-tarfile").takes_value(true) + .help("Basebackup tarfile to import")) + .arg(Arg::new("base-lsn").long("base-lsn").takes_value(true) + .help("Lsn the basebackup starts at")) + .arg(Arg::new("wal-tarfile").long("wal-tarfile").takes_value(true) + .help("Wal to add after base")) + .arg(Arg::new("end-lsn").long("end-lsn").takes_value(true) + .help("Lsn the basebackup ends at"))) ).subcommand( App::new("tenant") .setting(AppSettings::ArgRequiredElseHelp) @@ -523,7 +537,13 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an match tenant_match.subcommand() { Some(("list", _)) => { for t in pageserver.tenant_list()? { - println!("{} {}", t.id, t.state); + println!( + "{} {}", + t.id, + t.state + .map(|s| s.to_string()) + .unwrap_or_else(|| String::from("")) + ); } } Some(("create", create_match)) => { @@ -613,6 +633,43 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) - timeline.timeline_id, last_record_lsn, tenant_id, ); } + Some(("import", import_match)) => { + let tenant_id = get_tenant_id(import_match, env)?; + let timeline_id = parse_timeline_id(import_match)?.expect("No timeline id provided"); + let name = import_match + .value_of("node-name") + .ok_or_else(|| anyhow!("No node name provided"))?; + + // Parse base inputs + let base_tarfile = import_match + .value_of("base-tarfile") + .map(|s| PathBuf::from_str(s).unwrap()) + .ok_or_else(|| anyhow!("No base-tarfile provided"))?; + let base_lsn = Lsn::from_str( + import_match + .value_of("base-lsn") + .ok_or_else(|| anyhow!("No base-lsn provided"))?, + )?; + let base = (base_lsn, base_tarfile); + + // Parse pg_wal inputs + let wal_tarfile = import_match + .value_of("wal-tarfile") + .map(|s| PathBuf::from_str(s).unwrap()); + let end_lsn = import_match + .value_of("end-lsn") + .map(|s| Lsn::from_str(s).unwrap()); + // TODO validate both or none are provided + let pg_wal = end_lsn.zip(wal_tarfile); + + let mut cplane = ComputeControlPlane::load(env.clone())?; + println!("Importing timeline into pageserver ..."); + pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal)?; + println!("Creating node for imported timeline ..."); + env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?; + cplane.new_node(tenant_id, name, timeline_id, None, None)?; + println!("Done"); + } Some(("branch", branch_match)) => { let tenant_id = get_tenant_id(branch_match, env)?; let new_branch_name = branch_match @@ -860,7 +917,7 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul Ok(()) } -fn get_safekeeper(env: &local_env::LocalEnv, id: ZNodeId) -> Result { +fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result { if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) { Ok(SafekeeperNode::from_env(env, node)) } else { @@ -876,7 +933,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul // All the commands take an optional safekeeper name argument let sk_id = if let Some(id_str) = sub_args.value_of("id") { - ZNodeId(id_str.parse().context("while parsing safekeeper id")?) + NodeId(id_str.parse().context("while parsing safekeeper id")?) } else { DEFAULT_SAFEKEEPER_ID }; diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 290f52e0b2..b7d97a67c0 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [features] # It is simpler infra-wise to have failpoints enabled by default -# It shouldn't affect perf in any way because failpoints +# It shouldn't affect performance in any way because failpoints # are not placed in hot code paths default = ["failpoints"] profiling = ["pprof"] @@ -60,6 +60,8 @@ metrics = { path = "../libs/metrics" } utils = { path = "../libs/utils" } remote_storage = { path = "../libs/remote_storage" } workspace_hack = { version = "0.1", path = "../workspace_hack" } +close_fds = "0.3.2" +walkdir = "2.3.2" [dev-dependencies] hex-literal = "0.3" diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 92d35130d8..3ec1ec9243 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,8 +10,10 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{anyhow, ensure, Context, Result}; +use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::{BufMut, BytesMut}; +use fail::fail_point; +use itertools::Itertools; use std::fmt::Write as FmtWrite; use std::io; use std::io::Write; @@ -20,7 +22,7 @@ use std::time::SystemTime; use tar::{Builder, EntryType, Header}; use tracing::*; -use crate::reltag::SlruKind; +use crate::reltag::{RelTag, SlruKind}; use crate::repository::Timeline; use crate::DatadirTimelineImpl; use postgres_ffi::xlog_utils::*; @@ -30,26 +32,37 @@ use utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions /// used for constructing tarball. -pub struct Basebackup<'a> { - ar: Builder<&'a mut dyn Write>, +pub struct Basebackup<'a, W> +where + W: Write, +{ + ar: Builder>, timeline: &'a Arc, pub lsn: Lsn, prev_record_lsn: Lsn, + full_backup: bool, + finished: bool, } -// Create basebackup with non-rel data in it. Omit relational data. +// Create basebackup with non-rel data in it. +// Only include relational data if 'full_backup' is true. // // Currently we use empty lsn in two cases: // * During the basebackup right after timeline creation // * When working without safekeepers. In this situation it is important to match the lsn // we are taking basebackup on with the lsn that is used in pageserver's walreceiver // to start the replication. -impl<'a> Basebackup<'a> { +impl<'a, W> Basebackup<'a, W> +where + W: Write, +{ pub fn new( - write: &'a mut dyn Write, + write: W, timeline: &'a Arc, req_lsn: Option, - ) -> Result> { + prev_lsn: Option, + full_backup: bool, + ) -> Result> { // Compute postgres doesn't have any previous WAL files, but the first // record that it's going to write needs to include the LSN of the // previous record (xl_prev). We include prev_record_lsn in the @@ -84,20 +97,34 @@ impl<'a> Basebackup<'a> { (end_of_timeline.prev, end_of_timeline.last) }; + // Consolidate the derived and the provided prev_lsn values + let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn { + if backup_prev != Lsn(0) { + ensure!(backup_prev == provided_prev_lsn) + } + provided_prev_lsn + } else { + backup_prev + }; + info!( - "taking basebackup lsn={}, prev_lsn={}", - backup_lsn, backup_prev + "taking basebackup lsn={}, prev_lsn={} (full_backup={})", + backup_lsn, prev_lsn, full_backup ); Ok(Basebackup { - ar: Builder::new(write), + ar: Builder::new(AbortableWrite::new(write)), timeline, lsn: backup_lsn, - prev_record_lsn: backup_prev, + prev_record_lsn: prev_lsn, + full_backup, + finished: false, }) } - pub fn send_tarball(&mut self) -> anyhow::Result<()> { + pub fn send_tarball(mut self) -> anyhow::Result<()> { + // TODO include checksum + // Create pgdata subdirs structure for dir in pg_constants::PGDATA_SUBDIRS.iter() { let header = new_tar_header_dir(*dir)?; @@ -130,18 +157,62 @@ impl<'a> Basebackup<'a> { // Create tablespace directories for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? { self.add_dbdir(spcnode, dbnode, has_relmap_file)?; + + // Gather and send relational files in each database if full backup is requested. + if self.full_backup { + for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? { + self.add_rel(rel)?; + } + } } for xid in self.timeline.list_twophase_files(self.lsn)? { self.add_twophase_file(xid)?; } + fail_point!("basebackup-before-control-file", |_| { + bail!("failpoint basebackup-before-control-file") + }); + // Generate pg_control and bootstrap WAL segment. self.add_pgcontrol_file()?; self.ar.finish()?; + self.finished = true; debug!("all tarred up!"); Ok(()) } + fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> { + let nblocks = self.timeline.get_rel_size(tag, self.lsn)?; + + // Function that adds relation segment data to archive + let mut add_file = |segment_index, data: &Vec| -> anyhow::Result<()> { + let file_name = tag.to_segfile_name(segment_index as u32); + let header = new_tar_header(&file_name, data.len() as u64)?; + self.ar.append(&header, data.as_slice())?; + Ok(()) + }; + + // If the relation is empty, create an empty file + if nblocks == 0 { + add_file(0, &vec![])?; + return Ok(()); + } + + // Add a file for each chunk of blocks (aka segment) + let chunks = (0..nblocks).chunks(pg_constants::RELSEG_SIZE as usize); + for (seg, blocks) in chunks.into_iter().enumerate() { + let mut segment_data: Vec = vec![]; + for blknum in blocks { + let img = self.timeline.get_rel_page_at_lsn(tag, blknum, self.lsn)?; + segment_data.extend_from_slice(&img[..]); + } + + add_file(seg, &segment_data)?; + } + + Ok(()) + } + // // Generate SLRU segment files from repository. // @@ -331,6 +402,19 @@ impl<'a> Basebackup<'a> { } } +impl<'a, W> Drop for Basebackup<'a, W> +where + W: Write, +{ + /// If the basebackup was not finished, prevent the Archive::drop() from + /// writing the end-of-archive marker. + fn drop(&mut self) { + if !self.finished { + self.ar.get_mut().abort(); + } + } +} + // // Create new tarball entry header // @@ -366,3 +450,49 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result
{ header.set_cksum(); Ok(header) } + +/// A wrapper that passes through all data to the underlying Write, +/// until abort() is called. +/// +/// tar::Builder has an annoying habit of finishing the archive with +/// a valid tar end-of-archive marker (two 512-byte sectors of zeros), +/// even if an error occurs and we don't finish building the archive. +/// We'd rather abort writing the tarball immediately than construct +/// a seemingly valid but incomplete archive. This wrapper allows us +/// to swallow the end-of-archive marker that Builder::drop() emits, +/// without writing it to the underlying sink. +/// +struct AbortableWrite { + w: W, + aborted: bool, +} + +impl AbortableWrite { + pub fn new(w: W) -> Self { + AbortableWrite { w, aborted: false } + } + + pub fn abort(&mut self) { + self.aborted = true; + } +} + +impl Write for AbortableWrite +where + W: Write, +{ + fn write(&mut self, data: &[u8]) -> io::Result { + if self.aborted { + Ok(data.len()) + } else { + self.w.write(data) + } + } + fn flush(&mut self) -> io::Result<()> { + if self.aborted { + Ok(()) + } else { + self.w.flush() + } + } +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index ac90500b97..b539964414 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -104,7 +104,7 @@ fn main() -> anyhow::Result<()> { return Ok(()); } - let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith")); + let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".neon")); let workdir = workdir .canonicalize() .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?; @@ -263,6 +263,8 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<() // start profiler (if enabled) let profiler_guard = profiling::init_profiler(conf); + pageserver::tenant_tasks::init_tenant_task_pool()?; + // initialize authentication for incoming connections let auth = match &conf.auth_type { AuthType::Trust | AuthType::MD5 => None, diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index a9215c0701..01b626e046 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -5,9 +5,9 @@ //! See also `settings.md` for better description on every parameter. use anyhow::{anyhow, bail, ensure, Context, Result}; -use remote_storage::{RemoteStorageConfig, RemoteStorageKind, S3Config}; +use remote_storage::RemoteStorageConfig; use std::env; -use std::num::{NonZeroU32, NonZeroUsize}; + use std::path::{Path, PathBuf}; use std::str::FromStr; use std::time::Duration; @@ -16,7 +16,7 @@ use toml_edit::{Document, Item}; use url::Url; use utils::{ postgres_backend::AuthType, - zid::{ZNodeId, ZTenantId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTimelineId}, }; use crate::layered_repository::TIMELINES_SEGMENT_NAME; @@ -34,7 +34,7 @@ pub mod defaults { pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; - pub const DEFAULT_SUPERUSER: &str = "zenith_admin"; + pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; @@ -78,7 +78,7 @@ pub mod defaults { pub struct PageServerConf { // Identifier of that particular pageserver so e g safekeepers // can safely distinguish different pageservers - pub id: ZNodeId, + pub id: NodeId, /// Example (default): 127.0.0.1:64000 pub listen_pg_addr: String, @@ -114,7 +114,7 @@ pub struct PageServerConf { pub default_tenant_conf: TenantConf, /// A prefix to add in etcd brokers before every key. - /// Can be used for isolating different pageserver groups withing the same etcd cluster. + /// Can be used for isolating different pageserver groups within the same etcd cluster. pub broker_etcd_prefix: String, /// Etcd broker endpoints to connect to. @@ -180,7 +180,7 @@ struct PageServerConfigBuilder { auth_validation_public_key_path: BuilderValue>, remote_storage_config: BuilderValue>, - id: BuilderValue, + id: BuilderValue, profiling: BuilderValue, broker_etcd_prefix: BuilderValue, @@ -276,7 +276,7 @@ impl PageServerConfigBuilder { self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix) } - pub fn id(&mut self, node_id: ZNodeId) { + pub fn id(&mut self, node_id: NodeId) { self.id = BuilderValue::Set(node_id) } @@ -394,12 +394,12 @@ impl PageServerConf { )), "auth_type" => builder.auth_type(parse_toml_from_str(key, item)?), "remote_storage" => { - builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?)) + builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item)?)) } "tenant_config" => { t_conf = Self::parse_toml_tenant_conf(item)?; } - "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)), + "id" => builder.id(NodeId(parse_toml_u64(key, item)?)), "profiling" => builder.profiling(parse_toml_from_str(key, item)?), "broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?), "broker_endpoints" => builder.broker_endpoints( @@ -480,68 +480,25 @@ impl PageServerConf { if let Some(pitr_interval) = item.get("pitr_interval") { t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?); } + if let Some(walreceiver_connect_timeout) = item.get("walreceiver_connect_timeout") { + t_conf.walreceiver_connect_timeout = Some(parse_toml_duration( + "walreceiver_connect_timeout", + walreceiver_connect_timeout, + )?); + } + if let Some(lagging_wal_timeout) = item.get("lagging_wal_timeout") { + t_conf.lagging_wal_timeout = Some(parse_toml_duration( + "lagging_wal_timeout", + lagging_wal_timeout, + )?); + } + if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") { + t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?); + } Ok(t_conf) } - /// subroutine of parse_config(), to parse the `[remote_storage]` table. - fn parse_remote_storage_config(toml: &toml_edit::Item) -> anyhow::Result { - let local_path = toml.get("local_path"); - let bucket_name = toml.get("bucket_name"); - let bucket_region = toml.get("bucket_region"); - - let max_concurrent_syncs = NonZeroUsize::new( - parse_optional_integer("max_concurrent_syncs", toml)? - .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS), - ) - .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?; - - let max_sync_errors = NonZeroU32::new( - parse_optional_integer("max_sync_errors", toml)? - .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS), - ) - .context("Failed to parse 'max_sync_errors' as a positive integer")?; - - let concurrency_limit = NonZeroUsize::new( - parse_optional_integer("concurrency_limit", toml)? - .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT), - ) - .context("Failed to parse 'concurrency_limit' as a positive integer")?; - - let storage = match (local_path, bucket_name, bucket_region) { - (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"), - (_, Some(_), None) => { - bail!("'bucket_region' option is mandatory if 'bucket_name' is given ") - } - (_, None, Some(_)) => { - bail!("'bucket_name' option is mandatory if 'bucket_region' is given ") - } - (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config { - bucket_name: parse_toml_string("bucket_name", bucket_name)?, - bucket_region: parse_toml_string("bucket_region", bucket_region)?, - prefix_in_bucket: toml - .get("prefix_in_bucket") - .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket)) - .transpose()?, - endpoint: toml - .get("endpoint") - .map(|endpoint| parse_toml_string("endpoint", endpoint)) - .transpose()?, - concurrency_limit, - }), - (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from( - parse_toml_string("local_path", local_path)?, - )), - (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"), - }; - - Ok(RemoteStorageConfig { - max_concurrent_syncs, - max_sync_errors, - storage, - }) - } - #[cfg(test)] pub fn test_repo_dir(test_name: &str) -> PathBuf { PathBuf::from(format!("../tmp_check/test_{test_name}")) @@ -550,14 +507,14 @@ impl PageServerConf { #[cfg(test)] pub fn dummy_conf(repo_dir: PathBuf) -> Self { PageServerConf { - id: ZNodeId(0), + id: NodeId(0), wait_lsn_timeout: Duration::from_secs(60), wal_redo_timeout: Duration::from_secs(60), page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - superuser: "zenith_admin".to_string(), + superuser: "cloud_admin".to_string(), workdir: repo_dir, pg_distrib_dir: PathBuf::new(), auth_type: AuthType::Trust, @@ -592,23 +549,6 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result { Ok(i as u64) } -fn parse_optional_integer(name: &str, item: &toml_edit::Item) -> anyhow::Result> -where - I: TryFrom, - E: std::error::Error + Send + Sync + 'static, -{ - let toml_integer = match item.get(name) { - Some(item) => item - .as_integer() - .with_context(|| format!("configure option {name} is not an integer"))?, - None => return Ok(None), - }; - - I::try_from(toml_integer) - .map(Some) - .with_context(|| format!("configure option {name} is too large")) -} - fn parse_toml_duration(name: &str, item: &Item) -> Result { let s = item .as_str() @@ -651,8 +591,12 @@ fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result> { #[cfg(test)] mod tests { - use std::fs; + use std::{ + fs, + num::{NonZeroU32, NonZeroUsize}, + }; + use remote_storage::{RemoteStorageKind, S3Config}; use tempfile::{tempdir, TempDir}; use super::*; @@ -693,7 +637,7 @@ id = 10 assert_eq!( parsed_config, PageServerConf { - id: ZNodeId(10), + id: NodeId(10), listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?, @@ -737,7 +681,7 @@ id = 10 assert_eq!( parsed_config, PageServerConf { - id: ZNodeId(10), + id: NodeId(10), listen_pg_addr: "127.0.0.1:64000".to_string(), listen_http_addr: "127.0.0.1:9898".to_string(), wait_lsn_timeout: Duration::from_secs(111), diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index e9aaa72416..c947cebcb6 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -1,8 +1,10 @@ +use std::num::NonZeroU64; + use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use utils::{ lsn::Lsn, - zid::{ZNodeId, ZTenantId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTimelineId}, }; #[serde_as] @@ -33,6 +35,9 @@ pub struct TenantCreateRequest { pub gc_period: Option, pub image_creation_threshold: Option, pub pitr_interval: Option, + pub walreceiver_connect_timeout: Option, + pub lagging_wal_timeout: Option, + pub max_lsn_wal_lag: Option, } #[serde_as] @@ -42,7 +47,7 @@ pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId #[derive(Serialize)] pub struct StatusResponse { - pub id: ZNodeId, + pub id: NodeId, } impl TenantCreateRequest { @@ -68,6 +73,9 @@ pub struct TenantConfigRequest { pub gc_period: Option, pub image_creation_threshold: Option, pub pitr_interval: Option, + pub walreceiver_connect_timeout: Option, + pub lagging_wal_timeout: Option, + pub max_lsn_wal_lag: Option, } impl TenantConfigRequest { @@ -82,6 +90,21 @@ impl TenantConfigRequest { gc_period: None, image_creation_threshold: None, pitr_interval: None, + walreceiver_connect_timeout: None, + lagging_wal_timeout: None, + max_lsn_wal_lag: None, } } } + +/// A WAL receiver's data stored inside the global `WAL_RECEIVERS`. +/// We keep one WAL receiver active per timeline. +#[serde_as] +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct WalReceiverEntry { + pub wal_producer_connstr: Option, + #[serde_as(as = "Option")] + pub last_received_msg_lsn: Option, + /// the timestamp (in microseconds) of the last received message + pub last_received_msg_ts: Option, +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 55f7b3c5a7..2775a27e0f 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -22,6 +22,49 @@ paths: properties: id: type: integer + + /v1/tenant/{tenant_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + get: + description: Get tenant status + responses: + "200": + description: Currently returns the flag whether the tenant has inprogress timeline downloads + content: + application/json: + schema: + $ref: "#/components/schemas/TenantInfo" + "400": + description: Error when no tenant id found in path or no timeline id + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/timeline: parameters: - name: tenant_id @@ -70,6 +113,7 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/timeline/{timeline_id}: parameters: - name: tenant_id @@ -84,13 +128,14 @@ paths: schema: type: string format: hex - - name: include-non-incremental-logical-size - in: query - schema: - type: string - description: Controls calculation of current_logical_size_non_incremental get: description: Get info about the timeline + parameters: + - name: include-non-incremental-logical-size + in: query + schema: + type: string + description: Controls calculation of current_logical_size_non_incremental responses: "200": description: TimelineInfo @@ -122,6 +167,35 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" + delete: + description: "Attempts to delete specified timeline. On 500 errors should be retried" + responses: + "200": + description: Ok + "400": + description: Error when no tenant id found in path or no timeline id + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/timeline/{timeline_id}/wal_receiver: parameters: @@ -171,7 +245,7 @@ paths: schema: $ref: "#/components/schemas/Error" - /v1/tenant/{tenant_id}/timeline/{timeline_id}/attach: + /v1/tenant/{tenant_id}/attach: parameters: - name: tenant_id in: path @@ -179,19 +253,13 @@ paths: schema: type: string format: hex - - name: timeline_id - in: path - required: true - schema: - type: string - format: hex post: - description: Attach remote timeline + description: Schedules attach operation to happen in the background for given tenant responses: - "200": - description: Timeline attaching scheduled + "202": + description: Tenant attaching scheduled "400": - description: Error when no tenant id found in path or no timeline id + description: Error when no tenant id found in path parameters content: application/json: schema: @@ -215,7 +283,7 @@ paths: schema: $ref: "#/components/schemas/NotFoundError" "409": - description: Timeline download is already in progress + description: Tenant download is already in progress content: application/json: schema: @@ -227,7 +295,6 @@ paths: schema: $ref: "#/components/schemas/Error" - /v1/tenant/{tenant_id}/timeline/{timeline_id}/detach: parameters: - name: tenant_id @@ -243,10 +310,11 @@ paths: type: string format: hex post: - description: Detach local timeline + description: Deprecated, use DELETE /v1/tenant/{tenant_id}/timeline/{timeline_id} instead + deprecated: true responses: "200": - description: Timeline detached + description: Ok "400": description: Error when no tenant id found in path or no timeline id content: @@ -272,6 +340,43 @@ paths: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/detach: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Detach local tenant + responses: + "200": + description: Tenant detached + "400": + description: Error when no tenant id found in path parameters + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/timeline/: parameters: @@ -467,12 +572,13 @@ components: type: object required: - id - - state properties: id: type: string state: type: string + has_in_progress_downloads: + type: boolean TenantCreateInfo: type: object properties: @@ -567,6 +673,7 @@ components: type: integer current_logical_size_non_incremental: type: integer + WalReceiverEntry: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index bb650a34ed..236415cf58 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -14,6 +14,7 @@ use crate::repository::Repository; use crate::storage_sync; use crate::storage_sync::index::{RemoteIndex, RemoteTimeline}; use crate::tenant_config::TenantConfOpt; +use crate::tenant_mgr::TenantInfo; use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo}; use crate::{config::PageServerConf, tenant_mgr, timelines}; use utils::{ @@ -209,9 +210,9 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { +// TODO makes sense to provide tenant config right away the same way as it handled in tenant_create +async fn tenant_attach_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; - info!( - "Handling timeline {} attach for tenant: {}", - timeline_id, tenant_id, - ); + info!("Handling tenant attach {}", tenant_id,); tokio::task::spawn_blocking(move || { - if tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id).is_ok() { - // TODO: maybe answer with 309 Not Modified here? - anyhow::bail!("Timeline is already present locally") + if tenant_mgr::get_tenant_state(tenant_id).is_some() { + anyhow::bail!("Tenant is already present locally") }; Ok(()) }) .await .map_err(ApiError::from_err)??; - let sync_id = ZTenantTimelineId { - tenant_id, - timeline_id, - }; let state = get_state(&request); let remote_index = &state.remote_index; let mut index_accessor = remote_index.write().await; - if let Some(remote_timeline) = index_accessor.timeline_entry_mut(&sync_id) { - if remote_timeline.awaits_download { + if let Some(tenant_entry) = index_accessor.tenant_entry_mut(&tenant_id) { + if tenant_entry.has_in_progress_downloads() { return Err(ApiError::Conflict( - "Timeline download is already in progress".to_string(), + "Tenant download is already in progress".to_string(), )); } - remote_timeline.awaits_download = true; - storage_sync::schedule_layer_download(tenant_id, timeline_id); - return json_response(StatusCode::ACCEPTED, ()); - } else { - // no timeline in the index, release the lock to make the potentially lengthy download opetation - drop(index_accessor); - } - - let new_timeline = match try_download_index_part_data(state, sync_id).await { - Ok(Some(mut new_timeline)) => { - tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id)) - .await - .context("Failed to create new timeline directory")?; - new_timeline.awaits_download = true; - new_timeline + for (timeline_id, remote_timeline) in tenant_entry.iter_mut() { + storage_sync::schedule_layer_download(tenant_id, *timeline_id); + remote_timeline.awaits_download = true; } - Ok(None) => return Err(ApiError::NotFound("Unknown remote timeline".to_string())), + return json_response(StatusCode::ACCEPTED, ()); + } + // no tenant in the index, release the lock to make the potentially lengthy download opetation + drop(index_accessor); + + // download index parts for every tenant timeline + let remote_timelines = match gather_tenant_timelines_index_parts(state, tenant_id).await { + Ok(Some(remote_timelines)) => remote_timelines, + Ok(None) => return Err(ApiError::NotFound("Unknown remote tenant".to_string())), Err(e) => { - error!("Failed to retrieve remote timeline data: {:?}", e); + error!("Failed to retrieve remote tenant data: {:?}", e); return Err(ApiError::NotFound( - "Failed to retrieve remote timeline".to_string(), + "Failed to retrieve remote tenant".to_string(), )); } }; + // recheck that download is not in progress because + // we've released the lock to avoid holding it during the download let mut index_accessor = remote_index.write().await; - match index_accessor.timeline_entry_mut(&sync_id) { - Some(remote_timeline) => { - if remote_timeline.awaits_download { + let tenant_entry = match index_accessor.tenant_entry_mut(&tenant_id) { + Some(tenant_entry) => { + if tenant_entry.has_in_progress_downloads() { return Err(ApiError::Conflict( - "Timeline download is already in progress".to_string(), + "Tenant download is already in progress".to_string(), )); } - remote_timeline.awaits_download = true; + tenant_entry } - None => index_accessor.add_timeline_entry(sync_id, new_timeline), + None => index_accessor.add_tenant_entry(tenant_id), + }; + + // populate remote index with the data from index part and create directories on the local filesystem + for (timeline_id, mut remote_timeline) in remote_timelines { + tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id)) + .await + .context("Failed to create new timeline directory")?; + + remote_timeline.awaits_download = true; + tenant_entry.insert(timeline_id, remote_timeline); + // schedule actual download + storage_sync::schedule_layer_download(tenant_id, timeline_id); } - storage_sync::schedule_layer_download(tenant_id, timeline_id); + json_response(StatusCode::ACCEPTED, ()) } -async fn try_download_index_part_data( +/// Note: is expensive from s3 access perspective, +/// for details see comment to `storage_sync::gather_tenant_timelines_index_parts` +async fn gather_tenant_timelines_index_parts( state: &State, - sync_id: ZTenantTimelineId, -) -> anyhow::Result> { - let index_part = match state.remote_storage.as_ref() { + tenant_id: ZTenantId, +) -> anyhow::Result>> { + let index_parts = match state.remote_storage.as_ref() { Some(GenericRemoteStorage::Local(local_storage)) => { - storage_sync::download_index_part(state.conf, local_storage, sync_id).await + storage_sync::gather_tenant_timelines_index_parts(state.conf, local_storage, tenant_id) + .await } + // FIXME here s3 storage contains its own limits, that are separate from sync storage thread ones + // because it is a different instance. We can move this limit to some global static + // or use one instance everywhere. Some(GenericRemoteStorage::S3(s3_storage)) => { - storage_sync::download_index_part(state.conf, s3_storage, sync_id).await + storage_sync::gather_tenant_timelines_index_parts(state.conf, s3_storage, tenant_id) + .await } None => return Ok(None), } - .with_context(|| format!("Failed to download index part for timeline {sync_id}"))?; + .with_context(|| format!("Failed to download index parts for tenant {tenant_id}"))?; - let timeline_path = state - .conf - .timeline_path(&sync_id.timeline_id, &sync_id.tenant_id); - RemoteTimeline::from_index_part(&timeline_path, index_part) - .map(Some) - .with_context(|| { - format!("Failed to convert index part into remote timeline for timeline {sync_id}") - }) + let mut remote_timelines = Vec::with_capacity(index_parts.len()); + for (timeline_id, index_part) in index_parts { + let timeline_path = state.conf.timeline_path(&timeline_id, &tenant_id); + let remote_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part) + .with_context(|| { + format!("Failed to convert index part into remote timeline for timeline {tenant_id}/{timeline_id}") + })?; + remote_timelines.push((timeline_id, remote_timeline)); + } + Ok(Some(remote_timelines)) } -async fn timeline_detach_handler(request: Request) -> Result, ApiError> { +async fn timeline_delete_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?; + let state = get_state(&request); tokio::task::spawn_blocking(move || { - let _enter = - info_span!("timeline_detach_handler", tenant = %tenant_id, timeline = %timeline_id) - .entered(); - let state = get_state(&request); - tenant_mgr::detach_timeline(state.conf, tenant_id, timeline_id) + let _enter = info_span!("tenant_detach_handler", tenant = %tenant_id).entered(); + tenant_mgr::delete_timeline(tenant_id, timeline_id) }) .await .map_err(ApiError::from_err)??; + let mut remote_index = state.remote_index.write().await; + remote_index.remove_timeline_entry(ZTenantTimelineId { + tenant_id, + timeline_id, + }); + + json_response(StatusCode::OK, ()) +} + +async fn tenant_detach_handler(request: Request) -> Result, ApiError> { + let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + let state = get_state(&request); + let conf = state.conf; + tokio::task::spawn_blocking(move || { + let _enter = info_span!("tenant_detach_handler", tenant = %tenant_id).entered(); + tenant_mgr::detach_tenant(conf, tenant_id) + }) + .await + .map_err(ApiError::from_err)??; + + let mut remote_index = state.remote_index.write().await; + remote_index.remove_tenant_entry(&tenant_id); + json_response(StatusCode::OK, ()) } @@ -372,9 +400,13 @@ async fn tenant_list_handler(request: Request) -> Result, A // check for management permission check_permission(&request, None)?; + let state = get_state(&request); + // clone to avoid holding the lock while awaiting for blocking task + let remote_index = state.remote_index.read().await.clone(); + let response_data = tokio::task::spawn_blocking(move || { let _enter = info_span!("tenant_list").entered(); - crate::tenant_mgr::list_tenants() + crate::tenant_mgr::list_tenants(&remote_index) }) .await .map_err(ApiError::from_err)?; @@ -382,6 +414,34 @@ async fn tenant_list_handler(request: Request) -> Result, A json_response(StatusCode::OK, response_data) } +async fn tenant_status(request: Request) -> Result, ApiError> { + let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + // if tenant is in progress of downloading it can be absent in global tenant map + let tenant_state = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant_state(tenant_id)) + .await + .map_err(ApiError::from_err)?; + + let state = get_state(&request); + let remote_index = &state.remote_index; + + let index_accessor = remote_index.read().await; + let has_in_progress_downloads = index_accessor + .tenant_entry(&tenant_id) + .ok_or_else(|| ApiError::NotFound("Tenant not found in remote index".to_string()))? + .has_in_progress_downloads(); + + json_response( + StatusCode::OK, + TenantInfo { + id: tenant_id, + state: tenant_state, + has_in_progress_downloads: Some(has_in_progress_downloads), + }, + ) +} + async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { // check for management permission check_permission(&request, None)?; @@ -402,6 +462,19 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result( ) -> Result<()> { let mut pg_control: Option = None; + // TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn) + // Then fishing out pg_control would be unnecessary let mut modification = tline.begin_modification(lsn); modification.init_empty()?; - // Scan 'global' - let mut relfiles: Vec = Vec::new(); - for direntry in fs::read_dir(path.join("global"))? { - let direntry = direntry?; - match direntry.file_name().to_str() { - None => continue, + // Import all but pg_wal + let all_but_wal = WalkDir::new(path) + .into_iter() + .filter_entry(|entry| !entry.path().ends_with("pg_wal")); + for entry in all_but_wal { + let entry = entry?; + let metadata = entry.metadata().expect("error getting dir entry metadata"); + if metadata.is_file() { + let absolute_path = entry.path(); + let relative_path = absolute_path.strip_prefix(path)?; - Some("pg_control") => { - pg_control = Some(import_control_file(&mut modification, &direntry.path())?); + let file = File::open(absolute_path)?; + let len = metadata.len() as usize; + if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? { + pg_control = Some(control_file); } - Some("pg_filenode.map") => { - import_relmap_file( - &mut modification, - pg_constants::GLOBALTABLESPACE_OID, - 0, - &direntry.path(), - )?; - } - - // Load any relation files into the page server (but only after the other files) - _ => relfiles.push(direntry.path()), + modification.flush()?; } } - for relfile in relfiles { - import_relfile( - &mut modification, - &relfile, - pg_constants::GLOBALTABLESPACE_OID, - 0, - )?; - } - - // Scan 'base'. It contains database dirs, the database OID is the filename. - // E.g. 'base/12345', where 12345 is the database OID. - for direntry in fs::read_dir(path.join("base"))? { - let direntry = direntry?; - - //skip all temporary files - if direntry.file_name().to_string_lossy() == "pgsql_tmp" { - continue; - } - - let dboid = direntry.file_name().to_string_lossy().parse::()?; - - let mut relfiles: Vec = Vec::new(); - for direntry in fs::read_dir(direntry.path())? { - let direntry = direntry?; - match direntry.file_name().to_str() { - None => continue, - - Some("PG_VERSION") => { - //modification.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?; - } - Some("pg_filenode.map") => import_relmap_file( - &mut modification, - pg_constants::DEFAULTTABLESPACE_OID, - dboid, - &direntry.path(), - )?, - - // Load any relation files into the page server - _ => relfiles.push(direntry.path()), - } - } - for relfile in relfiles { - import_relfile( - &mut modification, - &relfile, - pg_constants::DEFAULTTABLESPACE_OID, - dboid, - )?; - } - } - for entry in fs::read_dir(path.join("pg_xact"))? { - let entry = entry?; - import_slru_file(&mut modification, SlruKind::Clog, &entry.path())?; - } - for entry in fs::read_dir(path.join("pg_multixact").join("members"))? { - let entry = entry?; - import_slru_file(&mut modification, SlruKind::MultiXactMembers, &entry.path())?; - } - for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? { - let entry = entry?; - import_slru_file(&mut modification, SlruKind::MultiXactOffsets, &entry.path())?; - } - for entry in fs::read_dir(path.join("pg_twophase"))? { - let entry = entry?; - let xid = u32::from_str_radix(&entry.path().to_string_lossy(), 16)?; - import_twophase_file(&mut modification, xid, &entry.path())?; - } - // TODO: Scan pg_tblspc // We're done importing all the data files. modification.commit()?; @@ -158,31 +89,30 @@ pub fn import_timeline_from_postgres_datadir( } // subroutine of import_timeline_from_postgres_datadir(), to load one relation file. -fn import_relfile( +fn import_rel( modification: &mut DatadirModification, path: &Path, spcoid: Oid, dboid: Oid, + mut reader: Reader, + len: usize, ) -> anyhow::Result<()> { // Does it look like a relation file? trace!("importing rel file {}", path.display()); - let (relnode, forknum, segno) = parse_relfilename(&path.file_name().unwrap().to_string_lossy()) - .map_err(|e| { - warn!("unrecognized file in postgres datadir: {:?} ({})", path, e); - e - })?; + let filename = &path + .file_name() + .expect("missing rel filename") + .to_string_lossy(); + let (relnode, forknum, segno) = parse_relfilename(filename).map_err(|e| { + warn!("unrecognized file in postgres datadir: {:?} ({})", path, e); + e + })?; - let mut file = File::open(path)?; let mut buf: [u8; 8192] = [0u8; 8192]; - let len = file.metadata().unwrap().len(); - ensure!(len % pg_constants::BLCKSZ as u64 == 0); - let nblocks = len / pg_constants::BLCKSZ as u64; - - if segno != 0 { - todo!(); - } + ensure!(len % pg_constants::BLCKSZ as usize == 0); + let nblocks = len / pg_constants::BLCKSZ as usize; let rel = RelTag { spcnode: spcoid, @@ -190,11 +120,22 @@ fn import_relfile( relnode, forknum, }; - modification.put_rel_creation(rel, nblocks as u32)?; let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32); + + // Call put_rel_creation for every segment of the relation, + // because there is no guarantee about the order in which we are processing segments. + // ignore "relation already exists" error + if let Err(e) = modification.put_rel_creation(rel, nblocks as u32) { + if e.to_string().contains("already exists") { + debug!("relation {} already exists. we must be extending it", rel); + } else { + return Err(e); + } + } + loop { - let r = file.read_exact(&mut buf); + let r = reader.read_exact(&mut buf); match r { Ok(_) => { modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; @@ -204,7 +145,9 @@ fn import_relfile( Err(err) => match err.kind() { std::io::ErrorKind::UnexpectedEof => { // reached EOF. That's expected. - ensure!(blknum == nblocks as u32, "unexpected EOF"); + let relative_blknum = + blknum - segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32); + ensure!(relative_blknum == nblocks as u32, "unexpected EOF"); break; } _ => { @@ -215,96 +158,43 @@ fn import_relfile( blknum += 1; } + // Update relation size + // + // If we process rel segments out of order, + // put_rel_extend will skip the update. + modification.put_rel_extend(rel, blknum)?; + Ok(()) } -/// Import a relmapper (pg_filenode.map) file into the repository -fn import_relmap_file( - modification: &mut DatadirModification, - spcnode: Oid, - dbnode: Oid, - path: &Path, -) -> Result<()> { - let mut file = File::open(path)?; - let mut buffer = Vec::new(); - // read the whole file - file.read_to_end(&mut buffer)?; - - trace!("importing relmap file {}", path.display()); - - modification.put_relmap_file(spcnode, dbnode, Bytes::copy_from_slice(&buffer[..]))?; - Ok(()) -} - -/// Import a twophase state file (pg_twophase/) into the repository -fn import_twophase_file( - modification: &mut DatadirModification, - xid: TransactionId, - path: &Path, -) -> Result<()> { - let mut file = File::open(path)?; - let mut buffer = Vec::new(); - // read the whole file - file.read_to_end(&mut buffer)?; - - trace!("importing non-rel file {}", path.display()); - - modification.put_twophase_file(xid, Bytes::copy_from_slice(&buffer[..]))?; - Ok(()) -} - -/// -/// Import pg_control file into the repository. -/// -/// The control file is imported as is, but we also extract the checkpoint record -/// from it and store it separated. -fn import_control_file( - modification: &mut DatadirModification, - path: &Path, -) -> Result { - let mut file = File::open(path)?; - let mut buffer = Vec::new(); - // read the whole file - file.read_to_end(&mut buffer)?; - - trace!("importing control file {}", path.display()); - - // Import it as ControlFile - modification.put_control_file(Bytes::copy_from_slice(&buffer[..]))?; - - // Extract the checkpoint record and import it separately. - let pg_control = ControlFileData::decode(&buffer)?; - let checkpoint_bytes = pg_control.checkPointCopy.encode()?; - modification.put_checkpoint(checkpoint_bytes)?; - - Ok(pg_control) -} - -/// /// Import an SLRU segment file /// -fn import_slru_file( +fn import_slru( modification: &mut DatadirModification, slru: SlruKind, path: &Path, + mut reader: Reader, + len: usize, ) -> Result<()> { trace!("importing slru file {}", path.display()); - let mut file = File::open(path)?; let mut buf: [u8; 8192] = [0u8; 8192]; - let segno = u32::from_str_radix(&path.file_name().unwrap().to_string_lossy(), 16)?; + let filename = &path + .file_name() + .expect("missing slru filename") + .to_string_lossy(); + let segno = u32::from_str_radix(filename, 16)?; - let len = file.metadata().unwrap().len(); - ensure!(len % pg_constants::BLCKSZ as u64 == 0); // we assume SLRU block size is the same as BLCKSZ - let nblocks = len / pg_constants::BLCKSZ as u64; + ensure!(len % pg_constants::BLCKSZ as usize == 0); // we assume SLRU block size is the same as BLCKSZ + let nblocks = len / pg_constants::BLCKSZ as usize; - ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as u64); + ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize); modification.put_slru_segment_creation(slru, segno, nblocks as u32)?; let mut rpageno = 0; loop { - let r = file.read_exact(&mut buf); + let r = reader.read_exact(&mut buf); match r { Ok(_) => { modification.put_slru_page_image( @@ -396,10 +286,272 @@ fn import_wal( } if last_lsn != startpoint { - debug!("reached end of WAL at {}", last_lsn); + info!("reached end of WAL at {}", last_lsn); } else { info!("no WAL to import at {}", last_lsn); } Ok(()) } + +pub fn import_basebackup_from_tar( + tline: &mut DatadirTimeline, + reader: Reader, + base_lsn: Lsn, +) -> Result<()> { + info!("importing base at {}", base_lsn); + let mut modification = tline.begin_modification(base_lsn); + modification.init_empty()?; + + let mut pg_control: Option = None; + + // Import base + for base_tar_entry in tar::Archive::new(reader).entries()? { + let entry = base_tar_entry?; + let header = entry.header(); + let len = header.entry_size()? as usize; + let file_path = header.path()?.into_owned(); + + match header.entry_type() { + tar::EntryType::Regular => { + if let Some(res) = import_file(&mut modification, file_path.as_ref(), entry, len)? { + // We found the pg_control file. + pg_control = Some(res); + } + modification.flush()?; + } + tar::EntryType::Directory => { + debug!("directory {:?}", file_path); + } + _ => { + panic!("tar::EntryType::?? {}", file_path.display()); + } + } + } + + // sanity check: ensure that pg_control is loaded + let _pg_control = pg_control.context("pg_control file not found")?; + + modification.commit()?; + Ok(()) +} + +pub fn import_wal_from_tar( + tline: &mut DatadirTimeline, + reader: Reader, + start_lsn: Lsn, + end_lsn: Lsn, +) -> Result<()> { + // Set up walingest mutable state + let mut waldecoder = WalStreamDecoder::new(start_lsn); + let mut segno = start_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE); + let mut offset = start_lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE); + let mut last_lsn = start_lsn; + let mut walingest = WalIngest::new(tline, start_lsn)?; + + // Ingest wal until end_lsn + info!("importing wal until {}", end_lsn); + let mut pg_wal_tar = tar::Archive::new(reader); + let mut pg_wal_entries_iter = pg_wal_tar.entries()?; + while last_lsn <= end_lsn { + let bytes = { + let entry = pg_wal_entries_iter.next().expect("expected more wal")?; + let header = entry.header(); + let file_path = header.path()?.into_owned(); + + match header.entry_type() { + tar::EntryType::Regular => { + // FIXME: assume postgresql tli 1 for now + let expected_filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE); + let file_name = file_path + .file_name() + .expect("missing wal filename") + .to_string_lossy(); + ensure!(expected_filename == file_name); + + debug!("processing wal file {:?}", file_path); + read_all_bytes(entry)? + } + tar::EntryType::Directory => { + debug!("directory {:?}", file_path); + continue; + } + _ => { + panic!("tar::EntryType::?? {}", file_path.display()); + } + } + }; + + waldecoder.feed_bytes(&bytes[offset..]); + + while last_lsn <= end_lsn { + if let Some((lsn, recdata)) = waldecoder.poll_decode()? { + walingest.ingest_record(tline, recdata, lsn)?; + last_lsn = lsn; + + debug!("imported record at {} (end {})", lsn, end_lsn); + } + } + + debug!("imported records up to {}", last_lsn); + segno += 1; + offset = 0; + } + + if last_lsn != start_lsn { + info!("reached end of WAL at {}", last_lsn); + } else { + info!("there was no WAL to import at {}", last_lsn); + } + + // Log any extra unused files + for e in &mut pg_wal_entries_iter { + let entry = e?; + let header = entry.header(); + let file_path = header.path()?.into_owned(); + info!("skipping {:?}", file_path); + } + + Ok(()) +} + +pub fn import_file( + modification: &mut DatadirModification, + file_path: &Path, + reader: Reader, + len: usize, +) -> Result> { + debug!("looking at {:?}", file_path); + + if file_path.starts_with("global") { + let spcnode = pg_constants::GLOBALTABLESPACE_OID; + let dbnode = 0; + + match file_path + .file_name() + .expect("missing filename") + .to_string_lossy() + .as_ref() + { + "pg_control" => { + let bytes = read_all_bytes(reader)?; + + // Extract the checkpoint record and import it separately. + let pg_control = ControlFileData::decode(&bytes[..])?; + let checkpoint_bytes = pg_control.checkPointCopy.encode()?; + modification.put_checkpoint(checkpoint_bytes)?; + debug!("imported control file"); + + // Import it as ControlFile + modification.put_control_file(bytes)?; + return Ok(Some(pg_control)); + } + "pg_filenode.map" => { + let bytes = read_all_bytes(reader)?; + modification.put_relmap_file(spcnode, dbnode, bytes)?; + debug!("imported relmap file") + } + "PG_VERSION" => { + debug!("ignored"); + } + _ => { + import_rel(modification, file_path, spcnode, dbnode, reader, len)?; + debug!("imported rel creation"); + } + } + } else if file_path.starts_with("base") { + let spcnode = pg_constants::DEFAULTTABLESPACE_OID; + let dbnode: u32 = file_path + .iter() + .nth(1) + .expect("invalid file path, expected dbnode") + .to_string_lossy() + .parse()?; + + match file_path + .file_name() + .expect("missing base filename") + .to_string_lossy() + .as_ref() + { + "pg_filenode.map" => { + let bytes = read_all_bytes(reader)?; + modification.put_relmap_file(spcnode, dbnode, bytes)?; + debug!("imported relmap file") + } + "PG_VERSION" => { + debug!("ignored"); + } + _ => { + import_rel(modification, file_path, spcnode, dbnode, reader, len)?; + debug!("imported rel creation"); + } + } + } else if file_path.starts_with("pg_xact") { + let slru = SlruKind::Clog; + + import_slru(modification, slru, file_path, reader, len)?; + debug!("imported clog slru"); + } else if file_path.starts_with("pg_multixact/offsets") { + let slru = SlruKind::MultiXactOffsets; + + import_slru(modification, slru, file_path, reader, len)?; + debug!("imported multixact offsets slru"); + } else if file_path.starts_with("pg_multixact/members") { + let slru = SlruKind::MultiXactMembers; + + import_slru(modification, slru, file_path, reader, len)?; + debug!("imported multixact members slru"); + } else if file_path.starts_with("pg_twophase") { + let file_name = &file_path + .file_name() + .expect("missing twophase filename") + .to_string_lossy(); + let xid = u32::from_str_radix(file_name, 16)?; + + let bytes = read_all_bytes(reader)?; + modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?; + debug!("imported twophase file"); + } else if file_path.starts_with("pg_wal") { + debug!("found wal file in base section. ignore it"); + } else if file_path.starts_with("zenith.signal") { + // Parse zenith signal file to set correct previous LSN + let bytes = read_all_bytes(reader)?; + // zenith.signal format is "PREV LSN: prev_lsn" + // TODO write serialization and deserialization in the same place. + let zenith_signal = std::str::from_utf8(&bytes)?.trim(); + let prev_lsn = match zenith_signal { + "PREV LSN: none" => Lsn(0), + "PREV LSN: invalid" => Lsn(0), + other => { + let split = other.split(':').collect::>(); + split[1] + .trim() + .parse::() + .context("can't parse zenith.signal")? + } + }; + + // zenith.signal is not necessarily the last file, that we handle + // but it is ok to call `finish_write()`, because final `modification.commit()` + // will update lsn once more to the final one. + let writer = modification.tline.tline.writer(); + writer.finish_write(prev_lsn); + + debug!("imported zenith signal {}", prev_lsn); + } else if file_path.starts_with("pg_tblspc") { + // TODO Backups exported from neon won't have pg_tblspc, but we will need + // this to import arbitrary postgres databases. + bail!("Importing pg_tblspc is not implemented"); + } else { + debug!("ignored"); + } + + Ok(None) +} + +fn read_all_bytes(mut reader: Reader) -> Result { + let mut buf: Vec = vec![]; + reader.read_to_end(&mut buf)?; + Ok(Bytes::copy_from_slice(&buf[..])) +} diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs index f6f0d7b7cf..da213704f3 100644 --- a/pageserver/src/keyspace.rs +++ b/pageserver/src/keyspace.rs @@ -15,7 +15,7 @@ pub struct KeySpace { impl KeySpace { /// /// Partition a key space into roughly chunks of roughly 'target_size' bytes - /// in each patition. + /// in each partition. /// pub fn partition(&self, target_size: u64) -> KeyPartitioning { // Assume that each value is 8k in size. diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index a83907430e..cead2e9222 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -4,7 +4,7 @@ //! The functions here are responsible for locating the correct layer for the //! get/put call, tracing timeline branching history as needed. //! -//! The files are stored in the .zenith/tenants//timelines/ +//! The files are stored in the .neon/tenants//timelines/ //! directory. See layered_repository/README for how the files are managed. //! In addition to the layer files, there is a metadata file in the same //! directory that contains information about the timeline, in particular its @@ -25,6 +25,7 @@ use std::collections::{BTreeSet, HashSet}; use std::fs; use std::fs::{File, OpenOptions}; use std::io::Write; +use std::num::NonZeroU64; use std::ops::{Bound::Included, Deref, Range}; use std::path::{Path, PathBuf}; use std::sync::atomic::{self, AtomicBool}; @@ -33,13 +34,11 @@ use std::time::{Duration, Instant, SystemTime}; use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; use crate::config::PageServerConf; -use crate::keyspace::KeySpace; +use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::storage_sync::index::RemoteIndex; use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::repository::{ - GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncStatusUpdate, TimelineWriter, -}; +use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline, TimelineWriter}; use crate::repository::{Key, Value}; use crate::tenant_mgr; use crate::thread_mgr; @@ -147,7 +146,7 @@ lazy_static! { .expect("failed to define a metric"); } -/// Parts of the `.zenith/tenants//timelines/` directory prefix. +/// Parts of the `.neon/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; /// @@ -157,6 +156,18 @@ pub struct LayeredRepository { // Global pageserver config parameters pub conf: &'static PageServerConf, + // Allows us to gracefully cancel operations that edit the directory + // that backs this layered repository. Usage: + // + // Use `let _guard = file_lock.try_read()` while writing any files. + // Use `let _guard = file_lock.write().unwrap()` to wait for all writes to finish. + // + // TODO try_read this lock during checkpoint as well to prevent race + // between checkpoint and detach/delete. + // TODO try_read this lock for all gc/compaction operations, not just + // ones scheduled by the tenant task manager. + pub file_lock: RwLock<()>, + // Overridden tenant-specific config parameters. // We keep TenantConfOpt sturct here to preserve the information // about parameters that are not set. @@ -219,43 +230,52 @@ impl Repository for LayeredRepository { fn create_empty_timeline( &self, - timelineid: ZTimelineId, + timeline_id: ZTimelineId, initdb_lsn: Lsn, ) -> Result> { let mut timelines = self.timelines.lock().unwrap(); + let vacant_timeline_entry = match timelines.entry(timeline_id) { + Entry::Occupied(_) => bail!("Timeline already exists"), + Entry::Vacant(vacant_entry) => vacant_entry, + }; + + let timeline_path = self.conf.timeline_path(&timeline_id, &self.tenant_id); + if timeline_path.exists() { + bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.") + } // Create the timeline directory, and write initial metadata to file. - crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenant_id))?; + crashsafe_dir::create_dir_all(timeline_path)?; let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn); - Self::save_metadata(self.conf, timelineid, self.tenant_id, &metadata, true)?; + Self::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?; let timeline = LayeredTimeline::new( self.conf, Arc::clone(&self.tenant_conf), metadata, None, - timelineid, + timeline_id, self.tenant_id, Arc::clone(&self.walredo_mgr), self.upload_layers, ); timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn); + // Insert if not exists let timeline = Arc::new(timeline); - let r = timelines.insert( - timelineid, - LayeredTimelineEntry::Loaded(Arc::clone(&timeline)), - ); - ensure!( - r.is_none(), - "assertion failure, inserted duplicate timeline" - ); + vacant_timeline_entry.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline))); + Ok(timeline) } /// Branch a timeline - fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()> { + fn branch_timeline( + &self, + src: ZTimelineId, + dst: ZTimelineId, + start_lsn: Option, + ) -> Result<()> { // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn // about timelines, so otherwise a race condition is possible, where we create new timeline and GC // concurrently removes data that is needed by the new timeline. @@ -268,6 +288,14 @@ impl Repository for LayeredRepository { .context("failed to load timeline for branching")? .ok_or_else(|| anyhow::anyhow!("unknown timeline id: {}", &src))?; let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn(); + + // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN + let start_lsn = start_lsn.unwrap_or_else(|| { + let lsn = src_timeline.get_last_record_lsn(); + info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}"); + lsn + }); + src_timeline .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn) .context("invalid branch start lsn")?; @@ -314,19 +342,19 @@ impl Repository for LayeredRepository { /// metrics collection. fn gc_iteration( &self, - target_timelineid: Option, + target_timeline_id: Option, horizon: u64, pitr: Duration, checkpoint_before_gc: bool, ) -> Result { - let timeline_str = target_timelineid + let timeline_str = target_timeline_id .map(|x| x.to_string()) .unwrap_or_else(|| "-".to_string()); STORAGE_TIME .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str]) .observe_closure_duration(|| { - self.gc_iteration_internal(target_timelineid, horizon, pitr, checkpoint_before_gc) + self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc) }) } @@ -393,50 +421,60 @@ impl Repository for LayeredRepository { Ok(()) } - fn detach_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> { + fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> { + // in order to be retriable detach needs to be idempotent + // (or at least to a point that each time the detach is called it can make progress) let mut timelines = self.timelines.lock().unwrap(); - // check no child timelines, because detach will remove files, which will brake child branches - // FIXME this can still be violated because we do not guarantee - // that all ancestors are downloaded/attached to the same pageserver - let num_children = timelines + + // Ensure that there are no child timelines **attached to that pageserver**, + // because detach removes files, which will break child branches + let children_exist = timelines .iter() - .filter(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id)) - .count(); + .any(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id)); ensure!( - num_children == 0, + !children_exist, "Cannot detach timeline which has child timelines" ); + let timeline_entry = match timelines.entry(timeline_id) { + Entry::Occupied(e) => e, + Entry::Vacant(_) => bail!("timeline not found"), + }; + + // try to acquire gc and compaction locks to prevent errors from missing files + let _gc_guard = self + .gc_cs + .try_lock() + .map_err(|e| anyhow::anyhow!("cannot acquire gc lock {e}"))?; + + let compaction_guard = timeline_entry.get().compaction_guard()?; + + let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id); + std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { + format!( + "Failed to remove local timeline directory '{}'", + local_timeline_directory.display() + ) + })?; + info!("detach removed files"); + + drop(compaction_guard); + timeline_entry.remove(); - ensure!( - timelines.remove(&timeline_id).is_some(), - "Cannot detach timeline {timeline_id} that is not available locally" - ); Ok(()) } - fn apply_timeline_remote_sync_status_update( - &self, - timeline_id: ZTimelineId, - timeline_sync_status_update: TimelineSyncStatusUpdate, - ) -> Result<()> { - debug!( - "apply_timeline_remote_sync_status_update timeline_id: {} update: {:?}", - timeline_id, timeline_sync_status_update - ); - match timeline_sync_status_update { - TimelineSyncStatusUpdate::Downloaded => { - match self.timelines.lock().unwrap().entry(timeline_id) { - Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."), - Entry::Vacant(entry) => { - // we need to get metadata of a timeline, another option is to pass it along with Downloaded status - let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; - // finally we make newly downloaded timeline visible to repository - entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, }) - }, - }; - } - } + fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> { + debug!("attach timeline_id: {}", timeline_id,); + match self.timelines.lock().unwrap().entry(timeline_id) { + Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."), + Entry::Vacant(entry) => { + // we need to get metadata of a timeline, another option is to pass it along with Downloaded status + let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?; + // finally we make newly downloaded timeline visible to repository + entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, }) + }, + }; Ok(()) } @@ -486,6 +524,18 @@ impl LayeredTimelineEntry { } } } + + fn compaction_guard(&self) -> Result>, anyhow::Error> { + match self { + LayeredTimelineEntry::Loaded(timeline) => timeline + .compaction_cs + .try_lock() + .map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}")) + .map(Some), + + LayeredTimelineEntry::Unloaded { .. } => Ok(None), + } + } } impl From for RepositoryTimeline { @@ -557,6 +607,27 @@ impl LayeredRepository { .unwrap_or(self.conf.default_tenant_conf.pitr_interval) } + pub fn get_wal_receiver_connect_timeout(&self) -> Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .walreceiver_connect_timeout + .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout) + } + + pub fn get_lagging_wal_timeout(&self) -> Duration { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .lagging_wal_timeout + .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout) + } + + pub fn get_max_lsn_wal_lag(&self) -> NonZeroU64 { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .max_lsn_wal_lag + .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag) + } + pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) -> Result<()> { let mut tenant_conf = self.tenant_conf.write().unwrap(); @@ -663,6 +734,7 @@ impl LayeredRepository { ) -> LayeredRepository { LayeredRepository { tenant_id, + file_lock: RwLock::new(()), conf, tenant_conf: Arc::new(RwLock::new(tenant_conf)), timelines: Mutex::new(HashMap::new()), @@ -800,13 +872,13 @@ impl LayeredRepository { // we do. fn gc_iteration_internal( &self, - target_timelineid: Option, + target_timeline_id: Option, horizon: u64, pitr: Duration, checkpoint_before_gc: bool, ) -> Result { let _span_guard = - info_span!("gc iteration", tenant = %self.tenant_id, timeline = ?target_timelineid) + info_span!("gc iteration", tenant = %self.tenant_id, timeline = ?target_timeline_id) .entered(); let mut totals: GcResult = Default::default(); let now = Instant::now(); @@ -820,15 +892,21 @@ impl LayeredRepository { let mut timeline_ids = Vec::new(); let mut timelines = self.timelines.lock().unwrap(); + if let Some(target_timeline_id) = target_timeline_id.as_ref() { + if timelines.get(target_timeline_id).is_none() { + bail!("gc target timeline does not exist") + } + }; + for (timeline_id, timeline_entry) in timelines.iter() { timeline_ids.push(*timeline_id); - // This is unresolved question for now, how to do gc in presense of remote timelines + // This is unresolved question for now, how to do gc in presence of remote timelines // especially when this is combined with branching. // Somewhat related: https://github.com/zenithdb/zenith/issues/999 if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() { // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timelineid) = target_timelineid { + if let Some(timelineid) = target_timeline_id { if ancestor_timeline_id == &timelineid { all_branchpoints .insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn())); @@ -843,7 +921,7 @@ impl LayeredRepository { // Ok, we now know all the branch points. // Perform GC for each timeline. - for timelineid in timeline_ids.into_iter() { + for timeline_id in timeline_ids.into_iter() { if thread_mgr::is_shutdown_requested() { // We were requested to shut down. Stop and return with the progress we // made. @@ -852,12 +930,12 @@ impl LayeredRepository { // Timeline is known to be local and loaded. let timeline = self - .get_timeline_load_internal(timelineid, &mut *timelines)? + .get_timeline_load_internal(timeline_id, &mut *timelines)? .expect("checked above that timeline is local and loaded"); // If target_timeline is specified, only GC it - if let Some(target_timelineid) = target_timelineid { - if timelineid != target_timelineid { + if let Some(target_timelineid) = target_timeline_id { + if timeline_id != target_timelineid { continue; } } @@ -866,8 +944,8 @@ impl LayeredRepository { drop(timelines); let branchpoints: Vec = all_branchpoints .range(( - Included((timelineid, Lsn(0))), - Included((timelineid, Lsn(u64::MAX))), + Included((timeline_id, Lsn(0))), + Included((timeline_id, Lsn(u64::MAX))), )) .map(|&x| x.1) .collect(); @@ -877,7 +955,7 @@ impl LayeredRepository { // used in tests, so we want as deterministic results as possible. if checkpoint_before_gc { timeline.checkpoint(CheckpointConfig::Forced)?; - info!("timeline {} checkpoint_before_gc done", timelineid); + info!("timeline {} checkpoint_before_gc done", timeline_id); } timeline.update_gc_info(branchpoints, cutoff, pitr); let result = timeline.gc()?; @@ -1230,7 +1308,7 @@ impl LayeredTimeline { }), disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), - last_freeze_at: AtomicLsn::new(0), + last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0), ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), @@ -1562,7 +1640,7 @@ impl LayeredTimeline { Ok(layer) } - fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> { + fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { //info!("PUT: key {} at {}", key, lsn); let layer = self.get_layer_for_write(lsn)?; layer.put_value(key, lsn, val)?; @@ -1690,26 +1768,28 @@ impl LayeredTimeline { /// Flush one frozen in-memory layer to disk, as a new delta layer. fn flush_frozen_layer(&self, frozen_layer: Arc) -> Result<()> { - let new_delta = frozen_layer.write_to_disk()?; - let new_delta_path = new_delta.path(); + // As a special case, when we have just imported an image into the repository, + // instead of writing out a L0 delta layer, we directly write out image layer + // files instead. This is possible as long as *all* the data imported into the + // repository have the same LSN. + let lsn_range = frozen_layer.get_lsn_range(); + let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn + && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) + { + let pgdir = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)?; + let (partitioning, _lsn) = + pgdir.repartition(self.initdb_lsn, self.get_compaction_target_size())?; + self.create_image_layers(&partitioning, self.initdb_lsn, true)? + } else { + // normal case, write out a L0 delta layer file. + let delta_path = self.create_delta_layer(&frozen_layer)?; + HashSet::from([delta_path]) + }; - // Sync the new layer to disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable - // - // TODO: If we're running inside 'flush_frozen_layers' and there are multiple - // files to flush, it might be better to first write them all, and then fsync - // them all in parallel. - par_fsync::par_fsync(&[ - new_delta_path.clone(), - self.conf.timeline_path(&self.timeline_id, &self.tenant_id), - ])?; - fail_point!("checkpoint-before-sync"); + fail_point!("flush-frozen-before-sync"); - fail_point!("flush-frozen"); - - // Finally, replace the frozen in-memory layer with the new on-disk layer + // The new on-disk layers are now in the layer map. We can remove the + // in-memory layer from the map now. { let mut layers = self.layers.write().unwrap(); let l = layers.frozen_layers.pop_front(); @@ -1719,19 +1799,27 @@ impl LayeredTimeline { // layer to disk at the same time, that would not work. assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer)); - // Add the new delta layer to the LayerMap - layers.insert_historic(Arc::new(new_delta)); - // release lock on 'layers' } + fail_point!("checkpoint-after-sync"); + // Update the metadata file, with new 'disk_consistent_lsn' // // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing // *all* the layers, to avoid fsyncing the file multiple times. - let disk_consistent_lsn = Lsn(frozen_layer.get_lsn_range().end.0 - 1); - fail_point!("checkpoint-after-sync"); + let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); + self.update_disk_consistent_lsn(disk_consistent_lsn, layer_paths_to_upload)?; + Ok(()) + } + + /// Update metadata file + fn update_disk_consistent_lsn( + &self, + disk_consistent_lsn: Lsn, + layer_paths_to_upload: HashSet, + ) -> Result<()> { // If we were able to advance 'disk_consistent_lsn', save it the metadata file. // After crash, we will restart WAL streaming and processing from that point. let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); @@ -1781,14 +1869,11 @@ impl LayeredTimeline { false, )?; - NUM_PERSISTENT_FILES_CREATED.inc_by(1); - PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len()); - if self.upload_layers.load(atomic::Ordering::Relaxed) { storage_sync::schedule_layer_upload( self.tenant_id, self.timeline_id, - HashSet::from([new_delta_path]), + layer_paths_to_upload, Some(metadata), ); } @@ -1800,6 +1885,37 @@ impl LayeredTimeline { Ok(()) } + // Write out the given frozen in-memory layer as a new L0 delta file + fn create_delta_layer(&self, frozen_layer: &InMemoryLayer) -> Result { + // Write it out + let new_delta = frozen_layer.write_to_disk()?; + let new_delta_path = new_delta.path(); + + // Sync it to disk. + // + // We must also fsync the timeline dir to ensure the directory entries for + // new layer files are durable + // + // TODO: If we're running inside 'flush_frozen_layers' and there are multiple + // files to flush, it might be better to first write them all, and then fsync + // them all in parallel. + par_fsync::par_fsync(&[ + new_delta_path.clone(), + self.conf.timeline_path(&self.timeline_id, &self.tenant_id), + ])?; + + // Add it to the layer map + { + let mut layers = self.layers.write().unwrap(); + layers.insert_historic(Arc::new(new_delta)); + } + + NUM_PERSISTENT_FILES_CREATED.inc_by(1); + PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len()); + + Ok(new_delta_path) + } + pub fn compact(&self) -> Result<()> { // // High level strategy for compaction / image creation: @@ -1831,7 +1947,7 @@ impl LayeredTimeline { // collect any page versions that are no longer needed because // of the new image layers we created in step 2. // - // TODO: This hight level strategy hasn't been implemented yet. + // TODO: This high level strategy hasn't been implemented yet. // Below are functions compact_level0() and create_image_layers() // but they are a bit ad hoc and don't quite work like it's explained // above. Rewrite it. @@ -1843,29 +1959,23 @@ impl LayeredTimeline { if let Ok(pgdir) = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id) { + // 2. Create new image layers for partitions that have been modified + // "enough". let (partitioning, lsn) = pgdir.repartition( self.get_last_record_lsn(), self.get_compaction_target_size(), )?; - let timer = self.create_images_time_histo.start_timer(); - // 2. Create new image layers for partitions that have been modified - // "enough". - let mut layer_paths_to_upload = HashSet::with_capacity(partitioning.parts.len()); - for part in partitioning.parts.iter() { - if self.time_for_new_image_layer(part, lsn)? { - let new_path = self.create_image_layer(part, lsn)?; - layer_paths_to_upload.insert(new_path); - } - } - if self.upload_layers.load(atomic::Ordering::Relaxed) { + let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; + if !layer_paths_to_upload.is_empty() + && self.upload_layers.load(atomic::Ordering::Relaxed) + { storage_sync::schedule_layer_upload( self.tenant_id, self.timeline_id, - layer_paths_to_upload, + HashSet::from_iter(layer_paths_to_upload), None, ); } - timer.stop_and_record(); // 3. Compact let timer = self.compact_time_histo.start_timer(); @@ -1890,15 +2000,28 @@ impl LayeredTimeline { } else { Lsn(0) }; + // Let's consider an example: + // + // delta layer with LSN range 71-81 + // delta layer with LSN range 81-91 + // delta layer with LSN range 91-101 + // image layer at LSN 100 + // + // If 'lsn' is still 100, i.e. no new WAL has been processed since the last image layer, + // there's no need to create a new one. We check this case explicitly, to avoid passing + // a bogus range to count_deltas below, with start > end. It's even possible that there + // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed + // after we read last_record_lsn, which is passed here in the 'lsn' argument. + if img_lsn < lsn { + let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; - let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; - - debug!( - "range {}-{}, has {} deltas on this timeline", - img_range.start, img_range.end, num_deltas - ); - if num_deltas >= self.get_image_creation_threshold() { - return Ok(true); + debug!( + "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", + img_range.start, img_range.end, num_deltas, img_lsn, lsn + ); + if num_deltas >= self.get_image_creation_threshold() { + return Ok(true); + } } } } @@ -1906,21 +2029,40 @@ impl LayeredTimeline { Ok(false) } - fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result { - let img_range = - partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; - let mut image_layer_writer = - ImageLayerWriter::new(self.conf, self.timeline_id, self.tenant_id, &img_range, lsn)?; + fn create_image_layers( + &self, + partitioning: &KeyPartitioning, + lsn: Lsn, + force: bool, + ) -> Result> { + let timer = self.create_images_time_histo.start_timer(); + let mut image_layers: Vec = Vec::new(); + let mut layer_paths_to_upload = HashSet::new(); + for partition in partitioning.parts.iter() { + if force || self.time_for_new_image_layer(partition, lsn)? { + let img_range = + partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_id, + &img_range, + lsn, + )?; - for range in &partition.ranges { - let mut key = range.start; - while key < range.end { - let img = self.get(key, lsn)?; - image_layer_writer.put_image(key, &img)?; - key = key.next(); + for range in &partition.ranges { + let mut key = range.start; + while key < range.end { + let img = self.get(key, lsn)?; + image_layer_writer.put_image(key, &img)?; + key = key.next(); + } + } + let image_layer = image_layer_writer.finish()?; + layer_paths_to_upload.insert(image_layer.path()); + image_layers.push(image_layer); } } - let image_layer = image_layer_writer.finish()?; // Sync the new layer to disk before adding it to the layer map, to make sure // we don't garbage collect something based on the new layer, before it has @@ -1931,19 +2073,18 @@ impl LayeredTimeline { // // Compaction creates multiple image layers. It would be better to create them all // and fsync them all in parallel. - par_fsync::par_fsync(&[ - image_layer.path(), - self.conf.timeline_path(&self.timeline_id, &self.tenant_id), - ])?; - - // FIXME: Do we need to do something to upload it to remote storage here? + let mut all_paths = Vec::from_iter(layer_paths_to_upload.clone()); + all_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id)); + par_fsync::par_fsync(&all_paths)?; let mut layers = self.layers.write().unwrap(); - let new_path = image_layer.path(); - layers.insert_historic(Arc::new(image_layer)); + for l in image_layers { + layers.insert_historic(Arc::new(l)); + } drop(layers); + timer.stop_and_record(); - Ok(new_path) + Ok(layer_paths_to_upload) } /// @@ -2190,6 +2331,9 @@ impl LayeredTimeline { LsnForTimestamp::Past(lsn) => { debug!("past({})", lsn); } + LsnForTimestamp::NoData(lsn) => { + debug!("nodata({})", lsn); + } } debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) } @@ -2268,7 +2412,7 @@ impl LayeredTimeline { } // 3. Is it needed by a child branch? - // NOTE With that wee would keep data that + // NOTE With that we would keep data that // might be referenced by child branches forever. // We can track this in child timeline GC and delete parent layers when // they are no longer needed. This might be complicated with long inheritance chains. @@ -2463,7 +2607,7 @@ impl Deref for LayeredTimelineWriter<'_> { } impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { - fn put(&self, key: Key, lsn: Lsn, value: Value) -> Result<()> { + fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> { self.tl.put_value(key, lsn, value) } @@ -2518,7 +2662,7 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { bail!("couldn't find an unused backup number for {:?}", path) } -fn load_metadata( +pub fn load_metadata( conf: &'static PageServerConf, timeline_id: ZTimelineId, tenant_id: ZTenantId, @@ -2605,7 +2749,7 @@ pub mod tests { let TEST_KEY: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?; + writer.put(TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; writer.finish_write(Lsn(0x10)); drop(writer); @@ -2613,7 +2757,7 @@ pub mod tests { tline.compact()?; let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?; + writer.put(TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; writer.finish_write(Lsn(0x20)); drop(writer); @@ -2621,7 +2765,7 @@ pub mod tests { tline.compact()?; let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x30), Value::Image(TEST_IMG("foo at 0x30")))?; + writer.put(TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?; writer.finish_write(Lsn(0x30)); drop(writer); @@ -2629,7 +2773,7 @@ pub mod tests { tline.compact()?; let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x40), Value::Image(TEST_IMG("foo at 0x40")))?; + writer.put(TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?; writer.finish_write(Lsn(0x40)); drop(writer); @@ -2667,7 +2811,7 @@ pub mod tests { writer.put( test_key, lsn, - Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; writer.finish_write(lsn); drop(writer); @@ -2713,7 +2857,7 @@ pub mod tests { writer.put( test_key, lsn, - Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; writer.finish_write(lsn); updated[blknum] = lsn; @@ -2731,7 +2875,7 @@ pub mod tests { writer.put( test_key, lsn, - Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; writer.finish_write(lsn); drop(writer); @@ -2783,7 +2927,7 @@ pub mod tests { writer.put( test_key, lsn, - Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; writer.finish_write(lsn); updated[blknum] = lsn; @@ -2795,7 +2939,7 @@ pub mod tests { let mut tline_id = TIMELINE_ID; for _ in 0..50 { let new_tline_id = ZTimelineId::generate(); - repo.branch_timeline(tline_id, new_tline_id, lsn)?; + repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?; tline = repo.get_timeline_load(new_tline_id)?; tline_id = new_tline_id; @@ -2807,7 +2951,7 @@ pub mod tests { writer.put( test_key, lsn, - Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; println!("updating {} at {}", blknum, lsn); writer.finish_write(lsn); @@ -2854,7 +2998,7 @@ pub mod tests { #[allow(clippy::needless_range_loop)] for idx in 0..NUM_TLINES { let new_tline_id = ZTimelineId::generate(); - repo.branch_timeline(tline_id, new_tline_id, lsn)?; + repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?; tline = repo.get_timeline_load(new_tline_id)?; tline_id = new_tline_id; @@ -2866,7 +3010,7 @@ pub mod tests { writer.put( test_key, lsn, - Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))), + &Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))), )?; println!("updating [{}][{}] at {}", idx, blknum, lsn); writer.finish_write(lsn); diff --git a/pageserver/src/layered_repository/blob_io.rs b/pageserver/src/layered_repository/blob_io.rs index 3aeeb2b2c8..a4c6186056 100644 --- a/pageserver/src/layered_repository/blob_io.rs +++ b/pageserver/src/layered_repository/blob_io.rs @@ -34,7 +34,7 @@ pub trait BlobCursor { ) -> Result<(), std::io::Error>; } -impl<'a, R> BlobCursor for BlockCursor +impl BlobCursor for BlockCursor where R: BlockReader, { diff --git a/pageserver/src/layered_repository/disk_btree.rs b/pageserver/src/layered_repository/disk_btree.rs index 0c9ad75048..dc8d7a2ad3 100644 --- a/pageserver/src/layered_repository/disk_btree.rs +++ b/pageserver/src/layered_repository/disk_btree.rs @@ -7,7 +7,7 @@ //! - Fixed-width keys //! - Fixed-width values (VALUE_SZ) //! - The tree is created in a bulk operation. Insert/deletion after creation -//! is not suppported +//! is not supported //! - page-oriented //! //! TODO: @@ -498,8 +498,8 @@ where return Ok(()); } - // It did not fit. Try to compress, and it it succeeds to make some room - // on the node, try appending to it again. + // It did not fit. Try to compress, and if it succeeds to make + // some room on the node, try appending to it again. #[allow(clippy::collapsible_if)] if last.compress() { if last.push(key, value) { diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 905023ecf9..bb24553afd 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -445,7 +445,10 @@ impl ImageLayerWriter { }, ); info!("new image layer {}", path.display()); - let mut file = VirtualFile::create(&path)?; + let mut file = VirtualFile::open_with_options( + &path, + std::fs::OpenOptions::new().write(true).create_new(true), + )?; // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64))?; let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64); diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index bffb946f7e..87e6877520 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -267,13 +267,13 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree - pub fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> { + pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { trace!("put_value key {} at {}/{}", key, self.timelineid, lsn); let mut inner = self.inner.write().unwrap(); inner.assert_writeable(); - let off = inner.file.write_blob(&Value::ser(&val)?)?; + let off = inner.file.write_blob(&Value::ser(val)?)?; let vec_map = inner.index.entry(key).or_default(); let old = vec_map.append_or_update_last(lsn, off).unwrap().0; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index fdce0e5c5f..c9c00d75e2 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -13,7 +13,7 @@ pub mod repository; pub mod storage_sync; pub mod tenant_config; pub mod tenant_mgr; -pub mod tenant_threads; +pub mod tenant_tasks; pub mod thread_mgr; pub mod timelines; pub mod virtual_file; @@ -24,7 +24,6 @@ pub mod walredo; use lazy_static::lazy_static; use tracing::info; -use utils::postgres_backend; use crate::thread_mgr::ThreadKind; use metrics::{register_int_gauge_vec, IntGaugeVec}; @@ -73,7 +72,6 @@ pub fn shutdown_pageserver(exit_code: i32) { thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None); // Shut down any page service threads. - postgres_backend::set_pgbackend_shutdown_requested(); thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None); // Shut down all the tenants. This flushes everything to disk and kills diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 0c179b95c5..716df0f749 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -20,7 +20,7 @@ //! assign a buffer for a page, you must hold the mapping lock and the lock on //! the slot at the same time. //! -//! Whenever you need to hold both locks simultenously, the slot lock must be +//! Whenever you need to hold both locks simultaneously, the slot lock must be //! acquired first. This consistent ordering avoids deadlocks. To look up a page //! in the cache, you would first look up the mapping, while holding the mapping //! lock, and then lock the slot. You must release the mapping lock in between, diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 03264c9782..078edc5c9f 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -7,14 +7,13 @@ // *status* -- show actual info about this pageserver, // *pagestream* -- enter mode where smgr and pageserver talk with their // custom protocol. -// *callmemaybe $url* -- ask pageserver to start walreceiver on $url // use anyhow::{bail, ensure, Context, Result}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use lazy_static::lazy_static; use regex::Regex; -use std::io; +use std::io::{self, Read}; use std::net::TcpListener; use std::str; use std::str::FromStr; @@ -30,6 +29,8 @@ use utils::{ use crate::basebackup; use crate::config::{PageServerConf, ProfilingConfig}; +use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar}; +use crate::layered_repository::LayeredRepository; use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp}; use crate::profiling::profpoint_start; use crate::reltag::RelTag; @@ -38,7 +39,6 @@ use crate::repository::Timeline; use crate::tenant_mgr; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; -use crate::walreceiver; use crate::CheckpointConfig; use metrics::{register_histogram_vec, HistogramVec}; use postgres_ffi::xlog_utils::to_pg_timestamp; @@ -202,6 +202,96 @@ impl PagestreamBeMessage { } } +/// Implements Read for the server side of CopyIn +struct CopyInReader<'a> { + pgb: &'a mut PostgresBackend, + + /// Overflow buffer for bytes sent in CopyData messages + /// that the reader (caller of read) hasn't asked for yet. + /// TODO use BytesMut? + buf: Vec, + + /// Bytes before `buf_begin` are considered as dropped. + /// This allows us to implement O(1) pop_front on Vec. + /// The Vec won't grow large because we only add to it + /// when it's empty. + buf_begin: usize, +} + +impl<'a> CopyInReader<'a> { + // NOTE: pgb should be in copy in state already + fn new(pgb: &'a mut PostgresBackend) -> Self { + Self { + pgb, + buf: Vec::<_>::new(), + buf_begin: 0, + } + } +} + +impl<'a> Drop for CopyInReader<'a> { + fn drop(&mut self) { + // Finalize copy protocol so that self.pgb can be reused + // TODO instead, maybe take ownership of pgb and give it back at the end + let mut buf: Vec = vec![]; + let _ = self.read_to_end(&mut buf); + } +} + +impl<'a> Read for CopyInReader<'a> { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + while !thread_mgr::is_shutdown_requested() { + // Return from buffer if nonempty + if self.buf_begin < self.buf.len() { + let bytes_to_read = std::cmp::min(buf.len(), self.buf.len() - self.buf_begin); + buf[..bytes_to_read].copy_from_slice(&self.buf[self.buf_begin..][..bytes_to_read]); + self.buf_begin += bytes_to_read; + return Ok(bytes_to_read); + } + + // Delete garbage + self.buf.clear(); + self.buf_begin = 0; + + // Wait for client to send CopyData bytes + match self.pgb.read_message() { + Ok(Some(message)) => { + let copy_data_bytes = match message { + FeMessage::CopyData(bytes) => bytes, + FeMessage::CopyDone => return Ok(0), + FeMessage::Sync => continue, + m => { + let msg = format!("unexpected message {:?}", m); + self.pgb.write_message(&BeMessage::ErrorResponse(&msg))?; + return Err(io::Error::new(io::ErrorKind::Other, msg)); + } + }; + + // Return as much as we can, saving the rest in self.buf + let mut reader = copy_data_bytes.reader(); + let bytes_read = reader.read(buf)?; + reader.read_to_end(&mut self.buf)?; + return Ok(bytes_read); + } + Ok(None) => { + let msg = "client closed connection"; + self.pgb.write_message(&BeMessage::ErrorResponse(msg))?; + return Err(io::Error::new(io::ErrorKind::Other, msg)); + } + Err(e) => { + if !is_socket_read_timed_out(&e) { + return Err(io::Error::new(io::ErrorKind::Other, e)); + } + } + } + } + + // Shutting down + let msg = "Importer thread was shut down"; + Err(io::Error::new(io::ErrorKind::Other, msg)) + } +} + /////////////////////////////////////////////////////////////////////////////// /// @@ -305,7 +395,29 @@ fn page_service_conn_main( let mut conn_handler = PageServerHandler::new(conf, auth); let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?; - pgbackend.run(&mut conn_handler) + match pgbackend.run(&mut conn_handler) { + Ok(()) => { + // we've been requested to shut down + Ok(()) + } + Err(err) => { + let root_cause_io_err_kind = err + .root_cause() + .downcast_ref::() + .map(|e| e.kind()); + + // `ConnectionReset` error happens when the Postgres client closes the connection. + // As this disconnection happens quite often and is expected, + // we decided to downgrade the logging level to `INFO`. + // See: https://github.com/neondatabase/neon/issues/1683. + if root_cause_io_err_kind == Some(io::ErrorKind::ConnectionReset) { + info!("Postgres client disconnected"); + Ok(()) + } else { + Err(err) + } + } + } } #[derive(Debug)] @@ -350,6 +462,10 @@ impl PageServerHandler { ) -> anyhow::Result<()> { let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered(); + // NOTE: pagerequests handler exits when connection is closed, + // so there is no need to reset the association + thread_mgr::associate_with(Some(tenantid), Some(timelineid)); + // Check that the timeline exists let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) .context("Cannot load local timeline")?; @@ -423,6 +539,98 @@ impl PageServerHandler { Ok(()) } + fn handle_import_basebackup( + &self, + pgb: &mut PostgresBackend, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + base_lsn: Lsn, + _end_lsn: Lsn, + ) -> anyhow::Result<()> { + thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); + let _enter = + info_span!("import basebackup", timeline = %timeline_id, tenant = %tenant_id).entered(); + + // Create empty timeline + info!("creating new timeline"); + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?; + let repartition_distance = repo.get_checkpoint_distance(); + let mut datadir_timeline = + DatadirTimeline::::new(timeline, repartition_distance); + + // TODO mark timeline as not ready until it reaches end_lsn. + // We might have some wal to import as well, and we should prevent compute + // from connecting before that and writing conflicting wal. + // + // This is not relevant for pageserver->pageserver migrations, since there's + // no wal to import. But should be fixed if we want to import from postgres. + + // TODO leave clean state on error. For now you can use detach to clean + // up broken state from a failed import. + + // Import basebackup provided via CopyData + info!("importing basebackup"); + pgb.write_message(&BeMessage::CopyInResponse)?; + let reader = CopyInReader::new(pgb); + import_basebackup_from_tar(&mut datadir_timeline, reader, base_lsn)?; + + // TODO check checksum + // Meanwhile you can verify client-side by taking fullbackup + // and checking that it matches in size with what was imported. + // It wouldn't work if base came from vanilla postgres though, + // since we discard some log files. + + // Flush data to disk, then upload to s3 + info!("flushing layers"); + datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?; + + info!("done"); + Ok(()) + } + + fn handle_import_wal( + &self, + pgb: &mut PostgresBackend, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + start_lsn: Lsn, + end_lsn: Lsn, + ) -> anyhow::Result<()> { + thread_mgr::associate_with(Some(tenant_id), Some(timeline_id)); + let _enter = + info_span!("import wal", timeline = %timeline_id, tenant = %tenant_id).entered(); + + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + let timeline = repo.get_timeline_load(timeline_id)?; + ensure!(timeline.get_last_record_lsn() == start_lsn); + + let repartition_distance = repo.get_checkpoint_distance(); + let mut datadir_timeline = + DatadirTimeline::::new(timeline, repartition_distance); + + // TODO leave clean state on error. For now you can use detach to clean + // up broken state from a failed import. + + // Import wal provided via CopyData + info!("importing wal"); + pgb.write_message(&BeMessage::CopyInResponse)?; + let reader = CopyInReader::new(pgb); + import_wal_from_tar(&mut datadir_timeline, reader, start_lsn, end_lsn)?; + + // TODO Does it make sense to overshoot? + ensure!(datadir_timeline.tline.get_last_record_lsn() >= end_lsn); + + // Flush data to disk, then upload to s3. No need for a forced checkpoint. + // We only want to persist the data, and it doesn't matter if it's in the + // shape of deltas or images. + info!("flushing layers"); + datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?; + + info!("done"); + Ok(()) + } + /// Helper function to handle the LSN from client request. /// /// Each GetPage (and Exists and Nblocks) request includes information about @@ -525,17 +733,10 @@ impl PageServerHandler { let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; - let all_rels = timeline.list_rels(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?; - let mut total_blocks: i64 = 0; + let total_blocks = + timeline.get_db_size(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?; - for rel in all_rels { - if rel.forknum == 0 { - let n_blocks = timeline.get_rel_size(rel, lsn).unwrap_or(0); - total_blocks += n_blocks as i64; - } - } - - let db_size = total_blocks * pg_constants::BLCKSZ as i64; + let db_size = total_blocks as i64 * pg_constants::BLCKSZ as i64; Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse { db_size, @@ -571,7 +772,9 @@ impl PageServerHandler { pgb: &mut PostgresBackend, timelineid: ZTimelineId, lsn: Option, + prev_lsn: Option, tenantid: ZTenantId, + full_backup: bool, ) -> anyhow::Result<()> { let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty); let _enter = span.enter(); @@ -593,7 +796,9 @@ impl PageServerHandler { /* Send a tarball of the latest layer on the timeline */ { let mut writer = CopyDataSink { pgb }; - let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?; + + let basebackup = + basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?; span.record("lsn", &basebackup.lsn.to_string().as_str()); basebackup.send_tarball()?; } @@ -611,7 +816,7 @@ impl PageServerHandler { return Ok(()); } // auth is some, just checked above, when auth is some - // then claims are always present because of checks during connetion init + // then claims are always present because of checks during connection init // so this expect won't trigger let claims = self .claims @@ -651,6 +856,10 @@ impl postgres_backend::Handler for PageServerHandler { Ok(()) } + fn is_shutdown_requested(&self) -> bool { + thread_mgr::is_shutdown_requested() + } + fn process_query( &mut self, pgb: &mut PostgresBackend, @@ -692,32 +901,119 @@ impl postgres_backend::Handler for PageServerHandler { }; // Check that the timeline exists - self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?; + self.handle_basebackup_request(pgb, timelineid, lsn, None, tenantid, false)?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("callmemaybe ") { - // callmemaybe - // TODO lazy static - let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) ([[:xdigit:]]+) (.*)$").unwrap(); - let caps = re - .captures(query_string) - .with_context(|| format!("invalid callmemaybe: '{}'", query_string))?; + } + // return pair of prev_lsn and last_lsn + else if query_string.starts_with("get_last_record_rlsn ") { + let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len()); + let params = params_raw.split_whitespace().collect::>(); - let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?; - let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?; - let connstr = caps.get(3).unwrap().as_str().to_owned(); + ensure!( + params.len() == 2, + "invalid param number for get_last_record_rlsn command" + ); + + let tenantid = ZTenantId::from_str(params[0])?; + let timelineid = ZTimelineId::from_str(params[1])?; + + self.check_permission(Some(tenantid))?; + let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) + .context("Cannot load local timeline")?; + + let end_of_timeline = timeline.tline.get_last_record_rlsn(); + + pgb.write_message_noflush(&BeMessage::RowDescription(&[ + RowDescriptor::text_col(b"prev_lsn"), + RowDescriptor::text_col(b"last_lsn"), + ]))? + .write_message_noflush(&BeMessage::DataRow(&[ + Some(end_of_timeline.prev.to_string().as_bytes()), + Some(end_of_timeline.last.to_string().as_bytes()), + ]))? + .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; + } + // same as basebackup, but result includes relational data as well + else if query_string.starts_with("fullbackup ") { + let (_, params_raw) = query_string.split_at("fullbackup ".len()); + let params = params_raw.split_whitespace().collect::>(); + + ensure!( + params.len() >= 2, + "invalid param number for fullbackup command" + ); + + let tenantid = ZTenantId::from_str(params[0])?; + let timelineid = ZTimelineId::from_str(params[1])?; + + // The caller is responsible for providing correct lsn and prev_lsn. + let lsn = if params.len() > 2 { + Some(Lsn::from_str(params[2])?) + } else { + None + }; + let prev_lsn = if params.len() > 3 { + Some(Lsn::from_str(params[3])?) + } else { + None + }; self.check_permission(Some(tenantid))?; - let _enter = - info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered(); - // Check that the timeline exists - tenant_mgr::get_local_timeline_with_load(tenantid, timelineid) - .context("Cannot load local timeline")?; - - walreceiver::launch_wal_receiver(self.conf, tenantid, timelineid, &connstr)?; - + self.handle_basebackup_request(pgb, timelineid, lsn, prev_lsn, tenantid, true)?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + } else if query_string.starts_with("import basebackup ") { + // Import the `base` section (everything but the wal) of a basebackup. + // Assumes the tenant already exists on this pageserver. + // + // Files are scheduled to be persisted to remote storage, and the + // caller should poll the http api to check when that is done. + // + // Example import command: + // 1. Get start/end LSN from backup_manifest file + // 2. Run: + // cat my_backup/base.tar | psql -h $PAGESERVER \ + // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN" + let (_, params_raw) = query_string.split_at("import basebackup ".len()); + let params = params_raw.split_whitespace().collect::>(); + ensure!(params.len() == 4); + let tenant = ZTenantId::from_str(params[0])?; + let timeline = ZTimelineId::from_str(params[1])?; + let base_lsn = Lsn::from_str(params[2])?; + let end_lsn = Lsn::from_str(params[3])?; + + self.check_permission(Some(tenant))?; + + match self.handle_import_basebackup(pgb, tenant, timeline, base_lsn, end_lsn) { + Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + Err(e) => { + error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}"); + pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))? + } + }; + } else if query_string.starts_with("import wal ") { + // Import the `pg_wal` section of a basebackup. + // + // Files are scheduled to be persisted to remote storage, and the + // caller should poll the http api to check when that is done. + let (_, params_raw) = query_string.split_at("import wal ".len()); + let params = params_raw.split_whitespace().collect::>(); + ensure!(params.len() == 4); + let tenant = ZTenantId::from_str(params[0])?; + let timeline = ZTimelineId::from_str(params[1])?; + let start_lsn = Lsn::from_str(params[2])?; + let end_lsn = Lsn::from_str(params[3])?; + + self.check_permission(Some(tenant))?; + + match self.handle_import_wal(pgb, tenant, timeline, start_lsn, end_lsn) { + Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + Err(e) => { + error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}"); + pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))? + } + }; } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect @@ -805,7 +1101,6 @@ impl postgres_backend::Handler for PageServerHandler { .map(|h| h.as_str().parse()) .unwrap_or_else(|| Ok(repo.get_gc_horizon()))?; - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; // Use tenant's pitr setting let pitr = repo.get_pitr_interval(); let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?; @@ -898,6 +1193,7 @@ impl postgres_backend::Handler for PageServerHandler { LsnForTimestamp::Present(lsn) => format!("{}", lsn), LsnForTimestamp::Future(_lsn) => "future".into(), LsnForTimestamp::Past(_lsn) => "past".into(), + LsnForTimestamp::NoData(_lsn) => "nodata".into(), }; pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?; pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index c052aa3d69..f696c1f411 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -51,6 +51,7 @@ pub enum LsnForTimestamp { Present(Lsn), Future(Lsn), Past(Lsn), + NoData(Lsn), } impl DatadirTimeline { @@ -123,6 +124,19 @@ impl DatadirTimeline { self.tline.get(key, lsn) } + // Get size of a database in blocks + pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + let mut total_blocks = 0; + + let rels = self.list_rels(spcnode, dbnode, lsn)?; + + for rel in rels { + let n_blocks = self.get_rel_size(rel, lsn)?; + total_blocks += n_blocks as usize; + } + Ok(total_blocks) + } + /// Get size of a relation file pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { ensure!(tag.relnode != 0, "invalid relnode"); @@ -250,7 +264,7 @@ impl DatadirTimeline { (false, false) => { // This can happen if no commit records have been processed yet, e.g. // just after importing a cluster. - bail!("no commit timestamps found"); + Ok(LsnForTimestamp::NoData(max_lsn)) } (true, false) => { // Didn't find any commit timestamps larger than the request @@ -521,7 +535,7 @@ pub struct DatadirModification<'a, R: Repository> { lsn: Lsn, - // The modifications are not applied directly to the underyling key-value store. + // The modifications are not applied directly to the underlying key-value store. // The put-functions add the modifications here, and they are flushed to the // underlying key-value store by the 'finish' function. pending_updates: HashMap, @@ -667,6 +681,10 @@ impl<'a, R: Repository> DatadirModification<'a, R> { } pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> { + let req_lsn = self.tline.get_last_record_lsn(); + + let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn)?; + // Remove entry from dbdir let buf = self.get(DBDIR_KEY)?; let mut dir = DbDirectory::des(&buf)?; @@ -680,7 +698,8 @@ impl<'a, R: Repository> DatadirModification<'a, R> { ); } - // FIXME: update pending_nblocks + // Update logical database size. + self.pending_nblocks -= total_blocks as isize; // Delete all relations and metadata files for the spcnode/dnode self.delete(dbdir_key_range(spcnode, dbnode)); @@ -749,6 +768,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { } /// Extend relation + /// If new size is smaller, do nothing. pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { ensure!(rel.relnode != 0, "invalid relnode"); @@ -756,10 +776,13 @@ impl<'a, R: Repository> DatadirModification<'a, R> { let size_key = rel_size_to_key(rel); let old_size = self.get(size_key)?.get_u32_le(); - let buf = nblocks.to_le_bytes(); - self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + // only extend relation here. never decrease the size + if nblocks > old_size { + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); - self.pending_nblocks += nblocks as isize - old_size as isize; + self.pending_nblocks += nblocks as isize - old_size as isize; + } Ok(()) } @@ -879,6 +902,57 @@ impl<'a, R: Repository> DatadirModification<'a, R> { Ok(()) } + /// + /// Flush changes accumulated so far to the underlying repository. + /// + /// Usually, changes made in DatadirModification are atomic, but this allows + /// you to flush them to the underlying repository before the final `commit`. + /// That allows to free up the memory used to hold the pending changes. + /// + /// Currently only used during bulk import of a data directory. In that + /// context, breaking the atomicity is OK. If the import is interrupted, the + /// whole import fails and the timeline will be deleted anyway. + /// (Or to be precise, it will be left behind for debugging purposes and + /// ignored, see https://github.com/neondatabase/neon/pull/1809) + /// + /// Note: A consequence of flushing the pending operations is that they + /// won't be visible to subsequent operations until `commit`. The function + /// retains all the metadata, but data pages are flushed. That's again OK + /// for bulk import, where you are just loading data pages and won't try to + /// modify the same pages twice. + pub fn flush(&mut self) -> Result<()> { + // Unless we have accumulated a decent amount of changes, it's not worth it + // to scan through the pending_updates list. + let pending_nblocks = self.pending_nblocks; + if pending_nblocks < 10000 { + return Ok(()); + } + + let writer = self.tline.tline.writer(); + + // Flush relation and SLRU data blocks, keep metadata. + let mut result: Result<()> = Ok(()); + self.pending_updates.retain(|&key, value| { + if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) { + result = writer.put(key, self.lsn, value); + false + } else { + true + } + }); + result?; + + if pending_nblocks != 0 { + self.tline.current_logical_size.fetch_add( + pending_nblocks * pg_constants::BLCKSZ as isize, + Ordering::SeqCst, + ); + self.pending_nblocks = 0; + } + + Ok(()) + } + /// /// Finish this atomic update, writing all the updated keys to the /// underlying timeline. @@ -889,7 +963,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> { let pending_nblocks = self.pending_nblocks; for (key, value) in self.pending_updates { - writer.put(key, self.lsn, value)?; + writer.put(key, self.lsn, &value)?; } for key_range in self.pending_deletions { writer.delete(key_range.clone(), self.lsn)?; @@ -1294,6 +1368,10 @@ pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> { }) } +fn is_rel_block_key(key: Key) -> bool { + key.field1 == 0x00 && key.field4 != 0 +} + pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> { Ok(match key.field1 { 0x01 => { @@ -1312,6 +1390,12 @@ pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> { }) } +fn is_slru_block_key(key: Key) -> bool { + key.field1 == 0x01 // SLRU-related + && key.field3 == 0x00000001 // but not SlruDir + && key.field6 != 0xffffffff // and not SlruSegSize +} + // //-- Tests that should work the same with any Repository/Timeline implementation. // diff --git a/pageserver/src/profiling.rs b/pageserver/src/profiling.rs index 84132659d6..ad896cfa30 100644 --- a/pageserver/src/profiling.rs +++ b/pageserver/src/profiling.rs @@ -81,6 +81,12 @@ mod profiling_impl { pub struct DummyProfilerGuard; + impl Drop for DummyProfilerGuard { + fn drop(&mut self) { + // do nothing, this exists to calm Clippy down + } + } + pub fn profpoint_start( _conf: &PageServerConf, _point: ProfilingConfig, diff --git a/pageserver/src/reltag.rs b/pageserver/src/reltag.rs index 18e26cc37a..fadd41f547 100644 --- a/pageserver/src/reltag.rs +++ b/pageserver/src/reltag.rs @@ -3,7 +3,7 @@ use std::cmp::Ordering; use std::fmt; use postgres_ffi::relfile_utils::forknumber_to_name; -use postgres_ffi::Oid; +use postgres_ffi::{pg_constants, Oid}; /// /// Relation data file segment id throughout the Postgres cluster. @@ -75,6 +75,30 @@ impl fmt::Display for RelTag { } } +impl RelTag { + pub fn to_segfile_name(&self, segno: u32) -> String { + let mut name = if self.spcnode == pg_constants::GLOBALTABLESPACE_OID { + "global/".to_string() + } else { + format!("base/{}/", self.dbnode) + }; + + name += &self.relnode.to_string(); + + if let Some(fork_name) = forknumber_to_name(self.forknum) { + name += "_"; + name += fork_name; + } + + if segno != 0 { + name += "."; + name += &segno.to_string(); + } + + name + } +} + /// /// Non-relation transaction status files (clog (a.k.a. pg_xact) and /// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer, diff --git a/pageserver/src/remote_storage/storage_sync/delete.rs b/pageserver/src/remote_storage/storage_sync/delete.rs deleted file mode 100644 index 00e7c85e35..0000000000 --- a/pageserver/src/remote_storage/storage_sync/delete.rs +++ /dev/null @@ -1,223 +0,0 @@ -//! Timeline synchrnonization logic to delete a bulk of timeline's remote files from the remote storage. - -use anyhow::Context; -use futures::stream::{FuturesUnordered, StreamExt}; -use tracing::{debug, error, info}; -use utils::zid::ZTenantTimelineId; - -use crate::remote_storage::{ - storage_sync::{SyncQueue, SyncTask}, - RemoteStorage, -}; - -use super::{LayersDeletion, SyncData}; - -/// Attempts to remove the timleline layers from the remote storage. -/// If the task had not adjusted the metadata before, the deletion will fail. -pub(super) async fn delete_timeline_layers<'a, P, S>( - storage: &'a S, - sync_queue: &SyncQueue, - sync_id: ZTenantTimelineId, - mut delete_data: SyncData, -) -> bool -where - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ - if !delete_data.data.deletion_registered { - error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing"); - delete_data.retries += 1; - sync_queue.push(sync_id, SyncTask::Delete(delete_data)); - return false; - } - - if delete_data.data.layers_to_delete.is_empty() { - info!("No layers to delete, skipping"); - return true; - } - - let layers_to_delete = delete_data - .data - .layers_to_delete - .drain() - .collect::>(); - debug!("Layers to delete: {layers_to_delete:?}"); - info!("Deleting {} timeline layers", layers_to_delete.len()); - - let mut delete_tasks = layers_to_delete - .into_iter() - .map(|local_layer_path| async { - let storage_path = match storage.storage_path(&local_layer_path).with_context(|| { - format!( - "Failed to get the layer storage path for local path '{}'", - local_layer_path.display() - ) - }) { - Ok(path) => path, - Err(e) => return Err((e, local_layer_path)), - }; - - match storage.delete(&storage_path).await.with_context(|| { - format!( - "Failed to delete remote layer from storage at '{:?}'", - storage_path - ) - }) { - Ok(()) => Ok(local_layer_path), - Err(e) => Err((e, local_layer_path)), - } - }) - .collect::>(); - - let mut errored = false; - while let Some(deletion_result) = delete_tasks.next().await { - match deletion_result { - Ok(local_layer_path) => { - debug!( - "Successfully deleted layer {} for timeline {sync_id}", - local_layer_path.display() - ); - delete_data.data.deleted_layers.insert(local_layer_path); - } - Err((e, local_layer_path)) => { - errored = true; - error!( - "Failed to delete layer {} for timeline {sync_id}: {e:?}", - local_layer_path.display() - ); - delete_data.data.layers_to_delete.insert(local_layer_path); - } - } - } - - if errored { - debug!("Reenqueuing failed delete task for timeline {sync_id}"); - delete_data.retries += 1; - sync_queue.push(sync_id, SyncTask::Delete(delete_data)); - } - errored -} - -#[cfg(test)] -mod tests { - use std::{collections::HashSet, num::NonZeroUsize}; - - use itertools::Itertools; - use tempfile::tempdir; - use tokio::fs; - use utils::lsn::Lsn; - - use crate::{ - remote_storage::{ - storage_sync::test_utils::{create_local_timeline, dummy_metadata}, - LocalFs, - }, - repository::repo_harness::{RepoHarness, TIMELINE_ID}, - }; - - use super::*; - - #[tokio::test] - async fn delete_timeline_negative() -> anyhow::Result<()> { - let harness = RepoHarness::create("delete_timeline_negative")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; - - let deleted = delete_timeline_layers( - &storage, - &sync_queue, - sync_id, - SyncData { - retries: 1, - data: LayersDeletion { - deleted_layers: HashSet::new(), - layers_to_delete: HashSet::new(), - deletion_registered: false, - }, - }, - ) - .await; - - assert!( - !deleted, - "Should not start the deletion for task with delete metadata unregistered" - ); - - Ok(()) - } - - #[tokio::test] - async fn delete_timeline() -> anyhow::Result<()> { - let harness = RepoHarness::create("delete_timeline")?; - let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap()); - - let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID); - let layer_files = ["a", "b", "c", "d"]; - let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?; - let current_retries = 3; - let metadata = dummy_metadata(Lsn(0x30)); - let local_timeline_path = harness.timeline_path(&TIMELINE_ID); - let timeline_upload = - create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?; - for local_path in timeline_upload.layers_to_upload { - let remote_path = storage.storage_path(&local_path)?; - let remote_parent_dir = remote_path.parent().unwrap(); - if !remote_parent_dir.exists() { - fs::create_dir_all(&remote_parent_dir).await?; - } - fs::copy(&local_path, &remote_path).await?; - } - assert_eq!( - storage - .list() - .await? - .into_iter() - .map(|remote_path| storage.local_path(&remote_path).unwrap()) - .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) - .sorted() - .collect::>(), - layer_files - .iter() - .map(|layer_str| layer_str.to_string()) - .sorted() - .collect::>(), - "Expect to have all layer files remotely before deletion" - ); - - let deleted = delete_timeline_layers( - &storage, - &sync_queue, - sync_id, - SyncData { - retries: current_retries, - data: LayersDeletion { - deleted_layers: HashSet::new(), - layers_to_delete: HashSet::from([ - local_timeline_path.join("a"), - local_timeline_path.join("c"), - local_timeline_path.join("something_different"), - ]), - deletion_registered: true, - }, - }, - ) - .await; - assert!(deleted, "Should be able to delete timeline files"); - - assert_eq!( - storage - .list() - .await? - .into_iter() - .map(|remote_path| storage.local_path(&remote_path).unwrap()) - .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) }) - .sorted() - .collect::>(), - vec!["b".to_string(), "d".to_string()], - "Expect to have only non-deleted files remotely" - ); - - Ok(()) - } -} diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index d25dc8914d..359c704e81 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -7,7 +7,6 @@ use byteorder::{ByteOrder, BE}; use bytes::Bytes; use serde::{Deserialize, Serialize}; use std::fmt; -use std::fmt::Display; use std::ops::{AddAssign, Range}; use std::sync::{Arc, RwLockReadGuard}; use std::time::Duration; @@ -19,7 +18,7 @@ use utils::{ #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] /// Key used in the Repository kv-store. /// -/// The Repository treates this as an opaque struct, but see the code in pgdatadir_mapping.rs +/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs /// for what we actually store in these fields. pub struct Key { pub field1: u8, @@ -182,35 +181,18 @@ impl Value { } } -#[derive(Clone, Copy, Debug)] -pub enum TimelineSyncStatusUpdate { - Downloaded, -} - -impl Display for TimelineSyncStatusUpdate { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let s = match self { - TimelineSyncStatusUpdate::Downloaded => "Downloaded", - }; - f.write_str(s) - } -} /// -/// A repository corresponds to one .zenith directory. One repository holds multiple +/// A repository corresponds to one .neon directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. pub trait Repository: Send + Sync { type Timeline: Timeline; /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization. /// See [`crate::remote_storage`] for more details about the synchronization. - fn apply_timeline_remote_sync_status_update( - &self, - timeline_id: ZTimelineId, - timeline_sync_status_update: TimelineSyncStatusUpdate, - ) -> Result<()>; + fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>; /// Get Timeline handle for given zenith timeline ID. - /// This function is idempotent. It doesnt change internal state in any way. + /// This function is idempotent. It doesn't change internal state in any way. fn get_timeline(&self, timelineid: ZTimelineId) -> Option>; /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded. @@ -224,12 +206,17 @@ pub trait Repository: Send + Sync { /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. fn create_empty_timeline( &self, - timelineid: ZTimelineId, + timeline_id: ZTimelineId, initdb_lsn: Lsn, ) -> Result>; /// Branch a timeline - fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>; + fn branch_timeline( + &self, + src: ZTimelineId, + dst: ZTimelineId, + start_lsn: Option, + ) -> Result<()>; /// Flush all data to disk. /// @@ -242,7 +229,7 @@ pub trait Repository: Send + Sync { /// /// 'timelineid' specifies the timeline to GC, or None for all. /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval). - /// `checkpoint_before_gc` parameter is used to force compaction of storage before CG + /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC /// to make tests more deterministic. /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed? fn gc_iteration( @@ -259,10 +246,10 @@ pub trait Repository: Send + Sync { /// api's 'compact' command. fn compaction_iteration(&self) -> Result<()>; - /// detaches timeline-related in-memory data. - fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>; + /// removes timeline-related in-memory data + fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()>; - // Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn. + /// Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn. fn get_remote_index(&self) -> &RemoteIndex; } @@ -345,11 +332,11 @@ pub trait Timeline: Send + Sync { /// Look up given page version. /// - /// NOTE: It is considerd an error to 'get' a key that doesn't exist. The abstraction + /// NOTE: It is considered an error to 'get' a key that doesn't exist. The abstraction /// above this needs to store suitable metadata to track what data exists with /// what keys, in separate metadata entries. If a non-existent key is requested, - /// the Repository implementation may incorrectly return a value from an ancestore - /// branch, for exampel, or waste a lot of cycles chasing the non-existing key. + /// the Repository implementation may incorrectly return a value from an ancestor + /// branch, for example, or waste a lot of cycles chasing the non-existing key. /// fn get(&self, key: Key, lsn: Lsn) -> Result; @@ -406,7 +393,7 @@ pub trait TimelineWriter<'a> { /// /// This will implicitly extend the relation, if the page is beyond the /// current end-of-file. - fn put(&self, key: Key, lsn: Lsn, value: Value) -> Result<()>; + fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()>; fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()>; @@ -469,6 +456,9 @@ pub mod repo_harness { gc_period: Some(tenant_conf.gc_period), image_creation_threshold: Some(tenant_conf.image_creation_threshold), pitr_interval: Some(tenant_conf.pitr_interval), + walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout), + lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout), + max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), } } } @@ -533,7 +523,7 @@ pub mod repo_harness { TenantConfOpt::from(self.tenant_conf), walredo_mgr, self.tenant_id, - RemoteIndex::empty(), + RemoteIndex::default(), false, ); // populate repo with locally available timelines @@ -549,10 +539,7 @@ pub mod repo_harness { .parse() .unwrap(); - repo.apply_timeline_remote_sync_status_update( - timeline_id, - TimelineSyncStatusUpdate::Downloaded, - )?; + repo.attach_timeline(timeline_id)?; } Ok(repo) @@ -616,12 +603,12 @@ mod tests { let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let writer = tline.writer(); - writer.put(*TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?; + writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?; writer.finish_write(Lsn(0x10)); drop(writer); let writer = tline.writer(); - writer.put(*TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?; + writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?; writer.finish_write(Lsn(0x20)); drop(writer); @@ -632,6 +619,19 @@ mod tests { Ok(()) } + #[test] + fn no_duplicate_timelines() -> Result<()> { + let repo = RepoHarness::create("no_duplicate_timelines")?.load(); + let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) { + Ok(_) => panic!("duplicate timeline creation should fail"), + Err(e) => assert_eq!(e.to_string(), "Timeline already exists"), + } + + Ok(()) + } + /// Convenience function to create a page image with given string as the only content pub fn test_value(s: &str) -> Value { let mut buf = BytesMut::new(); @@ -655,24 +655,24 @@ mod tests { let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap(); // Insert a value on the timeline - writer.put(TEST_KEY_A, Lsn(0x20), test_value("foo at 0x20"))?; - writer.put(TEST_KEY_B, Lsn(0x20), test_value("foobar at 0x20"))?; + writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?; + writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?; writer.finish_write(Lsn(0x20)); - writer.put(TEST_KEY_A, Lsn(0x30), test_value("foo at 0x30"))?; + writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?; writer.finish_write(Lsn(0x30)); - writer.put(TEST_KEY_A, Lsn(0x40), test_value("foo at 0x40"))?; + writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?; writer.finish_write(Lsn(0x40)); //assert_current_logical_size(&tline, Lsn(0x40)); // Branch the history, modify relation differently on the new timeline - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; let newtline = repo .get_timeline_load(NEW_TIMELINE_ID) .expect("Should have a local timeline"); let new_writer = newtline.writer(); - new_writer.put(TEST_KEY_A, Lsn(0x40), test_value("bar at 0x40"))?; + new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?; new_writer.finish_write(Lsn(0x40)); // Check page contents on both branches @@ -703,14 +703,14 @@ mod tests { writer.put( *TEST_KEY, lsn, - Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; writer.finish_write(lsn); lsn += 0x10; writer.put( *TEST_KEY, lsn, - Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; writer.finish_write(lsn); lsn += 0x10; @@ -721,14 +721,14 @@ mod tests { writer.put( *TEST_KEY, lsn, - Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; writer.finish_write(lsn); lsn += 0x10; writer.put( *TEST_KEY, lsn, - Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; writer.finish_write(lsn); } @@ -749,7 +749,7 @@ mod tests { repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; // try to branch at lsn 25, should fail because we already garbage collected the data - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) { + match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), Err(err) => { assert!(err.to_string().contains("invalid branch start lsn")); @@ -770,7 +770,7 @@ mod tests { repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) { + match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { Ok(_) => panic!("branching should have failed"), Err(err) => { assert!(&err.to_string().contains("invalid branch start lsn")); @@ -815,7 +815,7 @@ mod tests { let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = repo .get_timeline_load(NEW_TIMELINE_ID) .expect("Should have a local timeline"); @@ -831,7 +831,7 @@ mod tests { let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; make_some_layers(tline.as_ref(), Lsn(0x20))?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = repo .get_timeline_load(NEW_TIMELINE_ID) .expect("Should have a local timeline"); @@ -889,7 +889,7 @@ mod tests { make_some_layers(tline.as_ref(), Lsn(0x20))?; tline.checkpoint(CheckpointConfig::Forced)?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; let newtline = repo .get_timeline_load(NEW_TIMELINE_ID) diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index bbebcd1f36..ac5fb0bc8c 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -69,7 +69,7 @@ //! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexPart`], containing the list of remote files. //! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download. //! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`], -//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its shard contents, if needed, same as any layer files. +//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrieve its shard contents, if needed, same as any layer files. //! //! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed. //! Bulk index data download happens only initially, on pageserver startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only, @@ -96,7 +96,7 @@ //! timeline uploads and downloads can happen concurrently, in no particular order due to incremental nature of the timeline layers. //! Deletion happens only after a successful upload only, otherwise the compaction output might make the timeline inconsistent until both tasks are fully processed without errors. //! Upload and download update the remote data (inmemory index and S3 json index part file) only after every layer is successfully synchronized, while the deletion task -//! does otherwise: it requires to have the remote data updated first succesfully: blob files will be invisible to pageserver this way. +//! does otherwise: it requires to have the remote data updated first successfully: blob files will be invisible to pageserver this way. //! //! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via downloading and merging the index data for all timelines, //! present locally. @@ -178,20 +178,20 @@ use crate::{ metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}, LayeredRepository, }, - repository::TimelineSyncStatusUpdate, storage_sync::{self, index::RemoteIndex}, - tenant_mgr::apply_timeline_sync_status_updates, + tenant_mgr::attach_downloaded_tenants, thread_mgr, thread_mgr::ThreadKind, }; use metrics::{ - register_histogram_vec, register_int_counter, register_int_gauge, HistogramVec, IntCounter, - IntGauge, + register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge, + HistogramVec, IntCounter, IntCounterVec, IntGauge, }; use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; -pub use self::download::download_index_part; +use self::download::download_index_parts; +pub use self::download::gather_tenant_timelines_index_parts; pub use self::download::TEMP_DOWNLOAD_EXTENSION; lazy_static! { @@ -208,14 +208,17 @@ lazy_static! { static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!( "pageserver_remote_storage_image_sync_seconds", "Time took to synchronize (download or upload) a whole pageserver image. \ - Grouped by `operation_kind` (upload|download) and `status` (success|failure)", - &["operation_kind", "status"], - vec![ - 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0, 6.0, 7.0, - 8.0, 9.0, 10.0, 12.5, 15.0, 17.5, 20.0 - ] + Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)", + &["tenant_id", "timeline_id", "operation_kind", "status"], + vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0] ) .expect("failed to register pageserver image sync time histogram vec"); + static ref REMOTE_INDEX_UPLOAD: IntCounterVec = register_int_counter_vec!( + "pageserver_remote_storage_remote_index_uploads_total", + "Number of remote index uploads", + &["tenant_id", "timeline_id"], + ) + .expect("failed to register pageserver remote index upload vec"); } static SYNC_QUEUE: OnceCell = OnceCell::new(); @@ -298,7 +301,7 @@ pub fn start_local_timeline_sync( } Ok(SyncStartupData { local_timeline_init_statuses, - remote_index: RemoteIndex::empty(), + remote_index: RemoteIndex::default(), }) } } @@ -440,7 +443,7 @@ fn collect_timeline_files( // initial collect will fail because there is no metadata. // We either need to start download if we see empty dir after restart or attach caller should // be aware of that and retry attach if awaits_download for timeline switched from true to false - // but timelinne didnt appear locally. + // but timelinne didn't appear locally. // Check what happens with remote index in that case. let timeline_metadata_path = match timeline_metadata_path { Some(path) => path, @@ -832,7 +835,7 @@ where .build() .context("Failed to create storage sync runtime")?; - let applicable_index_parts = runtime.block_on(try_fetch_index_parts( + let applicable_index_parts = runtime.block_on(download_index_parts( conf, &storage, local_timeline_files.keys().copied().collect(), @@ -892,7 +895,7 @@ fn storage_sync_loop( REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64); if remaining_queue_length > 0 || !batched_tasks.is_empty() { - info!("Processing tasks for {} timelines in batch, more tasks left to process: {remaining_queue_length}", batched_tasks.len()); + debug!("Processing tasks for {} timelines in batch, more tasks left to process: {remaining_queue_length}", batched_tasks.len()); } else { debug!("No tasks to process"); continue; @@ -915,16 +918,48 @@ fn storage_sync_loop( }); match loop_step { - ControlFlow::Continue(new_timeline_states) => { - if new_timeline_states.is_empty() { - debug!("Sync loop step completed, no new timeline states"); + ControlFlow::Continue(updated_tenants) => { + if updated_tenants.is_empty() { + debug!("Sync loop step completed, no new tenant states"); } else { info!( - "Sync loop step completed, {} new timeline state update(s)", - new_timeline_states.len() + "Sync loop step completed, {} new tenant state update(s)", + updated_tenants.len() ); + let mut sync_status_updates: HashMap> = + HashMap::new(); + let index_accessor = runtime.block_on(index.read()); + for tenant_id in updated_tenants { + let tenant_entry = match index_accessor.tenant_entry(&tenant_id) { + Some(tenant_entry) => tenant_entry, + None => { + error!( + "cannot find tenant in remote index for timeline sync update" + ); + continue; + } + }; + + if tenant_entry.has_in_progress_downloads() { + info!("Tenant {tenant_id} has pending timeline downloads, skipping repository registration"); + continue; + } else { + info!( + "Tenant {tenant_id} download completed. Picking to register in repository" + ); + // Here we assume that if tenant has no in-progress downloads that + // means that it is the last completed timeline download that triggered + // sync status update. So we look at the index for available timelines + // and register them all at once in a repository for download + // to be submitted in a single operation to repository + // so it can apply them at once to internal timeline map. + sync_status_updates + .insert(tenant_id, tenant_entry.keys().copied().collect()); + } + } + drop(index_accessor); // Batch timeline download registration to ensure that the external registration code won't block any running tasks before. - apply_timeline_sync_status_updates(conf, &index, new_timeline_states); + attach_downloaded_tenants(conf, &index, sync_status_updates); } } ControlFlow::Break(()) => { @@ -935,6 +970,14 @@ fn storage_sync_loop( } } +// needed to check whether the download happened +// more informative than just a bool +#[derive(Debug)] +enum DownloadMarker { + Downloaded, + Nothing, +} + async fn process_batches( conf: &'static PageServerConf, max_sync_errors: NonZeroU32, @@ -942,7 +985,7 @@ async fn process_batches( index: &RemoteIndex, batched_tasks: HashMap, sync_queue: &SyncQueue, -) -> HashMap> +) -> HashSet where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, @@ -967,22 +1010,19 @@ where }) .collect::>(); - let mut new_timeline_states: HashMap< - ZTenantId, - HashMap, - > = HashMap::new(); + let mut downloaded_timelines = HashSet::new(); - while let Some((sync_id, state_update)) = sync_results.next().await { - debug!("Finished storage sync task for sync id {sync_id}"); - if let Some(state_update) = state_update { - new_timeline_states - .entry(sync_id.tenant_id) - .or_default() - .insert(sync_id.timeline_id, state_update); + while let Some((sync_id, download_marker)) = sync_results.next().await { + debug!( + "Finished storage sync task for sync id {sync_id} download marker {:?}", + download_marker + ); + if matches!(download_marker, DownloadMarker::Downloaded) { + downloaded_timelines.insert(sync_id.tenant_id); } } - new_timeline_states + downloaded_timelines } async fn process_sync_task_batch( @@ -991,7 +1031,7 @@ async fn process_sync_task_batch( max_sync_errors: NonZeroU32, sync_id: ZTenantTimelineId, batch: SyncTaskBatch, -) -> Option +) -> DownloadMarker where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, @@ -1007,7 +1047,7 @@ where // in local (implicitly, via Lsn values and related memory state) or remote (explicitly via remote layer file paths) metadata. // When operating in a system without tasks failing over the error threshold, // current batching and task processing systems aim to update the layer set and metadata files (remote and local), - // without "loosing" such layer files. + // without "losing" such layer files. let (upload_result, status_update) = tokio::join!( async { if let Some(upload_data) = upload_data { @@ -1076,7 +1116,7 @@ where } } } - None + DownloadMarker::Nothing } .instrument(info_span!("download_timeline_data")), ); @@ -1130,7 +1170,7 @@ async fn download_timeline_data( new_download_data: SyncData, sync_start: Instant, task_name: &str, -) -> Option +) -> DownloadMarker where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, @@ -1146,36 +1186,36 @@ where .await { DownloadedTimeline::Abort => { - register_sync_status(sync_start, task_name, None); + register_sync_status(sync_id, sync_start, task_name, None); if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) { error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}"); } } DownloadedTimeline::FailedAndRescheduled => { - register_sync_status(sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, task_name, Some(false)); } DownloadedTimeline::Successful(mut download_data) => { match update_local_metadata(conf, sync_id, current_remote_timeline).await { Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) { Ok(()) => { - register_sync_status(sync_start, task_name, Some(true)); - return Some(TimelineSyncStatusUpdate::Downloaded); + register_sync_status(sync_id, sync_start, task_name, Some(true)); + return DownloadMarker::Downloaded; } Err(e) => { - error!("Timeline {sync_id} was expected to be in the remote index after a sucessful download, but it's absent: {e:?}"); + error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}"); } }, Err(e) => { error!("Failed to update local timeline metadata: {e:?}"); download_data.retries += 1; sync_queue.push(sync_id, SyncTask::Download(download_data)); - register_sync_status(sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, task_name, Some(false)); } } } } - None + DownloadMarker::Nothing } async fn update_local_metadata( @@ -1186,7 +1226,7 @@ async fn update_local_metadata( let remote_metadata = match remote_timeline { Some(timeline) => &timeline.metadata, None => { - info!("No remote timeline to update local metadata from, skipping the update"); + debug!("No remote timeline to update local metadata from, skipping the update"); return Ok(()); } }; @@ -1265,14 +1305,14 @@ async fn delete_timeline_data( error!("Failed to update remote timeline {sync_id}: {e:?}"); new_delete_data.retries += 1; sync_queue.push(sync_id, SyncTask::Delete(new_delete_data)); - register_sync_status(sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, task_name, Some(false)); return; } } timeline_delete.deletion_registered = true; let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await; - register_sync_status(sync_start, task_name, Some(sync_status)); + register_sync_status(sync_id, sync_start, task_name, Some(sync_status)); } async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result { @@ -1306,7 +1346,7 @@ async fn upload_timeline_data( .await { UploadedTimeline::FailedAndRescheduled => { - register_sync_status(sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, task_name, Some(false)); return; } UploadedTimeline::Successful(upload_data) => upload_data, @@ -1325,13 +1365,13 @@ async fn upload_timeline_data( .await { Ok(()) => { - register_sync_status(sync_start, task_name, Some(true)); + register_sync_status(sync_id, sync_start, task_name, Some(true)); } Err(e) => { error!("Failed to update remote timeline {sync_id}: {e:?}"); uploaded_data.retries += 1; sync_queue.push(sync_id, SyncTask::Upload(uploaded_data)); - register_sync_status(sync_start, task_name, Some(false)); + register_sync_status(sync_id, sync_start, task_name, Some(false)); } } } @@ -1421,7 +1461,14 @@ where IndexPart::from_remote_timeline(&timeline_path, updated_remote_timeline) .context("Failed to create an index part from the updated remote timeline")?; - info!("Uploading remote index for the timeline"); + debug!("Uploading remote index for the timeline"); + REMOTE_INDEX_UPLOAD + .with_label_values(&[ + &sync_id.tenant_id.to_string(), + &sync_id.timeline_id.to_string(), + ]) + .inc(); + upload_index_part(conf, storage, sync_id, new_index_part) .await .context("Failed to upload new index part") @@ -1448,35 +1495,6 @@ async fn validate_task_retries( ControlFlow::Continue(sync_data) } -async fn try_fetch_index_parts( - conf: &'static PageServerConf, - storage: &S, - keys: HashSet, -) -> HashMap -where - P: Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, -{ - let mut index_parts = HashMap::with_capacity(keys.len()); - - let mut part_downloads = keys - .into_iter() - .map(|id| async move { (id, download_index_part(conf, storage, id).await) }) - .collect::>(); - - while let Some((id, part_upload_result)) = part_downloads.next().await { - match part_upload_result { - Ok(index_part) => { - debug!("Successfully fetched index part for {id}"); - index_parts.insert(id, index_part); - } - Err(e) => warn!("Failed to fetch index part for {id}: {e}"), - } - } - - index_parts -} - fn schedule_first_sync_tasks( index: &mut RemoteTimelineIndex, sync_queue: &SyncQueue, @@ -1539,6 +1557,7 @@ fn schedule_first_sync_tasks( local_timeline_init_statuses } +/// bool in return value stands for awaits_download fn compare_local_and_remote_timeline( new_sync_tasks: &mut VecDeque<(ZTenantTimelineId, SyncTask)>, sync_id: ZTenantTimelineId, @@ -1548,14 +1567,6 @@ fn compare_local_and_remote_timeline( ) -> (LocalTimelineInitStatus, bool) { let remote_files = remote_entry.stored_files(); - // TODO probably here we need more sophisticated logic, - // if more data is available remotely can we just download whats there? - // without trying to upload something. It may be tricky, needs further investigation. - // For now looks strange that we can request upload - // and dowload for the same timeline simultaneously. - // (upload needs to be only for previously unsynced files, not whole timeline dir). - // If one of the tasks fails they will be reordered in the queue which can lead - // to timeline being stuck in evicted state let number_of_layers_to_download = remote_files.difference(&local_files).count(); let (initial_timeline_status, awaits_download) = if number_of_layers_to_download > 0 { new_sync_tasks.push_back(( @@ -1565,7 +1576,7 @@ fn compare_local_and_remote_timeline( }), )); (LocalTimelineInitStatus::NeedsSync, true) - // we do not need to manupulate with remote consistent lsn here + // we do not need to manipulate with remote consistent lsn here // because it will be updated when sync will be completed } else { (LocalTimelineInitStatus::LocallyComplete, false) @@ -1590,12 +1601,24 @@ fn compare_local_and_remote_timeline( (initial_timeline_status, awaits_download) } -fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Option) { +fn register_sync_status( + sync_id: ZTenantTimelineId, + sync_start: Instant, + sync_name: &str, + sync_status: Option, +) { let secs_elapsed = sync_start.elapsed().as_secs_f64(); - info!("Processed a sync task in {secs_elapsed:.2} seconds"); + debug!("Processed a sync task in {secs_elapsed:.2} seconds"); + + let tenant_id = sync_id.tenant_id.to_string(); + let timeline_id = sync_id.timeline_id.to_string(); match sync_status { - Some(true) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "success"]), - Some(false) => IMAGE_SYNC_TIME.with_label_values(&[sync_name, "failure"]), + Some(true) => { + IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "success"]) + } + Some(false) => { + IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "failure"]) + } None => return, } .observe(secs_elapsed) diff --git a/pageserver/src/storage_sync/delete.rs b/pageserver/src/storage_sync/delete.rs index 91c618d201..0dcd9c97fc 100644 --- a/pageserver/src/storage_sync/delete.rs +++ b/pageserver/src/storage_sync/delete.rs @@ -1,4 +1,4 @@ -//! Timeline synchrnonization logic to delete a bulk of timeline's remote files from the remote storage. +//! Timeline synchronization logic to delete a bulk of timeline's remote files from the remote storage. use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; diff --git a/pageserver/src/storage_sync/download.rs b/pageserver/src/storage_sync/download.rs index a28867f27e..a91eaaa7ca 100644 --- a/pageserver/src/storage_sync/download.rs +++ b/pageserver/src/storage_sync/download.rs @@ -1,10 +1,15 @@ -//! Timeline synchrnonization logic to fetch the layer files from remote storage into pageserver's local directory. +//! Timeline synchronization logic to fetch the layer files from remote storage into pageserver's local directory. -use std::{collections::HashSet, fmt::Debug, path::Path}; +use std::{ + collections::{HashMap, HashSet}, + fmt::Debug, + mem, + path::Path, +}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; -use remote_storage::{path_with_suffix_extension, RemoteStorage}; +use remote_storage::{path_with_suffix_extension, DownloadError, RemoteObjectName, RemoteStorage}; use tokio::{ fs, io::{self, AsyncWriteExt}, @@ -14,7 +19,7 @@ use tracing::{debug, error, info, warn}; use crate::{ config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, }; -use utils::zid::ZTenantTimelineId; +use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; use super::{ index::{IndexPart, RemoteTimeline}, @@ -23,12 +28,155 @@ use super::{ pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; +// We collect timelines remotely available for each tenant +// in case we failed to gather all index parts (due to an error) +// Poisoned variant is returned. +// When data is received succesfully without errors Present variant is used. +pub enum TenantIndexParts { + Poisoned { + present: HashMap, + missing: HashSet, + }, + Present(HashMap), +} + +impl TenantIndexParts { + fn add_poisoned(&mut self, timeline_id: ZTimelineId) { + match self { + TenantIndexParts::Poisoned { missing, .. } => { + missing.insert(timeline_id); + } + TenantIndexParts::Present(present) => { + *self = TenantIndexParts::Poisoned { + present: mem::take(present), + missing: HashSet::from([timeline_id]), + } + } + } + } +} + +impl Default for TenantIndexParts { + fn default() -> Self { + TenantIndexParts::Present(HashMap::default()) + } +} + +pub async fn download_index_parts( + conf: &'static PageServerConf, + storage: &S, + keys: HashSet, +) -> HashMap +where + P: Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let mut index_parts: HashMap = HashMap::new(); + + let mut part_downloads = keys + .into_iter() + .map(|id| async move { (id, download_index_part(conf, storage, id).await) }) + .collect::>(); + + while let Some((id, part_upload_result)) = part_downloads.next().await { + match part_upload_result { + Ok(index_part) => { + debug!("Successfully fetched index part for {id}"); + match index_parts.entry(id.tenant_id).or_default() { + TenantIndexParts::Poisoned { present, .. } => { + present.insert(id.timeline_id, index_part); + } + TenantIndexParts::Present(parts) => { + parts.insert(id.timeline_id, index_part); + } + } + } + Err(download_error) => { + match download_error { + DownloadError::NotFound => { + // thats ok because it means that we didnt upload something we have locally for example + } + e => { + let tenant_parts = index_parts.entry(id.tenant_id).or_default(); + tenant_parts.add_poisoned(id.timeline_id); + error!( + "Failed to fetch index part for {id}: {e} poisoning tenant index parts" + ); + } + } + } + } + } + + index_parts +} + +/// Note: The function is rather expensive from s3 access point of view, it will execute ceil(N/1000) + N requests. +/// At least one request to obtain a list of tenant timelines (more requests is there are more than 1000 timelines). +/// And then will attempt to download all index files that belong to these timelines. +pub async fn gather_tenant_timelines_index_parts( + conf: &'static PageServerConf, + storage: &S, + tenant_id: ZTenantId, +) -> anyhow::Result> +where + P: RemoteObjectName + Debug + Send + Sync + 'static, + S: RemoteStorage + Send + Sync + 'static, +{ + let tenant_path = conf.timelines_path(&tenant_id); + let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| { + format!( + "Failed to get tenant storage path for local path '{}'", + tenant_path.display() + ) + })?; + let timelines = storage + .list_prefixes(Some(tenant_storage_path)) + .await + .with_context(|| { + format!( + "Failed to list tenant storage path to get remote timelines to download: {}", + tenant_id + ) + })?; + + let mut sync_ids = HashSet::new(); + + for timeline_remote_storage_key in timelines { + let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { + anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") + })?; + + let timeline_id: ZTimelineId = object_name + .parse() + .with_context(|| { + format!("failed to parse object name into timeline id for tenant {tenant_id} '{object_name}'") + })?; + + sync_ids.insert(ZTenantTimelineId { + tenant_id, + timeline_id, + }); + } + + match download_index_parts(conf, storage, sync_ids) + .await + .remove(&tenant_id) + .ok_or_else(|| anyhow::anyhow!("Missing tenant index parts. This is a bug."))? + { + TenantIndexParts::Poisoned { missing, .. } => { + anyhow::bail!("Failed to download index parts for all timelines. Missing {missing:?}") + } + TenantIndexParts::Present(parts) => Ok(parts), + } +} + /// Retrieves index data from the remote storage for a given timeline. -pub async fn download_index_part( +async fn download_index_part( conf: &'static PageServerConf, storage: &S, sync_id: ZTenantTimelineId, -) -> anyhow::Result +) -> Result where P: Debug + Send + Sync + 'static, S: RemoteStorage + Send + Sync + 'static, @@ -43,18 +191,29 @@ where "Failed to get the index part storage path for local path '{}'", index_part_path.display() ) - })?; - let mut index_part_bytes = Vec::new(); - storage - .download(&part_storage_path, &mut index_part_bytes) - .await - .with_context(|| { - format!("Failed to download an index part from storage path {part_storage_path:?}") - })?; + }) + .map_err(DownloadError::BadInput)?; - let index_part: IndexPart = serde_json::from_slice(&index_part_bytes).with_context(|| { - format!("Failed to deserialize index part file from storage path '{part_storage_path:?}'") - })?; + let mut index_part_download = storage.download(&part_storage_path).await?; + + let mut index_part_bytes = Vec::new(); + io::copy( + &mut index_part_download.download_stream, + &mut index_part_bytes, + ) + .await + .with_context(|| { + format!("Failed to download an index part from storage path {part_storage_path:?}") + }) + .map_err(DownloadError::Other)?; + + let index_part: IndexPart = serde_json::from_slice(&index_part_bytes) + .with_context(|| { + format!( + "Failed to deserialize index part file from storage path '{part_storage_path:?}'" + ) + }) + .map_err(DownloadError::Other)?; let missing_files = index_part.missing_files(); if !missing_files.is_empty() { @@ -162,15 +321,19 @@ where temp_file_path.display() ) })?; - - storage - .download(&layer_storage_path, &mut destination_file) + let mut download = storage + .download(&layer_storage_path) .await .with_context(|| { format!( - "Failed to download a layer from storage path '{layer_storage_path:?}'" + "Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'" ) })?; + io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| { + format!( + "Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display() + ) + })?; // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: // A file will not be closed immediately when it goes out of scope if there are any IO operations diff --git a/pageserver/src/storage_sync/index.rs b/pageserver/src/storage_sync/index.rs index 7764a810bc..134ae893bc 100644 --- a/pageserver/src/storage_sync/index.rs +++ b/pageserver/src/storage_sync/index.rs @@ -2,6 +2,7 @@ //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about //! remote timeline layers and its metadata. +use std::ops::{Deref, DerefMut}; use std::{ collections::{HashMap, HashSet}, path::{Path, PathBuf}, @@ -12,9 +13,15 @@ use anyhow::{anyhow, Context, Ok}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use tokio::sync::RwLock; +use tracing::log::warn; use crate::{config::PageServerConf, layered_repository::metadata::TimelineMetadata}; -use utils::{lsn::Lsn, zid::ZTenantTimelineId}; +use utils::{ + lsn::Lsn, + zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, +}; + +use super::download::TenantIndexParts; /// A part of the filesystem path, that needs a root to become a path again. #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] @@ -41,38 +48,74 @@ impl RelativePath { } } +#[derive(Debug, Clone, Default)] +pub struct TenantEntry(HashMap); + +impl TenantEntry { + pub fn has_in_progress_downloads(&self) -> bool { + self.values() + .any(|remote_timeline| remote_timeline.awaits_download) + } +} + +impl Deref for TenantEntry { + type Target = HashMap; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for TenantEntry { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl From> for TenantEntry { + fn from(inner: HashMap) -> Self { + Self(inner) + } +} + /// An index to track tenant files that exist on the remote storage. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct RemoteTimelineIndex { - timeline_entries: HashMap, + entries: HashMap, } /// A wrapper to synchronize the access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`]. +#[derive(Default)] pub struct RemoteIndex(Arc>); impl RemoteIndex { - pub fn empty() -> Self { - Self(Arc::new(RwLock::new(RemoteTimelineIndex { - timeline_entries: HashMap::new(), - }))) - } - pub fn from_parts( conf: &'static PageServerConf, - index_parts: HashMap, + index_parts: HashMap, ) -> anyhow::Result { - let mut timeline_entries = HashMap::new(); + let mut entries: HashMap = HashMap::new(); - for (sync_id, index_part) in index_parts { - let timeline_path = conf.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id); - let remote_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part) - .context("Failed to restore remote timeline data from index part")?; - timeline_entries.insert(sync_id, remote_timeline); + for (tenant_id, index_parts) in index_parts { + match index_parts { + // TODO: should we schedule a retry so it can be recovered? otherwise we can revive it only through detach/attach or pageserver restart + TenantIndexParts::Poisoned { missing, ..} => warn!("skipping tenant_id set up for remote index because the index download has failed for timeline(s): {missing:?}"), + TenantIndexParts::Present(timelines) => { + for (timeline_id, index_part) in timelines { + let timeline_path = conf.timeline_path(&timeline_id, &tenant_id); + let remote_timeline = + RemoteTimeline::from_index_part(&timeline_path, index_part) + .context("Failed to restore remote timeline data from index part")?; + + entries + .entry(tenant_id) + .or_default() + .insert(timeline_id, remote_timeline); + } + }, + } } - Ok(Self(Arc::new(RwLock::new(RemoteTimelineIndex { - timeline_entries, - })))) + Ok(Self(Arc::new(RwLock::new(RemoteTimelineIndex { entries })))) } pub async fn read(&self) -> tokio::sync::RwLockReadGuard<'_, RemoteTimelineIndex> { @@ -91,20 +134,67 @@ impl Clone for RemoteIndex { } impl RemoteTimelineIndex { - pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&RemoteTimeline> { - self.timeline_entries.get(id) + pub fn timeline_entry( + &self, + ZTenantTimelineId { + tenant_id, + timeline_id, + }: &ZTenantTimelineId, + ) -> Option<&RemoteTimeline> { + self.entries.get(tenant_id)?.get(timeline_id) } - pub fn timeline_entry_mut(&mut self, id: &ZTenantTimelineId) -> Option<&mut RemoteTimeline> { - self.timeline_entries.get_mut(id) + pub fn timeline_entry_mut( + &mut self, + ZTenantTimelineId { + tenant_id, + timeline_id, + }: &ZTenantTimelineId, + ) -> Option<&mut RemoteTimeline> { + self.entries.get_mut(tenant_id)?.get_mut(timeline_id) } - pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: RemoteTimeline) { - self.timeline_entries.insert(id, entry); + pub fn add_timeline_entry( + &mut self, + ZTenantTimelineId { + tenant_id, + timeline_id, + }: ZTenantTimelineId, + entry: RemoteTimeline, + ) { + self.entries + .entry(tenant_id) + .or_default() + .insert(timeline_id, entry); } - pub fn all_sync_ids(&self) -> impl Iterator + '_ { - self.timeline_entries.keys().copied() + pub fn remove_timeline_entry( + &mut self, + ZTenantTimelineId { + tenant_id, + timeline_id, + }: ZTenantTimelineId, + ) -> Option { + self.entries + .entry(tenant_id) + .or_default() + .remove(&timeline_id) + } + + pub fn tenant_entry(&self, tenant_id: &ZTenantId) -> Option<&TenantEntry> { + self.entries.get(tenant_id) + } + + pub fn tenant_entry_mut(&mut self, tenant_id: &ZTenantId) -> Option<&mut TenantEntry> { + self.entries.get_mut(tenant_id) + } + + pub fn add_tenant_entry(&mut self, tenant_id: ZTenantId) -> &mut TenantEntry { + self.entries.entry(tenant_id).or_default() + } + + pub fn remove_tenant_entry(&mut self, tenant_id: &ZTenantId) -> Option { + self.entries.remove(tenant_id) } pub fn set_awaits_download( @@ -273,7 +363,7 @@ mod tests { }; let index_part = IndexPart::from_remote_timeline(&timeline_path, remote_timeline.clone()) - .expect("Correct remote timeline should be convertable to index part"); + .expect("Correct remote timeline should be convertible to index part"); assert_eq!( index_part.timeline_layers.iter().collect::>(), @@ -305,7 +395,7 @@ mod tests { ); let restored_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part) - .expect("Correct index part should be convertable to remote timeline"); + .expect("Correct index part should be convertible to remote timeline"); let original_metadata = &remote_timeline.metadata; let restored_metadata = &restored_timeline.metadata; diff --git a/pageserver/src/storage_sync/upload.rs b/pageserver/src/storage_sync/upload.rs index 625ec7aed6..f9ab3b7471 100644 --- a/pageserver/src/storage_sync/upload.rs +++ b/pageserver/src/storage_sync/upload.rs @@ -4,6 +4,7 @@ use std::{fmt::Debug, path::PathBuf}; use anyhow::Context; use futures::stream::{FuturesUnordered, StreamExt}; +use lazy_static::lazy_static; use remote_storage::RemoteStorage; use tokio::fs; use tracing::{debug, error, info, warn}; @@ -17,6 +18,16 @@ use super::{ use crate::{ config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask, }; +use metrics::{register_int_counter_vec, IntCounterVec}; + +lazy_static! { + static ref NO_LAYERS_UPLOAD: IntCounterVec = register_int_counter_vec!( + "pageserver_remote_storage_no_layers_uploads_total", + "Number of skipped uploads due to no layers", + &["tenant_id", "timeline_id"], + ) + .expect("failed to register pageserver no layers upload vec"); +} /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part( @@ -102,7 +113,13 @@ where .collect::>(); if layers_to_upload.is_empty() { - info!("No layers to upload after filtering, aborting"); + debug!("No layers to upload after filtering, aborting"); + NO_LAYERS_UPLOAD + .with_label_values(&[ + &sync_id.tenant_id.to_string(), + &sync_id.timeline_id.to_string(), + ]) + .inc(); return UploadedTimeline::Successful(upload_data); } @@ -391,7 +408,7 @@ mod tests { assert_eq!( upload.metadata, Some(metadata), - "Successful upload should not chage its metadata" + "Successful upload should not change its metadata" ); let storage_files = storage.list().await?; diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant_config.rs index 9bf223e59e..8811009743 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant_config.rs @@ -10,6 +10,7 @@ //! use crate::config::PageServerConf; use serde::{Deserialize, Serialize}; +use std::num::NonZeroU64; use std::path::PathBuf; use std::time::Duration; use utils::zid::ZTenantId; @@ -34,6 +35,9 @@ pub mod defaults { pub const DEFAULT_GC_PERIOD: &str = "100 s"; pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; pub const DEFAULT_PITR_INTERVAL: &str = "30 days"; + pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds"; + pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; + pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; } /// Per-tenant configuration options @@ -68,6 +72,17 @@ pub struct TenantConf { // Page versions older than this are garbage collected away. #[serde(with = "humantime_serde")] pub pitr_interval: Duration, + /// Maximum amount of time to wait while opening a connection to receive wal, before erroring. + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Duration, + /// Considers safekeepers stalled after no WAL updates were received longer than this threshold. + /// A stalled safekeeper will be changed to a newer one when it appears. + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Duration, + /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold. + /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, + /// to avoid eager reconnects. + pub max_lsn_wal_lag: NonZeroU64, } /// Same as TenantConf, but this struct preserves the information about @@ -85,6 +100,11 @@ pub struct TenantConfOpt { pub image_creation_threshold: Option, #[serde(with = "humantime_serde")] pub pitr_interval: Option, + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Option, + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Option, + pub max_lsn_wal_lag: Option, } impl TenantConfOpt { @@ -108,6 +128,13 @@ impl TenantConfOpt { .image_creation_threshold .unwrap_or(global_conf.image_creation_threshold), pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval), + walreceiver_connect_timeout: self + .walreceiver_connect_timeout + .unwrap_or(global_conf.walreceiver_connect_timeout), + lagging_wal_timeout: self + .lagging_wal_timeout + .unwrap_or(global_conf.lagging_wal_timeout), + max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag), } } @@ -136,6 +163,15 @@ impl TenantConfOpt { if let Some(pitr_interval) = other.pitr_interval { self.pitr_interval = Some(pitr_interval); } + if let Some(walreceiver_connect_timeout) = other.walreceiver_connect_timeout { + self.walreceiver_connect_timeout = Some(walreceiver_connect_timeout); + } + if let Some(lagging_wal_timeout) = other.lagging_wal_timeout { + self.lagging_wal_timeout = Some(lagging_wal_timeout); + } + if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag { + self.max_lsn_wal_lag = Some(max_lsn_wal_lag); + } } } @@ -155,6 +191,14 @@ impl TenantConf { image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD, pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) .expect("cannot parse default PITR interval"), + walreceiver_connect_timeout: humantime::parse_duration( + DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, + ) + .expect("cannot parse default walreceiver connect timeout"), + lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT) + .expect("cannot parse default walreceiver lagging wal timeout"), + max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) + .expect("cannot parse default max walreceiver Lsn wal lag"), } } @@ -175,6 +219,16 @@ impl TenantConf { gc_period: Duration::from_secs(10), image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD, pitr_interval: Duration::from_secs(60 * 60), + walreceiver_connect_timeout: humantime::parse_duration( + defaults::DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, + ) + .unwrap(), + lagging_wal_timeout: humantime::parse_duration( + defaults::DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT, + ) + .unwrap(), + max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) + .unwrap(), } } } diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index bbe66d7f80..1759d3bbb8 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -2,41 +2,48 @@ //! page server. use crate::config::PageServerConf; -use crate::layered_repository::LayeredRepository; +use crate::layered_repository::{load_metadata, LayeredRepository}; use crate::pgdatadir_mapping::DatadirTimeline; -use crate::repository::{Repository, TimelineSyncStatusUpdate}; -use crate::storage_sync::index::RemoteIndex; +use crate::repository::Repository; +use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex}; use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData}; use crate::tenant_config::TenantConfOpt; -use crate::thread_mgr; use crate::thread_mgr::ThreadKind; -use crate::timelines; use crate::timelines::CreateRepo; use crate::walredo::PostgresRedoManager; +use crate::{thread_mgr, timelines, walreceiver}; use crate::{DatadirTimelineImpl, RepositoryImpl}; -use anyhow::{bail, Context}; +use anyhow::Context; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use std::collections::hash_map::Entry; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fmt; use std::sync::Arc; +use tokio::sync::mpsc; use tracing::*; +use utils::lsn::Lsn; -use utils::zid::{ZTenantId, ZTimelineId}; +use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; mod tenants_state { + use anyhow::ensure; use std::{ collections::HashMap, sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}, }; + use tokio::sync::mpsc; + use tracing::{debug, error}; use utils::zid::ZTenantId; - use crate::tenant_mgr::Tenant; + use crate::tenant_mgr::{LocalTimelineUpdate, Tenant}; lazy_static::lazy_static! { static ref TENANTS: RwLock> = RwLock::new(HashMap::new()); + /// Sends updates to the local timelines (creation and deletion) to the WAL receiver, + /// so that it can enable/disable corresponding processes. + static ref TIMELINE_UPDATE_SENDER: RwLock>> = RwLock::new(None); } pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap> { @@ -50,6 +57,39 @@ mod tenants_state { .write() .expect("Failed to write() tenants lock, it got poisoned") } + + pub(super) fn set_timeline_update_sender( + timeline_updates_sender: mpsc::UnboundedSender, + ) -> anyhow::Result<()> { + let mut sender_guard = TIMELINE_UPDATE_SENDER + .write() + .expect("Failed to write() timeline_update_sender lock, it got poisoned"); + ensure!(sender_guard.is_none(), "Timeline update sender already set"); + *sender_guard = Some(timeline_updates_sender); + Ok(()) + } + + pub(super) fn try_send_timeline_update(update: LocalTimelineUpdate) { + match TIMELINE_UPDATE_SENDER + .read() + .expect("Failed to read() timeline_update_sender lock, it got poisoned") + .as_ref() + { + Some(sender) => { + if let Err(e) = sender.send(update) { + error!("Failed to send timeline update: {}", e); + } + } + None => debug!("Timeline update sender is not enabled, cannot send update {update:?}"), + } + } + + pub(super) fn stop_timeline_update_sender() { + TIMELINE_UPDATE_SENDER + .write() + .expect("Failed to write() timeline_update_sender lock, it got poisoned") + .take(); + } } struct Tenant { @@ -86,10 +126,10 @@ pub enum TenantState { impl fmt::Display for TenantState { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - TenantState::Active => f.write_str("Active"), - TenantState::Idle => f.write_str("Idle"), - TenantState::Stopping => f.write_str("Stopping"), - TenantState::Broken => f.write_str("Broken"), + Self::Active => f.write_str("Active"), + Self::Idle => f.write_str("Idle"), + Self::Stopping => f.write_str("Stopping"), + Self::Broken => f.write_str("Broken"), } } } @@ -98,6 +138,11 @@ impl fmt::Display for TenantState { /// Timelines that are only partially available locally (remote storage has more data than this pageserver) /// are scheduled for download and added to the repository once download is completed. pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result { + let (timeline_updates_sender, timeline_updates_receiver) = + mpsc::unbounded_channel::(); + tenants_state::set_timeline_update_sender(timeline_updates_sender)?; + walreceiver::init_wal_receiver_main_thread(conf, timeline_updates_receiver)?; + let SyncStartupData { remote_index, local_timeline_init_statuses, @@ -112,9 +157,12 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result anyhow::Result, + }, + Attach { + id: ZTenantTimelineId, + datadir: Arc, + }, +} + +impl std::fmt::Debug for LocalTimelineUpdate { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Detach { id, .. } => f.debug_tuple("Remove").field(id).finish(), + Self::Attach { id, .. } => f.debug_tuple("Add").field(id).finish(), + } + } +} + /// Updates tenants' repositories, changing their timelines state in memory. -pub fn apply_timeline_sync_status_updates( +pub fn attach_downloaded_tenants( conf: &'static PageServerConf, remote_index: &RemoteIndex, - sync_status_updates: HashMap>, + sync_status_updates: HashMap>, ) { if sync_status_updates.is_empty() { - debug!("no sync status updates to apply"); + debug!("No sync status updates to apply"); return; } - info!( - "Applying sync status updates for {} timelines", - sync_status_updates.len() - ); - debug!("Sync status updates: {sync_status_updates:?}"); + for (tenant_id, downloaded_timelines) in sync_status_updates { + info!( + "Registering downlloaded timelines for {tenant_id} {} timelines", + downloaded_timelines.len() + ); + debug!("Downloaded timelines: {downloaded_timelines:?}"); - for (tenant_id, status_updates) in sync_status_updates { let repo = match load_local_repo(conf, tenant_id, remote_index) { Ok(repo) => repo, Err(e) => { - error!("Failed to load repo for tenant {tenant_id} Error: {e:?}",); + error!("Failed to load repo for tenant {tenant_id} Error: {e:?}"); continue; } }; - match apply_timeline_remote_sync_status_updates(&repo, status_updates) { + match attach_downloaded_tenant(&repo, downloaded_timelines) { Ok(()) => info!("successfully applied sync status updates for tenant {tenant_id}"), Err(e) => error!( "Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}" @@ -159,6 +228,7 @@ pub fn apply_timeline_sync_status_updates( /// Shut down all tenants. This runs as part of pageserver shutdown. /// pub fn shutdown_all_tenants() { + tenants_state::stop_timeline_update_sender(); let mut m = tenants_state::write_tenants(); let mut tenantids = Vec::new(); for (tenantid, tenant) in m.iter_mut() { @@ -172,9 +242,7 @@ pub fn shutdown_all_tenants() { } drop(m); - thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiver), None, None); - thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None); - thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), None, None); + thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None); // Ok, no background threads running anymore. Flush any remaining data in // memory to disk. @@ -246,60 +314,57 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option { Some(tenants_state::read_tenants().get(&tenantid)?.state) } -/// -/// Change the state of a tenant to Active and launch its compactor and GC -/// threads. If the tenant was already in Active state or Stopping, does nothing. -/// -pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> { +pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { let mut m = tenants_state::write_tenants(); let tenant = m .get_mut(&tenant_id) .with_context(|| format!("Tenant not found for id {tenant_id}"))?; + let old_state = tenant.state; + tenant.state = new_state; + drop(m); - info!("activating tenant {tenant_id}"); + match (old_state, new_state) { + (TenantState::Broken, TenantState::Broken) + | (TenantState::Active, TenantState::Active) + | (TenantState::Idle, TenantState::Idle) + | (TenantState::Stopping, TenantState::Stopping) => { + debug!("tenant {tenant_id} already in state {new_state}"); + } + (TenantState::Broken, ignored) => { + debug!("Ignoring {ignored} since tenant {tenant_id} is in broken state"); + } + (_, TenantState::Broken) => { + debug!("Setting tenant {tenant_id} status to broken"); + } + (TenantState::Stopping, ignored) => { + debug!("Ignoring {ignored} since tenant {tenant_id} is in stopping state"); + } + (TenantState::Idle, TenantState::Active) => { + info!("activating tenant {tenant_id}"); - match tenant.state { - // If the tenant is already active, nothing to do. - TenantState::Active => {} - - // If it's Idle, launch the compactor and GC threads - TenantState::Idle => { - thread_mgr::spawn( - ThreadKind::Compactor, + // Spawn gc and compaction loops. The loops will shut themselves + // down when they notice that the tenant is inactive. + // TODO maybe use tokio::sync::watch instead? + crate::tenant_tasks::start_compaction_loop(tenant_id)?; + crate::tenant_tasks::start_gc_loop(tenant_id)?; + } + (TenantState::Idle, TenantState::Stopping) => { + info!("stopping idle tenant {tenant_id}"); + } + (TenantState::Active, TenantState::Stopping | TenantState::Idle) => { + info!("stopping tenant {tenant_id} threads due to new state {new_state}"); + thread_mgr::shutdown_threads( + Some(ThreadKind::WalReceiverManager), Some(tenant_id), None, - "Compactor thread", - false, - move || crate::tenant_threads::compact_loop(tenant_id), - )?; + ); - let gc_spawn_result = thread_mgr::spawn( - ThreadKind::GarbageCollector, - Some(tenant_id), - None, - "GC thread", - false, - move || crate::tenant_threads::gc_loop(tenant_id), - ) - .map(|_thread_id| ()) // update the `Result::Ok` type to match the outer function's return signature - .with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}")); - - if let Err(e) = &gc_spawn_result { - error!("Failed to start GC thread for tenant {tenant_id}, stopping its checkpointer thread: {e:?}"); - thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None); - return gc_spawn_result; - } - tenant.state = TenantState::Active; - } - - TenantState::Stopping => { - // don't re-activate it if it's being stopped - } - - TenantState::Broken => { - // cannot activate + // Wait until all gc/compaction tasks finish + let repo = get_repository_for_tenant(tenant_id)?; + let _guard = repo.file_lock.write().unwrap(); } } + Ok(()) } @@ -324,48 +389,104 @@ pub fn get_local_timeline_with_load( .with_context(|| format!("Tenant {tenant_id} not found"))?; if let Some(page_tline) = tenant.local_timelines.get(&timeline_id) { - return Ok(Arc::clone(page_tline)); + Ok(Arc::clone(page_tline)) + } else { + let page_tline = load_local_timeline(&tenant.repo, timeline_id) + .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}"))?; + tenant + .local_timelines + .insert(timeline_id, Arc::clone(&page_tline)); + Ok(page_tline) } - - let page_tline = new_local_timeline(&tenant.repo, timeline_id) - .with_context(|| format!("Failed to create new local timeline for tenant {tenant_id}"))?; - tenant - .local_timelines - .insert(timeline_id, Arc::clone(&page_tline)); - Ok(page_tline) } -pub fn detach_timeline( - conf: &'static PageServerConf, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, -) -> anyhow::Result<()> { - // shutdown the timeline threads (this shuts down the walreceiver) - thread_mgr::shutdown_threads(None, Some(tenant_id), Some(timeline_id)); +pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> { + // Start with the shutdown of timeline tasks (this shuts down the walreceiver) + // It is important that we do not take locks here, and do not check whether the timeline exists + // because if we hold tenants_state::write_tenants() while awaiting for the threads to join + // we cannot create new timelines and tenants, and that can take quite some time, + // it can even become stuck due to a bug making whole pageserver unavailable for some operations + // so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation + // and then try to actually remove timeline from inmemory state and this is the point when concurrent requests + // will synchronize and either fail with the not found error or succeed + let (sender, receiver) = std::sync::mpsc::channel::<()>(); + tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { + id: ZTenantTimelineId::new(tenant_id, timeline_id), + join_confirmation_sender: sender, + }); + + debug!("waiting for wal receiver to shutdown"); + let _ = receiver.recv(); + debug!("wal receiver shutdown confirmed"); + debug!("waiting for threads to shutdown"); + thread_mgr::shutdown_threads(None, None, Some(timeline_id)); + debug!("thread shutdown completed"); match tenants_state::write_tenants().get_mut(&tenant_id) { Some(tenant) => { - tenant - .repo - .detach_timeline(timeline_id) - .context("Failed to detach inmem tenant timeline")?; + tenant.repo.delete_timeline(timeline_id)?; tenant.local_timelines.remove(&timeline_id); } - None => bail!("Tenant {tenant_id} not found in local tenant state"), + None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"), } - let local_timeline_directory = conf.timeline_path(&timeline_id, &tenant_id); - std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { + Ok(()) +} + +pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> anyhow::Result<()> { + set_tenant_state(tenant_id, TenantState::Stopping)?; + // shutdown the tenant and timeline threads: gc, compaction, page service threads) + thread_mgr::shutdown_threads(None, Some(tenant_id), None); + + // FIXME should we protect somehow from starting new threads/walreceivers when tenant is in stopping state? + // send stop signal to wal receiver and collect join handles while holding the lock + let walreceiver_join_handles = { + let tenants = tenants_state::write_tenants(); + let tenant = tenants.get(&tenant_id).context("tenant not found")?; + let mut walreceiver_join_handles = Vec::with_capacity(tenant.local_timelines.len()); + for timeline_id in tenant.local_timelines.keys() { + let (sender, receiver) = std::sync::mpsc::channel::<()>(); + tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach { + id: ZTenantTimelineId::new(tenant_id, *timeline_id), + join_confirmation_sender: sender, + }); + walreceiver_join_handles.push((*timeline_id, receiver)); + } + // drop the tenants lock + walreceiver_join_handles + }; + + // wait for wal receivers to stop without holding the lock, because walreceiver + // will attempt to change tenant state which is protected by the same global tenants lock. + // TODO do we need a timeout here? how to handle it? + // recv_timeout is broken: https://github.com/rust-lang/rust/issues/94518#issuecomment-1057440631 + // need to use crossbeam-channel + for (timeline_id, join_handle) in walreceiver_join_handles { + info!("waiting for wal receiver to shutdown timeline_id {timeline_id}"); + join_handle.recv().context("failed to join walreceiver")?; + info!("wal receiver shutdown confirmed timeline_id {timeline_id}"); + } + + tenants_state::write_tenants().remove(&tenant_id); + + // If removal fails there will be no way to successfully retry detach, + // because tenant no longer exists in in memory map. And it needs to be removed from it + // before we remove files because it contains references to repository + // which references ephemeral files which are deleted on drop. So if we keep these references + // code will attempt to remove files which no longer exist. This can be fixed by having shutdown + // mechanism for repository that will clean temporary data to avoid any references to ephemeral files + let local_tenant_directory = conf.tenant_path(&tenant_id); + std::fs::remove_dir_all(&local_tenant_directory).with_context(|| { format!( "Failed to remove local timeline directory '{}'", - local_timeline_directory.display() + local_tenant_directory.display() ) })?; Ok(()) } -fn new_local_timeline( +fn load_local_timeline( repo: &RepositoryImpl, timeline_id: ZTimelineId, ) -> anyhow::Result>> { @@ -378,6 +499,12 @@ fn new_local_timeline( repartition_distance, )); page_tline.init_logical_size()?; + + tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach { + id: ZTenantTimelineId::new(repo.tenant_id(), timeline_id), + datadir: Arc::clone(&page_tline), + }); + Ok(page_tline) } @@ -386,68 +513,105 @@ fn new_local_timeline( pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] pub id: ZTenantId, - pub state: TenantState, + pub state: Option, + pub has_in_progress_downloads: Option, } -pub fn list_tenants() -> Vec { +pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec { tenants_state::read_tenants() .iter() - .map(|(id, tenant)| TenantInfo { - id: *id, - state: tenant.state, + .map(|(id, tenant)| { + let has_in_progress_downloads = remote_index + .tenant_entry(id) + .map(|entry| entry.has_in_progress_downloads()); + + if has_in_progress_downloads.is_none() { + error!("timeline is not found in remote index while it is present in the tenants registry") + } + + TenantInfo { + id: *id, + state: Some(tenant.state), + has_in_progress_downloads, + } }) .collect() } +/// Check if a given timeline is "broken" \[1\]. +/// The function returns an error if the timeline is "broken". +/// +/// \[1\]: it's not clear now how should we classify a timeline as broken. +/// A timeline is categorized as broken when any of following conditions is true: +/// - failed to load the timeline's metadata +/// - the timeline's disk consistent LSN is zero +fn check_broken_timeline( + conf: &'static PageServerConf, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, +) -> anyhow::Result<()> { + let metadata = + load_metadata(conf, timeline_id, tenant_id).context("failed to load metadata")?; + + // A timeline with zero disk consistent LSN can happen when the page server + // failed to checkpoint the timeline import data when creating that timeline. + if metadata.disk_consistent_lsn() == Lsn::INVALID { + anyhow::bail!("Timeline {timeline_id} has a zero disk consistent LSN."); + } + + Ok(()) +} + +/// Note: all timelines are attached at once if and only if all of them are locally complete fn init_local_repository( conf: &'static PageServerConf, tenant_id: ZTenantId, local_timeline_init_statuses: HashMap, remote_index: &RemoteIndex, ) -> anyhow::Result<(), anyhow::Error> { - // initialize local tenant - let repo = load_local_repo(conf, tenant_id, remote_index) - .with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?; - - let mut status_updates = HashMap::with_capacity(local_timeline_init_statuses.len()); + let mut timelines_to_attach = HashSet::new(); for (timeline_id, init_status) in local_timeline_init_statuses { match init_status { LocalTimelineInitStatus::LocallyComplete => { debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository"); - status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded); + check_broken_timeline(conf, tenant_id, timeline_id) + .context("found broken timeline")?; + timelines_to_attach.insert(timeline_id); } LocalTimelineInitStatus::NeedsSync => { debug!( "timeline {tenant_id} for tenant {timeline_id} needs sync, \ so skipped for adding into repository until sync is finished" ); + return Ok(()); } } } + // initialize local tenant + let repo = load_local_repo(conf, tenant_id, remote_index) + .with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?; + // Lets fail here loudly to be on the safe side. // XXX: It may be a better api to actually distinguish between repository startup // and processing of newly downloaded timelines. - apply_timeline_remote_sync_status_updates(&repo, status_updates) + attach_downloaded_tenant(&repo, timelines_to_attach) .with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))?; Ok(()) } -fn apply_timeline_remote_sync_status_updates( +fn attach_downloaded_tenant( repo: &LayeredRepository, - status_updates: HashMap, + downloaded_timelines: HashSet, ) -> anyhow::Result<()> { - let mut registration_queue = Vec::with_capacity(status_updates.len()); + let mut registration_queue = Vec::with_capacity(downloaded_timelines.len()); // first need to register the in-mem representations, to avoid missing ancestors during the local disk data registration - for (timeline_id, status_update) in status_updates { - repo.apply_timeline_remote_sync_status_update(timeline_id, status_update) - .with_context(|| { - format!("Failed to load timeline {timeline_id} into in-memory repository") - })?; - match status_update { - TimelineSyncStatusUpdate::Downloaded => registration_queue.push(timeline_id), - } + for timeline_id in downloaded_timelines { + repo.attach_timeline(timeline_id).with_context(|| { + format!("Failed to load timeline {timeline_id} into in-memory repository") + })?; + registration_queue.push(timeline_id); } for timeline_id in registration_queue { @@ -455,15 +619,15 @@ fn apply_timeline_remote_sync_status_updates( match tenants_state::write_tenants().get_mut(&tenant_id) { Some(tenant) => match tenant.local_timelines.entry(timeline_id) { Entry::Occupied(_) => { - bail!("Local timeline {timeline_id} already registered") + anyhow::bail!("Local timeline {timeline_id} already registered") } Entry::Vacant(v) => { - v.insert(new_local_timeline(repo, timeline_id).with_context(|| { - format!("Failed to register new local timeline for tenant {tenant_id}") + v.insert(load_local_timeline(repo, timeline_id).with_context(|| { + format!("Failed to register add local timeline for tenant {tenant_id}") })?); } }, - None => bail!( + None => anyhow::bail!( "Tenant {} not found in local tenant state", repo.tenant_id() ), diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs new file mode 100644 index 0000000000..b0bb4953ca --- /dev/null +++ b/pageserver/src/tenant_tasks.rs @@ -0,0 +1,286 @@ +//! This module contains functions to serve per-tenant background processes, +//! such as compaction and GC + +use std::collections::HashMap; +use std::ops::ControlFlow; +use std::time::Duration; + +use crate::repository::Repository; +use crate::tenant_mgr::TenantState; +use crate::thread_mgr::ThreadKind; +use crate::{tenant_mgr, thread_mgr}; +use anyhow::{self, Context}; +use futures::stream::FuturesUnordered; +use futures::StreamExt; +use metrics::{register_int_counter_vec, IntCounterVec}; +use once_cell::sync::{Lazy, OnceCell}; +use tokio::sync::mpsc; +use tokio::sync::watch; +use tracing::*; +use utils::zid::ZTenantId; + +static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_task_events", + "Number of task start/stop/fail events.", + &["event"], + ) + .expect("Failed to register tenant_task_events metric") +}); + +/// +/// Compaction task's main loop +/// +async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { + loop { + trace!("waking up"); + + // Run blocking part of the task + let period: Result, _> = tokio::task::spawn_blocking(move || { + // Break if tenant is not active + if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { + return Ok(ControlFlow::Break(())); + } + + // Break if we're not allowed to write to disk + let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + // TODO do this inside repo.compaction_iteration instead. + let _guard = match repo.file_lock.try_read() { + Ok(g) => g, + Err(_) => return Ok(ControlFlow::Break(())), + }; + + // Run compaction + let compaction_period = repo.get_compaction_period(); + repo.compaction_iteration()?; + Ok(ControlFlow::Continue(compaction_period)) + }) + .await; + + // Decide whether to sleep or break + let sleep_duration = match period { + Ok(Ok(ControlFlow::Continue(period))) => period, + Ok(Ok(ControlFlow::Break(()))) => break, + Ok(Err(e)) => { + error!("Compaction failed, retrying: {}", e); + Duration::from_secs(2) + } + Err(e) => { + error!("Compaction join error, retrying: {}", e); + Duration::from_secs(2) + } + }; + + // Sleep + tokio::select! { + _ = cancel.changed() => { + trace!("received cancellation request"); + break; + }, + _ = tokio::time::sleep(sleep_duration) => {}, + } + } + + trace!( + "compaction loop stopped. State is {:?}", + tenant_mgr::get_tenant_state(tenantid) + ); +} + +static START_GC_LOOP: OnceCell> = OnceCell::new(); +static START_COMPACTION_LOOP: OnceCell> = OnceCell::new(); + +/// Spawn a task that will periodically schedule garbage collection until +/// the tenant becomes inactive. This should be called on tenant +/// activation. +pub fn start_gc_loop(tenantid: ZTenantId) -> anyhow::Result<()> { + START_GC_LOOP + .get() + .context("Failed to get START_GC_LOOP")? + .blocking_send(tenantid) + .context("Failed to send to START_GC_LOOP channel")?; + Ok(()) +} + +/// Spawn a task that will periodically schedule compaction until +/// the tenant becomes inactive. This should be called on tenant +/// activation. +pub fn start_compaction_loop(tenantid: ZTenantId) -> anyhow::Result<()> { + START_COMPACTION_LOOP + .get() + .context("failed to get START_COMPACTION_LOOP")? + .blocking_send(tenantid) + .context("failed to send to START_COMPACTION_LOOP")?; + Ok(()) +} + +/// Spawn the TenantTaskManager +/// This needs to be called before start_gc_loop or start_compaction_loop +pub fn init_tenant_task_pool() -> anyhow::Result<()> { + let runtime = tokio::runtime::Builder::new_multi_thread() + .thread_name("tenant-task-worker") + .enable_all() + .build()?; + + let (gc_send, mut gc_recv) = mpsc::channel::(100); + START_GC_LOOP + .set(gc_send) + .expect("Failed to set START_GC_LOOP"); + + let (compaction_send, mut compaction_recv) = mpsc::channel::(100); + START_COMPACTION_LOOP + .set(compaction_send) + .expect("Failed to set START_COMPACTION_LOOP"); + + // TODO this is getting repetitive + let mut gc_loops = HashMap::>::new(); + let mut compaction_loops = HashMap::>::new(); + + thread_mgr::spawn( + ThreadKind::TenantTaskManager, + None, + None, + "Tenant task manager main thread", + true, + move || { + runtime.block_on(async move { + let mut futures = FuturesUnordered::new(); + loop { + tokio::select! { + _ = thread_mgr::shutdown_watcher() => { + // Send cancellation to all tasks + for (_, cancel) in gc_loops.drain() { + cancel.send(()).ok(); + } + for (_, cancel) in compaction_loops.drain() { + cancel.send(()).ok(); + } + + // Exit after all tasks finish + while let Some(result) = futures.next().await { + match result { + Ok(()) => { + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); + }, + Err(e) => { + TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc(); + error!("loop join error {}", e) + }, + } + } + break; + }, + tenantid = gc_recv.recv() => { + let tenantid = tenantid.expect("Gc task channel closed unexpectedly"); + + // Spawn new task, request cancellation of the old one if exists + let (cancel_send, cancel_recv) = watch::channel(()); + let handle = tokio::spawn(gc_loop(tenantid, cancel_recv) + .instrument(info_span!("gc loop", tenant = %tenantid))); + if let Some(old_cancel_send) = gc_loops.insert(tenantid, cancel_send) { + old_cancel_send.send(()).ok(); + } + + // Update metrics, remember handle + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + futures.push(handle); + }, + tenantid = compaction_recv.recv() => { + let tenantid = tenantid.expect("Compaction task channel closed unexpectedly"); + + // Spawn new task, request cancellation of the old one if exists + let (cancel_send, cancel_recv) = watch::channel(()); + let handle = tokio::spawn(compaction_loop(tenantid, cancel_recv) + .instrument(info_span!("compaction loop", tenant = %tenantid))); + if let Some(old_cancel_send) = compaction_loops.insert(tenantid, cancel_send) { + old_cancel_send.send(()).ok(); + } + + // Update metrics, remember handle + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + futures.push(handle); + }, + result = futures.next() => { + // Log and count any unhandled panics + match result { + Some(Ok(())) => { + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); + }, + Some(Err(e)) => { + TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc(); + error!("loop join error {}", e) + }, + None => {}, + }; + }, + } + } + }); + Ok(()) + }, + )?; + + Ok(()) +} + +/// +/// GC task's main loop +/// +async fn gc_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) { + loop { + trace!("waking up"); + + // Run blocking part of the task + let period: Result, _> = tokio::task::spawn_blocking(move || { + // Break if tenant is not active + if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { + return Ok(ControlFlow::Break(())); + } + + // Break if we're not allowed to write to disk + let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; + // TODO do this inside repo.gc_iteration instead. + let _guard = match repo.file_lock.try_read() { + Ok(g) => g, + Err(_) => return Ok(ControlFlow::Break(())), + }; + + // Run gc + let gc_period = repo.get_gc_period(); + let gc_horizon = repo.get_gc_horizon(); + if gc_horizon > 0 { + repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?; + } + + Ok(ControlFlow::Continue(gc_period)) + }) + .await; + + // Decide whether to sleep or break + let sleep_duration = match period { + Ok(Ok(ControlFlow::Continue(period))) => period, + Ok(Ok(ControlFlow::Break(()))) => break, + Ok(Err(e)) => { + error!("Gc failed, retrying: {}", e); + Duration::from_secs(2) + } + Err(e) => { + error!("Gc join error, retrying: {}", e); + Duration::from_secs(2) + } + }; + + // Sleep + tokio::select! { + _ = cancel.changed() => { + trace!("received cancellation request"); + break; + }, + _ = tokio::time::sleep(sleep_duration) => {}, + } + } + trace!( + "GC loop stopped. State is {:?}", + tenant_mgr::get_tenant_state(tenantid) + ); +} diff --git a/pageserver/src/tenant_threads.rs b/pageserver/src/tenant_threads.rs deleted file mode 100644 index b904d9040d..0000000000 --- a/pageserver/src/tenant_threads.rs +++ /dev/null @@ -1,79 +0,0 @@ -//! This module contains functions to serve per-tenant background processes, -//! such as compaction and GC -use crate::repository::Repository; -use crate::tenant_mgr; -use crate::tenant_mgr::TenantState; -use anyhow::Result; -use std::time::Duration; -use tracing::*; -use utils::zid::ZTenantId; - -/// -/// Compaction thread's main loop -/// -pub fn compact_loop(tenantid: ZTenantId) -> Result<()> { - if let Err(err) = compact_loop_ext(tenantid) { - error!("compact loop terminated with error: {:?}", err); - Err(err) - } else { - Ok(()) - } -} - -fn compact_loop_ext(tenantid: ZTenantId) -> Result<()> { - loop { - if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { - break; - } - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - let compaction_period = repo.get_compaction_period(); - - std::thread::sleep(compaction_period); - trace!("compaction thread for tenant {} waking up", tenantid); - - // Compact timelines - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - repo.compaction_iteration()?; - } - - trace!( - "compaction thread stopped for tenant {} state is {:?}", - tenantid, - tenant_mgr::get_tenant_state(tenantid) - ); - Ok(()) -} - -/// -/// GC thread's main loop -/// -pub fn gc_loop(tenantid: ZTenantId) -> Result<()> { - loop { - if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { - break; - } - - trace!("gc thread for tenant {} waking up", tenantid); - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - let gc_horizon = repo.get_gc_horizon(); - // Garbage collect old files that are not needed for PITR anymore - if gc_horizon > 0 { - repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?; - } - - // TODO Write it in more adequate way using - // condvar.wait_timeout() or something - let mut sleep_time = repo.get_gc_period().as_secs(); - while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == Some(TenantState::Active) - { - sleep_time -= 1; - std::thread::sleep(Duration::from_secs(1)); - } - } - trace!( - "GC thread stopped for tenant {} state is {:?}", - tenantid, - tenant_mgr::get_tenant_state(tenantid) - ); - Ok(()) -} diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index 473cddda58..ab0d894c70 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -91,14 +91,11 @@ pub enum ThreadKind { // associated with one later, after receiving a command from the client. PageRequestHandler, - // Thread that connects to a safekeeper to fetch WAL for one timeline. - WalReceiver, + // Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL. + WalReceiverManager, - // Thread that handles compaction of all timelines for a tenant. - Compactor, - - // Thread that handles GC of a tenant - GarbageCollector, + // Thread that schedules new compaction and gc jobs + TenantTaskManager, // Thread that flushes frozen in-memory layers to disk LayerFlushThread, @@ -108,15 +105,21 @@ pub enum ThreadKind { StorageSync, } +struct MutableThreadState { + /// Tenant and timeline that this thread is associated with. + tenant_id: Option, + timeline_id: Option, + + /// Handle for waiting for the thread to exit. It can be None, if the + /// the thread has already exited. + join_handle: Option>, +} + struct PageServerThread { _thread_id: u64, kind: ThreadKind, - /// Tenant and timeline that this thread is associated with. - tenant_id: Option, - timeline_id: Option, - name: String, // To request thread shutdown, set the flag, and send a dummy message to the @@ -124,9 +127,7 @@ struct PageServerThread { shutdown_requested: AtomicBool, shutdown_tx: watch::Sender<()>, - /// Handle for waiting for the thread to exit. It can be None, if the - /// the thread has already exited. - join_handle: Mutex>>, + mutable: Mutex, } /// Launch a new thread @@ -145,29 +146,27 @@ where { let (shutdown_tx, shutdown_rx) = watch::channel(()); let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed); - let thread = PageServerThread { + let thread = Arc::new(PageServerThread { _thread_id: thread_id, kind, - tenant_id, - timeline_id, name: name.to_string(), - shutdown_requested: AtomicBool::new(false), shutdown_tx, - - join_handle: Mutex::new(None), - }; - - let thread_rc = Arc::new(thread); - - let mut jh_guard = thread_rc.join_handle.lock().unwrap(); + mutable: Mutex::new(MutableThreadState { + tenant_id, + timeline_id, + join_handle: None, + }), + }); THREADS .lock() .unwrap() - .insert(thread_id, Arc::clone(&thread_rc)); + .insert(thread_id, Arc::clone(&thread)); - let thread_rc2 = Arc::clone(&thread_rc); + let mut thread_mut = thread.mutable.lock().unwrap(); + + let thread_cloned = Arc::clone(&thread); let thread_name = name.to_string(); let join_handle = match thread::Builder::new() .name(name.to_string()) @@ -175,7 +174,7 @@ where thread_wrapper( thread_name, thread_id, - thread_rc2, + thread_cloned, shutdown_rx, shutdown_process_on_error, f, @@ -189,8 +188,8 @@ where return Err(err); } }; - *jh_guard = Some(join_handle); - drop(jh_guard); + thread_mut.join_handle = Some(join_handle); + drop(thread_mut); // The thread is now running. Nothing more to do here Ok(thread_id) @@ -229,19 +228,20 @@ fn thread_wrapper( .remove(&thread_id) .expect("no thread in registry"); + let thread_mut = thread.mutable.lock().unwrap(); match result { Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name), Ok(Err(err)) => { if shutdown_process_on_error { error!( "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - thread_name, thread.tenant_id, thread.timeline_id, err + thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err ); shutdown_pageserver(1); } else { error!( "Thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - thread_name, thread.tenant_id, thread.timeline_id, err + thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err ); } } @@ -249,19 +249,29 @@ fn thread_wrapper( if shutdown_process_on_error { error!( "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - thread_name, thread.tenant_id, thread.timeline_id, err + thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err ); shutdown_pageserver(1); } else { error!( "Thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - thread_name, thread.tenant_id, thread.timeline_id, err + thread_name, thread_mut.tenant_id, thread_mut.timeline_id, err ); } } } } +// expected to be called from the thread of the given id. +pub fn associate_with(tenant_id: Option, timeline_id: Option) { + CURRENT_THREAD.with(|ct| { + let borrowed = ct.borrow(); + let mut thread_mut = borrowed.as_ref().unwrap().mutable.lock().unwrap(); + thread_mut.tenant_id = tenant_id; + thread_mut.timeline_id = timeline_id; + }); +} + /// Is there a thread running that matches the criteria /// Signal and wait for threads to shut down. @@ -285,9 +295,10 @@ pub fn shutdown_threads( let threads = THREADS.lock().unwrap(); for thread in threads.values() { + let thread_mut = thread.mutable.lock().unwrap(); if (kind.is_none() || Some(thread.kind) == kind) - && (tenant_id.is_none() || thread.tenant_id == tenant_id) - && (timeline_id.is_none() || thread.timeline_id == timeline_id) + && (tenant_id.is_none() || thread_mut.tenant_id == tenant_id) + && (timeline_id.is_none() || thread_mut.timeline_id == timeline_id) { thread.shutdown_requested.store(true, Ordering::Relaxed); // FIXME: handle error? @@ -298,8 +309,10 @@ pub fn shutdown_threads( drop(threads); for thread in victim_threads { + let mut thread_mut = thread.mutable.lock().unwrap(); info!("waiting for {} to shut down", thread.name); - if let Some(join_handle) = thread.join_handle.lock().unwrap().take() { + if let Some(join_handle) = thread_mut.join_handle.take() { + drop(thread_mut); let _ = join_handle.join(); } else { // The thread had not even fully started yet. Or it was shut down diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index eadf5bf4e0..a40e705cb9 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -202,7 +202,7 @@ pub fn create_repo( // anymore, but I think that could still happen. let wal_redo_manager = Arc::new(crate::walredo::DummyRedoManager {}); - (wal_redo_manager as _, RemoteIndex::empty()) + (wal_redo_manager as _, RemoteIndex::default()) } }; @@ -283,9 +283,9 @@ fn bootstrap_timeline( tli: ZTimelineId, repo: &R, ) -> Result<()> { - let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered(); - - let initdb_path = conf.tenant_path(&tenantid).join("tmp"); + let initdb_path = conf + .tenant_path(&tenantid) + .join(format!("tmp-timeline-{}", tli)); // Init temporarily repo to get bootstrap data run_initdb(conf, &initdb_path)?; @@ -300,10 +300,15 @@ fn bootstrap_timeline( let timeline = repo.create_empty_timeline(tli, lsn)?; let mut page_tline: DatadirTimeline = DatadirTimeline::new(timeline, u64::MAX); import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?; + + fail::fail_point!("before-checkpoint-new-timeline", |_| { + bail!("failpoint before-checkpoint-new-timeline"); + }); + page_tline.tline.checkpoint(CheckpointConfig::Forced)?; - println!( - "created initial timeline {} timeline.lsn {}", + info!( + "created root timeline {} timeline.lsn {}", tli, page_tline.tline.get_last_record_lsn() ); @@ -342,7 +347,7 @@ pub(crate) fn create_timeline( tenant_id: ZTenantId, new_timeline_id: Option, ancestor_timeline_id: Option, - ancestor_start_lsn: Option, + mut ancestor_start_lsn: Option, ) -> Result> { let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; @@ -352,41 +357,35 @@ pub(crate) fn create_timeline( return Ok(None); } - let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0)); - let new_timeline_info = match ancestor_timeline_id { Some(ancestor_timeline_id) => { let ancestor_timeline = repo .get_timeline_load(ancestor_timeline_id) .context("Cannot branch off the timeline that's not present locally")?; - if start_lsn == Lsn(0) { - // Find end of WAL on the old timeline - let end_of_wal = ancestor_timeline.get_last_record_lsn(); - info!("branching at end of WAL: {}", end_of_wal); - start_lsn = end_of_wal; - } else { + if let Some(lsn) = ancestor_start_lsn.as_mut() { // Wait for the WAL to arrive and be processed on the parent branch up // to the requested branch point. The repository code itself doesn't // require it, but if we start to receive WAL on the new timeline, // decoding the new WAL might need to look up previous pages, relation // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. - ancestor_timeline.wait_lsn(start_lsn)?; - } - start_lsn = start_lsn.align(); + *lsn = lsn.align(); + ancestor_timeline.wait_lsn(*lsn)?; - let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); - if ancestor_ancestor_lsn > start_lsn { - // can we safely just branch from the ancestor instead? - anyhow::bail!( + let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); + if ancestor_ancestor_lsn > *lsn { + // can we safely just branch from the ancestor instead? + anyhow::bail!( "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", - start_lsn, + lsn, ancestor_timeline_id, ancestor_ancestor_lsn, ); + } } - repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?; + + repo.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?; // load the timeline into memory let loaded_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?; diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 37d70372b5..a16e772238 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -336,7 +336,7 @@ impl VirtualFile { // library RwLock doesn't allow downgrading without releasing the lock, // and that doesn't seem worth the trouble. // - // XXX: `parking_lot::RwLock` can enable such downgrades, yet its implemenation is fair and + // XXX: `parking_lot::RwLock` can enable such downgrades, yet its implementation is fair and // may deadlock on subsequent read calls. // Simply replacing all `RwLock` in project causes deadlocks, so use it sparingly. let result = STORAGE_IO_TIME diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 5223125ce6..2f39007e9f 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -12,7 +12,7 @@ //! The zenith Repository can store page versions in two formats: as //! page images, or a WAL records. WalIngest::ingest_record() extracts //! page images out of some WAL records, but most it stores as WAL -//! records. If a WAL record modifies multple pages, WalIngest +//! records. If a WAL record modifies multiple pages, WalIngest //! will call Repository::put_wal_record or put_page_image functions //! separately for each modified page. //! diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index b8f349af8f..c36343db17 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -1,61 +1,52 @@ +//! WAL receiver manages an open connection to safekeeper, to get the WAL it streams into. +//! To do so, a current implementation needs to do the following: //! -//! WAL receiver connects to the WAL safekeeper service, streams WAL, -//! decodes records and saves them in the repository for the correct -//! timeline. +//! * acknowledge the timelines that it needs to stream WAL into. +//! Pageserver is able to dynamically (un)load tenants on attach and detach, +//! hence WAL receiver needs to react on such events. //! -//! We keep one WAL receiver active per timeline. +//! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming. +//! For that, it watches specific keys in etcd broker and pulls the relevant data periodically. +//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other. +//! Without this data, no WAL streaming is possible currently. +//! +//! Only one active WAL streaming connection is allowed at a time. +//! The connection is supposed to be updated periodically, based on safekeeper timeline data. +//! +//! * handle the actual connection and WAL streaming +//! +//! Handling happens dynamically, by portions of WAL being processed and registered in the server. +//! Along with the registration, certain metadata is written to show WAL streaming progress and rely on that when considering safekeepers for connection. +//! +//! The current module contains high-level primitives used in the submodules; general synchronization, timeline acknowledgement and shutdown logic. + +mod connection_manager; +mod walreceiver_connection; + +use anyhow::{ensure, Context}; +use etcd_broker::Client; +use itertools::Itertools; +use once_cell::sync::Lazy; +use std::cell::Cell; +use std::collections::{hash_map, HashMap, HashSet}; +use std::future::Future; +use std::num::NonZeroU64; +use std::sync::Arc; +use std::thread_local; +use std::time::Duration; +use tokio::{ + select, + sync::{mpsc, watch, RwLock}, + task::JoinHandle, +}; +use tracing::*; +use url::Url; use crate::config::PageServerConf; -use crate::repository::{Repository, Timeline}; -use crate::tenant_mgr; -use crate::thread_mgr; -use crate::thread_mgr::ThreadKind; -use crate::walingest::WalIngest; -use anyhow::{bail, Context, Error, Result}; -use bytes::BytesMut; -use fail::fail_point; -use lazy_static::lazy_static; -use postgres_ffi::waldecoder::*; -use postgres_protocol::message::backend::ReplicationMessage; -use postgres_types::PgLsn; -use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, DisplayFromStr}; -use std::cell::Cell; -use std::collections::HashMap; -use std::str::FromStr; -use std::sync::Mutex; -use std::thread_local; -use std::time::SystemTime; -use tokio::pin; -use tokio_postgres::replication::ReplicationStream; -use tokio_postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow}; -use tokio_stream::StreamExt; -use tracing::*; -use utils::{ - lsn::Lsn, - pq_proto::ZenithFeedback, - zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}, -}; - -/// -/// A WAL receiver's data stored inside the global `WAL_RECEIVERS`. -/// We keep one WAL receiver active per timeline. -/// -#[serde_as] -#[derive(Debug, Serialize, Deserialize, Clone)] -pub struct WalReceiverEntry { - thread_id: u64, - wal_producer_connstr: String, - #[serde_as(as = "Option")] - last_received_msg_lsn: Option, - /// the timestamp (in microseconds) of the last received message - last_received_msg_ts: Option, -} - -lazy_static! { - static ref WAL_RECEIVERS: Mutex> = - Mutex::new(HashMap::new()); -} +use crate::http::models::WalReceiverEntry; +use crate::tenant_mgr::{self, LocalTimelineUpdate, TenantState}; +use crate::thread_mgr::{self, ThreadKind}; +use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; thread_local! { // Boolean that is true only for WAL receiver threads @@ -64,375 +55,326 @@ thread_local! { pub(crate) static IS_WAL_RECEIVER: Cell = Cell::new(false); } -fn drop_wal_receiver(tenantid: ZTenantId, timelineid: ZTimelineId) { - let mut receivers = WAL_RECEIVERS.lock().unwrap(); - receivers.remove(&(tenantid, timelineid)); -} +/// WAL receiver state for sharing with the outside world. +/// Only entries for timelines currently available in pageserver are stored. +static WAL_RECEIVER_ENTRIES: Lazy>> = + Lazy::new(|| RwLock::new(HashMap::new())); -// Launch a new WAL receiver, or tell one that's running about change in connection string -pub fn launch_wal_receiver( - conf: &'static PageServerConf, - tenantid: ZTenantId, - timelineid: ZTimelineId, - wal_producer_connstr: &str, -) -> Result<()> { - let mut receivers = WAL_RECEIVERS.lock().unwrap(); - - match receivers.get_mut(&(tenantid, timelineid)) { - Some(receiver) => { - debug!("wal receiver already running, updating connection string"); - receiver.wal_producer_connstr = wal_producer_connstr.into(); - } - None => { - let thread_id = thread_mgr::spawn( - ThreadKind::WalReceiver, - Some(tenantid), - Some(timelineid), - "WAL receiver thread", - false, - move || { - IS_WAL_RECEIVER.with(|c| c.set(true)); - thread_main(conf, tenantid, timelineid); - Ok(()) - }, - )?; - - let receiver = WalReceiverEntry { - thread_id, - wal_producer_connstr: wal_producer_connstr.into(), - last_received_msg_lsn: None, - last_received_msg_ts: None, - }; - receivers.insert((tenantid, timelineid), receiver); - - // Update tenant state and start tenant threads, if they are not running yet. - tenant_mgr::activate_tenant(tenantid)?; - } - }; - Ok(()) -} - -/// Look up a WAL receiver's data in the global `WAL_RECEIVERS` -pub fn get_wal_receiver_entry( +/// Gets the public WAL streaming entry for a certain timeline. +pub async fn get_wal_receiver_entry( tenant_id: ZTenantId, timeline_id: ZTimelineId, ) -> Option { - let receivers = WAL_RECEIVERS.lock().unwrap(); - receivers.get(&(tenant_id, timeline_id)).cloned() + WAL_RECEIVER_ENTRIES + .read() + .await + .get(&ZTenantTimelineId::new(tenant_id, timeline_id)) + .cloned() } -// -// This is the entry point for the WAL receiver thread. -// -fn thread_main(conf: &'static PageServerConf, tenant_id: ZTenantId, timeline_id: ZTimelineId) { - let _enter = info_span!("WAL receiver", timeline = %timeline_id, tenant = %tenant_id).entered(); - info!("WAL receiver thread started"); - - // Look up the current WAL producer address - let wal_producer_connstr = { - match get_wal_receiver_entry(tenant_id, timeline_id) { - Some(e) => e.wal_producer_connstr, - None => { - info!( - "Unable to create the WAL receiver thread: no WAL receiver entry found for tenant {} and timeline {}", - tenant_id, timeline_id - ); - return; - } - } - }; - - // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server, - // and start streaming WAL from it. - let res = walreceiver_main(conf, tenant_id, timeline_id, &wal_producer_connstr); - - // TODO cleanup info messages - if let Err(e) = res { - info!("WAL streaming connection failed ({})", e); - } else { - info!( - "walreceiver disconnected tenant {}, timelineid {}", - tenant_id, timeline_id - ); - } - - // Drop it from list of active WAL_RECEIVERS - // so that next callmemaybe request launched a new thread - drop_wal_receiver(tenant_id, timeline_id); -} - -fn walreceiver_main( - _conf: &PageServerConf, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - wal_producer_connstr: &str, -) -> anyhow::Result<(), Error> { - // Connect to the database in replication mode. - info!("connecting to {:?}", wal_producer_connstr); - let connect_cfg = format!( - "{} application_name=pageserver replication=true", - wal_producer_connstr +/// Sets up the main WAL receiver thread that manages the rest of the subtasks inside of it, per timeline. +/// See comments in [`wal_receiver_main_thread_loop_step`] for more details on per timeline activities. +pub fn init_wal_receiver_main_thread( + conf: &'static PageServerConf, + mut timeline_updates_receiver: mpsc::UnboundedReceiver, +) -> anyhow::Result<()> { + let etcd_endpoints = conf.broker_endpoints.clone(); + ensure!( + !etcd_endpoints.is_empty(), + "Cannot start wal receiver: etcd endpoints are empty" ); - - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?; - - let (mut replication_client, connection) = - runtime.block_on(tokio_postgres::connect(&connect_cfg, NoTls))?; - // This is from tokio-postgres docs, but it is a bit weird in our case because we extensively use block_on - runtime.spawn(async move { - if let Err(e) = connection.await { - error!("connection error: {}", e); - } - }); - - info!("connected!"); - - // Immediately increment the gauge, then create a job to decrement it on thread exit. - // One of the pros of `defer!` is that this will *most probably* - // get called, even in presence of panics. - let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); - gauge.inc(); - scopeguard::defer! { - gauge.dec(); - } - - let identify = runtime.block_on(identify_system(&mut replication_client))?; - info!("{:?}", identify); - let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); - let mut caught_up = false; - - let repo = tenant_mgr::get_repository_for_tenant(tenant_id) - .with_context(|| format!("no repository found for tenant {}", tenant_id))?; - let timeline = - tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id).with_context(|| { - format!( - "local timeline {} not found for tenant {}", - timeline_id, tenant_id - ) - })?; - let remote_index = repo.get_remote_index(); - - // - // Start streaming the WAL, from where we left off previously. - // - // If we had previously received WAL up to some point in the middle of a WAL record, we - // better start from the end of last full WAL record, not in the middle of one. - let mut last_rec_lsn = timeline.get_last_record_lsn(); - let mut startpoint = last_rec_lsn; - - if startpoint == Lsn(0) { - bail!("No previous WAL position"); - } - - // There might be some padding after the last full record, skip it. - startpoint += startpoint.calc_padding(8u32); - + let broker_prefix = &conf.broker_etcd_prefix; info!( - "last_record_lsn {} starting replication from {}, server is at {}...", - last_rec_lsn, startpoint, end_of_wal + "Starting wal receiver main thread, etdc endpoints: {}", + etcd_endpoints.iter().map(Url::to_string).join(", ") ); - let query = format!("START_REPLICATION PHYSICAL {}", startpoint); + let runtime = tokio::runtime::Builder::new_multi_thread() + .thread_name("wal-receiver-runtime-thread") + .enable_all() + .on_thread_start(|| IS_WAL_RECEIVER.with(|c| c.set(true))) + .build() + .context("Failed to create storage sync runtime")?; + let etcd_client = runtime + .block_on(Client::connect(etcd_endpoints, None)) + .context("Failed to connect to etcd")?; - let copy_stream = runtime.block_on(replication_client.copy_both_simple(&query))?; - let physical_stream = ReplicationStream::new(copy_stream); - pin!(physical_stream); + thread_mgr::spawn( + ThreadKind::WalReceiverManager, + None, + None, + "WAL receiver manager main thread", + true, + move || { + runtime.block_on(async move { + let mut local_timeline_wal_receivers = HashMap::new(); + loop { + select! { + _ = thread_mgr::shutdown_watcher() => { + info!("Shutdown signal received"); + shutdown_all_wal_connections(&mut local_timeline_wal_receivers).await; + break; + }, + _ = wal_receiver_main_thread_loop_step( + broker_prefix, + &etcd_client, + &mut timeline_updates_receiver, + &mut local_timeline_wal_receivers, + ) => {}, + } + } + }.instrument(info_span!("wal_receiver_main"))); - let mut waldecoder = WalStreamDecoder::new(startpoint); + info!("Wal receiver main thread stopped"); + Ok(()) + }, + ) + .map(|_thread_id| ()) + .context("Failed to spawn wal receiver main thread") +} - let mut walingest = WalIngest::new(&*timeline, startpoint)?; - - while let Some(replication_message) = runtime.block_on(async { - let shutdown_watcher = thread_mgr::shutdown_watcher(); - tokio::select! { - // check for shutdown first - biased; - _ = shutdown_watcher => { - info!("walreceiver interrupted"); - None - } - replication_message = physical_stream.next() => replication_message, +async fn shutdown_all_wal_connections( + local_timeline_wal_receivers: &mut HashMap>>, +) { + info!("Shutting down all WAL connections"); + let mut broker_join_handles = Vec::new(); + for (tenant_id, timelines) in local_timeline_wal_receivers.drain() { + for (timeline_id, handles) in timelines { + handles.cancellation.send(()).ok(); + broker_join_handles.push(( + ZTenantTimelineId::new(tenant_id, timeline_id), + handles.handle, + )); } - }) { - let replication_message = replication_message?; - let status_update = match replication_message { - ReplicationMessage::XLogData(xlog_data) => { - // Pass the WAL data to the decoder, and see if we can decode - // more records as a result. - let data = xlog_data.data(); - let startlsn = Lsn::from(xlog_data.wal_start()); - let endlsn = startlsn + data.len() as u64; + } - trace!("received XLogData between {} and {}", startlsn, endlsn); - - waldecoder.feed_bytes(data); - - while let Some((lsn, recdata)) = waldecoder.poll_decode()? { - let _enter = info_span!("processing record", lsn = %lsn).entered(); - - // It is important to deal with the aligned records as lsn in getPage@LSN is - // aligned and can be several bytes bigger. Without this alignment we are - // at risk of hitting a deadlock. - anyhow::ensure!(lsn.is_aligned()); - - walingest.ingest_record(&timeline, recdata, lsn)?; - - fail_point!("walreceiver-after-ingest"); - - last_rec_lsn = lsn; - } - - if !caught_up && endlsn >= end_of_wal { - info!("caught up at LSN {}", endlsn); - caught_up = true; - } - - timeline.tline.check_checkpoint_distance()?; - - Some(endlsn) + let mut tenants = HashSet::with_capacity(broker_join_handles.len()); + for (id, broker_join_handle) in broker_join_handles { + tenants.insert(id.tenant_id); + debug!("Waiting for wal broker for timeline {id} to finish"); + if let Err(e) = broker_join_handle.await { + error!("Failed to join on wal broker for timeline {id}: {e}"); + } + } + if let Err(e) = tokio::task::spawn_blocking(move || { + for tenant_id in tenants { + if let Err(e) = tenant_mgr::set_tenant_state(tenant_id, TenantState::Idle) { + error!("Failed to make tenant {tenant_id} idle: {e:?}"); } + } + }) + .await + { + error!("Failed to await a task to make all tenants idle: {e:?}"); + } +} - ReplicationMessage::PrimaryKeepAlive(keepalive) => { - let wal_end = keepalive.wal_end(); - let timestamp = keepalive.timestamp(); - let reply_requested = keepalive.reply() != 0; +/// A handle of an asynchronous task. +/// The task has a channel that it can use to communicate its lifecycle events in a certain form, see [`TaskEvent`] +/// and a cancellation channel that it can listen to for earlier interrupts. +/// +/// Note that the communication happens via the `watch` channel, that does not accumulate the events, replacing the old one with the never one on submission. +/// That may lead to certain events not being observed by the listener. +#[derive(Debug)] +struct TaskHandle { + handle: JoinHandle>, + events_receiver: watch::Receiver>, + cancellation: watch::Sender<()>, +} - trace!( - "received PrimaryKeepAlive(wal_end: {}, timestamp: {:?} reply: {})", - wal_end, - timestamp, - reply_requested, - ); +#[derive(Debug, Clone)] +pub enum TaskEvent { + Started, + NewEvent(E), + End(Result<(), String>), +} - if reply_requested { - Some(last_rec_lsn) - } else { - None - } +impl TaskHandle { + /// Initializes the task, starting it immediately after the creation. + pub fn spawn( + task: impl FnOnce(Arc>>, watch::Receiver<()>) -> Fut + Send + 'static, + ) -> Self + where + Fut: Future> + Send, + E: Sync + Send + 'static, + { + let (cancellation, cancellation_receiver) = watch::channel(()); + let (events_sender, events_receiver) = watch::channel(TaskEvent::Started); + let events_sender = Arc::new(events_sender); + + let sender = Arc::clone(&events_sender); + let handle = tokio::task::spawn(async move { + events_sender.send(TaskEvent::Started).ok(); + task(sender, cancellation_receiver).await + }); + + TaskHandle { + handle, + events_receiver, + cancellation, + } + } + + async fn next_task_event(&mut self) -> TaskEvent { + select! { + next_task_event = self.events_receiver.changed() => match next_task_event { + Ok(()) => self.events_receiver.borrow().clone(), + Err(_task_channel_part_dropped) => join_on_handle(&mut self.handle).await, + }, + task_completion_result = join_on_handle(&mut self.handle) => task_completion_result, + } + } + + /// Aborts current task, waiting for it to finish. + async fn shutdown(self) { + self.cancellation.send(()).ok(); + if let Err(e) = self.handle.await { + error!("Task failed to shut down: {e}") + } + } +} + +async fn join_on_handle(handle: &mut JoinHandle>) -> TaskEvent { + match handle.await { + Ok(task_result) => TaskEvent::End(task_result), + Err(e) => { + if e.is_cancelled() { + TaskEvent::End(Ok(())) + } else { + TaskEvent::End(Err(format!("WAL receiver task panicked: {e}"))) } + } + } +} - _ => None, - }; +/// A step to process timeline attach/detach events to enable/disable the corresponding WAL receiver machinery. +/// In addition to WAL streaming management, the step ensures that corresponding tenant has its service threads enabled or disabled. +/// This is done here, since only walreceiver knows when a certain tenant has no streaming enabled. +/// +/// Cannot fail, should always try to process the next timeline event even if the other one was not processed properly. +async fn wal_receiver_main_thread_loop_step<'a>( + broker_prefix: &'a str, + etcd_client: &'a Client, + timeline_updates_receiver: &'a mut mpsc::UnboundedReceiver, + local_timeline_wal_receivers: &'a mut HashMap>>, +) { + // Only react on updates from [`tenant_mgr`] on local timeline attach/detach. + match timeline_updates_receiver.recv().await { + Some(update) => { + info!("Processing timeline update: {update:?}"); + match update { + // Timeline got detached, stop all related tasks and remove public timeline data. + LocalTimelineUpdate::Detach { + id, + join_confirmation_sender, + } => { + match local_timeline_wal_receivers.get_mut(&id.tenant_id) { + Some(wal_receivers) => { + if let hash_map::Entry::Occupied(o) = wal_receivers.entry(id.timeline_id) { + o.remove().shutdown().await + } + if wal_receivers.is_empty() { + if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Idle).await { + error!("Failed to make tenant idle for id {id}: {e:#}"); + } + } + } + None => warn!("Timeline {id} does not have a tenant entry in wal receiver main thread"), + }; + { + WAL_RECEIVER_ENTRIES.write().await.remove(&id); + if let Err(e) = join_confirmation_sender.send(()) { + warn!("cannot send wal_receiver shutdown confirmation {e}") + } else { + info!("confirm walreceiver shutdown for {id}"); + } + } + } + // Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly. + LocalTimelineUpdate::Attach { id, datadir } => { + let timeline_connection_managers = local_timeline_wal_receivers + .entry(id.tenant_id) + .or_default(); - if let Some(last_lsn) = status_update { - let timeline_remote_consistent_lsn = runtime.block_on(async { - remote_index - .read() - .await - // here we either do not have this timeline in remote index - // or there were no checkpoints for it yet - .timeline_entry(&ZTenantTimelineId { - tenant_id, - timeline_id, - }) - .map(|remote_timeline| remote_timeline.metadata.disk_consistent_lsn()) - .unwrap_or(Lsn(0)) // no checkpoint was uploaded - }); + if timeline_connection_managers.is_empty() { + if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Active).await + { + error!("Failed to make tenant active for id {id}: {e:#}"); + return; + } + } - // The last LSN we processed. It is not guaranteed to survive pageserver crash. - let write_lsn = u64::from(last_lsn); - // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data - let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn()); - // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash - // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. - let apply_lsn = u64::from(timeline_remote_consistent_lsn); - let ts = SystemTime::now(); + let vacant_connection_manager_entry = + match timeline_connection_managers.entry(id.timeline_id) { + hash_map::Entry::Occupied(_) => { + debug!("Attepted to readd an existing timeline {id}, ignoring"); + return; + } + hash_map::Entry::Vacant(v) => v, + }; - // Update the current WAL receiver's data stored inside the global hash table `WAL_RECEIVERS` - { - let mut receivers = WAL_RECEIVERS.lock().unwrap(); - let entry = match receivers.get_mut(&(tenant_id, timeline_id)) { - Some(e) => e, - None => { - anyhow::bail!( - "no WAL receiver entry found for tenant {} and timeline {}", - tenant_id, - timeline_id + let (wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag) = + match fetch_tenant_settings(id.tenant_id).await { + Ok(settings) => settings, + Err(e) => { + error!("Failed to fetch tenant settings for id {id}: {e:#}"); + return; + } + }; + + { + WAL_RECEIVER_ENTRIES.write().await.insert( + id, + WalReceiverEntry { + wal_producer_connstr: None, + last_received_msg_lsn: None, + last_received_msg_ts: None, + }, ); } - }; - entry.last_received_msg_lsn = Some(last_lsn); - entry.last_received_msg_ts = Some( - ts.duration_since(SystemTime::UNIX_EPOCH) - .expect("Received message time should be before UNIX EPOCH!") - .as_micros(), - ); + vacant_connection_manager_entry.insert( + connection_manager::spawn_connection_manager_task( + id, + broker_prefix.to_owned(), + etcd_client.clone(), + datadir, + wal_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + ), + ); + } } - - // Send zenith feedback message. - // Regular standby_status_update fields are put into this message. - let zenith_status_update = ZenithFeedback { - current_timeline_size: timeline.get_current_logical_size() as u64, - ps_writelsn: write_lsn, - ps_flushlsn: flush_lsn, - ps_applylsn: apply_lsn, - ps_replytime: ts, - }; - - debug!("zenith_status_update {:?}", zenith_status_update); - - let mut data = BytesMut::new(); - zenith_status_update.serialize(&mut data)?; - runtime.block_on( - physical_stream - .as_mut() - .zenith_status_update(data.len() as u64, &data), - )?; + } + None => { + info!("Local timeline update channel closed"); + shutdown_all_wal_connections(local_timeline_wal_receivers).await; } } - - Ok(()) } -/// Data returned from the postgres `IDENTIFY_SYSTEM` command -/// -/// See the [postgres docs] for more details. -/// -/// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html -#[derive(Debug)] -// As of nightly 2021-09-11, fields that are only read by the type's `Debug` impl still count as -// unused. Relevant issue: https://github.com/rust-lang/rust/issues/88900 -#[allow(dead_code)] -pub struct IdentifySystem { - systemid: u64, - timeline: u32, - xlogpos: PgLsn, - dbname: Option, +async fn fetch_tenant_settings( + tenant_id: ZTenantId, +) -> anyhow::Result<(Duration, Duration, NonZeroU64)> { + tokio::task::spawn_blocking(move || { + let repo = tenant_mgr::get_repository_for_tenant(tenant_id) + .with_context(|| format!("no repository found for tenant {tenant_id}"))?; + Ok::<_, anyhow::Error>(( + repo.get_wal_receiver_connect_timeout(), + repo.get_lagging_wal_timeout(), + repo.get_max_lsn_wal_lag(), + )) + }) + .await + .with_context(|| format!("Failed to join on tenant {tenant_id} settings fetch task"))? } -/// There was a problem parsing the response to -/// a postgres IDENTIFY_SYSTEM command. -#[derive(Debug, thiserror::Error)] -#[error("IDENTIFY_SYSTEM parse error")] -pub struct IdentifyError; - -/// Run the postgres `IDENTIFY_SYSTEM` command -pub async fn identify_system(client: &mut Client) -> Result { - let query_str = "IDENTIFY_SYSTEM"; - let response = client.simple_query(query_str).await?; - - // get(N) from row, then parse it as some destination type. - fn get_parse(row: &SimpleQueryRow, idx: usize) -> Result - where - T: FromStr, - { - let val = row.get(idx).ok_or(IdentifyError)?; - val.parse::().or(Err(IdentifyError)) - } - - // extract the row contents into an IdentifySystem struct. - // written as a closure so I can use ? for Option here. - if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) { - Ok(IdentifySystem { - systemid: get_parse(first_row, 0)?, - timeline: get_parse(first_row, 1)?, - xlogpos: get_parse(first_row, 2)?, - dbname: get_parse(first_row, 3).ok(), - }) - } else { - Err(IdentifyError.into()) - } +async fn change_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> { + tokio::task::spawn_blocking(move || { + tenant_mgr::set_tenant_state(tenant_id, new_state) + .with_context(|| format!("Failed to activate tenant {tenant_id}")) + }) + .await + .with_context(|| format!("Failed to spawn activation task for tenant {tenant_id}"))? } diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs new file mode 100644 index 0000000000..614bca50ad --- /dev/null +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -0,0 +1,1221 @@ +//! WAL receiver logic that ensures the pageserver gets connectected to safekeeper, +//! that contains the latest WAL to stream and this connection does not go stale. +//! +//! To achieve that, a etcd broker is used: safekepers propagate their timelines' state in it, +//! the manager subscribes for changes and accumulates those to query the one with the biggest Lsn for connection. +//! Current connection state is tracked too, to ensure it's not getting stale. +//! +//! After every connection or etcd update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader, +//! then a [re]connection happens, if necessary. +//! Only WAL streaming task expects to be finished, other loops (etcd, connection management) never exit unless cancelled explicitly via the dedicated channel. + +use std::{ + collections::{hash_map, HashMap}, + num::NonZeroU64, + sync::Arc, + time::Duration, +}; + +use anyhow::Context; +use chrono::{DateTime, Local, NaiveDateTime, Utc}; +use etcd_broker::{ + subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription, + BrokerUpdate, Client, +}; +use tokio::select; +use tracing::*; + +use crate::DatadirTimelineImpl; +use utils::{ + lsn::Lsn, + pq_proto::ReplicationFeedback, + zid::{NodeId, ZTenantTimelineId}, +}; + +use super::{TaskEvent, TaskHandle}; + +/// Spawns the loop to take care of the timeline's WAL streaming connection. +pub(super) fn spawn_connection_manager_task( + id: ZTenantTimelineId, + broker_loop_prefix: String, + mut client: Client, + local_timeline: Arc, + wal_connect_timeout: Duration, + lagging_wal_timeout: Duration, + max_lsn_wal_lag: NonZeroU64, +) -> TaskHandle<()> { + TaskHandle::spawn(move |_, mut cancellation| { + async move { + info!("WAL receiver broker started, connecting to etcd"); + let mut walreceiver_state = WalreceiverState::new( + id, + local_timeline, + wal_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + ); + loop { + select! { + _ = cancellation.changed() => { + info!("Broker subscription init cancelled, shutting down"); + if let Some(wal_connection) = walreceiver_state.wal_connection.take() + { + wal_connection.connection_task.shutdown().await; + } + return Ok(()); + }, + + _ = connection_manager_loop_step( + &broker_loop_prefix, + &mut client, + &mut walreceiver_state, + ) => {}, + } + } + } + .instrument(info_span!("wal_connection_manager", id = %id)) + }) +} + +/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. +/// Based on the updates, desides whether to start, keep or stop a WAL receiver task. +/// If etcd subscription is cancelled, exits. +async fn connection_manager_loop_step( + broker_prefix: &str, + etcd_client: &mut Client, + walreceiver_state: &mut WalreceiverState, +) { + let id = walreceiver_state.id; + + // XXX: We never explicitly cancel etcd task, instead establishing one and never letting it go, + // running the entire loop step as much as possible to an end. + // The task removal happens implicitly on drop, both aborting the etcd subscription task and dropping the receiver channel end, + // forcing the etcd subscription to exit either way. + let mut broker_subscription = + subscribe_for_timeline_updates(etcd_client, broker_prefix, id).await; + info!("Subscribed for etcd timeline changes, waiting for new etcd data"); + + loop { + select! { + broker_connection_result = &mut broker_subscription.watcher_handle => { + cleanup_broker_connection(broker_connection_result, walreceiver_state); + return; + }, + + Some(wal_connection_update) = async { + match walreceiver_state.wal_connection.as_mut() { + Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await), + None => None, + } + } => { + let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard"); + match &wal_connection_update { + TaskEvent::Started => { + wal_connection.latest_connection_update = Utc::now().naive_utc(); + *walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0) += 1; + }, + TaskEvent::NewEvent(replication_feedback) => { + wal_connection.latest_connection_update = DateTime::::from(replication_feedback.ps_replytime).naive_utc(); + // reset connection attempts here only, the only place where both nodes + // explicitly confirmn with replication feedback that they are connected to each other + walreceiver_state.wal_connection_attempts.remove(&wal_connection.sk_id); + }, + TaskEvent::End(end_result) => { + match end_result { + Ok(()) => debug!("WAL receiving task finished"), + Err(e) => warn!("WAL receiving task failed: {e}"), + }; + walreceiver_state.wal_connection = None; + }, + } + }, + + broker_update = broker_subscription.value_updates.recv() => { + match broker_update { + Some(broker_update) => walreceiver_state.register_timeline_update(broker_update), + None => { + info!("Broker sender end was dropped, ending current broker loop step"); + // Ensure to cancel and wait for the broker subscription task end, to log its result. + // Broker sender end is in the broker subscription task and its drop means abnormal task completion. + // First, ensure that the task is stopped (abort can be done without errors on already stopped tasks and repeated multiple times). + broker_subscription.watcher_handle.abort(); + // Then, wait for the task to finish and print its result. If the task was finished before abort (which we assume in this abnormal case), + // a proper error message will be printed, otherwise an abortion message is printed which is ok, since we're signalled to finish anyway. + cleanup_broker_connection( + (&mut broker_subscription.watcher_handle).await, + walreceiver_state, + ); + return; + } + } + }, + } + + // Fetch more etcd timeline updates, but limit ourselves since they may arrive quickly. + let mut max_events_to_poll = 100_u32; + while max_events_to_poll > 0 { + if let Ok(broker_update) = broker_subscription.value_updates.try_recv() { + walreceiver_state.register_timeline_update(broker_update); + max_events_to_poll -= 1; + } else { + break; + } + } + + if let Some(new_candidate) = walreceiver_state.next_connection_candidate() { + info!("Switching to new connection candidate: {new_candidate:?}"); + walreceiver_state + .change_connection( + new_candidate.safekeeper_id, + new_candidate.wal_producer_connstr, + ) + .await + } + } +} + +fn cleanup_broker_connection( + broker_connection_result: Result, tokio::task::JoinError>, + walreceiver_state: &mut WalreceiverState, +) { + match broker_connection_result { + Ok(Ok(())) => info!("Broker conneciton task finished, ending current broker loop step"), + Ok(Err(broker_error)) => warn!("Broker conneciton ended with error: {broker_error}"), + Err(abort_error) => { + if abort_error.is_panic() { + error!("Broker connection panicked: {abort_error}") + } else { + debug!("Broker connection aborted: {abort_error}") + } + } + } + + walreceiver_state.wal_stream_candidates.clear(); +} + +/// Endlessly try to subscribe for broker updates for a given timeline. +/// If there are no safekeepers to maintain the lease, the timeline subscription will be unavailable in the broker and the operation will fail constantly. +/// This is ok, pageservers should anyway try subscribing (with some backoff) since it's the only way they can get the timeline WAL anyway. +async fn subscribe_for_timeline_updates( + etcd_client: &mut Client, + broker_prefix: &str, + id: ZTenantTimelineId, +) -> BrokerSubscription { + let mut attempt = 0; + loop { + exponential_backoff( + attempt, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ) + .await; + attempt += 1; + + match etcd_broker::subscribe_for_json_values( + etcd_client, + SubscriptionKey::sk_timeline_info(broker_prefix.to_owned(), id), + ) + .instrument(info_span!("etcd_subscription")) + .await + { + Ok(new_subscription) => { + return new_subscription; + } + Err(e) => { + warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in etcd: {e:#}"); + continue; + } + } + } +} + +const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 2.0; +const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 60.0; + +async fn exponential_backoff(n: u32, base: f64, max_seconds: f64) { + if n == 0 { + return; + } + let seconds_to_wait = base.powf(f64::from(n) - 1.0).min(max_seconds); + info!("Backoff: waiting {seconds_to_wait} seconds before proceeding with the task"); + tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await; +} + +/// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible. +struct WalreceiverState { + id: ZTenantTimelineId, + /// Use pageserver data about the timeline to filter out some of the safekeepers. + local_timeline: Arc, + /// The timeout on the connection to safekeeper for WAL streaming. + wal_connect_timeout: Duration, + /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one. + lagging_wal_timeout: Duration, + /// The Lsn lag to use to determine when the current connection is lagging to much behind and reconnect to the other one. + max_lsn_wal_lag: NonZeroU64, + /// Current connection to safekeeper for WAL streaming. + wal_connection: Option, + wal_connection_attempts: HashMap, + /// Data about all timelines, available for connection, fetched from etcd, grouped by their corresponding safekeeper node id. + wal_stream_candidates: HashMap, +} + +/// Current connection data. +#[derive(Debug)] +struct WalConnection { + /// Current safekeeper pageserver is connected to for WAL streaming. + sk_id: NodeId, + /// Connection task start time or the timestamp of a latest connection message received. + latest_connection_update: NaiveDateTime, + /// WAL streaming task handle. + connection_task: TaskHandle, +} + +/// Data about the timeline to connect to, received from etcd. +#[derive(Debug)] +struct EtcdSkTimeline { + timeline: SkTimelineInfo, + /// Etcd generation, the bigger it is, the more up to date the timeline data is. + etcd_version: i64, + /// Time at which the data was fetched from etcd last time, to track the stale data. + latest_update: NaiveDateTime, +} + +impl WalreceiverState { + fn new( + id: ZTenantTimelineId, + local_timeline: Arc, + wal_connect_timeout: Duration, + lagging_wal_timeout: Duration, + max_lsn_wal_lag: NonZeroU64, + ) -> Self { + Self { + id, + local_timeline, + wal_connect_timeout, + lagging_wal_timeout, + max_lsn_wal_lag, + wal_connection: None, + wal_stream_candidates: HashMap::new(), + wal_connection_attempts: HashMap::new(), + } + } + + /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. + async fn change_connection(&mut self, new_sk_id: NodeId, new_wal_producer_connstr: String) { + if let Some(old_connection) = self.wal_connection.take() { + old_connection.connection_task.shutdown().await + } + + let id = self.id; + let connect_timeout = self.wal_connect_timeout; + let connection_attempt = self + .wal_connection_attempts + .get(&new_sk_id) + .copied() + .unwrap_or(0); + let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { + async move { + exponential_backoff( + connection_attempt, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ) + .await; + super::walreceiver_connection::handle_walreceiver_connection( + id, + &new_wal_producer_connstr, + events_sender.as_ref(), + cancellation, + connect_timeout, + ) + .await + .map_err(|e| format!("walreceiver connection handling failure: {e:#}")) + } + .instrument(info_span!("walreceiver_connection", id = %id)) + }); + + self.wal_connection = Some(WalConnection { + sk_id: new_sk_id, + latest_connection_update: Utc::now().naive_utc(), + connection_task: connection_handle, + }); + } + + /// Adds another etcd timeline into the state, if its more recent than the one already added there for the same key. + fn register_timeline_update(&mut self, timeline_update: BrokerUpdate) { + match self + .wal_stream_candidates + .entry(timeline_update.key.node_id) + { + hash_map::Entry::Occupied(mut o) => { + let existing_value = o.get_mut(); + if existing_value.etcd_version < timeline_update.etcd_version { + existing_value.etcd_version = timeline_update.etcd_version; + existing_value.timeline = timeline_update.value; + existing_value.latest_update = Utc::now().naive_utc(); + } + } + hash_map::Entry::Vacant(v) => { + v.insert(EtcdSkTimeline { + timeline: timeline_update.value, + etcd_version: timeline_update.etcd_version, + latest_update: Utc::now().naive_utc(), + }); + } + } + } + + /// Cleans up stale etcd records and checks the rest for the new connection candidate. + /// Returns a new candidate, if the current state is absent or somewhat lagging, `None` otherwise. + /// The current rules for approving new candidates: + /// * pick from the input data from etcd for currently connected safekeeper (if any) + /// * out of the rest input entries, pick one with biggest `commit_lsn` that's after than pageserver's latest Lsn for the timeline + /// * if there's no such entry, no new candidate found, abort + /// * check the current connection time data for staleness, reconnect if stale + /// * otherwise, check if etcd updates contain currently connected safekeeper + /// * if not, that means no WAL updates happened after certain time (either none since the connection time or none since the last event after the connection) + /// Reconnect if the time exceeds the threshold. + /// * if there's one, compare its Lsn with the other candidate's, reconnect if candidate's over threshold + /// + /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently. + /// Both thresholds are configured per tenant. + fn next_connection_candidate(&mut self) -> Option { + self.cleanup_old_candidates(); + + match &self.wal_connection { + Some(existing_wal_connection) => { + let connected_sk_node = existing_wal_connection.sk_id; + + let (new_sk_id, new_safekeeper_etcd_data, new_wal_producer_connstr) = + self.select_connection_candidate(Some(connected_sk_node))?; + + let now = Utc::now().naive_utc(); + if let Ok(latest_interaciton) = + (now - existing_wal_connection.latest_connection_update).to_std() + { + if latest_interaciton > self.lagging_wal_timeout { + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_producer_connstr: new_wal_producer_connstr, + reason: ReconnectReason::NoWalTimeout { + last_wal_interaction: Some( + existing_wal_connection.latest_connection_update, + ), + check_time: now, + threshold: self.lagging_wal_timeout, + }, + }); + } + } + + match self.wal_stream_candidates.get(&connected_sk_node) { + Some(current_connection_etcd_data) => { + let new_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0)); + let current_lsn = current_connection_etcd_data + .timeline + .commit_lsn + .unwrap_or(Lsn(0)); + match new_lsn.0.checked_sub(current_lsn.0) + { + Some(new_sk_lsn_advantage) => { + if new_sk_lsn_advantage >= self.max_lsn_wal_lag.get() { + return Some( + NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_producer_connstr: new_wal_producer_connstr, + reason: ReconnectReason::LaggingWal { current_lsn, new_lsn, threshold: self.max_lsn_wal_lag }, + }); + } + } + None => debug!("Best SK candidate has its commit Lsn behind the current timeline's latest consistent Lsn"), + } + } + None => { + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_producer_connstr: new_wal_producer_connstr, + reason: ReconnectReason::NoEtcdDataForExistingConnection, + }) + } + } + } + None => { + let (new_sk_id, _, new_wal_producer_connstr) = + self.select_connection_candidate(None)?; + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + wal_producer_connstr: new_wal_producer_connstr, + reason: ReconnectReason::NoExistingConnection, + }); + } + } + + None + } + + /// Selects the best possible candidate, based on the data collected from etcd updates about the safekeepers. + /// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another. + /// + /// The candidate that is chosen: + /// * has fewest connection attempts from pageserver to safekeeper node (reset every time the WAL replication feedback is sent) + /// * has greatest data Lsn among the ones that are left + /// + /// NOTE: + /// We evict timeline data received from etcd based on time passed since it was registered, along with its connection attempts values, but + /// otherwise to reset the connection attempts, a successful connection to that node is needed. + /// That won't happen now, before all nodes with less connection attempts are connected to first, which might leave the sk node with more advanced state to be ignored. + fn select_connection_candidate( + &self, + node_to_omit: Option, + ) -> Option<(NodeId, &SkTimelineInfo, String)> { + let all_candidates = self + .applicable_connection_candidates() + .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit) + .collect::>(); + + let smallest_attempts_allowed = all_candidates + .iter() + .map(|(sk_id, _, _)| { + self.wal_connection_attempts + .get(sk_id) + .copied() + .unwrap_or(0) + }) + .min()?; + + all_candidates + .into_iter() + .filter(|(sk_id, _, _)| { + smallest_attempts_allowed + >= self + .wal_connection_attempts + .get(sk_id) + .copied() + .unwrap_or(0) + }) + .max_by_key(|(_, info, _)| info.commit_lsn) + } + + fn applicable_connection_candidates( + &self, + ) -> impl Iterator { + self.wal_stream_candidates + .iter() + .filter(|(_, etcd_info)| { + etcd_info.timeline.commit_lsn > Some(self.local_timeline.get_last_record_lsn()) + }) + .filter_map(|(sk_id, etcd_info)| { + let info = &etcd_info.timeline; + match wal_stream_connection_string( + self.id, + info.safekeeper_connstr.as_deref()?, + ) { + Ok(connstr) => Some((*sk_id, info, connstr)), + Err(e) => { + error!("Failed to create wal receiver connection string from broker data of safekeeper node {}: {e:#}", sk_id); + None + } + } + }) + } + + fn cleanup_old_candidates(&mut self) { + let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len()); + + self.wal_stream_candidates.retain(|node_id, etcd_info| { + if let Ok(time_since_latest_etcd_update) = + (Utc::now().naive_utc() - etcd_info.latest_update).to_std() + { + let should_retain = time_since_latest_etcd_update < self.lagging_wal_timeout; + if !should_retain { + node_ids_to_remove.push(*node_id); + } + should_retain + } else { + true + } + }); + + for node_id in node_ids_to_remove { + self.wal_connection_attempts.remove(&node_id); + } + } +} + +#[derive(Debug, PartialEq, Eq)] +struct NewWalConnectionCandidate { + safekeeper_id: NodeId, + wal_producer_connstr: String, + reason: ReconnectReason, +} + +/// Stores the reason why WAL connection was switched, for furter debugging purposes. +#[derive(Debug, PartialEq, Eq)] +enum ReconnectReason { + NoExistingConnection, + NoEtcdDataForExistingConnection, + LaggingWal { + current_lsn: Lsn, + new_lsn: Lsn, + threshold: NonZeroU64, + }, + NoWalTimeout { + last_wal_interaction: Option, + check_time: NaiveDateTime, + threshold: Duration, + }, +} + +fn wal_stream_connection_string( + ZTenantTimelineId { + tenant_id, + timeline_id, + }: ZTenantTimelineId, + listen_pg_addr_str: &str, +) -> anyhow::Result { + let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db"); + let me_conf = sk_connstr + .parse::() + .with_context(|| { + format!("Failed to parse pageserver connection string '{sk_connstr}' as a postgres one") + })?; + let (host, port) = utils::connstring::connection_host_port(&me_conf); + Ok(format!( + "host={host} port={port} options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'" + )) +} + +#[cfg(test)] +mod tests { + use std::time::SystemTime; + + use crate::repository::{ + repo_harness::{RepoHarness, TIMELINE_ID}, + Repository, + }; + + use super::*; + + #[test] + fn no_connection_no_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("no_connection_no_candidate")?; + let mut state = dummy_state(&harness); + let now = Utc::now().naive_utc(); + + let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?; + let delay_over_threshold = now - lagging_wal_timeout - lagging_wal_timeout; + + state.wal_connection = None; + state.wal_stream_candidates = HashMap::from([ + ( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: None, + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(1), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: None, + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("no commit_lsn".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(2), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: None, + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("no commit_lsn".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(3), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: delay_over_threshold, + }, + ), + ]); + + let no_candidate = state.next_connection_candidate(); + assert!( + no_candidate.is_none(), + "Expected no candidate selected out of non full data options, but got {no_candidate:?}" + ); + + Ok(()) + } + + #[tokio::test] + async fn connection_no_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("connection_no_candidate")?; + let mut state = dummy_state(&harness); + let now = Utc::now().naive_utc(); + + let connected_sk_id = NodeId(0); + let current_lsn = 100_000; + + state.max_lsn_wal_lag = NonZeroU64::new(100).unwrap(); + state.wal_connection = Some(WalConnection { + sk_id: connected_sk_id, + latest_connection_update: now, + connection_task: TaskHandle::spawn(move |sender, _| async move { + sender + .send(TaskEvent::NewEvent(ReplicationFeedback { + current_timeline_size: 1, + ps_writelsn: 1, + ps_applylsn: current_lsn, + ps_flushlsn: 1, + ps_replytime: SystemTime::now(), + })) + .ok(); + Ok(()) + }), + }); + state.wal_stream_candidates = HashMap::from([ + ( + connected_sk_id, + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() * 2)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(1), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(current_lsn)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("not advanced Lsn".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(2), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() / 2)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("not enough advanced Lsn".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ]); + + let no_candidate = state.next_connection_candidate(); + assert!( + no_candidate.is_none(), + "Expected no candidate selected out of valid options since candidate Lsn data is ignored and others' was not advanced enough, but got {no_candidate:?}" + ); + + Ok(()) + } + + #[test] + fn no_connection_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("no_connection_candidate")?; + let mut state = dummy_state(&harness); + let now = Utc::now().naive_utc(); + + state.wal_connection = None; + state.wal_stream_candidates = HashMap::from([( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + )]); + + let only_candidate = state + .next_connection_candidate() + .expect("Expected one candidate selected out of the only data option, but got none"); + assert_eq!(only_candidate.safekeeper_id, NodeId(0)); + assert_eq!( + only_candidate.reason, + ReconnectReason::NoExistingConnection, + "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" + ); + assert!(only_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + + let selected_lsn = 100_000; + state.wal_stream_candidates = HashMap::from([ + ( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(selected_lsn - 100)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("smaller commit_lsn".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(1), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(selected_lsn)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(2), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(selected_lsn + 100)), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: None, + }, + etcd_version: 0, + latest_update: now, + }, + ), + ]); + let biggest_wal_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(biggest_wal_candidate.safekeeper_id, NodeId(1)); + assert_eq!( + biggest_wal_candidate.reason, + ReconnectReason::NoExistingConnection, + "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold" + ); + assert!(biggest_wal_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + + Ok(()) + } + + #[tokio::test] + async fn candidate_with_many_connection_failures() -> anyhow::Result<()> { + let harness = RepoHarness::create("candidate_with_many_connection_failures")?; + let mut state = dummy_state(&harness); + let now = Utc::now().naive_utc(); + + let current_lsn = Lsn(100_000).align(); + let bigger_lsn = Lsn(current_lsn.0 + 100).align(); + + state.wal_connection = None; + state.wal_stream_candidates = HashMap::from([ + ( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(bigger_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(1), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ]); + state.wal_connection_attempts = HashMap::from([(NodeId(0), 1), (NodeId(1), 0)]); + + let candidate_with_less_errors = state + .next_connection_candidate() + .expect("Expected one candidate selected, but got none"); + assert_eq!( + candidate_with_less_errors.safekeeper_id, + NodeId(1), + "Should select the node with less connection errors" + ); + + Ok(()) + } + + #[tokio::test] + async fn connection_no_etcd_data_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("connection_no_etcd_data_candidate")?; + let mut state = dummy_state(&harness); + + let now = Utc::now().naive_utc(); + let current_lsn = Lsn(100_000).align(); + let connected_sk_id = NodeId(0); + let other_sk_id = NodeId(connected_sk_id.0 + 1); + + state.wal_connection = Some(WalConnection { + sk_id: connected_sk_id, + latest_connection_update: now, + connection_task: TaskHandle::spawn(move |sender, _| async move { + sender + .send(TaskEvent::NewEvent(ReplicationFeedback { + current_timeline_size: 1, + ps_writelsn: current_lsn.0, + ps_applylsn: 1, + ps_flushlsn: 1, + ps_replytime: SystemTime::now(), + })) + .ok(); + Ok(()) + }), + }); + state.wal_stream_candidates = HashMap::from([( + other_sk_id, + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + )]); + + let only_candidate = state + .next_connection_candidate() + .expect("Expected one candidate selected out of the only data option, but got none"); + assert_eq!(only_candidate.safekeeper_id, other_sk_id); + assert_eq!( + only_candidate.reason, + ReconnectReason::NoEtcdDataForExistingConnection, + "Should select new safekeeper due to missing etcd data, even if there's an existing connection with this safekeeper" + ); + assert!(only_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + + Ok(()) + } + + #[tokio::test] + async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("lsn_wal_over_threshcurrent_candidate")?; + let mut state = dummy_state(&harness); + let current_lsn = Lsn(100_000).align(); + let now = Utc::now().naive_utc(); + + let connected_sk_id = NodeId(0); + let new_lsn = Lsn(current_lsn.0 + state.max_lsn_wal_lag.get() + 1); + + state.wal_connection = Some(WalConnection { + sk_id: connected_sk_id, + latest_connection_update: now, + connection_task: TaskHandle::spawn(move |sender, _| async move { + sender + .send(TaskEvent::NewEvent(ReplicationFeedback { + current_timeline_size: 1, + ps_writelsn: current_lsn.0, + ps_applylsn: 1, + ps_flushlsn: 1, + ps_replytime: SystemTime::now(), + })) + .ok(); + Ok(()) + }), + }); + state.wal_stream_candidates = HashMap::from([ + ( + connected_sk_id, + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ( + NodeId(1), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(new_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some("advanced by Lsn safekeeper".to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + ), + ]); + + let over_threshcurrent_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(1)); + assert_eq!( + over_threshcurrent_candidate.reason, + ReconnectReason::LaggingWal { + current_lsn, + new_lsn, + threshold: state.max_lsn_wal_lag + }, + "Should select bigger WAL safekeeper if it starts to lag enough" + ); + assert!(over_threshcurrent_candidate + .wal_producer_connstr + .contains("advanced by Lsn safekeeper")); + + Ok(()) + } + + #[tokio::test] + async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("timeout_wal_over_threshhold_current_candidate")?; + let mut state = dummy_state(&harness); + let current_lsn = Lsn(100_000).align(); + let now = Utc::now().naive_utc(); + + let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?; + let time_over_threshold = + Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; + + state.wal_connection = Some(WalConnection { + sk_id: NodeId(1), + latest_connection_update: time_over_threshold, + connection_task: TaskHandle::spawn(move |sender, _| async move { + sender + .send(TaskEvent::NewEvent(ReplicationFeedback { + current_timeline_size: 1, + ps_writelsn: current_lsn.0, + ps_applylsn: 1, + ps_flushlsn: 1, + ps_replytime: SystemTime::now(), + })) + .ok(); + Ok(()) + }), + }); + state.wal_stream_candidates = HashMap::from([( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + )]); + + let over_threshcurrent_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0)); + match over_threshcurrent_candidate.reason { + ReconnectReason::NoWalTimeout { + last_wal_interaction, + threshold, + .. + } => { + assert_eq!(last_wal_interaction, Some(time_over_threshold)); + assert_eq!(threshold, state.lagging_wal_timeout); + } + unexpected => panic!("Unexpected reason: {unexpected:?}"), + } + assert!(over_threshcurrent_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + + Ok(()) + } + + #[tokio::test] + async fn timeout_connection_over_threshhold_current_candidate() -> anyhow::Result<()> { + let harness = RepoHarness::create("timeout_connection_over_threshhold_current_candidate")?; + let mut state = dummy_state(&harness); + let current_lsn = Lsn(100_000).align(); + let now = Utc::now().naive_utc(); + + let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?; + let time_over_threshold = + Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout; + + state.wal_connection = Some(WalConnection { + sk_id: NodeId(1), + latest_connection_update: time_over_threshold, + connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), + }); + state.wal_stream_candidates = HashMap::from([( + NodeId(0), + EtcdSkTimeline { + timeline: SkTimelineInfo { + last_log_term: None, + flush_lsn: None, + commit_lsn: Some(current_lsn), + backup_lsn: None, + remote_consistent_lsn: None, + peer_horizon_lsn: None, + safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()), + }, + etcd_version: 0, + latest_update: now, + }, + )]); + + let over_threshcurrent_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0)); + match over_threshcurrent_candidate.reason { + ReconnectReason::NoWalTimeout { + last_wal_interaction, + threshold, + .. + } => { + assert_eq!(last_wal_interaction, Some(time_over_threshold)); + assert_eq!(threshold, state.lagging_wal_timeout); + } + unexpected => panic!("Unexpected reason: {unexpected:?}"), + } + assert!(over_threshcurrent_candidate + .wal_producer_connstr + .contains(DUMMY_SAFEKEEPER_CONNSTR)); + + Ok(()) + } + + const DUMMY_SAFEKEEPER_CONNSTR: &str = "safekeeper_connstr"; + + fn dummy_state(harness: &RepoHarness) -> WalreceiverState { + WalreceiverState { + id: ZTenantTimelineId { + tenant_id: harness.tenant_id, + timeline_id: TIMELINE_ID, + }, + local_timeline: Arc::new(DatadirTimelineImpl::new( + harness + .load() + .create_empty_timeline(TIMELINE_ID, Lsn(0)) + .expect("Failed to create an empty timeline for dummy wal connection manager"), + 10_000, + )), + wal_connect_timeout: Duration::from_secs(1), + lagging_wal_timeout: Duration::from_secs(1), + max_lsn_wal_lag: NonZeroU64::new(1).unwrap(), + wal_connection: None, + wal_stream_candidates: HashMap::new(), + wal_connection_attempts: HashMap::new(), + } + } +} diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs new file mode 100644 index 0000000000..98b36dfe48 --- /dev/null +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -0,0 +1,319 @@ +//! Actual Postgres connection handler to stream WAL to the server. + +use std::{ + str::FromStr, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use anyhow::{bail, ensure, Context}; +use bytes::BytesMut; +use fail::fail_point; +use postgres::{SimpleQueryMessage, SimpleQueryRow}; +use postgres_protocol::message::backend::ReplicationMessage; +use postgres_types::PgLsn; +use tokio::{pin, select, sync::watch, time}; +use tokio_postgres::{replication::ReplicationStream, Client}; +use tokio_stream::StreamExt; +use tracing::{debug, error, info, info_span, trace, warn, Instrument}; + +use super::TaskEvent; +use crate::{ + http::models::WalReceiverEntry, + repository::{Repository, Timeline}, + tenant_mgr, + walingest::WalIngest, +}; +use postgres_ffi::waldecoder::WalStreamDecoder; +use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId}; + +/// Opens a conneciton to the given wal producer and streams the WAL, sending progress messages during streaming. +pub async fn handle_walreceiver_connection( + id: ZTenantTimelineId, + wal_producer_connstr: &str, + events_sender: &watch::Sender>, + mut cancellation: watch::Receiver<()>, + connect_timeout: Duration, +) -> anyhow::Result<()> { + // Connect to the database in replication mode. + info!("connecting to {wal_producer_connstr}"); + let connect_cfg = + format!("{wal_producer_connstr} application_name=pageserver replication=true"); + + let (mut replication_client, connection) = time::timeout( + connect_timeout, + tokio_postgres::connect(&connect_cfg, postgres::NoTls), + ) + .await + .context("Timed out while waiting for walreceiver connection to open")? + .context("Failed to open walreceiver conection")?; + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + let mut connection_cancellation = cancellation.clone(); + tokio::spawn( + async move { + info!("connected!"); + select! { + connection_result = connection => match connection_result{ + Ok(()) => info!("Walreceiver db connection closed"), + Err(connection_error) => { + if connection_error.is_closed() { + info!("Connection closed regularly: {connection_error}") + } else { + warn!("Connection aborted: {connection_error}") + } + } + }, + + _ = connection_cancellation.changed() => info!("Connection cancelled"), + } + } + .instrument(info_span!("safekeeper_handle_db")), + ); + + // Immediately increment the gauge, then create a job to decrement it on task exit. + // One of the pros of `defer!` is that this will *most probably* + // get called, even in presence of panics. + let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); + gauge.inc(); + scopeguard::defer! { + gauge.dec(); + } + + let identify = identify_system(&mut replication_client).await?; + info!("{identify:?}"); + let end_of_wal = Lsn::from(u64::from(identify.xlogpos)); + let mut caught_up = false; + let ZTenantTimelineId { + tenant_id, + timeline_id, + } = id; + + let (repo, timeline) = tokio::task::spawn_blocking(move || { + let repo = tenant_mgr::get_repository_for_tenant(tenant_id) + .with_context(|| format!("no repository found for tenant {tenant_id}"))?; + let timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id) + .with_context(|| { + format!("local timeline {timeline_id} not found for tenant {tenant_id}") + })?; + Ok::<_, anyhow::Error>((repo, timeline)) + }) + .await + .with_context(|| format!("Failed to spawn blocking task to get repository and timeline for tenant {tenant_id} timeline {timeline_id}"))??; + + // + // Start streaming the WAL, from where we left off previously. + // + // If we had previously received WAL up to some point in the middle of a WAL record, we + // better start from the end of last full WAL record, not in the middle of one. + let mut last_rec_lsn = timeline.get_last_record_lsn(); + let mut startpoint = last_rec_lsn; + + if startpoint == Lsn(0) { + bail!("No previous WAL position"); + } + + // There might be some padding after the last full record, skip it. + startpoint += startpoint.calc_padding(8u32); + + info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, server is at {end_of_wal}..."); + + let query = format!("START_REPLICATION PHYSICAL {startpoint}"); + + let copy_stream = replication_client.copy_both_simple(&query).await?; + let physical_stream = ReplicationStream::new(copy_stream); + pin!(physical_stream); + + let mut waldecoder = WalStreamDecoder::new(startpoint); + + let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?; + + while let Some(replication_message) = { + select! { + _ = cancellation.changed() => { + info!("walreceiver interrupted"); + None + } + replication_message = physical_stream.next() => replication_message, + } + } { + let replication_message = replication_message?; + let status_update = match replication_message { + ReplicationMessage::XLogData(xlog_data) => { + // Pass the WAL data to the decoder, and see if we can decode + // more records as a result. + let data = xlog_data.data(); + let startlsn = Lsn::from(xlog_data.wal_start()); + let endlsn = startlsn + data.len() as u64; + + trace!("received XLogData between {startlsn} and {endlsn}"); + + waldecoder.feed_bytes(data); + + while let Some((lsn, recdata)) = waldecoder.poll_decode()? { + let _enter = info_span!("processing record", lsn = %lsn).entered(); + + // It is important to deal with the aligned records as lsn in getPage@LSN is + // aligned and can be several bytes bigger. Without this alignment we are + // at risk of hitting a deadlock. + ensure!(lsn.is_aligned()); + + walingest.ingest_record(&timeline, recdata, lsn)?; + + fail_point!("walreceiver-after-ingest"); + + last_rec_lsn = lsn; + } + + if !caught_up && endlsn >= end_of_wal { + info!("caught up at LSN {endlsn}"); + caught_up = true; + } + + let timeline_to_check = Arc::clone(&timeline.tline); + tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance()) + .await + .with_context(|| { + format!("Spawned checkpoint check task panicked for timeline {id}") + })? + .with_context(|| { + format!("Failed to check checkpoint distance for timeline {id}") + })?; + + Some(endlsn) + } + + ReplicationMessage::PrimaryKeepAlive(keepalive) => { + let wal_end = keepalive.wal_end(); + let timestamp = keepalive.timestamp(); + let reply_requested = keepalive.reply() != 0; + + trace!("received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})"); + + if reply_requested { + Some(last_rec_lsn) + } else { + None + } + } + + _ => None, + }; + + if let Some(last_lsn) = status_update { + let remote_index = repo.get_remote_index(); + let timeline_remote_consistent_lsn = remote_index + .read() + .await + // here we either do not have this timeline in remote index + // or there were no checkpoints for it yet + .timeline_entry(&ZTenantTimelineId { + tenant_id, + timeline_id, + }) + .map(|remote_timeline| remote_timeline.metadata.disk_consistent_lsn()) + // no checkpoint was uploaded + .unwrap_or(Lsn(0)); + + // The last LSN we processed. It is not guaranteed to survive pageserver crash. + let write_lsn = u64::from(last_lsn); + // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data + let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn()); + // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash + // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. + let apply_lsn = u64::from(timeline_remote_consistent_lsn); + let ts = SystemTime::now(); + + // Update the current WAL receiver's data stored inside the global hash table `WAL_RECEIVERS` + { + super::WAL_RECEIVER_ENTRIES.write().await.insert( + id, + WalReceiverEntry { + wal_producer_connstr: Some(wal_producer_connstr.to_owned()), + last_received_msg_lsn: Some(last_lsn), + last_received_msg_ts: Some( + ts.duration_since(SystemTime::UNIX_EPOCH) + .expect("Received message time should be before UNIX EPOCH!") + .as_micros(), + ), + }, + ); + } + + // Send zenith feedback message. + // Regular standby_status_update fields are put into this message. + let zenith_status_update = ReplicationFeedback { + current_timeline_size: timeline.get_current_logical_size() as u64, + ps_writelsn: write_lsn, + ps_flushlsn: flush_lsn, + ps_applylsn: apply_lsn, + ps_replytime: ts, + }; + + debug!("zenith_status_update {zenith_status_update:?}"); + + let mut data = BytesMut::new(); + zenith_status_update.serialize(&mut data)?; + physical_stream + .as_mut() + .zenith_status_update(data.len() as u64, &data) + .await?; + if let Err(e) = events_sender.send(TaskEvent::NewEvent(zenith_status_update)) { + warn!("Wal connection event listener dropped, aborting the connection: {e}"); + return Ok(()); + } + } + } + + Ok(()) +} + +/// Data returned from the postgres `IDENTIFY_SYSTEM` command +/// +/// See the [postgres docs] for more details. +/// +/// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html +#[derive(Debug)] +// As of nightly 2021-09-11, fields that are only read by the type's `Debug` impl still count as +// unused. Relevant issue: https://github.com/rust-lang/rust/issues/88900 +#[allow(dead_code)] +struct IdentifySystem { + systemid: u64, + timeline: u32, + xlogpos: PgLsn, + dbname: Option, +} + +/// There was a problem parsing the response to +/// a postgres IDENTIFY_SYSTEM command. +#[derive(Debug, thiserror::Error)] +#[error("IDENTIFY_SYSTEM parse error")] +struct IdentifyError; + +/// Run the postgres `IDENTIFY_SYSTEM` command +async fn identify_system(client: &mut Client) -> anyhow::Result { + let query_str = "IDENTIFY_SYSTEM"; + let response = client.simple_query(query_str).await?; + + // get(N) from row, then parse it as some destination type. + fn get_parse(row: &SimpleQueryRow, idx: usize) -> Result + where + T: FromStr, + { + let val = row.get(idx).ok_or(IdentifyError)?; + val.parse::().or(Err(IdentifyError)) + } + + // extract the row contents into an IdentifySystem struct. + // written as a closure so I can use ? for Option here. + if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) { + Ok(IdentifySystem { + systemid: get_parse(first_row, 0)?, + timeline: get_parse(first_row, 1)?, + xlogpos: get_parse(first_row, 2)?, + dbname: get_parse(first_row, 3).ok(), + }) + } else { + Err(IdentifyError.into()) + } +} diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index e556c24548..db4620417c 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -28,6 +28,7 @@ use std::fs::OpenOptions; use std::io::prelude::*; use std::io::{Error, ErrorKind}; use std::os::unix::io::AsRawFd; +use std::os::unix::prelude::CommandExt; use std::path::PathBuf; use std::process::Stdio; use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command}; @@ -122,7 +123,7 @@ lazy_static! { /// /// This is the real implementation that uses a Postgres process to -/// perform WAL replay. Only one thread can use the processs at a time, +/// perform WAL replay. Only one thread can use the process at a time, /// that is controlled by the Mutex. In the future, we might want to /// launch a pool of processes to allow concurrent replay of multiple /// records. @@ -134,7 +135,7 @@ pub struct PostgresRedoManager { process: Mutex>, } -/// Can this request be served by zenith redo funcitons +/// Can this request be served by zenith redo functions /// or we need to pass it to wal-redo postgres process? fn can_apply_in_zenith(rec: &ZenithWalRecord) -> bool { // Currently, we don't have bespoken Rust code to replay any @@ -554,6 +555,40 @@ impl PostgresRedoManager { } } +/// +/// Command with ability not to give all file descriptors to child process +/// +trait CloseFileDescriptors: CommandExt { + /// + /// Close file descriptors (other than stdin, stdout, stderr) in child process + /// + fn close_fds(&mut self) -> &mut Command; +} + +impl CloseFileDescriptors for C { + fn close_fds(&mut self) -> &mut Command { + unsafe { + self.pre_exec(move || { + // SAFETY: Code executed inside pre_exec should have async-signal-safety, + // which means it should be safe to execute inside a signal handler. + // The precise meaning depends on platform. See `man signal-safety` + // for the linux definition. + // + // The set_fds_cloexec_threadsafe function is documented to be + // async-signal-safe. + // + // Aside from this function, the rest of the code is re-entrant and + // doesn't make any syscalls. We're just passing constants. + // + // NOTE: It's easy to indirectly cause a malloc or lock a mutex, + // which is not async-signal-safe. Be careful. + close_fds::set_fds_cloexec_threadsafe(3, &[]); + Ok(()) + }) + } + } +} + /// /// Handle to the Postgres WAL redo process /// @@ -588,6 +623,7 @@ impl PostgresRedoProcess { .env_clear() .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) + .close_fds() .output() .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {}", e)))?; @@ -607,9 +643,10 @@ impl PostgresRedoProcess { .open(PathBuf::from(&datadir).join("postgresql.conf"))?; config.write_all(b"shared_buffers=128kB\n")?; config.write_all(b"fsync=off\n")?; - config.write_all(b"shared_preload_libraries=zenith\n")?; - config.write_all(b"zenith.wal_redo=on\n")?; + config.write_all(b"shared_preload_libraries=neon\n")?; + config.write_all(b"neon.wal_redo=on\n")?; } + // Start postgres itself let mut child = Command::new(conf.pg_bin_dir().join("postgres")) .arg("--wal-redo") @@ -620,6 +657,19 @@ impl PostgresRedoProcess { .env("LD_LIBRARY_PATH", conf.pg_lib_dir()) .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir()) .env("PGDATA", &datadir) + // The redo process is not trusted, so it runs in seccomp mode + // (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't + // inherit any file descriptors from the pageserver that would allow + // an attacker to do bad things. + // + // The Rust standard library makes sure to mark any file descriptors with + // as close-on-exec by default, but that's not enough, since we use + // libraries that directly call libc open without setting that flag. + // + // One example is the pidfile of the daemonize library, which doesn't + // currently mark file descriptors as close-on-exec. Either way, we + // want to be on the safe side and prevent accidental regression. + .close_fds() .spawn() .map_err(|e| { Error::new( diff --git a/poetry.lock b/poetry.lock index 6e552d2cd3..4963390718 100644 --- a/poetry.lock +++ b/poetry.lock @@ -544,20 +544,21 @@ test = ["pytest (>=6.2.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pr [[package]] name = "docker" -version = "5.0.3" +version = "4.2.2" description = "A Python library for the Docker Engine API." category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [package.dependencies] -pywin32 = {version = "227", markers = "sys_platform == \"win32\""} +pypiwin32 = {version = "223", markers = "sys_platform == \"win32\" and python_version >= \"3.6\""} requests = ">=2.14.2,<2.18.0 || >2.18.0" +six = ">=1.4.0" websocket-client = ">=0.32.0" [package.extras] ssh = ["paramiko (>=2.4.2)"] -tls = ["pyOpenSSL (>=17.5.0)", "cryptography (>=3.4.7)", "idna (>=2.0.0)"] +tls = ["pyOpenSSL (>=17.5.0)", "cryptography (>=1.3.4)", "idna (>=2.0.0)"] [[package]] name = "ecdsa" @@ -1003,6 +1004,17 @@ python-versions = ">=3.6" [package.extras] diagrams = ["jinja2", "railroad-diagrams"] +[[package]] +name = "pypiwin32" +version = "223" +description = "" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +pywin32 = ">=223" + [[package]] name = "pyrsistent" version = "0.18.1" @@ -1124,7 +1136,7 @@ python-versions = "*" [[package]] name = "pywin32" -version = "227" +version = "301" description = "Python for Window Extensions" category = "main" optional = false @@ -1501,8 +1513,8 @@ cryptography = [ {file = "cryptography-36.0.1.tar.gz", hash = "sha256:53e5c1dc3d7a953de055d77bef2ff607ceef7a2aac0353b5d630ab67f7423638"}, ] docker = [ - {file = "docker-5.0.3-py2.py3-none-any.whl", hash = "sha256:7a79bb439e3df59d0a72621775d600bc8bc8b422d285824cb37103eab91d1ce0"}, - {file = "docker-5.0.3.tar.gz", hash = "sha256:d916a26b62970e7c2f554110ed6af04c7ccff8e9f81ad17d0d40c75637e227fb"}, + {file = "docker-4.2.2-py2.py3-none-any.whl", hash = "sha256:03a46400c4080cb6f7aa997f881ddd84fef855499ece219d75fbdb53289c17ab"}, + {file = "docker-4.2.2.tar.gz", hash = "sha256:26eebadce7e298f55b76a88c4f8802476c5eaddbdbe38dbc6cce8781c47c9b54"}, ] ecdsa = [ {file = "ecdsa-0.17.0-py2.py3-none-any.whl", hash = "sha256:5cf31d5b33743abe0dfc28999036c849a69d548f994b535e527ee3cb7f3ef676"}, @@ -1802,6 +1814,10 @@ pyparsing = [ {file = "pyparsing-3.0.6-py3-none-any.whl", hash = "sha256:04ff808a5b90911829c55c4e26f75fa5ca8a2f5f36aa3a51f68e27033341d3e4"}, {file = "pyparsing-3.0.6.tar.gz", hash = "sha256:d9bdec0013ef1eb5a84ab39a3b3868911598afa494f5faa038647101504e2b81"}, ] +pypiwin32 = [ + {file = "pypiwin32-223-py3-none-any.whl", hash = "sha256:67adf399debc1d5d14dffc1ab5acacb800da569754fafdc576b2a039485aa775"}, + {file = "pypiwin32-223.tar.gz", hash = "sha256:71be40c1fbd28594214ecaecb58e7aa8b708eabfa0125c8a109ebd51edbd776a"}, +] pyrsistent = [ {file = "pyrsistent-0.18.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:df46c854f490f81210870e509818b729db4488e1f30f2a1ce1698b2295a878d1"}, {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d45866ececf4a5fff8742c25722da6d4c9e180daa7b405dc0a2a2790d668c26"}, @@ -1858,18 +1874,16 @@ pytz = [ {file = "pytz-2021.3.tar.gz", hash = "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"}, ] pywin32 = [ - {file = "pywin32-227-cp27-cp27m-win32.whl", hash = "sha256:371fcc39416d736401f0274dd64c2302728c9e034808e37381b5e1b22be4a6b0"}, - {file = "pywin32-227-cp27-cp27m-win_amd64.whl", hash = "sha256:4cdad3e84191194ea6d0dd1b1b9bdda574ff563177d2adf2b4efec2a244fa116"}, - {file = "pywin32-227-cp35-cp35m-win32.whl", hash = "sha256:f4c5be1a293bae0076d93c88f37ee8da68136744588bc5e2be2f299a34ceb7aa"}, - {file = "pywin32-227-cp35-cp35m-win_amd64.whl", hash = "sha256:a929a4af626e530383a579431b70e512e736e9588106715215bf685a3ea508d4"}, - {file = "pywin32-227-cp36-cp36m-win32.whl", hash = "sha256:300a2db938e98c3e7e2093e4491439e62287d0d493fe07cce110db070b54c0be"}, - {file = "pywin32-227-cp36-cp36m-win_amd64.whl", hash = "sha256:9b31e009564fb95db160f154e2aa195ed66bcc4c058ed72850d047141b36f3a2"}, - {file = "pywin32-227-cp37-cp37m-win32.whl", hash = "sha256:47a3c7551376a865dd8d095a98deba954a98f326c6fe3c72d8726ca6e6b15507"}, - {file = "pywin32-227-cp37-cp37m-win_amd64.whl", hash = "sha256:31f88a89139cb2adc40f8f0e65ee56a8c585f629974f9e07622ba80199057511"}, - {file = "pywin32-227-cp38-cp38-win32.whl", hash = "sha256:7f18199fbf29ca99dff10e1f09451582ae9e372a892ff03a28528a24d55875bc"}, - {file = "pywin32-227-cp38-cp38-win_amd64.whl", hash = "sha256:7c1ae32c489dc012930787f06244426f8356e129184a02c25aef163917ce158e"}, - {file = "pywin32-227-cp39-cp39-win32.whl", hash = "sha256:c054c52ba46e7eb6b7d7dfae4dbd987a1bb48ee86debe3f245a2884ece46e295"}, - {file = "pywin32-227-cp39-cp39-win_amd64.whl", hash = "sha256:f27cec5e7f588c3d1051651830ecc00294f90728d19c3bf6916e6dba93ea357c"}, + {file = "pywin32-301-cp35-cp35m-win32.whl", hash = "sha256:93367c96e3a76dfe5003d8291ae16454ca7d84bb24d721e0b74a07610b7be4a7"}, + {file = "pywin32-301-cp35-cp35m-win_amd64.whl", hash = "sha256:9635df6998a70282bd36e7ac2a5cef9ead1627b0a63b17c731312c7a0daebb72"}, + {file = "pywin32-301-cp36-cp36m-win32.whl", hash = "sha256:c866f04a182a8cb9b7855de065113bbd2e40524f570db73ef1ee99ff0a5cc2f0"}, + {file = "pywin32-301-cp36-cp36m-win_amd64.whl", hash = "sha256:dafa18e95bf2a92f298fe9c582b0e205aca45c55f989937c52c454ce65b93c78"}, + {file = "pywin32-301-cp37-cp37m-win32.whl", hash = "sha256:98f62a3f60aa64894a290fb7494bfa0bfa0a199e9e052e1ac293b2ad3cd2818b"}, + {file = "pywin32-301-cp37-cp37m-win_amd64.whl", hash = "sha256:fb3b4933e0382ba49305cc6cd3fb18525df7fd96aa434de19ce0878133bf8e4a"}, + {file = "pywin32-301-cp38-cp38-win32.whl", hash = "sha256:88981dd3cfb07432625b180f49bf4e179fb8cbb5704cd512e38dd63636af7a17"}, + {file = "pywin32-301-cp38-cp38-win_amd64.whl", hash = "sha256:8c9d33968aa7fcddf44e47750e18f3d034c3e443a707688a008a2e52bbef7e96"}, + {file = "pywin32-301-cp39-cp39-win32.whl", hash = "sha256:595d397df65f1b2e0beaca63a883ae6d8b6df1cdea85c16ae85f6d2e648133fe"}, + {file = "pywin32-301-cp39-cp39-win_amd64.whl", hash = "sha256:87604a4087434cd814ad8973bd47d6524bd1fa9e971ce428e76b62a5e0860fdf"}, ] pyyaml = [ {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 4e45698e3e..8c6036f87d 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -39,6 +39,8 @@ utils = { path = "../libs/utils" } metrics = { path = "../libs/metrics" } workspace_hack = { version = "0.1", path = "../workspace_hack" } +x509-parser = "0.13.2" + [dev-dependencies] rcgen = "0.8.14" rstest = "0.12" diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 2463f31645..9bddd58fce 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,56 +1,58 @@ -mod credentials; -mod flow; +//! Client authentication mechanisms. -use crate::auth_backend::{console, legacy_console, link, postgres}; -use crate::config::{AuthBackendType, ProxyConfig}; -use crate::error::UserFacingError; -use crate::stream::PqStream; -use crate::{auth_backend, compute, waiters}; -use console::ConsoleAuthError::SniMissing; +pub mod backend; +pub use backend::DatabaseInfo; + +mod credentials; +pub use credentials::ClientCredentials; + +mod flow; +pub use flow::*; + +use crate::{error::UserFacingError, waiters}; use std::io; use thiserror::Error; -use tokio::io::{AsyncRead, AsyncWrite}; -pub use credentials::ClientCredentials; -pub use flow::*; +/// Convenience wrapper for the authentication error. +pub type Result = std::result::Result; /// Common authentication error. #[derive(Debug, Error)] pub enum AuthErrorImpl { /// Authentication error reported by the console. #[error(transparent)] - Console(#[from] auth_backend::AuthError), + Console(#[from] backend::AuthError), #[error(transparent)] - GetAuthInfo(#[from] auth_backend::console::ConsoleAuthError), + GetAuthInfo(#[from] backend::console::ConsoleAuthError), #[error(transparent)] Sasl(#[from] crate::sasl::Error), - /// For passwords that couldn't be processed by [`parse_password`]. + /// For passwords that couldn't be processed by [`backend::legacy_console::parse_password`]. #[error("Malformed password message")] MalformedPassword, - /// Errors produced by [`PqStream`]. + /// Errors produced by [`crate::stream::PqStream`]. #[error(transparent)] Io(#[from] io::Error), } impl AuthErrorImpl { pub fn auth_failed(msg: impl Into) -> Self { - AuthErrorImpl::Console(auth_backend::AuthError::auth_failed(msg)) + Self::Console(backend::AuthError::auth_failed(msg)) } } impl From for AuthErrorImpl { fn from(e: waiters::RegisterError) -> Self { - AuthErrorImpl::Console(auth_backend::AuthError::from(e)) + Self::Console(backend::AuthError::from(e)) } } impl From for AuthErrorImpl { fn from(e: waiters::WaitError) -> Self { - AuthErrorImpl::Console(auth_backend::AuthError::from(e)) + Self::Console(backend::AuthError::from(e)) } } @@ -63,7 +65,7 @@ where AuthErrorImpl: From, { fn from(e: T) -> Self { - AuthError(Box::new(e.into())) + Self(Box::new(e.into())) } } @@ -72,34 +74,10 @@ impl UserFacingError for AuthError { use AuthErrorImpl::*; match self.0.as_ref() { Console(e) => e.to_string_client(), + GetAuthInfo(e) => e.to_string_client(), + Sasl(e) => e.to_string_client(), MalformedPassword => self.to_string(), - GetAuthInfo(e) if matches!(e, SniMissing) => e.to_string(), _ => "Internal error".to_string(), } } } - -async fn handle_user( - config: &ProxyConfig, - client: &mut PqStream, - creds: ClientCredentials, -) -> Result { - match config.auth_backend { - AuthBackendType::LegacyConsole => { - legacy_console::handle_user( - &config.auth_endpoint, - &config.auth_link_uri, - client, - &creds, - ) - .await - } - AuthBackendType::Console => { - console::handle_user(config.auth_endpoint.as_ref(), client, &creds).await - } - AuthBackendType::Postgres => { - postgres::handle_user(&config.auth_endpoint, client, &creds).await - } - AuthBackendType::Link => link::handle_user(config.auth_link_uri.as_ref(), client).await, - } -} diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs new file mode 100644 index 0000000000..1d41f7f932 --- /dev/null +++ b/proxy/src/auth/backend.rs @@ -0,0 +1,109 @@ +mod legacy_console; +mod link; +mod postgres; + +pub mod console; + +pub use legacy_console::{AuthError, AuthErrorImpl}; + +use super::ClientCredentials; +use crate::{ + compute, + config::{AuthBackendType, ProxyConfig}, + mgmt, + stream::PqStream, + waiters::{self, Waiter, Waiters}, +}; +use lazy_static::lazy_static; +use serde::{Deserialize, Serialize}; +use tokio::io::{AsyncRead, AsyncWrite}; + +lazy_static! { + static ref CPLANE_WAITERS: Waiters = Default::default(); +} + +/// Give caller an opportunity to wait for the cloud's reply. +pub async fn with_waiter( + psql_session_id: impl Into, + action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R, +) -> Result +where + R: std::future::Future>, + E: From, +{ + let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; + action(waiter).await +} + +pub fn notify(psql_session_id: &str, msg: mgmt::ComputeReady) -> Result<(), waiters::NotifyError> { + CPLANE_WAITERS.notify(psql_session_id, msg) +} + +/// Compute node connection params provided by the cloud. +/// Note how it implements serde traits, since we receive it over the wire. +#[derive(Serialize, Deserialize, Default)] +pub struct DatabaseInfo { + pub host: String, + pub port: u16, + pub dbname: String, + pub user: String, + pub password: Option, +} + +// Manually implement debug to omit personal and sensitive info. +impl std::fmt::Debug for DatabaseInfo { + fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + fmt.debug_struct("DatabaseInfo") + .field("host", &self.host) + .field("port", &self.port) + .finish() + } +} + +impl From for tokio_postgres::Config { + fn from(db_info: DatabaseInfo) -> Self { + let mut config = tokio_postgres::Config::new(); + + config + .host(&db_info.host) + .port(db_info.port) + .dbname(&db_info.dbname) + .user(&db_info.user); + + if let Some(password) = db_info.password { + config.password(password); + } + + config + } +} + +pub(super) async fn handle_user( + config: &ProxyConfig, + client: &mut PqStream, + creds: ClientCredentials, +) -> super::Result { + use AuthBackendType::*; + match config.auth_backend { + LegacyConsole => { + legacy_console::handle_user( + &config.auth_endpoint, + &config.auth_link_uri, + client, + &creds, + ) + .await + } + Console => { + console::Api::new(&config.auth_endpoint, &creds)? + .handle_user(client) + .await + } + Postgres => { + postgres::Api::new(&config.auth_endpoint, &creds)? + .handle_user(client) + .await + } + Link => link::handle_user(&config.auth_link_uri, client).await, + } +} diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs new file mode 100644 index 0000000000..3085f0b0e4 --- /dev/null +++ b/proxy/src/auth/backend/console.rs @@ -0,0 +1,226 @@ +//! Cloud API V2. + +use crate::{ + auth::{self, AuthFlow, ClientCredentials, DatabaseInfo}, + compute, + error::UserFacingError, + scram, + stream::PqStream, + url::ApiUrl, +}; +use serde::{Deserialize, Serialize}; +use std::{future::Future, io}; +use thiserror::Error; +use tokio::io::{AsyncRead, AsyncWrite}; +use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; + +pub type Result = std::result::Result; + +#[derive(Debug, Error)] +pub enum ConsoleAuthError { + #[error(transparent)] + BadProjectName(#[from] auth::credentials::ClientCredsParseError), + + // We shouldn't include the actual secret here. + #[error("Bad authentication secret")] + BadSecret, + + #[error("Console responded with a malformed compute address: '{0}'")] + BadComputeAddress(String), + + #[error("Console responded with a malformed JSON: '{0}'")] + BadResponse(#[from] serde_json::Error), + + /// HTTP status (other than 200) returned by the console. + #[error("Console responded with an HTTP status: {0}")] + HttpStatus(reqwest::StatusCode), + + #[error(transparent)] + Io(#[from] std::io::Error), +} + +impl UserFacingError for ConsoleAuthError { + fn to_string_client(&self) -> String { + use ConsoleAuthError::*; + match self { + BadProjectName(e) => e.to_string_client(), + _ => "Internal error".to_string(), + } + } +} + +impl From<&auth::credentials::ClientCredsParseError> for ConsoleAuthError { + fn from(e: &auth::credentials::ClientCredsParseError) -> Self { + ConsoleAuthError::BadProjectName(e.clone()) + } +} + +// TODO: convert into an enum with "error" +#[derive(Serialize, Deserialize, Debug)] +struct GetRoleSecretResponse { + role_secret: String, +} + +// TODO: convert into an enum with "error" +#[derive(Serialize, Deserialize, Debug)] +struct GetWakeComputeResponse { + address: String, +} + +/// Auth secret which is managed by the cloud. +pub enum AuthInfo { + /// Md5 hash of user's password. + Md5([u8; 16]), + + /// [SCRAM](crate::scram) authentication info. + Scram(scram::ServerSecret), +} + +#[must_use] +pub(super) struct Api<'a> { + endpoint: &'a ApiUrl, + creds: &'a ClientCredentials, +} + +impl<'a> Api<'a> { + /// Construct an API object containing the auth parameters. + pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result { + Ok(Self { endpoint, creds }) + } + + /// Authenticate the existing user or throw an error. + pub(super) async fn handle_user( + self, + client: &mut PqStream, + ) -> auth::Result { + handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await + } + + async fn get_auth_info(&self) -> Result { + let mut url = self.endpoint.clone(); + url.path_segments_mut().push("proxy_get_role_secret"); + url.query_pairs_mut() + .append_pair("project", self.creds.project_name.as_ref()?) + .append_pair("role", &self.creds.user); + + // TODO: use a proper logger + println!("cplane request: {url}"); + + let resp = reqwest::get(url.into_inner()).await.map_err(io_error)?; + if !resp.status().is_success() { + return Err(ConsoleAuthError::HttpStatus(resp.status())); + } + + let response: GetRoleSecretResponse = + serde_json::from_str(&resp.text().await.map_err(io_error)?)?; + + scram::ServerSecret::parse(response.role_secret.as_str()) + .map(AuthInfo::Scram) + .ok_or(ConsoleAuthError::BadSecret) + } + + /// Wake up the compute node and return the corresponding connection info. + async fn wake_compute(&self) -> Result { + let mut url = self.endpoint.clone(); + url.path_segments_mut().push("proxy_wake_compute"); + let project_name = self.creds.project_name.as_ref()?; + url.query_pairs_mut().append_pair("project", project_name); + + // TODO: use a proper logger + println!("cplane request: {url}"); + + let resp = reqwest::get(url.into_inner()).await.map_err(io_error)?; + if !resp.status().is_success() { + return Err(ConsoleAuthError::HttpStatus(resp.status())); + } + + let response: GetWakeComputeResponse = + serde_json::from_str(&resp.text().await.map_err(io_error)?)?; + + let (host, port) = parse_host_port(&response.address) + .ok_or(ConsoleAuthError::BadComputeAddress(response.address))?; + + Ok(DatabaseInfo { + host, + port, + dbname: self.creds.dbname.to_owned(), + user: self.creds.user.to_owned(), + password: None, + }) + } +} + +/// Common logic for user handling in API V2. +/// We reuse this for a mock API implementation in [`super::postgres`]. +pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>( + client: &mut PqStream, + endpoint: &'a Endpoint, + get_auth_info: impl FnOnce(&'a Endpoint) -> GetAuthInfo, + wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute, +) -> auth::Result +where + GetAuthInfo: Future>, + WakeCompute: Future>, +{ + let auth_info = get_auth_info(endpoint).await?; + + let flow = AuthFlow::new(client); + let scram_keys = match auth_info { + AuthInfo::Md5(_) => { + // TODO: decide if we should support MD5 in api v2 + return Err(auth::AuthErrorImpl::auth_failed("MD5 is not supported").into()); + } + AuthInfo::Scram(secret) => { + let scram = auth::Scram(&secret); + Some(compute::ScramKeys { + client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), + server_key: secret.server_key.as_bytes(), + }) + } + }; + + client + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&BeParameterStatusMessage::encoding())?; + + Ok(compute::NodeInfo { + db_info: wake_compute(endpoint).await?, + scram_keys, + }) +} + +/// Upcast (almost) any error into an opaque [`io::Error`]. +pub(super) fn io_error(e: impl Into>) -> io::Error { + io::Error::new(io::ErrorKind::Other, e) +} + +fn parse_host_port(input: &str) -> Option<(String, u16)> { + let (host, port) = input.split_once(':')?; + Some((host.to_owned(), port.parse().ok()?)) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn parse_db_info() -> anyhow::Result<()> { + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "password": "password", + }))?; + + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + }))?; + + Ok(()) + } +} diff --git a/proxy/src/auth_backend/legacy_console.rs b/proxy/src/auth/backend/legacy_console.rs similarity index 90% rename from proxy/src/auth_backend/legacy_console.rs rename to proxy/src/auth/backend/legacy_console.rs index 29997d2389..467da63a98 100644 --- a/proxy/src/auth_backend/legacy_console.rs +++ b/proxy/src/auth/backend/legacy_console.rs @@ -1,20 +1,18 @@ //! Cloud API V1. -use super::console::DatabaseInfo; - -use crate::auth::ClientCredentials; -use crate::stream::PqStream; - -use crate::{compute, waiters}; +use super::DatabaseInfo; +use crate::{ + auth::{self, ClientCredentials}, + compute, + error::UserFacingError, + stream::PqStream, + waiters, +}; use serde::{Deserialize, Serialize}; - +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; -use thiserror::Error; - -use crate::error::UserFacingError; - #[derive(Debug, Error)] pub enum AuthErrorImpl { /// Authentication error reported by the console. @@ -45,7 +43,7 @@ pub struct AuthError(Box); impl AuthError { /// Smart constructor for authentication error reported by `mgmt`. pub fn auth_failed(msg: impl Into) -> Self { - AuthError(Box::new(AuthErrorImpl::AuthFailed(msg.into()))) + Self(Box::new(AuthErrorImpl::AuthFailed(msg.into()))) } } @@ -54,7 +52,7 @@ where AuthErrorImpl: From, { fn from(e: T) -> Self { - AuthError(Box::new(e.into())) + Self(Box::new(e.into())) } } @@ -120,7 +118,7 @@ async fn handle_existing_user( auth_endpoint: &reqwest::Url, client: &mut PqStream, creds: &ClientCredentials, -) -> Result { +) -> Result { let psql_session_id = super::link::new_psql_session_id(); let md5_salt = rand::random(); @@ -130,7 +128,7 @@ async fn handle_existing_user( // Read client's password hash let msg = client.read_password_message().await?; - let md5_response = parse_password(&msg).ok_or(crate::auth::AuthErrorImpl::MalformedPassword)?; + let md5_response = parse_password(&msg).ok_or(auth::AuthErrorImpl::MalformedPassword)?; let db_info = authenticate_proxy_client( auth_endpoint, @@ -156,11 +154,11 @@ pub async fn handle_user( auth_link_uri: &reqwest::Url, client: &mut PqStream, creds: &ClientCredentials, -) -> Result { +) -> auth::Result { if creds.is_existing_user() { handle_existing_user(auth_endpoint, client, creds).await } else { - super::link::handle_user(auth_link_uri.as_ref(), client).await + super::link::handle_user(auth_link_uri, client).await } } diff --git a/proxy/src/auth_backend/link.rs b/proxy/src/auth/backend/link.rs similarity index 61% rename from proxy/src/auth_backend/link.rs rename to proxy/src/auth/backend/link.rs index 9bdb9e21c4..669c9e00e9 100644 --- a/proxy/src/auth_backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -1,16 +1,13 @@ -use crate::{compute, stream::PqStream}; +use crate::{auth, compute, stream::PqStream}; use tokio::io::{AsyncRead, AsyncWrite}; use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; fn hello_message(redirect_uri: &str, session_id: &str) -> String { format!( concat![ - "☀️ Welcome to Neon!\n", - "To proceed with database creation, open the following link:\n\n", + "Welcome to Neon!\n", + "Authenticate by visiting:\n", " {redirect_uri}{session_id}\n\n", - "It needs to be done once and we will send you '.pgpass' file,\n", - "which will allow you to access or create ", - "databases without opening your web browser." ], redirect_uri = redirect_uri, session_id = session_id, @@ -22,13 +19,13 @@ pub fn new_psql_session_id() -> String { } pub async fn handle_user( - redirect_uri: &str, + redirect_uri: &reqwest::Url, client: &mut PqStream, -) -> Result { +) -> auth::Result { let psql_session_id = new_psql_session_id(); - let greeting = hello_message(redirect_uri, &psql_session_id); + let greeting = hello_message(redirect_uri.as_str(), &psql_session_id); - let db_info = crate::auth_backend::with_waiter(psql_session_id, |waiter| async { + let db_info = super::with_waiter(psql_session_id, |waiter| async { // Give user a URL to spawn a new database client .write_message_noflush(&Be::AuthenticationOk)? @@ -37,9 +34,7 @@ pub async fn handle_user( .await?; // Wait for web console response (see `mgmt`) - waiter - .await? - .map_err(crate::auth::AuthErrorImpl::auth_failed) + waiter.await?.map_err(auth::AuthErrorImpl::auth_failed) }) .await?; diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs new file mode 100644 index 0000000000..721b9db095 --- /dev/null +++ b/proxy/src/auth/backend/postgres.rs @@ -0,0 +1,88 @@ +//! Local mock of Cloud API V2. + +use crate::{ + auth::{ + self, + backend::console::{self, io_error, AuthInfo, Result}, + ClientCredentials, DatabaseInfo, + }, + compute, scram, + stream::PqStream, + url::ApiUrl, +}; +use tokio::io::{AsyncRead, AsyncWrite}; + +#[must_use] +pub(super) struct Api<'a> { + endpoint: &'a ApiUrl, + creds: &'a ClientCredentials, +} + +impl<'a> Api<'a> { + /// Construct an API object containing the auth parameters. + pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result { + Ok(Self { endpoint, creds }) + } + + /// Authenticate the existing user or throw an error. + pub(super) async fn handle_user( + self, + client: &mut PqStream, + ) -> auth::Result { + // We reuse user handling logic from a production module. + console::handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await + } + + /// This implementation fetches the auth info from a local postgres instance. + async fn get_auth_info(&self) -> Result { + // Perhaps we could persist this connection, but then we'd have to + // write more code for reopening it if it got closed, which doesn't + // seem worth it. + let (client, connection) = + tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls) + .await + .map_err(io_error)?; + + tokio::spawn(connection); + let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1"; + let rows = client + .query(query, &[&self.creds.user]) + .await + .map_err(io_error)?; + + match &rows[..] { + // We can't get a secret if there's no such user. + [] => Err(io_error(format!("unknown user '{}'", self.creds.user)).into()), + + // We shouldn't get more than one row anyway. + [row, ..] => { + let entry = row.try_get(0).map_err(io_error)?; + scram::ServerSecret::parse(entry) + .map(AuthInfo::Scram) + .or_else(|| { + // It could be an md5 hash if it's not a SCRAM secret. + let text = entry.strip_prefix("md5")?; + Some(AuthInfo::Md5({ + let mut bytes = [0u8; 16]; + hex::decode_to_slice(text, &mut bytes).ok()?; + bytes + })) + }) + // Putting the secret into this message is a security hazard! + .ok_or(console::ConsoleAuthError::BadSecret) + } + } + } + + /// We don't need to wake anything locally, so we just return the connection info. + async fn wake_compute(&self) -> Result { + Ok(DatabaseInfo { + // TODO: handle that near CLI params parsing + host: self.endpoint.host_str().unwrap_or("localhost").to_owned(), + port: self.endpoint.port().unwrap_or(5432), + dbname: self.creds.dbname.to_owned(), + user: self.creds.user.to_owned(), + password: None, + }) + } +} diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 9d2272b5ad..b5312fbe1f 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,6 +1,5 @@ //! User credentials used in authentication. -use super::AuthError; use crate::compute; use crate::config::ProxyConfig; use crate::error::UserFacingError; @@ -9,10 +8,32 @@ use std::collections::HashMap; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -#[derive(Debug, Error)] +#[derive(Debug, Error, PartialEq, Eq, Clone)] pub enum ClientCredsParseError { - #[error("Parameter `{0}` is missing in startup packet")] + #[error("Parameter `{0}` is missing in startup packet.")] MissingKey(&'static str), + + #[error( + "Project name is not specified. \ + EITHER please upgrade the postgres client library (libpq) for SNI support \ + OR pass the project name as a parameter: '&options=project%3D'." + )] + MissingSNIAndProjectName, + + #[error("Inconsistent project name inferred from SNI ('{0}') and project option ('{1}').")] + InconsistentProjectNameAndSNI(String, String), + + #[error("Common name is not set.")] + CommonNameNotSet, + + #[error( + "SNI ('{1}') inconsistently formatted with respect to common name ('{0}'). \ + SNI should be formatted as '.'." + )] + InconsistentCommonNameAndSNI(String, String), + + #[error("Project name ('{0}') must contain only alphanumeric characters and hyphens ('-').")] + ProjectNameContainsIllegalChars(String), } impl UserFacingError for ClientCredsParseError {} @@ -23,10 +44,7 @@ impl UserFacingError for ClientCredsParseError {} pub struct ClientCredentials { pub user: String, pub dbname: String, - - // New console API requires SNI info to determine the cluster name. - // Other Auth backends don't need it. - pub sni_data: Option, + pub project_name: Result, } impl ClientCredentials { @@ -34,37 +52,278 @@ impl ClientCredentials { // This logic will likely change in the future. self.user.ends_with("@zenith") } -} -impl TryFrom> for ClientCredentials { - type Error = ClientCredsParseError; - - fn try_from(mut value: HashMap) -> Result { + pub fn parse( + mut options: HashMap, + sni_data: Option<&str>, + common_name: Option<&str>, + ) -> Result { let mut get_param = |key| { - value + options .remove(key) .ok_or(ClientCredsParseError::MissingKey(key)) }; let user = get_param("user")?; - let db = get_param("database")?; + let dbname = get_param("database")?; + let project_name = get_param("project").ok(); + let project_name = get_project_name(sni_data, common_name, project_name.as_deref()); Ok(Self { user, - dbname: db, - sni_data: None, + dbname, + project_name, }) } -} -impl ClientCredentials { /// Use credentials to authenticate the user. pub async fn authenticate( self, config: &ProxyConfig, client: &mut PqStream, - ) -> Result { + ) -> super::Result { // This method is just a convenient facade for `handle_user` - super::handle_user(config, client, self).await + super::backend::handle_user(config, client, self).await + } +} + +/// Inferring project name from sni_data. +fn project_name_from_sni_data( + sni_data: &str, + common_name: &str, +) -> Result { + let common_name_with_dot = format!(".{common_name}"); + // check that ".{common_name_with_dot}" is the actual suffix in sni_data + if !sni_data.ends_with(&common_name_with_dot) { + return Err(ClientCredsParseError::InconsistentCommonNameAndSNI( + common_name.to_string(), + sni_data.to_string(), + )); + } + // return sni_data without the common name suffix. + Ok(sni_data + .strip_suffix(&common_name_with_dot) + .unwrap() + .to_string()) +} + +#[cfg(test)] +mod tests_for_project_name_from_sni_data { + use super::*; + + #[test] + fn passing() { + let target_project_name = "my-project-123"; + let common_name = "localtest.me"; + let sni_data = format!("{target_project_name}.{common_name}"); + assert_eq!( + project_name_from_sni_data(&sni_data, common_name), + Ok(target_project_name.to_string()) + ); + } + + #[test] + fn throws_inconsistent_common_name_and_sni_data() { + let target_project_name = "my-project-123"; + let common_name = "localtest.me"; + let wrong_suffix = "wrongtest.me"; + assert_eq!(common_name.len(), wrong_suffix.len()); + let wrong_common_name = format!("wrong{wrong_suffix}"); + let sni_data = format!("{target_project_name}.{wrong_common_name}"); + assert_eq!( + project_name_from_sni_data(&sni_data, common_name), + Err(ClientCredsParseError::InconsistentCommonNameAndSNI( + common_name.to_string(), + sni_data + )) + ); + } +} + +/// Determine project name from SNI or from project_name parameter from options argument. +fn get_project_name( + sni_data: Option<&str>, + common_name: Option<&str>, + project_name: Option<&str>, +) -> Result { + // determine the project name from sni_data if it exists, otherwise from project_name. + let ret = match sni_data { + Some(sni_data) => { + let common_name = common_name.ok_or(ClientCredsParseError::CommonNameNotSet)?; + let project_name_from_sni = project_name_from_sni_data(sni_data, common_name)?; + // check invariant: project name from options and from sni should match + if let Some(project_name) = &project_name { + if !project_name_from_sni.eq(project_name) { + return Err(ClientCredsParseError::InconsistentProjectNameAndSNI( + project_name_from_sni, + project_name.to_string(), + )); + } + } + project_name_from_sni + } + None => project_name + .ok_or(ClientCredsParseError::MissingSNIAndProjectName)? + .to_string(), + }; + + // check formatting invariant: project name must contain only alphanumeric characters and hyphens. + if !ret.chars().all(|x: char| x.is_alphanumeric() || x == '-') { + return Err(ClientCredsParseError::ProjectNameContainsIllegalChars(ret)); + } + + Ok(ret) +} + +#[cfg(test)] +mod tests_for_project_name_only { + use super::*; + + #[test] + fn passing_from_sni_data_only() { + let target_project_name = "my-project-123"; + let common_name = "localtest.me"; + let sni_data = format!("{target_project_name}.{common_name}"); + assert_eq!( + get_project_name(Some(&sni_data), Some(common_name), None), + Ok(target_project_name.to_string()) + ); + } + + #[test] + fn throws_project_name_contains_illegal_chars_from_sni_data_only() { + let project_name_prefix = "my-project"; + let project_name_suffix = "123"; + let common_name = "localtest.me"; + + for illegal_char_id in 0..256 { + let illegal_char = char::from_u32(illegal_char_id).unwrap(); + if !(illegal_char.is_alphanumeric() || illegal_char == '-') + && illegal_char.to_string().len() == 1 + { + let target_project_name = + format!("{project_name_prefix}{illegal_char}{project_name_suffix}"); + let sni_data = format!("{target_project_name}.{common_name}"); + assert_eq!( + get_project_name(Some(&sni_data), Some(common_name), None), + Err(ClientCredsParseError::ProjectNameContainsIllegalChars( + target_project_name + )) + ); + } + } + } + + #[test] + fn passing_from_project_name_only() { + let target_project_name = "my-project-123"; + let common_names = [Some("localtest.me"), None]; + for common_name in common_names { + assert_eq!( + get_project_name(None, common_name, Some(target_project_name)), + Ok(target_project_name.to_string()) + ); + } + } + + #[test] + fn throws_project_name_contains_illegal_chars_from_project_name_only() { + let project_name_prefix = "my-project"; + let project_name_suffix = "123"; + let common_names = [Some("localtest.me"), None]; + + for common_name in common_names { + for illegal_char_id in 0..256 { + let illegal_char: char = char::from_u32(illegal_char_id).unwrap(); + if !(illegal_char.is_alphanumeric() || illegal_char == '-') + && illegal_char.to_string().len() == 1 + { + let target_project_name = + format!("{project_name_prefix}{illegal_char}{project_name_suffix}"); + assert_eq!( + get_project_name(None, common_name, Some(&target_project_name)), + Err(ClientCredsParseError::ProjectNameContainsIllegalChars( + target_project_name + )) + ); + } + } + } + } + + #[test] + fn passing_from_sni_data_and_project_name() { + let target_project_name = "my-project-123"; + let common_name = "localtest.me"; + let sni_data = format!("{target_project_name}.{common_name}"); + assert_eq!( + get_project_name( + Some(&sni_data), + Some(common_name), + Some(target_project_name) + ), + Ok(target_project_name.to_string()) + ); + } + + #[test] + fn throws_inconsistent_project_name_and_sni() { + let project_name_param = "my-project-123"; + let wrong_project_name = "not-my-project-123"; + let common_name = "localtest.me"; + let sni_data = format!("{wrong_project_name}.{common_name}"); + assert_eq!( + get_project_name(Some(&sni_data), Some(common_name), Some(project_name_param)), + Err(ClientCredsParseError::InconsistentProjectNameAndSNI( + wrong_project_name.to_string(), + project_name_param.to_string() + )) + ); + } + + #[test] + fn throws_common_name_not_set() { + let target_project_name = "my-project-123"; + let wrong_project_name = "not-my-project-123"; + let common_name = "localtest.me"; + let sni_datas = [ + Some(format!("{wrong_project_name}.{common_name}")), + Some(format!("{target_project_name}.{common_name}")), + ]; + let project_names = [None, Some(target_project_name)]; + for sni_data in sni_datas { + for project_name_param in project_names { + assert_eq!( + get_project_name(sni_data.as_deref(), None, project_name_param), + Err(ClientCredsParseError::CommonNameNotSet) + ); + } + } + } + + #[test] + fn throws_inconsistent_common_name_and_sni_data() { + let target_project_name = "my-project-123"; + let wrong_project_name = "not-my-project-123"; + let common_name = "localtest.me"; + let wrong_suffix = "wrongtest.me"; + assert_eq!(common_name.len(), wrong_suffix.len()); + let wrong_common_name = format!("wrong{wrong_suffix}"); + let sni_datas = [ + Some(format!("{wrong_project_name}.{wrong_common_name}")), + Some(format!("{target_project_name}.{wrong_common_name}")), + ]; + let project_names = [None, Some(target_project_name)]; + for project_name_param in project_names { + for sni_data in &sni_datas { + assert_eq!( + get_project_name(sni_data.as_deref(), Some(common_name), project_name_param), + Err(ClientCredsParseError::InconsistentCommonNameAndSNI( + common_name.to_string(), + sni_data.clone().unwrap().to_string() + )) + ); + } + } } } diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 3eed0f0a23..7efff13bfc 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -1,6 +1,6 @@ //! Main authentication flow. -use super::{AuthError, AuthErrorImpl}; +use super::AuthErrorImpl; use crate::stream::PqStream; use crate::{sasl, scram}; use std::io; @@ -32,7 +32,7 @@ impl AuthMethod for Scram<'_> { pub struct AuthFlow<'a, Stream, State> { /// The underlying stream which implements libpq's protocol. stream: &'a mut PqStream, - /// State might contain ancillary data (see [`AuthFlow::begin`]). + /// State might contain ancillary data (see [`Self::begin`]). state: State, } @@ -60,7 +60,7 @@ impl<'a, S: AsyncWrite + Unpin> AuthFlow<'a, S, Begin> { /// Stream wrapper for handling [SCRAM](crate::scram) auth. impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. - pub async fn authenticate(self) -> Result { + pub async fn authenticate(self) -> super::Result { // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; let sasl = sasl::FirstMessage::parse(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; diff --git a/proxy/src/auth_backend.rs b/proxy/src/auth_backend.rs deleted file mode 100644 index 54362bf719..0000000000 --- a/proxy/src/auth_backend.rs +++ /dev/null @@ -1,31 +0,0 @@ -pub mod console; -pub mod legacy_console; -pub mod link; -pub mod postgres; - -pub use legacy_console::{AuthError, AuthErrorImpl}; - -use crate::mgmt; -use crate::waiters::{self, Waiter, Waiters}; -use lazy_static::lazy_static; - -lazy_static! { - static ref CPLANE_WAITERS: Waiters = Default::default(); -} - -/// Give caller an opportunity to wait for the cloud's reply. -pub async fn with_waiter( - psql_session_id: impl Into, - action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R, -) -> Result -where - R: std::future::Future>, - E: From, -{ - let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; - action(waiter).await -} - -pub fn notify(psql_session_id: &str, msg: mgmt::ComputeReady) -> Result<(), waiters::NotifyError> { - CPLANE_WAITERS.notify(psql_session_id, msg) -} diff --git a/proxy/src/auth_backend/console.rs b/proxy/src/auth_backend/console.rs deleted file mode 100644 index 41a822701f..0000000000 --- a/proxy/src/auth_backend/console.rs +++ /dev/null @@ -1,243 +0,0 @@ -//! Declaration of Cloud API V2. - -use crate::{ - auth::{self, AuthFlow}, - compute, scram, -}; -use serde::{Deserialize, Serialize}; -use thiserror::Error; - -use crate::auth::ClientCredentials; -use crate::stream::PqStream; - -use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; - -#[derive(Debug, Error)] -pub enum ConsoleAuthError { - // We shouldn't include the actual secret here. - #[error("Bad authentication secret")] - BadSecret, - - #[error("Bad client credentials: {0:?}")] - BadCredentials(crate::auth::ClientCredentials), - - #[error("SNI info is missing, please upgrade the postgres client library")] - SniMissing, - - #[error("Unexpected SNI content")] - SniWrong, - - #[error(transparent)] - BadUrl(#[from] url::ParseError), - - #[error(transparent)] - Io(#[from] std::io::Error), - - /// HTTP status (other than 200) returned by the console. - #[error("Console responded with an HTTP status: {0}")] - HttpStatus(reqwest::StatusCode), - - #[error(transparent)] - Transport(#[from] reqwest::Error), - - #[error("Console responded with a malformed JSON: '{0}'")] - MalformedResponse(#[from] serde_json::Error), - - #[error("Console responded with a malformed compute address: '{0}'")] - MalformedComputeAddress(String), -} - -#[derive(Serialize, Deserialize, Debug)] -struct GetRoleSecretResponse { - role_secret: String, -} - -#[derive(Serialize, Deserialize, Debug)] -struct GetWakeComputeResponse { - address: String, -} - -/// Auth secret which is managed by the cloud. -pub enum AuthInfo { - /// Md5 hash of user's password. - Md5([u8; 16]), - /// [SCRAM](crate::scram) authentication info. - Scram(scram::ServerSecret), -} - -/// Compute node connection params provided by the cloud. -/// Note how it implements serde traits, since we receive it over the wire. -#[derive(Serialize, Deserialize, Default)] -pub struct DatabaseInfo { - pub host: String, - pub port: u16, - pub dbname: String, - pub user: String, - - /// [Cloud API V1](super::legacy) returns cleartext password, - /// but [Cloud API V2](super::api) implements [SCRAM](crate::scram) - /// authentication, so we can leverage this method and cope without password. - pub password: Option, -} - -// Manually implement debug to omit personal and sensitive info. -impl std::fmt::Debug for DatabaseInfo { - fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { - fmt.debug_struct("DatabaseInfo") - .field("host", &self.host) - .field("port", &self.port) - .finish() - } -} - -impl From for tokio_postgres::Config { - fn from(db_info: DatabaseInfo) -> Self { - let mut config = tokio_postgres::Config::new(); - - config - .host(&db_info.host) - .port(db_info.port) - .dbname(&db_info.dbname) - .user(&db_info.user); - - if let Some(password) = db_info.password { - config.password(password); - } - - config - } -} - -async fn get_auth_info( - auth_endpoint: &str, - user: &str, - cluster: &str, -) -> Result { - let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_get_role_secret"))?; - - url.query_pairs_mut() - .append_pair("project", cluster) - .append_pair("role", user); - - // TODO: use a proper logger - println!("cplane request: {}", url); - - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - return Err(ConsoleAuthError::HttpStatus(resp.status())); - } - - let response: GetRoleSecretResponse = serde_json::from_str(resp.text().await?.as_str())?; - - scram::ServerSecret::parse(response.role_secret.as_str()) - .map(AuthInfo::Scram) - .ok_or(ConsoleAuthError::BadSecret) -} - -/// Wake up the compute node and return the corresponding connection info. -async fn wake_compute( - auth_endpoint: &str, - cluster: &str, -) -> Result<(String, u16), ConsoleAuthError> { - let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_wake_compute"))?; - url.query_pairs_mut().append_pair("project", cluster); - - // TODO: use a proper logger - println!("cplane request: {}", url); - - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - return Err(ConsoleAuthError::HttpStatus(resp.status())); - } - - let response: GetWakeComputeResponse = serde_json::from_str(resp.text().await?.as_str())?; - let (host, port) = response - .address - .split_once(':') - .ok_or_else(|| ConsoleAuthError::MalformedComputeAddress(response.address.clone()))?; - let port: u16 = port - .parse() - .map_err(|_| ConsoleAuthError::MalformedComputeAddress(response.address.clone()))?; - - Ok((host.to_string(), port)) -} - -pub async fn handle_user( - auth_endpoint: &str, - client: &mut PqStream, - creds: &ClientCredentials, -) -> Result { - // Determine cluster name from SNI. - let cluster = creds - .sni_data - .as_ref() - .ok_or(ConsoleAuthError::SniMissing)? - .split_once('.') - .ok_or(ConsoleAuthError::SniWrong)? - .0; - - let user = creds.user.as_str(); - - // Step 1: get the auth secret - let auth_info = get_auth_info(auth_endpoint, user, cluster).await?; - - let flow = AuthFlow::new(client); - let scram_keys = match auth_info { - AuthInfo::Md5(_) => { - // TODO: decide if we should support MD5 in api v2 - return Err(crate::auth::AuthErrorImpl::auth_failed("MD5 is not supported").into()); - } - AuthInfo::Scram(secret) => { - let scram = auth::Scram(&secret); - Some(compute::ScramKeys { - client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), - server_key: secret.server_key.as_bytes(), - }) - } - }; - - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - - // Step 2: wake compute - let (host, port) = wake_compute(auth_endpoint, cluster).await?; - - Ok(compute::NodeInfo { - db_info: DatabaseInfo { - host, - port, - dbname: creds.dbname.clone(), - user: creds.user.clone(), - password: None, - }, - scram_keys, - }) -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn parse_db_info() -> anyhow::Result<()> { - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "password": "password", - }))?; - - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - }))?; - - Ok(()) - } -} diff --git a/proxy/src/auth_backend/postgres.rs b/proxy/src/auth_backend/postgres.rs deleted file mode 100644 index 148c2a2518..0000000000 --- a/proxy/src/auth_backend/postgres.rs +++ /dev/null @@ -1,93 +0,0 @@ -//! Local mock of Cloud API V2. - -use super::console::{self, AuthInfo, DatabaseInfo}; -use crate::scram; -use crate::{auth::ClientCredentials, compute}; - -use crate::stream::PqStream; -use tokio::io::{AsyncRead, AsyncWrite}; -use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; - -async fn get_auth_info( - auth_endpoint: &str, - creds: &ClientCredentials, -) -> Result { - // We wrap `tokio_postgres::Error` because we don't want to infect the - // method's error type with a detail that's specific to debug mode only. - let io_error = |e| std::io::Error::new(std::io::ErrorKind::Other, e); - - // Perhaps we could persist this connection, but then we'd have to - // write more code for reopening it if it got closed, which doesn't - // seem worth it. - let (client, connection) = tokio_postgres::connect(auth_endpoint, tokio_postgres::NoTls) - .await - .map_err(io_error)?; - - tokio::spawn(connection); - let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1"; - let rows = client - .query(query, &[&creds.user]) - .await - .map_err(io_error)?; - - match &rows[..] { - // We can't get a secret if there's no such user. - [] => Err(console::ConsoleAuthError::BadCredentials(creds.to_owned())), - // We shouldn't get more than one row anyway. - [row, ..] => { - let entry = row.try_get(0).map_err(io_error)?; - scram::ServerSecret::parse(entry) - .map(AuthInfo::Scram) - .or_else(|| { - // It could be an md5 hash if it's not a SCRAM secret. - let text = entry.strip_prefix("md5")?; - Some(AuthInfo::Md5({ - let mut bytes = [0u8; 16]; - hex::decode_to_slice(text, &mut bytes).ok()?; - bytes - })) - }) - // Putting the secret into this message is a security hazard! - .ok_or(console::ConsoleAuthError::BadSecret) - } - } -} - -pub async fn handle_user( - auth_endpoint: &reqwest::Url, - client: &mut PqStream, - creds: &ClientCredentials, -) -> Result { - let auth_info = get_auth_info(auth_endpoint.as_ref(), creds).await?; - - let flow = crate::auth::AuthFlow::new(client); - let scram_keys = match auth_info { - AuthInfo::Md5(_) => { - // TODO: decide if we should support MD5 in api v2 - return Err(crate::auth::AuthErrorImpl::auth_failed("MD5 is not supported").into()); - } - AuthInfo::Scram(secret) => { - let scram = crate::auth::Scram(&secret); - Some(compute::ScramKeys { - client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), - server_key: secret.server_key.as_bytes(), - }) - } - }; - - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; - - Ok(compute::NodeInfo { - db_info: DatabaseInfo { - // TODO: handle that near CLI params parsing - host: auth_endpoint.host_str().unwrap_or("localhost").to_owned(), - port: auth_endpoint.port().unwrap_or(5432), - dbname: creds.dbname.to_owned(), - user: creds.user.to_owned(), - password: None, - }, - scram_keys, - }) -} diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index c3c5ba47fb..cccd6e60d4 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,4 +1,4 @@ -use crate::auth_backend::console::DatabaseInfo; +use crate::auth::DatabaseInfo; use crate::cancellation::CancelClosure; use crate::error::UserFacingError; use std::io; @@ -37,7 +37,7 @@ pub struct NodeInfo { impl NodeInfo { async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { - let host_port = format!("{}:{}", self.db_info.host, self.db_info.port); + let host_port = (self.db_info.host.as_str(), self.db_info.port); let socket = TcpStream::connect(host_port).await?; let socket_addr = socket.peer_addr()?; socket2::SockRef::from(&socket).set_keepalive(true)?; diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 077a07beb9..df3923de1a 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,58 +1,70 @@ -use anyhow::{ensure, Context}; +use crate::url::ApiUrl; +use anyhow::{bail, ensure, Context}; use std::{str::FromStr, sync::Arc}; -#[non_exhaustive] +#[derive(Debug)] pub enum AuthBackendType { + /// Legacy Cloud API (V1). LegacyConsole, - Console, - Postgres, + /// Authentication via a web browser. Link, + /// Current Cloud API (V2). + Console, + /// Local mock of Cloud API (V2). + Postgres, } impl FromStr for AuthBackendType { type Err = anyhow::Error; fn from_str(s: &str) -> anyhow::Result { - println!("ClientAuthMethod::from_str: '{}'", s); use AuthBackendType::*; - match s { - "legacy" => Ok(LegacyConsole), - "console" => Ok(Console), - "postgres" => Ok(Postgres), - "link" => Ok(Link), - _ => Err(anyhow::anyhow!("Invlid option for auth method")), - } + Ok(match s { + "legacy" => LegacyConsole, + "console" => Console, + "postgres" => Postgres, + "link" => Link, + _ => bail!("Invalid option `{s}` for auth method"), + }) } } pub struct ProxyConfig { - /// TLS configuration for the proxy. pub tls_config: Option, - pub auth_backend: AuthBackendType, - - pub auth_endpoint: reqwest::Url, - - pub auth_link_uri: reqwest::Url, + pub auth_endpoint: ApiUrl, + pub auth_link_uri: ApiUrl, } -pub type TlsConfig = Arc; +pub struct TlsConfig { + pub config: Arc, + pub common_name: Option, +} + +impl TlsConfig { + pub fn to_server_config(&self) -> Arc { + self.config.clone() + } +} /// Configure TLS for the main endpoint. pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result { let key = { let key_bytes = std::fs::read(key_path).context("TLS key file")?; let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) - .context("couldn't read TLS keys")?; + .context(format!("Failed to read TLS keys at '{key_path}'"))?; ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); keys.pop().map(rustls::PrivateKey).unwrap() }; + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; let cert_chain = { - let cert_chain_bytes = std::fs::read(cert_path).context("TLS cert file")?; rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .context("couldn't read TLS certificate chain")? + .context(format!( + "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." + ))? .into_iter() .map(rustls::Certificate) .collect() @@ -61,9 +73,28 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result anyhow::Result<()> { })); println!("Version: {GIT_VERSION}"); + println!("Authentication backend: {:?}", config.auth_backend); // Check that we can bind to address before further initialization println!("Starting http on {}", http_address); diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 93618fff68..8737d170b1 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -1,4 +1,4 @@ -use crate::auth_backend; +use crate::auth; use anyhow::Context; use serde::Deserialize; use std::{ @@ -77,12 +77,12 @@ struct PsqlSessionResponse { #[derive(Deserialize)] enum PsqlSessionResult { - Success(auth_backend::console::DatabaseInfo), + Success(auth::DatabaseInfo), Failure(String), } /// A message received by `mgmt` when a compute node is ready. -pub type ComputeReady = Result; +pub type ComputeReady = Result; impl PsqlSessionResult { fn into_compute_ready(self) -> ComputeReady { @@ -113,7 +113,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::R let resp: PsqlSessionResponse = serde_json::from_str(query_string)?; - match auth_backend::notify(&resp.session_id, resp.result.into_compute_ready()) { + match auth::backend::notify(&resp.session_id, resp.result.into_compute_ready()) { Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 642e50c2c1..7e364b5e9c 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -81,7 +81,7 @@ async fn handle_client( NUM_CONNECTIONS_CLOSED_COUNTER.inc(); } - let tls = config.tls_config.clone(); + let tls = config.tls_config.as_ref(); let (stream, creds) = match handshake(stream, tls, cancel_map).await? { Some(x) => x, None => return Ok(()), // it's a cancellation request @@ -95,16 +95,18 @@ async fn handle_client( /// Establish a (most probably, secure) connection with the client. /// For better testing experience, `stream` can be any object satisfying the traits. -/// It's easier to work with owned `stream` here as we need to updgrade it to TLS; +/// It's easier to work with owned `stream` here as we need to upgrade it to TLS; /// we also take an extra care of propagating only the select handshake errors to client. async fn handshake( stream: S, - mut tls: Option, + mut tls: Option<&TlsConfig>, cancel_map: &CancelMap, ) -> anyhow::Result>, auth::ClientCredentials)>> { // Client may try upgrading to each protocol only once let (mut tried_ssl, mut tried_gss) = (false, false); + let common_name = tls.and_then(|cfg| cfg.common_name.as_deref()); + let mut stream = PqStream::new(Stream::from_raw(stream)); loop { let msg = stream.read_startup_packet().await?; @@ -122,7 +124,9 @@ async fn handshake( if let Some(tls) = tls.take() { // Upgrade raw stream into a secure TLS-backed stream. // NOTE: We've consumed `tls`; this fact will be used later. - stream = PqStream::new(stream.into_inner().upgrade(tls).await?); + stream = PqStream::new( + stream.into_inner().upgrade(tls.to_server_config()).await?, + ); } } _ => bail!(ERR_PROTO_VIOLATION), @@ -143,15 +147,16 @@ async fn handshake( stream.throw_error_str(ERR_INSECURE_CONNECTION).await?; } - // Here and forth: `or_else` demands that we use a future here - let mut creds: auth::ClientCredentials = async { params.try_into() } - .or_else(|e| stream.throw_error(e)) - .await?; + // Get SNI info when available + let sni_data = match stream.get_ref() { + Stream::Tls { tls } => tls.get_ref().1.sni_hostname().map(|s| s.to_owned()), + _ => None, + }; - // Set SNI info when available - if let Stream::Tls { tls } = stream.get_ref() { - creds.sni_data = tls.get_ref().1.sni_hostname().map(|s| s.to_owned()); - } + // Construct credentials + let creds = + auth::ClientCredentials::parse(params, sni_data.as_deref(), common_name); + let creds = async { creds }.or_else(|e| stream.throw_error(e)).await?; break Ok(Some((stream, creds))); } @@ -264,12 +269,13 @@ mod tests { } /// Generate TLS certificates and build rustls configs for client and server. - fn generate_tls_config( - hostname: &str, - ) -> anyhow::Result<(ClientConfig<'_>, Arc)> { + fn generate_tls_config<'a>( + hostname: &'a str, + common_name: &'a str, + ) -> anyhow::Result<(ClientConfig<'a>, TlsConfig)> { let (ca, cert, key) = generate_certs(hostname)?; - let server_config = { + let tls_config = { let config = rustls::ServerConfig::builder() .with_safe_defaults() .with_no_client_auth() @@ -291,7 +297,12 @@ mod tests { ClientConfig { config, hostname } }; - Ok((client_config, server_config)) + let tls_config = TlsConfig { + config: tls_config, + common_name: Some(common_name.to_string()), + }; + + Ok((client_config, tls_config)) } #[async_trait] @@ -346,7 +357,7 @@ mod tests { auth: impl TestAuth + Send, ) -> anyhow::Result<()> { let cancel_map = CancelMap::default(); - let (mut stream, _creds) = handshake(client, tls, &cancel_map) + let (mut stream, _creds) = handshake(client, tls.as_ref(), &cancel_map) .await? .context("handshake failed")?; @@ -365,7 +376,8 @@ mod tests { async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); - let (_, server_config) = generate_tls_config("localhost")?; + let (_, server_config) = + generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); let client_err = tokio_postgres::Config::new() @@ -393,7 +405,8 @@ mod tests { async fn handshake_tls() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); - let (client_config, server_config) = generate_tls_config("localhost")?; + let (client_config, server_config) = + generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); let (_client, _conn) = tokio_postgres::Config::new() @@ -415,6 +428,7 @@ mod tests { let (_client, _conn) = tokio_postgres::Config::new() .user("john_doe") .dbname("earth") + .options("project=generic-project-name") .ssl_mode(SslMode::Prefer) .connect_raw(server, NoTls) .await?; @@ -476,7 +490,8 @@ mod tests { async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); - let (client_config, server_config) = generate_tls_config("localhost")?; + let (client_config, server_config) = + generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), @@ -498,7 +513,8 @@ mod tests { async fn scram_auth_mock() -> anyhow::Result<()> { let (client, server) = tokio::io::duplex(1024); - let (client_config, server_config) = generate_tls_config("localhost")?; + let (client_config, server_config) = + generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs index cd9032bfb9..689fca6049 100644 --- a/proxy/src/sasl.rs +++ b/proxy/src/sasl.rs @@ -10,6 +10,7 @@ mod channel_binding; mod messages; mod stream; +use crate::error::UserFacingError; use std::io; use thiserror::Error; @@ -36,6 +37,20 @@ pub enum Error { Io(#[from] io::Error), } +impl UserFacingError for Error { + fn to_string_client(&self) -> String { + use Error::*; + match self { + // This constructor contains the reason why auth has failed. + AuthenticationFailed(s) => s.to_string(), + // TODO: add support for channel binding + ChannelBindingFailed(_) => "channel binding is not supported yet".to_string(), + ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"), + _ => "authentication protocol violation".to_string(), + } + } +} + /// A convenient result type for SASL exchange. pub type Result = std::result::Result; diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index cad77e15f5..fca5585b25 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -106,7 +106,9 @@ impl sasl::Mechanism for Exchange<'_> { } if client_final_message.nonce != server_first_message.nonce() { - return Err(SaslError::AuthenticationFailed("bad nonce")); + return Err(SaslError::AuthenticationFailed( + "combined nonce doesn't match", + )); } let signature_builder = SignatureBuilder { @@ -120,7 +122,7 @@ impl sasl::Mechanism for Exchange<'_> { .derive_client_key(&client_final_message.proof); if client_key.sha256() != self.secret.stored_key { - return Err(SaslError::AuthenticationFailed("keys don't match")); + return Err(SaslError::AuthenticationFailed("password doesn't match")); } let msg = client_final_message diff --git a/proxy/src/url.rs b/proxy/src/url.rs new file mode 100644 index 0000000000..76d6ad0e66 --- /dev/null +++ b/proxy/src/url.rs @@ -0,0 +1,82 @@ +use anyhow::bail; +use url::form_urlencoded::Serializer; + +/// A [url](url::Url) type with additional guarantees. +#[derive(Debug, Clone)] +pub struct ApiUrl(url::Url); + +impl ApiUrl { + /// Consume the wrapper and return inner [url](url::Url). + pub fn into_inner(self) -> url::Url { + self.0 + } + + /// See [`url::Url::query_pairs_mut`]. + pub fn query_pairs_mut(&mut self) -> Serializer<'_, url::UrlQuery<'_>> { + self.0.query_pairs_mut() + } + + /// See [`url::Url::path_segments_mut`]. + pub fn path_segments_mut(&mut self) -> url::PathSegmentsMut { + // We've already verified that it works during construction. + self.0.path_segments_mut().expect("bad API url") + } +} + +/// This instance imposes additional requirements on the url. +impl std::str::FromStr for ApiUrl { + type Err = anyhow::Error; + + fn from_str(s: &str) -> anyhow::Result { + let mut url: url::Url = s.parse()?; + + // Make sure that we can build upon this URL. + if url.path_segments_mut().is_err() { + bail!("bad API url provided"); + } + + Ok(Self(url)) + } +} + +/// This instance is safe because it doesn't allow us to modify the object. +impl std::ops::Deref for ApiUrl { + type Target = url::Url; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::fmt::Display for ApiUrl { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn bad_url() { + let url = "test:foobar"; + url.parse::().expect("unexpected parsing failure"); + let _ = url.parse::().expect_err("should not parse"); + } + + #[test] + fn good_url() { + let url = "test://foobar"; + let mut a = url.parse::().expect("unexpected parsing failure"); + let mut b = url.parse::().expect("unexpected parsing failure"); + + a.path_segments_mut().unwrap().push("method"); + a.query_pairs_mut().append_pair("key", "value"); + + b.path_segments_mut().push("method"); + b.query_pairs_mut().append_pair("key", "value"); + + assert_eq!(a, b.into_inner()); + } +} diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index 799d45a165..bba5494cfe 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -115,7 +115,7 @@ mod tests { Ok(()) }); - let () = waiter.await?; + waiter.await?; notifier.await? } } diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 417cf58cd5..373108c61b 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -30,6 +30,10 @@ const_format = "0.2.21" tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } tokio-util = { version = "0.7", features = ["io"] } git-version = "0.3.5" +async-trait = "0.1" +once_cell = "1.10.0" +futures = "0.3.13" +toml_edit = { version = "0.13", features = ["easy"] } postgres_ffi = { path = "../libs/postgres_ffi" } metrics = { path = "../libs/metrics" } diff --git a/safekeeper/spec/ProposerAcceptorConsensus.tla b/safekeeper/spec/ProposerAcceptorConsensus.tla index 993edfcf23..e5f0bb270f 100644 --- a/safekeeper/spec/ProposerAcceptorConsensus.tla +++ b/safekeeper/spec/ProposerAcceptorConsensus.tla @@ -88,7 +88,7 @@ TypeOk == \* in campaign proposer sends RequestVote and waits for acks; \* in leader he is elected /\ prop_state[p].state \in {"campaign", "leader"} - \* 0..max_term should be actually Nat in the unbouned model, but TLC won't + \* 0..max_term should be actually Nat in the unbounded model, but TLC won't \* swallow it /\ prop_state[p].term \in 0..max_term \* votes received diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index a5ffc013e2..6c9c59c76b 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -6,25 +6,32 @@ use clap::{App, Arg}; use const_format::formatcp; use daemonize::Daemonize; use fs2::FileExt; +use remote_storage::RemoteStorageConfig; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::path::{Path, PathBuf}; +use std::sync::Arc; use std::thread; use tokio::sync::mpsc; +use toml_edit::Document; use tracing::*; use url::{ParseError, Url}; -use safekeeper::control_file::{self}; -use safekeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR}; +use safekeeper::broker; +use safekeeper::control_file; +use safekeeper::defaults::{ + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS, +}; +use safekeeper::http; use safekeeper::remove_wal; use safekeeper::timeline::GlobalTimelines; +use safekeeper::wal_backup; use safekeeper::wal_service; use safekeeper::SafeKeeperConf; -use safekeeper::{broker, callmemaybe}; -use safekeeper::{http, s3_offload}; +use utils::auth::JwtAuth; use utils::{ http::endpoint, logging, project_git_version, shutdown::exit_now, signals, tcp_listener, - zid::ZNodeId, + zid::NodeId, }; const LOCK_FILE_NAME: &str = "safekeeper.lock"; @@ -71,12 +78,6 @@ fn main() -> anyhow::Result<()> { .long("pageserver") .takes_value(true), ) - .arg( - Arg::new("ttl") - .long("ttl") - .takes_value(true) - .help("interval for keeping WAL at safekeeper node, after which them will be uploaded to S3 and removed locally"), - ) .arg( Arg::new("recall") .long("recall") @@ -101,7 +102,7 @@ fn main() -> anyhow::Result<()> { Arg::new("dump-control-file") .long("dump-control-file") .takes_value(true) - .help("Dump control file at path specifed by this argument and exit"), + .help("Dump control file at path specified by this argument and exit"), ) .arg( Arg::new("id").long("id").takes_value(true).help("safekeeper node id: integer") @@ -118,12 +119,26 @@ fn main() -> anyhow::Result<()> { .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"), ) .arg( - Arg::new("enable-s3-offload") - .long("enable-s3-offload") + Arg::new("wal-backup-threads").long("backup-threads").takes_value(true).help(formatcp!("number of threads for wal backup (default {DEFAULT_WAL_BACKUP_RUNTIME_THREADS}")), + ).arg( + Arg::new("remote-storage") + .long("remote-storage") + .takes_value(true) + .help("Remote storage configuration for WAL backup (offloading to s3) as TOML inline table, e.g. {\"max_concurrent_syncs\" = 17, \"max_sync_errors\": 13, \"bucket_name\": \"\", \"bucket_region\":\"\", \"concurrency_limit\": 119}.\nSafekeeper offloads WAL to [prefix_in_bucket/]//, mirroring structure on the file system.") + ) + .arg( + Arg::new("enable-wal-backup") + .long("enable-wal-backup") .takes_value(true) .default_value("true") .default_missing_value("true") - .help("Enable/disable s3 offloading. When disabled, safekeeper removes WAL ignoring s3 WAL horizon."), + .help("Enable/disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring WAL backup horizon."), + ) + .arg( + Arg::new("auth-validation-public-key-path") + .long("auth-validation-public-key-path") + .takes_value(true) + .help("Path to an RSA .pem public key which is used to check JWT tokens") ) .get_matches(); @@ -157,17 +172,13 @@ fn main() -> anyhow::Result<()> { conf.listen_http_addr = addr.to_owned(); } - if let Some(ttl) = arg_matches.value_of("ttl") { - conf.ttl = Some(humantime::parse_duration(ttl)?); - } - if let Some(recall) = arg_matches.value_of("recall") { conf.recall_period = humantime::parse_duration(recall)?; } let mut given_id = None; if let Some(given_id_str) = arg_matches.value_of("id") { - given_id = Some(ZNodeId( + given_id = Some(NodeId( given_id_str .parse() .context("failed to parse safekeeper id")?, @@ -182,17 +193,33 @@ fn main() -> anyhow::Result<()> { conf.broker_etcd_prefix = prefix.to_string(); } + if let Some(backup_threads) = arg_matches.value_of("wal-backup-threads") { + conf.backup_runtime_threads = backup_threads + .parse() + .with_context(|| format!("Failed to parse backup threads {}", backup_threads))?; + } + if let Some(storage_conf) = arg_matches.value_of("remote-storage") { + // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse + let storage_conf_toml = format!("remote_storage = {}", storage_conf); + let parsed_toml = storage_conf_toml.parse::()?; // parse + let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again + conf.remote_storage = Some(RemoteStorageConfig::from_toml(storage_conf_parsed_toml)?); + } // Seems like there is no better way to accept bool values explicitly in clap. - conf.s3_offload_enabled = arg_matches - .value_of("enable-s3-offload") + conf.wal_backup_enabled = arg_matches + .value_of("enable-wal-backup") .unwrap() .parse() .context("failed to parse bool enable-s3-offload bool")?; + conf.auth_validation_public_key_path = arg_matches + .value_of("auth-validation-public-key-path") + .map(PathBuf::from); + start_safekeeper(conf, given_id, arg_matches.is_present("init")) } -fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { +fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { let log_file = logging::init("safekeeper.log", conf.daemonize)?; info!("version: {GIT_VERSION}"); @@ -224,6 +251,19 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b e })?; + let auth = match conf.auth_validation_public_key_path.as_ref() { + None => { + info!("Auth is disabled"); + None + } + Some(path) => { + info!("Loading JWT auth key from {}", path.display()); + Some(Arc::new( + JwtAuth::from_key_path(path).context("failed to load the auth key")?, + )) + } + }; + // XXX: Don't spawn any threads before daemonizing! if conf.daemonize { info!("daemonizing..."); @@ -249,18 +289,23 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b } } + // Register metrics collector for active timelines. It's important to do this + // after daemonizing, otherwise process collector will be upset. + let registry = metrics::default_registry(); + let timeline_collector = safekeeper::metrics::TimelineCollector::new(); + registry.register(Box::new(timeline_collector))?; + let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; - let (callmemaybe_tx, callmemaybe_rx) = mpsc::unbounded_channel(); - GlobalTimelines::set_callmemaybe_tx(callmemaybe_tx); + let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); + GlobalTimelines::init(wal_backup_launcher_tx); let conf_ = conf.clone(); threads.push( thread::Builder::new() .name("http_endpoint_thread".into()) .spawn(|| { - // TODO authentication - let router = http::make_router(conf_); + let router = http::make_router(conf_, auth); endpoint::serve_thread_main( router, http_listener, @@ -270,50 +315,26 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b })?, ); - if conf.ttl.is_some() { - let conf_ = conf.clone(); - threads.push( - thread::Builder::new() - .name("S3 offload thread".into()) - .spawn(|| { - s3_offload::thread_main(conf_); - })?, - ); - } - let conf_cloned = conf.clone(); let safekeeper_thread = thread::Builder::new() .name("Safekeeper thread".into()) .spawn(|| { - // thread code - let thread_result = wal_service::thread_main(conf_cloned, pg_listener); - if let Err(e) = thread_result { - info!("safekeeper thread terminated: {}", e); + // TODO: add auth + if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener) { + info!("safekeeper thread terminated: {e}"); } }) .unwrap(); threads.push(safekeeper_thread); - let conf_cloned = conf.clone(); - let callmemaybe_thread = thread::Builder::new() - .name("callmemaybe thread".into()) - .spawn(|| { - // thread code - let thread_result = callmemaybe::thread_main(conf_cloned, callmemaybe_rx); - if let Err(e) = thread_result { - error!("callmemaybe thread terminated: {}", e); - } - }) - .unwrap(); - threads.push(callmemaybe_thread); - if !conf.broker_endpoints.is_empty() { let conf_ = conf.clone(); threads.push( thread::Builder::new() .name("broker thread".into()) .spawn(|| { + // TODO: add auth? broker::thread_main(conf_); })?, ); @@ -326,10 +347,21 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b thread::Builder::new() .name("WAL removal thread".into()) .spawn(|| { + // TODO: add auth? remove_wal::thread_main(conf_); })?, ); + let conf_ = conf.clone(); + threads.push( + thread::Builder::new() + .name("wal backup launcher thread".into()) + .spawn(move || { + // TODO: add auth? + wal_backup::wal_backup_launcher_thread_main(conf_, wal_backup_launcher_rx); + })?, + ); + // TODO: put more thoughts into handling of failed threads // We probably should restart them. @@ -345,14 +377,14 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: b } /// Determine safekeeper id and set it in config. -fn set_id(conf: &mut SafeKeeperConf, given_id: Option) -> Result<()> { +fn set_id(conf: &mut SafeKeeperConf, given_id: Option) -> Result<()> { let id_file_path = conf.workdir.join(ID_FILE_NAME); - let my_id: ZNodeId; + let my_id: NodeId; // If ID exists, read it in; otherwise set one passed match fs::read(&id_file_path) { Ok(id_serialized) => { - my_id = ZNodeId( + my_id = NodeId( std::str::from_utf8(&id_serialized) .context("failed to parse safekeeper id")? .parse() diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index d7217be20a..ce66131700 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -1,22 +1,32 @@ //! Communication with etcd, providing safekeeper peers and pageserver coordination. +use anyhow::anyhow; use anyhow::Context; use anyhow::Error; use anyhow::Result; -use etcd_broker::Client; -use etcd_broker::PutOptions; -use etcd_broker::SkTimelineSubscriptionKind; +use etcd_broker::subscription_value::SkTimelineInfo; +use etcd_broker::LeaseKeepAliveStream; +use etcd_broker::LeaseKeeper; + +use std::collections::hash_map::Entry; +use std::collections::HashMap; use std::time::Duration; +use tokio::spawn; use tokio::task::JoinHandle; use tokio::{runtime, time::sleep}; use tracing::*; +use url::Url; use crate::{timeline::GlobalTimelines, SafeKeeperConf}; -use utils::zid::{ZNodeId, ZTenantTimelineId}; +use etcd_broker::{ + subscription_key::{OperationKind, SkOperationKind, SubscriptionKey}, + Client, PutOptions, +}; +use utils::zid::{NodeId, ZTenantTimelineId}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; -const LEASE_TTL_SEC: i64 = 5; +const LEASE_TTL_SEC: i64 = 10; pub fn thread_main(conf: SafeKeeperConf) { let runtime = runtime::Builder::new_current_thread() @@ -36,21 +46,163 @@ pub fn thread_main(conf: SafeKeeperConf) { fn timeline_safekeeper_path( broker_etcd_prefix: String, zttid: ZTenantTimelineId, - sk_id: ZNodeId, + sk_id: NodeId, ) -> String { format!( "{}/{sk_id}", - SkTimelineSubscriptionKind::timeline(broker_etcd_prefix, zttid).watch_key() + SubscriptionKey::sk_timeline_info(broker_etcd_prefix, zttid).watch_key() ) } +pub struct Election { + pub election_name: String, + pub candidate_name: String, + pub broker_endpoints: Vec, +} + +impl Election { + pub fn new(election_name: String, candidate_name: String, broker_endpoints: Vec) -> Self { + Self { + election_name, + candidate_name, + broker_endpoints, + } + } +} + +pub struct ElectionLeader { + client: Client, + keep_alive: JoinHandle>, +} + +impl ElectionLeader { + pub async fn check_am_i( + &mut self, + election_name: String, + candidate_name: String, + ) -> Result { + let resp = self.client.leader(election_name).await?; + + let kv = resp + .kv() + .ok_or_else(|| anyhow!("failed to get leader response"))?; + let leader = kv.value_str()?; + + Ok(leader == candidate_name) + } + + pub async fn give_up(self) { + self.keep_alive.abort(); + // TODO: it'll be wise to resign here but it'll happen after lease expiration anyway + // should we await for keep alive termination? + let _ = self.keep_alive.await; + } +} + +pub async fn get_leader(req: &Election, leader: &mut Option) -> Result<()> { + let mut client = Client::connect(req.broker_endpoints.clone(), None) + .await + .context("Could not connect to etcd")?; + + let lease = client + .lease_grant(LEASE_TTL_SEC, None) + .await + .context("Could not acquire a lease"); + + let lease_id = lease.map(|l| l.id()).unwrap(); + + // kill previous keepalive, if any + if let Some(l) = leader.take() { + l.give_up().await; + } + + let keep_alive = spawn::<_>(lease_keep_alive(client.clone(), lease_id)); + // immediately save handle to kill task if we get canceled below + *leader = Some(ElectionLeader { + client: client.clone(), + keep_alive, + }); + + client + .campaign( + req.election_name.clone(), + req.candidate_name.clone(), + lease_id, + ) + .await?; + + Ok(()) +} + +async fn lease_keep_alive(mut client: Client, lease_id: i64) -> Result<()> { + let (mut keeper, mut ka_stream) = client + .lease_keep_alive(lease_id) + .await + .context("failed to create keepalive stream")?; + + loop { + let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); + + keeper + .keep_alive() + .await + .context("failed to send LeaseKeepAliveRequest")?; + + ka_stream + .message() + .await + .context("failed to receive LeaseKeepAliveResponse")?; + + sleep(push_interval).await; + } +} + +pub fn get_candiate_name(system_id: NodeId) -> String { + format!("id_{system_id}") +} + +async fn push_sk_info( + zttid: ZTenantTimelineId, + mut client: Client, + key: String, + sk_info: SkTimelineInfo, + mut lease: Lease, +) -> anyhow::Result<(ZTenantTimelineId, Lease)> { + let put_opts = PutOptions::new().with_lease(lease.id); + client + .put( + key.clone(), + serde_json::to_string(&sk_info)?, + Some(put_opts), + ) + .await + .with_context(|| format!("failed to push safekeeper info to {}", key))?; + + // revive the lease + lease + .keeper + .keep_alive() + .await + .context("failed to send LeaseKeepAliveRequest")?; + lease + .ka_stream + .message() + .await + .context("failed to receive LeaseKeepAliveResponse")?; + + Ok((zttid, lease)) +} + +struct Lease { + id: i64, + keeper: LeaseKeeper, + ka_stream: LeaseKeepAliveStream, +} + /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { let mut client = Client::connect(&conf.broker_endpoints, None).await?; - - // Get and maintain lease to automatically delete obsolete data - let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; - let (mut keeper, mut ka_stream) = client.lease_keep_alive(lease.id()).await?; + let mut leases: HashMap = HashMap::new(); let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); loop { @@ -58,33 +210,46 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // is under plain mutex. That's ok, all this code is not performance // sensitive and there is no risk of deadlock as we don't await while // lock is held. - for zttid in GlobalTimelines::get_active_timelines() { - if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { - let sk_info = tli.get_public_info(&conf)?; - let put_opts = PutOptions::new().with_lease(lease.id()); - client - .put( - timeline_safekeeper_path( - conf.broker_etcd_prefix.clone(), - zttid, - conf.my_id, - ), - serde_json::to_string(&sk_info)?, - Some(put_opts), - ) - .await - .context("failed to push safekeeper info")?; + let active_tlis = GlobalTimelines::get_active_timelines(); + + // // Get and maintain (if not yet) per timeline lease to automatically delete obsolete data. + for zttid in active_tlis.iter() { + if let Entry::Vacant(v) = leases.entry(*zttid) { + let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; + let (keeper, ka_stream) = client.lease_keep_alive(lease.id()).await?; + v.insert(Lease { + id: lease.id(), + keeper, + ka_stream, + }); } } - // revive the lease - keeper - .keep_alive() - .await - .context("failed to send LeaseKeepAliveRequest")?; - ka_stream - .message() - .await - .context("failed to receive LeaseKeepAliveResponse")?; + leases.retain(|zttid, _| active_tlis.contains(zttid)); + + // Push data concurrently to not suffer from latency, with many timelines it can be slow. + let handles = active_tlis + .iter() + .filter_map(|zttid| GlobalTimelines::get_loaded(*zttid)) + .map(|tli| { + let sk_info = tli.get_public_info(&conf); + let key = timeline_safekeeper_path( + conf.broker_etcd_prefix.clone(), + tli.zttid, + conf.my_id, + ); + let lease = leases.remove(&tli.zttid).unwrap(); + tokio::spawn(push_sk_info(tli.zttid, client.clone(), key, sk_info, lease)) + }) + .collect::>(); + for h in handles { + let (zttid, lease) = h.await??; + // It is ugly to pull leases from hash and then put it back, but + // otherwise we have to resort to long living per tli tasks (which + // would generate a lot of errors when etcd is down) as task wants to + // have 'static objects, we can't borrow to it. + leases.insert(zttid, lease); + } + sleep(push_interval).await; } } @@ -93,25 +258,34 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { let mut client = Client::connect(&conf.broker_endpoints, None).await?; - let mut subscription = etcd_broker::subscribe_to_safekeeper_timeline_updates( + let mut subscription = etcd_broker::subscribe_for_values( &mut client, - SkTimelineSubscriptionKind::all(conf.broker_etcd_prefix.clone()), + SubscriptionKey::all(conf.broker_etcd_prefix.clone()), + |full_key, value_str| { + if full_key.operation == OperationKind::Safekeeper(SkOperationKind::TimelineInfo) { + match serde_json::from_str::(value_str) { + Ok(new_info) => return Some(new_info), + Err(e) => { + error!("Failed to parse timeline info from value str '{value_str}': {e}") + } + } + } + None + }, ) .await .context("failed to subscribe for safekeeper info")?; loop { - match subscription.fetch_data().await { + match subscription.value_updates.recv().await { Some(new_info) => { - for (zttid, sk_info) in new_info { - // note: there are blocking operations below, but it's considered fine for now - if let Ok(tli) = GlobalTimelines::get(&conf, zttid, false) { - for (safekeeper_id, info) in sk_info { - tli.record_safekeeper_info(&info, safekeeper_id)? - } - } + // note: there are blocking operations below, but it's considered fine for now + if let Ok(tli) = GlobalTimelines::get(&conf, new_info.key.id, false) { + tli.record_safekeeper_info(&new_info.value, new_info.key.node_id) + .await? } } None => { + // XXX it means we lost connection with etcd, error is consumed inside sub object debug!("timeline updates sender closed, aborting the pull loop"); return Ok(()); } @@ -142,11 +316,12 @@ async fn main_loop(conf: SafeKeeperConf) { }, res = async { pull_handle.as_mut().unwrap().await }, if pull_handle.is_some() => { // was it panic or normal error? - let err = match res { - Ok(res_internal) => res_internal.unwrap_err(), - Err(err_outer) => err_outer.into(), + match res { + Ok(res_internal) => if let Err(err_inner) = res_internal { + warn!("pull task failed: {:?}", err_inner); + } + Err(err_outer) => { warn!("pull task panicked: {:?}", err_outer) } }; - warn!("pull task failed: {:?}", err); pull_handle = None; }, _ = ticker.tick() => { diff --git a/safekeeper/src/callmemaybe.rs b/safekeeper/src/callmemaybe.rs deleted file mode 100644 index 8c3fbe26ba..0000000000 --- a/safekeeper/src/callmemaybe.rs +++ /dev/null @@ -1,305 +0,0 @@ -//! -//! Callmemaybe module is responsible for periodically requesting -//! pageserver to initiate wal streaming. -//! -//! Other threads can use CallmeEvent messages to subscribe or unsubscribe -//! from the call list. -//! -use crate::SafeKeeperConf; -use anyhow::{Context, Result}; -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::sync::Mutex; -use std::time::{Duration, Instant}; -use tokio::runtime; -use tokio::sync::mpsc::UnboundedReceiver; -use tokio::task; -use tokio_postgres::NoTls; -use tracing::*; -use utils::{ - connstring::connection_host_port, - zid::{ZTenantId, ZTimelineId}, -}; - -async fn request_callback( - pageserver_connstr: String, - listen_pg_addr_str: String, - timelineid: ZTimelineId, - tenantid: ZTenantId, -) -> Result<()> { - info!( - "callmemaybe request_callback Connecting to pageserver {}", - &pageserver_connstr - ); - let (client, connection) = tokio_postgres::connect(&pageserver_connstr, NoTls).await?; - - tokio::spawn(async move { - if let Err(e) = connection.await { - error!("connection error: {}", e); - } - }); - - // use Config parsing because SockAddr parsing doesnt allow to use host names instead of ip addresses - let me_connstr = format!("postgresql://no_user@{}/no_db", listen_pg_addr_str); - let me_conf: postgres::config::Config = me_connstr.parse().unwrap(); - let (host, port) = connection_host_port(&me_conf); - - // pageserver connstr is needed to be able to distinguish between different pageservers - // it is required to correctly manage callmemaybe subscriptions when more than one pageserver is involved - // TODO it is better to use some sort of a unique id instead of connection string, see https://github.com/zenithdb/zenith/issues/1105 - let callme = format!( - "callmemaybe {} {} host={} port={} options='-c ztimelineid={} ztenantid={} pageserver_connstr={}'", - tenantid, timelineid, host, port, timelineid, tenantid, pageserver_connstr, - ); - - let _ = client.simple_query(&callme).await?; - - Ok(()) -} - -pub fn thread_main(conf: SafeKeeperConf, rx: UnboundedReceiver) -> Result<()> { - let runtime = runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - - runtime.block_on(main_loop(conf, rx)) -} - -#[derive(Debug, PartialEq, Eq, Hash, Clone)] -pub struct SubscriptionStateKey { - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - pageserver_connstr: String, -} - -impl SubscriptionStateKey { - pub fn new(tenant_id: ZTenantId, timeline_id: ZTimelineId, pageserver_connstr: String) -> Self { - Self { - tenant_id, - timeline_id, - pageserver_connstr, - } - } -} - -/// Messages to the callmemaybe thread -#[derive(Debug)] -pub enum CallmeEvent { - // add new subscription to the list - Subscribe(SubscriptionStateKey), - // remove the subscription from the list - Unsubscribe(SubscriptionStateKey), - // don't serve this subscription, but keep it in the list - Pause(SubscriptionStateKey), - // resume this subscription, if it exists, - // but don't create a new one if it is gone - Resume(SubscriptionStateKey), - // TODO how do we delete from subscriptions? -} - -#[derive(Debug)] -struct SubscriptionState { - tenantid: ZTenantId, - timelineid: ZTimelineId, - pageserver_connstr: String, - handle: Option>, - last_call_time: Instant, - paused: bool, -} - -impl SubscriptionState { - fn new( - tenantid: ZTenantId, - timelineid: ZTimelineId, - pageserver_connstr: String, - ) -> SubscriptionState { - SubscriptionState { - tenantid, - timelineid, - pageserver_connstr, - handle: None, - last_call_time: Instant::now(), - paused: false, - } - } - - fn pause(&mut self) { - self.paused = true; - self.abort_handle(); - } - - fn resume(&mut self) { - self.paused = false; - } - - // Most likely, the task have already successfully completed - // and abort() won't have any effect. - fn abort_handle(&mut self) { - if let Some(handle) = self.handle.take() { - handle.abort(); - - let timelineid = self.timelineid; - let tenantid = self.tenantid; - let pageserver_connstr = self.pageserver_connstr.clone(); - tokio::spawn(async move { - if let Err(err) = handle.await { - if err.is_cancelled() { - warn!("callback task for timelineid={} tenantid={} was cancelled before spawning a new one", - timelineid, tenantid); - } else { - error!( - "callback task for timelineid={} tenantid={} pageserver_connstr={} failed: {}", - timelineid, tenantid, pageserver_connstr, err - ); - } - } - }); - } - } - - fn call(&mut self, recall_period: Duration, listen_pg_addr: String) { - // Ignore call request if this subscription is paused - if self.paused { - debug!( - "ignore call request for paused subscription \ - tenantid: {}, timelineid: {}", - self.tenantid, self.timelineid - ); - return; - } - - // Check if it too early to recall - if self.handle.is_some() && self.last_call_time.elapsed() < recall_period { - debug!( - "too early to recall. self.last_call_time.elapsed: {:?}, recall_period: {:?} \ - tenantid: {}, timelineid: {}", - self.last_call_time, recall_period, self.tenantid, self.timelineid - ); - return; - } - - // If previous task didn't complete in recall_period, it must be hanging, - // so don't wait for it forever, just abort it and try again. - self.abort_handle(); - - let timelineid = self.timelineid; - let tenantid = self.tenantid; - let pageserver_connstr = self.pageserver_connstr.clone(); - self.handle = Some(tokio::spawn(async move { - request_callback(pageserver_connstr, listen_pg_addr, timelineid, tenantid) - .await - .unwrap_or_else(|e| { - error!( - "callback task for timelineid={} tenantid={} failed: {}", - timelineid, tenantid, e - ) - }); - })); - - // Update last_call_time - self.last_call_time = Instant::now(); - info!( - "new call spawned. last call time {:?} tenantid: {}, timelineid: {}", - self.last_call_time, self.tenantid, self.timelineid - ); - } -} - -impl Drop for SubscriptionState { - fn drop(&mut self) { - self.abort_handle(); - } -} - -pub async fn main_loop(conf: SafeKeeperConf, mut rx: UnboundedReceiver) -> Result<()> { - let subscriptions: Mutex> = - Mutex::new(HashMap::new()); - - let mut ticker = tokio::time::interval(conf.recall_period); - loop { - tokio::select! { - request = rx.recv() => - { - match request.context("done")? - { - CallmeEvent::Subscribe(key) => - { - let _enter = info_span!("callmemaybe: subscribe", timelineid = %key.timeline_id, tenantid = %key.tenant_id, pageserver_connstr=%key.pageserver_connstr.clone()).entered(); - let mut subscriptions = subscriptions.lock().unwrap(); - // XXX this clone is ugly, is there a way to use the trick with Borrow trait with entry API? - // when we switch to node id instead of the connection string key will be Copy and there will be no need to clone - match subscriptions.entry(key.clone()) { - Entry::Occupied(_) => { - // Do nothing if subscription already exists - // If it is paused it means that there is already established replication connection. - // If it is not paused it will be polled with other subscriptions when timeout expires. - // This can occur when replication channel is established before subscription is added. - info!( - "subscription already exists", - ); - } - Entry::Vacant(entry) => { - let subscription = entry.insert(SubscriptionState::new( - key.tenant_id, - key.timeline_id, - key.pageserver_connstr, - )); - subscription.call(conf.recall_period, conf.listen_pg_addr.clone()); - } - } - }, - CallmeEvent::Unsubscribe(key) => { - let _enter = debug_span!("callmemaybe: unsubscribe", timelineid = %key.timeline_id, tenantid = %key.tenant_id, pageserver_connstr=%key.pageserver_connstr.clone()).entered(); - debug!("unsubscribe"); - let mut subscriptions = subscriptions.lock().unwrap(); - subscriptions.remove(&key); - - }, - CallmeEvent::Pause(key) => { - let _enter = debug_span!("callmemaybe: pause", timelineid = %key.timeline_id, tenantid = %key.tenant_id, pageserver_connstr=%key.pageserver_connstr.clone()).entered(); - let mut subscriptions = subscriptions.lock().unwrap(); - // If pause received when no corresponding subscription exists it means that someone started replication - // without using callmemaybe. So we create subscription and pause it. - // In tenant relocation scenario subscribe call will be executed after pause when compute is restarted. - // In that case there is no need to create new/unpause existing subscription. - match subscriptions.entry(key.clone()) { - Entry::Occupied(mut sub) => { - debug!("pause existing"); - sub.get_mut().pause(); - } - Entry::Vacant(entry) => { - debug!("create paused"); - let subscription = entry.insert(SubscriptionState::new( - key.tenant_id, - key.timeline_id, - key.pageserver_connstr, - )); - subscription.pause(); - } - } - }, - CallmeEvent::Resume(key) => { - debug!( - "callmemaybe. thread_main. resume callback request for timelineid={} tenantid={} pageserver_connstr={}", - key.timeline_id, key.tenant_id, key.pageserver_connstr, - ); - let mut subscriptions = subscriptions.lock().unwrap(); - if let Some(sub) = subscriptions.get_mut(&key) - { - sub.resume(); - }; - }, - } - }, - _ = ticker.tick() => { - let _enter = debug_span!("callmemaybe: tick").entered(); - let mut subscriptions = subscriptions.lock().unwrap(); - - for (_, state) in subscriptions.iter_mut() { - state.call(conf.recall_period, conf.listen_pg_addr.clone()); - } - }, - }; - } -} diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 22716de1a0..5e749796dd 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -27,7 +27,7 @@ struct SafeKeeperStateV1 { acceptor_state: AcceptorStateV1, /// information about server server: ServerInfoV2, - /// Unique id of the last *elected* proposer we dealed with. Not needed + /// Unique id of the last *elected* proposer we dealt with. Not needed /// for correctness, exists for monitoring purposes. proposer_uuid: PgUuid, /// part of WAL acknowledged by quorum and available locally @@ -57,7 +57,7 @@ pub struct SafeKeeperStateV2 { pub acceptor_state: AcceptorState, /// information about server pub server: ServerInfoV2, - /// Unique id of the last *elected* proposer we dealed with. Not needed + /// Unique id of the last *elected* proposer we dealt with. Not needed /// for correctness, exists for monitoring purposes. pub proposer_uuid: PgUuid, /// part of WAL acknowledged by quorum and available locally @@ -89,7 +89,7 @@ pub struct SafeKeeperStateV3 { pub acceptor_state: AcceptorState, /// information about server pub server: ServerInfoV3, - /// Unique id of the last *elected* proposer we dealed with. Not needed + /// Unique id of the last *elected* proposer we dealt with. Not needed /// for correctness, exists for monitoring purposes. #[serde(with = "hex")] pub proposer_uuid: PgUuid, @@ -114,7 +114,7 @@ pub struct SafeKeeperStateV4 { pub acceptor_state: AcceptorState, /// information about server pub server: ServerInfo, - /// Unique id of the last *elected* proposer we dealed with. Not needed + /// Unique id of the last *elected* proposer we dealt with. Not needed /// for correctness, exists for monitoring purposes. #[serde(with = "hex")] pub proposer_uuid: PgUuid, @@ -165,7 +165,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), @@ -188,7 +188,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), @@ -211,7 +211,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn(0), peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), @@ -234,11 +234,24 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: oldstate.commit_lsn, - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn::INVALID, peer_horizon_lsn: oldstate.peer_horizon_lsn, remote_consistent_lsn: Lsn(0), peers: Peers(vec![]), }); + } else if version == 5 { + info!("reading safekeeper control file version {}", version); + let mut oldstate = SafeKeeperState::des(&buf[..buf.len()])?; + if oldstate.timeline_start_lsn != Lsn(0) { + return Ok(oldstate); + } + + // set special timeline_start_lsn because we don't know the real one + info!("setting timeline_start_lsn and local_start_lsn to Lsn(1)"); + oldstate.timeline_start_lsn = Lsn(1); + oldstate.local_start_lsn = Lsn(1); + + return Ok(oldstate); } bail!("unsupported safekeeper control file version {}", version) } diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 9af78661f9..a8121e829e 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -29,12 +29,11 @@ pub struct SafekeeperPostgresHandler { pub ztenantid: Option, pub ztimelineid: Option, pub timeline: Option>, - pageserver_connstr: Option, } /// Parsed Postgres command. enum SafekeeperPostgresCommand { - StartWalPush { pageserver_connstr: Option }, + StartWalPush, StartReplication { start_lsn: Lsn }, IdentifySystem, JSONCtrl { cmd: AppendLogicalMessage }, @@ -42,11 +41,7 @@ enum SafekeeperPostgresCommand { fn parse_cmd(cmd: &str) -> Result { if cmd.starts_with("START_WAL_PUSH") { - let re = Regex::new(r"START_WAL_PUSH(?: (.+))?").unwrap(); - - let caps = re.captures(cmd).unwrap(); - let pageserver_connstr = caps.get(1).map(|m| m.as_str().to_owned()); - Ok(SafekeeperPostgresCommand::StartWalPush { pageserver_connstr }) + Ok(SafekeeperPostgresCommand::StartWalPush) } else if cmd.starts_with("START_REPLICATION") { let re = Regex::new(r"START_REPLICATION(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)").unwrap(); @@ -86,8 +81,6 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { self.appname = Some(app_name.clone()); } - self.pageserver_connstr = params.get("pageserver_connstr").cloned(); - Ok(()) } else { bail!("Safekeeper received unexpected initial message: {:?}", sm); @@ -113,14 +106,14 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { } match cmd { - SafekeeperPostgresCommand::StartWalPush { pageserver_connstr } => { - ReceiveWalConn::new(pgb, pageserver_connstr) + SafekeeperPostgresCommand::StartWalPush => { + ReceiveWalConn::new(pgb) .run(self) .context("failed to run ReceiveWalConn")?; } SafekeeperPostgresCommand::StartReplication { start_lsn } => { ReplicationConn::new(pgb) - .run(self, pgb, start_lsn, self.pageserver_connstr.clone()) + .run(self, pgb, start_lsn) .context("failed to run ReplicationConn")?; } SafekeeperPostgresCommand::IdentifySystem => { @@ -142,7 +135,6 @@ impl SafekeeperPostgresHandler { ztenantid: None, ztimelineid: None, timeline: None, - pageserver_connstr: None, } } diff --git a/safekeeper/src/http/models.rs b/safekeeper/src/http/models.rs index ca18e64096..77efc0cc21 100644 --- a/safekeeper/src/http/models.rs +++ b/safekeeper/src/http/models.rs @@ -1,9 +1,9 @@ use serde::{Deserialize, Serialize}; -use utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; +use utils::zid::{NodeId, ZTenantId, ZTimelineId}; #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { pub tenant_id: ZTenantId, pub timeline_id: ZTimelineId, - pub peer_ids: Vec, + pub peer_ids: Vec, } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 62fbd2ff2f..33581c6c31 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,9 +1,9 @@ -use etcd_broker::SkTimelineInfo; -use hyper::{Body, Request, Response, StatusCode}; +use hyper::{Body, Request, Response, StatusCode, Uri}; +use once_cell::sync::Lazy; use serde::Serialize; use serde::Serializer; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fmt::Display; use std::sync::Arc; @@ -11,27 +11,30 @@ use crate::safekeeper::Term; use crate::safekeeper::TermHistory; use crate::timeline::{GlobalTimelines, TimelineDeleteForceResult}; use crate::SafeKeeperConf; +use etcd_broker::subscription_value::SkTimelineInfo; use utils::{ + auth::JwtAuth, http::{ - endpoint, + endpoint::{self, auth_middleware, check_permission}, error::ApiError, json::{json_request, json_response}, request::{ensure_no_body, parse_request_param}, RequestExt, RouterBuilder, }, lsn::Lsn, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; use super::models::TimelineCreateRequest; #[derive(Debug, Serialize)] struct SafekeeperStatus { - id: ZNodeId, + id: NodeId, } /// Healthcheck handler. async fn status_handler(request: Request) -> Result, ApiError> { + check_permission(&request, None)?; let conf = get_conf(&request); let status = SafekeeperStatus { id: conf.my_id }; json_response(StatusCode::OK, status) @@ -70,19 +73,19 @@ struct TimelineStatus { timeline_id: ZTimelineId, acceptor_state: AcceptorStateStatus, #[serde(serialize_with = "display_serialize")] + flush_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] timeline_start_lsn: Lsn, #[serde(serialize_with = "display_serialize")] local_start_lsn: Lsn, #[serde(serialize_with = "display_serialize")] commit_lsn: Lsn, #[serde(serialize_with = "display_serialize")] - s3_wal_lsn: Lsn, + backup_lsn: Lsn, #[serde(serialize_with = "display_serialize")] peer_horizon_lsn: Lsn, #[serde(serialize_with = "display_serialize")] remote_consistent_lsn: Lsn, - #[serde(serialize_with = "display_serialize")] - flush_lsn: Lsn, } /// Report info about timeline. @@ -91,6 +94,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result, ) -> Result, ApiError> { let tenant_id = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; ensure_no_body(&mut request).await?; json_response( StatusCode::OK, GlobalTimelines::delete_force_all_for_tenant(get_conf(&request), &tenant_id) + .await .map_err(ApiError::from_err)? .iter() .map(|(zttid, resp)| (format!("{}", zttid.timeline_id), *resp)) @@ -175,24 +185,44 @@ async fn record_safekeeper_info(mut request: Request) -> Result RouterBuilder { - let router = endpoint::make_router(); +pub fn make_router( + conf: SafeKeeperConf, + auth: Option>, +) -> RouterBuilder { + let mut router = endpoint::make_router(); + if auth.is_some() { + router = router.middleware(auth_middleware(|request| { + #[allow(clippy::mutable_key_type)] + static ALLOWLIST_ROUTES: Lazy> = + Lazy::new(|| ["/v1/status"].iter().map(|v| v.parse().unwrap()).collect()); + if ALLOWLIST_ROUTES.contains(request.uri()) { + None + } else { + // Option> is always provided as data below, hence unwrap(). + request.data::>>().unwrap().as_deref() + } + })) + } router .data(Arc::new(conf)) + .data(auth) .get("/v1/status", status_handler) .get( "/v1/timeline/:tenant_id/:timeline_id", timeline_status_handler, ) + // Will be used in the future instead of implicit timeline creation .post("/v1/timeline", timeline_create_handler) .delete( "/v1/tenant/:tenant_id/timeline/:timeline_id", diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 43514997d4..97fb3654d2 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -124,7 +124,7 @@ fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: L term, start_streaming_at: lsn, term_history: history, - timeline_start_lsn: Lsn(0), + timeline_start_lsn: lsn, }); spg.timeline.get().process_msg(&proposer_elected_request)?; diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index a87e5da686..0335d61d3f 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -1,23 +1,25 @@ +use defaults::DEFAULT_WAL_BACKUP_RUNTIME_THREADS; // +use remote_storage::RemoteStorageConfig; use std::path::PathBuf; use std::time::Duration; use url::Url; -use utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId}; +use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId}; pub mod broker; -pub mod callmemaybe; pub mod control_file; pub mod control_file_upgrade; pub mod handler; pub mod http; pub mod json_ctrl; +pub mod metrics; pub mod receive_wal; pub mod remove_wal; -pub mod s3_offload; pub mod safekeeper; pub mod send_wal; pub mod timeline; +pub mod wal_backup; pub mod wal_service; pub mod wal_storage; @@ -31,6 +33,7 @@ pub mod defaults { pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676; pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); pub const DEFAULT_RECALL_PERIOD: Duration = Duration::from_secs(10); + pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8; } #[derive(Debug, Clone)] @@ -47,12 +50,14 @@ pub struct SafeKeeperConf { pub no_sync: bool, pub listen_pg_addr: String, pub listen_http_addr: String, - pub ttl: Option, pub recall_period: Duration, - pub my_id: ZNodeId, + pub remote_storage: Option, + pub backup_runtime_threads: usize, + pub wal_backup_enabled: bool, + pub my_id: NodeId, pub broker_endpoints: Vec, pub broker_etcd_prefix: String, - pub s3_offload_enabled: bool, + pub auth_validation_public_key_path: Option, } impl SafeKeeperConf { @@ -77,12 +82,14 @@ impl Default for SafeKeeperConf { no_sync: false, listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - ttl: None, + remote_storage: None, recall_period: defaults::DEFAULT_RECALL_PERIOD, - my_id: ZNodeId(0), + my_id: NodeId(0), broker_endpoints: Vec::new(), broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), - s3_offload_enabled: true, + backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS, + wal_backup_enabled: true, + auth_validation_public_key_path: None, } } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs new file mode 100644 index 0000000000..fe4f9d231c --- /dev/null +++ b/safekeeper/src/metrics.rs @@ -0,0 +1,336 @@ +//! This module exports metrics for all active timelines. + +use std::time::{Instant, SystemTime}; + +use metrics::{ + core::{AtomicU64, Collector, Desc, GenericGaugeVec, Opts}, + proto::MetricFamily, + Gauge, IntGaugeVec, +}; +use postgres_ffi::xlog_utils::XLogSegNo; +use utils::{lsn::Lsn, zid::ZTenantTimelineId}; + +use crate::{ + safekeeper::{SafeKeeperState, SafekeeperMemState}, + timeline::{GlobalTimelines, ReplicaState}, +}; + +pub struct FullTimelineInfo { + pub zttid: ZTenantTimelineId, + pub replicas: Vec, + pub wal_backup_active: bool, + pub timeline_is_active: bool, + pub num_computes: u32, + pub last_removed_segno: XLogSegNo, + + pub epoch_start_lsn: Lsn, + pub mem_state: SafekeeperMemState, + pub persisted_state: SafeKeeperState, + + pub flush_lsn: Lsn, +} + +pub struct TimelineCollector { + descs: Vec, + commit_lsn: GenericGaugeVec, + backup_lsn: GenericGaugeVec, + flush_lsn: GenericGaugeVec, + epoch_start_lsn: GenericGaugeVec, + peer_horizon_lsn: GenericGaugeVec, + remote_consistent_lsn: GenericGaugeVec, + feedback_ps_write_lsn: GenericGaugeVec, + feedback_last_time_seconds: GenericGaugeVec, + timeline_active: GenericGaugeVec, + wal_backup_active: GenericGaugeVec, + connected_computes: IntGaugeVec, + disk_usage: GenericGaugeVec, + acceptor_term: GenericGaugeVec, + collect_timeline_metrics: Gauge, +} + +impl Default for TimelineCollector { + fn default() -> Self { + Self::new() + } +} + +impl TimelineCollector { + pub fn new() -> TimelineCollector { + let mut descs = Vec::new(); + + let commit_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_commit_lsn", + "Current commit_lsn (not necessarily persisted to disk), grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(commit_lsn.desc().into_iter().cloned()); + + let backup_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_backup_lsn", + "Current backup_lsn, up to which WAL is backed up, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(backup_lsn.desc().into_iter().cloned()); + + let flush_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_flush_lsn", + "Current flush_lsn, grouped by timeline", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(flush_lsn.desc().into_iter().cloned()); + + let epoch_start_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_epoch_start_lsn", + "Point since which compute generates new WAL in the current consensus term", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(epoch_start_lsn.desc().into_iter().cloned()); + + let peer_horizon_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_peer_horizon_lsn", + "LSN of the most lagging safekeeper", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(peer_horizon_lsn.desc().into_iter().cloned()); + + let remote_consistent_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_remote_consistent_lsn", + "LSN which is persisted to the remote storage in pageserver", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(remote_consistent_lsn.desc().into_iter().cloned()); + + let feedback_ps_write_lsn = GenericGaugeVec::new( + Opts::new( + "safekeeper_feedback_ps_write_lsn", + "Last LSN received by the pageserver, acknowledged in the feedback", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(feedback_ps_write_lsn.desc().into_iter().cloned()); + + let feedback_last_time_seconds = GenericGaugeVec::new( + Opts::new( + "safekeeper_feedback_last_time_seconds", + "Timestamp of the last feedback from the pageserver", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(feedback_last_time_seconds.desc().into_iter().cloned()); + + let timeline_active = GenericGaugeVec::new( + Opts::new( + "safekeeper_timeline_active", + "Reports 1 for active timelines, 0 for inactive", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(timeline_active.desc().into_iter().cloned()); + + let wal_backup_active = GenericGaugeVec::new( + Opts::new( + "safekeeper_wal_backup_active", + "Reports 1 for timelines with active WAL backup, 0 otherwise", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(wal_backup_active.desc().into_iter().cloned()); + + let connected_computes = IntGaugeVec::new( + Opts::new( + "safekeeper_connected_computes", + "Number of active compute connections", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(connected_computes.desc().into_iter().cloned()); + + let disk_usage = GenericGaugeVec::new( + Opts::new( + "safekeeper_disk_usage_bytes", + "Estimated disk space used to store WAL segments", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(disk_usage.desc().into_iter().cloned()); + + let acceptor_term = GenericGaugeVec::new( + Opts::new("safekeeper_acceptor_term", "Current consensus term"), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + descs.extend(acceptor_term.desc().into_iter().cloned()); + + let collect_timeline_metrics = Gauge::new( + "safekeeper_collect_timeline_metrics_seconds", + "Time spent collecting timeline metrics, including obtaining mutex lock for all timelines", + ) + .unwrap(); + descs.extend(collect_timeline_metrics.desc().into_iter().cloned()); + + TimelineCollector { + descs, + commit_lsn, + backup_lsn, + flush_lsn, + epoch_start_lsn, + peer_horizon_lsn, + remote_consistent_lsn, + feedback_ps_write_lsn, + feedback_last_time_seconds, + timeline_active, + wal_backup_active, + connected_computes, + disk_usage, + acceptor_term, + collect_timeline_metrics, + } + } +} + +impl Collector for TimelineCollector { + fn desc(&self) -> Vec<&Desc> { + self.descs.iter().collect() + } + + fn collect(&self) -> Vec { + let start_collecting = Instant::now(); + + // reset all metrics to clean up inactive timelines + self.commit_lsn.reset(); + self.backup_lsn.reset(); + self.flush_lsn.reset(); + self.epoch_start_lsn.reset(); + self.peer_horizon_lsn.reset(); + self.remote_consistent_lsn.reset(); + self.feedback_ps_write_lsn.reset(); + self.feedback_last_time_seconds.reset(); + self.timeline_active.reset(); + self.wal_backup_active.reset(); + self.connected_computes.reset(); + self.disk_usage.reset(); + self.acceptor_term.reset(); + + let timelines = GlobalTimelines::active_timelines_metrics(); + + for tli in timelines { + let tenant_id = tli.zttid.tenant_id.to_string(); + let timeline_id = tli.zttid.timeline_id.to_string(); + let labels = &[tenant_id.as_str(), timeline_id.as_str()]; + + let mut most_advanced: Option = None; + for replica in tli.replicas.iter() { + if let Some(replica_feedback) = replica.pageserver_feedback { + if let Some(current) = most_advanced { + if current.ps_writelsn < replica_feedback.ps_writelsn { + most_advanced = Some(replica_feedback); + } + } else { + most_advanced = Some(replica_feedback); + } + } + } + + self.commit_lsn + .with_label_values(labels) + .set(tli.mem_state.commit_lsn.into()); + self.backup_lsn + .with_label_values(labels) + .set(tli.mem_state.backup_lsn.into()); + self.flush_lsn + .with_label_values(labels) + .set(tli.flush_lsn.into()); + self.epoch_start_lsn + .with_label_values(labels) + .set(tli.epoch_start_lsn.into()); + self.peer_horizon_lsn + .with_label_values(labels) + .set(tli.mem_state.peer_horizon_lsn.into()); + self.remote_consistent_lsn + .with_label_values(labels) + .set(tli.mem_state.remote_consistent_lsn.into()); + self.timeline_active + .with_label_values(labels) + .set(tli.timeline_is_active as u64); + self.wal_backup_active + .with_label_values(labels) + .set(tli.wal_backup_active as u64); + self.connected_computes + .with_label_values(labels) + .set(tli.num_computes as i64); + self.acceptor_term + .with_label_values(labels) + .set(tli.persisted_state.acceptor_state.term as u64); + + if let Some(feedback) = most_advanced { + self.feedback_ps_write_lsn + .with_label_values(labels) + .set(feedback.ps_writelsn); + if let Ok(unix_time) = feedback.ps_replytime.duration_since(SystemTime::UNIX_EPOCH) + { + self.feedback_last_time_seconds + .with_label_values(labels) + .set(unix_time.as_secs()); + } + } + + if tli.last_removed_segno != 0 { + let segno_count = tli + .flush_lsn + .segment_number(tli.persisted_state.server.wal_seg_size as usize) + - tli.last_removed_segno; + let disk_usage_bytes = segno_count * tli.persisted_state.server.wal_seg_size as u64; + self.disk_usage + .with_label_values(labels) + .set(disk_usage_bytes); + } + } + + // collect MetricFamilys. + let mut mfs = Vec::new(); + mfs.extend(self.commit_lsn.collect()); + mfs.extend(self.backup_lsn.collect()); + mfs.extend(self.flush_lsn.collect()); + mfs.extend(self.epoch_start_lsn.collect()); + mfs.extend(self.peer_horizon_lsn.collect()); + mfs.extend(self.remote_consistent_lsn.collect()); + mfs.extend(self.feedback_ps_write_lsn.collect()); + mfs.extend(self.feedback_last_time_seconds.collect()); + mfs.extend(self.timeline_active.collect()); + mfs.extend(self.wal_backup_active.collect()); + mfs.extend(self.connected_computes.collect()); + mfs.extend(self.disk_usage.collect()); + mfs.extend(self.acceptor_term.collect()); + + // report time it took to collect all info + let elapsed = start_collecting.elapsed().as_secs_f64(); + self.collect_timeline_metrics.set(elapsed); + mfs.extend(self.collect_timeline_metrics.collect()); + + mfs + } +} diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 0ef335c9ed..af4cfb6ba4 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -32,22 +32,14 @@ pub struct ReceiveWalConn<'pg> { pg_backend: &'pg mut PostgresBackend, /// The cached result of `pg_backend.socket().peer_addr()` (roughly) peer_addr: SocketAddr, - /// Pageserver connection string forwarded from compute - /// NOTE that it is allowed to operate without a pageserver. - /// So if compute has no pageserver configured do not use it. - pageserver_connstr: Option, } impl<'pg> ReceiveWalConn<'pg> { - pub fn new( - pg: &'pg mut PostgresBackend, - pageserver_connstr: Option, - ) -> ReceiveWalConn<'pg> { + pub fn new(pg: &'pg mut PostgresBackend) -> ReceiveWalConn<'pg> { let peer_addr = *pg.get_peer_addr(); ReceiveWalConn { pg_backend: pg, peer_addr, - pageserver_connstr, } } @@ -85,16 +77,10 @@ impl<'pg> ReceiveWalConn<'pg> { _ => bail!("unexpected message {:?} instead of greeting", next_msg), } - // Register the connection and defer unregister. - spg.timeline - .get() - .on_compute_connect(self.pageserver_connstr.as_ref())?; - let _guard = ComputeConnectionGuard { - timeline: Arc::clone(spg.timeline.get()), - }; - let mut next_msg = Some(next_msg); + let mut first_time_through = true; + let mut _guard: Option = None; loop { if matches!(next_msg, Some(ProposerAcceptorMessage::AppendRequest(_))) { // poll AppendRequest's without blocking and write WAL to disk without flushing, @@ -122,6 +108,16 @@ impl<'pg> ReceiveWalConn<'pg> { self.write_msg(&reply)?; } } + if first_time_through { + // Register the connection and defer unregister. Do that only + // after processing first message, as it sets wal_seg_size, + // wanted by many. + spg.timeline.get().on_compute_connect()?; + _guard = Some(ComputeConnectionGuard { + timeline: Arc::clone(spg.timeline.get()), + }); + first_time_through = false; + } // blocking wait for the next message if next_msg.is_none() { diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index 3278d51bd3..004c0243f9 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -12,7 +12,7 @@ pub fn thread_main(conf: SafeKeeperConf) { let active_tlis = GlobalTimelines::get_active_timelines(); for zttid in &active_tlis { if let Ok(tli) = GlobalTimelines::get(&conf, *zttid, false) { - if let Err(e) = tli.remove_old_wal(conf.s3_offload_enabled) { + if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) { warn!( "failed to remove WAL for tenant {} timeline {}: {}", tli.zttid.tenant_id, tli.zttid.timeline_id, e diff --git a/safekeeper/src/s3_offload.rs b/safekeeper/src/s3_offload.rs deleted file mode 100644 index 2851c0b8a0..0000000000 --- a/safekeeper/src/s3_offload.rs +++ /dev/null @@ -1,107 +0,0 @@ -// -// Offload old WAL segments to S3 and remove them locally -// Needs `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to be set -// if no IAM bucket access is used. -// - -use anyhow::{bail, Context}; -use postgres_ffi::xlog_utils::*; -use remote_storage::{ - GenericRemoteStorage, RemoteStorage, RemoteStorageConfig, S3Bucket, S3Config, S3ObjectKey, -}; -use std::collections::HashSet; -use std::env; -use std::num::{NonZeroU32, NonZeroUsize}; -use std::path::Path; -use std::time::SystemTime; -use tokio::fs::{self, File}; -use tokio::io::BufReader; -use tokio::runtime; -use tokio::time::sleep; -use tracing::*; -use walkdir::WalkDir; - -use crate::SafeKeeperConf; - -pub fn thread_main(conf: SafeKeeperConf) { - // Create a new thread pool - // - // FIXME: keep it single-threaded for now, make it easier to debug with gdb, - // and we're not concerned with performance yet. - //let runtime = runtime::Runtime::new().unwrap(); - let runtime = runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - - info!("Starting S3 offload task"); - - runtime.block_on(async { - main_loop(&conf).await.unwrap(); - }); -} - -async fn offload_files( - remote_storage: &S3Bucket, - listing: &HashSet, - dir_path: &Path, - conf: &SafeKeeperConf, -) -> anyhow::Result { - let horizon = SystemTime::now() - conf.ttl.unwrap(); - let mut n: u64 = 0; - for entry in WalkDir::new(dir_path) { - let entry = entry?; - let path = entry.path(); - - if path.is_file() - && IsXLogFileName(entry.file_name().to_str().unwrap()) - && entry.metadata().unwrap().created().unwrap() <= horizon - { - let remote_path = remote_storage.remote_object_id(path)?; - if !listing.contains(&remote_path) { - let file = File::open(&path).await?; - let file_length = file.metadata().await?.len() as usize; - remote_storage - .upload(BufReader::new(file), file_length, &remote_path, None) - .await?; - - fs::remove_file(&path).await?; - n += 1; - } - } - } - Ok(n) -} - -async fn main_loop(conf: &SafeKeeperConf) -> anyhow::Result<()> { - let remote_storage = match GenericRemoteStorage::new( - conf.workdir.clone(), - &RemoteStorageConfig { - max_concurrent_syncs: NonZeroUsize::new(10).unwrap(), - max_sync_errors: NonZeroU32::new(1).unwrap(), - storage: remote_storage::RemoteStorageKind::AwsS3(S3Config { - bucket_name: "zenith-testbucket".to_string(), - bucket_region: env::var("S3_REGION").context("S3_REGION env var is not set")?, - prefix_in_bucket: Some("walarchive/".to_string()), - endpoint: Some(env::var("S3_ENDPOINT").context("S3_ENDPOINT env var is not set")?), - concurrency_limit: NonZeroUsize::new(20).unwrap(), - }), - }, - )? { - GenericRemoteStorage::Local(_) => { - bail!("Unexpected: got local storage for the remote config") - } - GenericRemoteStorage::S3(remote_storage) => remote_storage, - }; - - loop { - let listing = remote_storage - .list() - .await? - .into_iter() - .collect::>(); - let n = offload_files(&remote_storage, &listing, &conf.workdir, conf).await?; - info!("Offload {n} files to S3"); - sleep(conf.ttl.unwrap()).await; - } -} diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index fff1c269b6..fd4761505d 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -4,7 +4,7 @@ use anyhow::{bail, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use etcd_broker::SkTimelineInfo; +use etcd_broker::subscription_value::SkTimelineInfo; use postgres_ffi::xlog_utils::TimeLineID; use postgres_ffi::xlog_utils::XLogSegNo; @@ -15,22 +15,20 @@ use std::fmt; use std::io::Read; use tracing::*; -use lazy_static::lazy_static; - use crate::control_file; use crate::send_wal::HotStandbyFeedback; + use crate::wal_storage; -use metrics::{register_gauge_vec, Gauge, GaugeVec}; use postgres_ffi::xlog_utils::MAX_SEND_SIZE; use utils::{ bin_ser::LeSer, lsn::Lsn, - pq_proto::{SystemId, ZenithFeedback}, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, + pq_proto::{ReplicationFeedback, SystemId}, + zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}, }; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 5; +pub const SK_FORMAT_VERSION: u32 = 6; const SK_PROTOCOL_VERSION: u32 = 2; const UNKNOWN_SERVER_VERSION: u32 = 0; @@ -141,7 +139,7 @@ pub struct ServerInfo { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PeerInfo { /// LSN up to which safekeeper offloaded WAL to s3. - s3_wal_lsn: Lsn, + backup_lsn: Lsn, /// Term of the last entry. term: Term, /// LSN of the last record. @@ -153,7 +151,7 @@ pub struct PeerInfo { impl PeerInfo { fn new() -> Self { Self { - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn::INVALID, term: INVALID_TERM, flush_lsn: Lsn(0), commit_lsn: Lsn(0), @@ -164,7 +162,7 @@ impl PeerInfo { // vector-based node id -> peer state map with very limited functionality we // need/ #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Peers(pub Vec<(ZNodeId, PeerInfo)>); +pub struct Peers(pub Vec<(NodeId, PeerInfo)>); /// Persistent information stored on safekeeper node /// On disk data is prefixed by magic and format version and followed by checksum. @@ -179,7 +177,7 @@ pub struct SafeKeeperState { pub acceptor_state: AcceptorState, /// information about server pub server: ServerInfo, - /// Unique id of the last *elected* proposer we dealed with. Not needed + /// Unique id of the last *elected* proposer we dealt with. Not needed /// for correctness, exists for monitoring purposes. #[serde(with = "hex")] pub proposer_uuid: PgUuid, @@ -193,9 +191,9 @@ pub struct SafeKeeperState { /// Part of WAL acknowledged by quorum and available locally. Always points /// to record boundary. pub commit_lsn: Lsn, - /// First LSN not yet offloaded to s3. Useful to persist to avoid finding - /// out offloading progress on boot. - pub s3_wal_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn /// of last record streamed to everyone). Persisting it helps skipping /// recovery in walproposer, generally we compute it from peers. In @@ -217,14 +215,14 @@ pub struct SafeKeeperState { // are not flushed yet. pub struct SafekeeperMemState { pub commit_lsn: Lsn, - pub s3_wal_lsn: Lsn, // TODO: keep only persistent version + pub backup_lsn: Lsn, pub peer_horizon_lsn: Lsn, pub remote_consistent_lsn: Lsn, pub proposer_uuid: PgUuid, } impl SafeKeeperState { - pub fn new(zttid: &ZTenantTimelineId, peers: Vec) -> SafeKeeperState { + pub fn new(zttid: &ZTenantTimelineId, peers: Vec) -> SafeKeeperState { SafeKeeperState { tenant_id: zttid.tenant_id, timeline_id: zttid.timeline_id, @@ -241,7 +239,7 @@ impl SafeKeeperState { timeline_start_lsn: Lsn(0), local_start_lsn: Lsn(0), commit_lsn: Lsn(0), - s3_wal_lsn: Lsn(0), + backup_lsn: Lsn::INVALID, peer_horizon_lsn: Lsn(0), remote_consistent_lsn: Lsn(0), peers: Peers(peers.iter().map(|p| (*p, PeerInfo::new())).collect()), @@ -277,7 +275,7 @@ pub struct ProposerGreeting { #[derive(Debug, Serialize)] pub struct AcceptorGreeting { term: u64, - node_id: ZNodeId, + node_id: NodeId, } /// Vote request sent from proposer to safekeepers @@ -350,7 +348,7 @@ pub struct AppendResponse { // a criterion for walproposer --sync mode exit pub commit_lsn: Lsn, pub hs_feedback: HotStandbyFeedback, - pub zenith_feedback: ZenithFeedback, + pub pageserver_feedback: ReplicationFeedback, } impl AppendResponse { @@ -360,7 +358,7 @@ impl AppendResponse { flush_lsn: Lsn(0), commit_lsn: Lsn(0), hs_feedback: HotStandbyFeedback::empty(), - zenith_feedback: ZenithFeedback::empty(), + pageserver_feedback: ReplicationFeedback::empty(), } } } @@ -478,7 +476,7 @@ impl AcceptorProposerMessage { buf.put_u64_le(msg.hs_feedback.xmin); buf.put_u64_le(msg.hs_feedback.catalog_xmin); - msg.zenith_feedback.serialize(buf)? + msg.pageserver_feedback.serialize(buf)? } } @@ -486,52 +484,23 @@ impl AcceptorProposerMessage { } } -lazy_static! { - // The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). - // i64 is faster than f64, so update to u64 when available. - static ref COMMIT_LSN_GAUGE: GaugeVec = register_gauge_vec!( - "safekeeper_commit_lsn", - "Current commit_lsn (not necessarily persisted to disk), grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("Failed to register safekeeper_commit_lsn gauge vec"); -} - -struct SafeKeeperMetrics { - commit_lsn: Gauge, - // WAL-related metrics are in WalStorageMetrics -} - -impl SafeKeeperMetrics { - fn new(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> Self { - let tenant_id = tenant_id.to_string(); - let timeline_id = timeline_id.to_string(); - Self { - commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&[&tenant_id, &timeline_id]), - } - } -} - /// SafeKeeper which consumes events (messages from compute) and provides /// replies. pub struct SafeKeeper { - // Cached metrics so we don't have to recompute labels on each update. - metrics: SafeKeeperMetrics, - /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn. /// Note: be careful to set only if we are sure our WAL (term history) matches /// committed one. pub global_commit_lsn: Lsn, /// LSN since the proposer safekeeper currently talking to appends WAL; /// determines epoch switch point. - epoch_start_lsn: Lsn, + pub epoch_start_lsn: Lsn, pub inmem: SafekeeperMemState, // in memory part pub state: CTRL, // persistent state storage pub wal_store: WAL, - node_id: ZNodeId, // safekeeper's node id + node_id: NodeId, // safekeeper's node id } impl SafeKeeper @@ -544,7 +513,7 @@ where ztli: ZTimelineId, state: CTRL, mut wal_store: WAL, - node_id: ZNodeId, + node_id: NodeId, ) -> Result> { if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id { bail!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id); @@ -554,12 +523,11 @@ where wal_store.init_storage(&state)?; Ok(SafeKeeper { - metrics: SafeKeeperMetrics::new(state.tenant_id, ztli), global_commit_lsn: state.commit_lsn, epoch_start_lsn: Lsn(0), inmem: SafekeeperMemState { commit_lsn: state.commit_lsn, - s3_wal_lsn: state.s3_wal_lsn, + backup_lsn: state.backup_lsn, peer_horizon_lsn: state.peer_horizon_lsn, remote_consistent_lsn: state.remote_consistent_lsn, proposer_uuid: state.proposer_uuid, @@ -575,13 +543,16 @@ where self.state .acceptor_state .term_history - .up_to(self.wal_store.flush_lsn()) + .up_to(self.flush_lsn()) } pub fn get_epoch(&self) -> Term { - self.state - .acceptor_state - .get_epoch(self.wal_store.flush_lsn()) + self.state.acceptor_state.get_epoch(self.flush_lsn()) + } + + /// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet. + fn flush_lsn(&self) -> Lsn { + max(self.wal_store.flush_lsn(), self.state.timeline_start_lsn) } /// Process message from proposer and possibly form reply. Concurrent @@ -649,7 +620,6 @@ where self.state.persist(&state)?; } - // pass wal_seg_size to read WAL and find flush_lsn self.wal_store.init_storage(&self.state)?; info!( @@ -667,11 +637,22 @@ where &mut self, msg: &VoteRequest, ) -> Result> { + // Once voted, we won't accept data from older proposers; flush + // everything we've already received so that new proposer starts + // streaming at end of our WAL, without overlap. Currently we truncate + // WAL at streaming point, so this avoids truncating already committed + // WAL. + // + // TODO: it would be smoother to not truncate committed piece at + // handle_elected instead. Currently not a big deal, as proposer is the + // only source of WAL; with peer2peer recovery it would be more + // important. + self.wal_store.flush_wal()?; // initialize with refusal let mut resp = VoteResponse { term: self.state.acceptor_state.term, vote_given: false as u64, - flush_lsn: self.wal_store.flush_lsn(), + flush_lsn: self.flush_lsn(), truncate_lsn: self.state.peer_horizon_lsn, term_history: self.get_term_history(), timeline_start_lsn: self.state.timeline_start_lsn, @@ -703,11 +684,11 @@ where fn append_response(&self) -> AppendResponse { let ar = AppendResponse { term: self.state.acceptor_state.term, - flush_lsn: self.wal_store.flush_lsn(), + flush_lsn: self.flush_lsn(), commit_lsn: self.state.commit_lsn, // will be filled by the upper code to avoid bothering safekeeper hs_feedback: HotStandbyFeedback::empty(), - zenith_feedback: ZenithFeedback::empty(), + pageserver_feedback: ReplicationFeedback::empty(), }; trace!("formed AppendResponse {:?}", ar); ar @@ -731,24 +712,35 @@ where { let mut state = self.state.clone(); - // Remeber point where WAL begins globally, if not yet. + // Here we learn initial LSN for the first time, set fields + // interested in that. + if state.timeline_start_lsn == Lsn(0) { + // Remember point where WAL begins globally. state.timeline_start_lsn = msg.timeline_start_lsn; info!( "setting timeline_start_lsn to {:?}", state.timeline_start_lsn ); - } - // Remember point where WAL begins locally, if not yet. (I doubt the - // second condition is ever possible) - if state.local_start_lsn == Lsn(0) || state.local_start_lsn >= msg.start_streaming_at { state.local_start_lsn = msg.start_streaming_at; info!("setting local_start_lsn to {:?}", state.local_start_lsn); } + // Initializing commit_lsn before acking first flushed record is + // important to let find_end_of_wal skip the whole in the beginning + // of the first segment. + // + // NB: on new clusters, this happens at the same time as + // timeline_start_lsn initialization, it is taken outside to provide + // upgrade. + self.global_commit_lsn = max(self.global_commit_lsn, state.timeline_start_lsn); + self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn); + + // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment. + self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); state.acceptor_state.term_history = msg.term_history.clone(); - self.state.persist(&state)?; + self.persist_control_file(state)?; } info!("start receiving WAL since {:?}", msg.start_streaming_at); @@ -758,11 +750,10 @@ where /// Advance commit_lsn taking into account what we have locally pub fn update_commit_lsn(&mut self) -> Result<()> { - let commit_lsn = min(self.global_commit_lsn, self.wal_store.flush_lsn()); + let commit_lsn = min(self.global_commit_lsn, self.flush_lsn()); assert!(commit_lsn >= self.inmem.commit_lsn); self.inmem.commit_lsn = commit_lsn; - self.metrics.commit_lsn.set(self.inmem.commit_lsn.0 as f64); // If new commit_lsn reached epoch switch, force sync of control // file: walproposer in sync mode is very interested when this @@ -772,25 +763,16 @@ where // that we receive new epoch_start_lsn, and we still need to sync // control file in this case. if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn { - self.persist_control_file()?; - } - - // We got our first commit_lsn, which means we should sync - // everything to disk, to initialize the state. - if self.state.commit_lsn == Lsn(0) && commit_lsn > Lsn(0) { - self.wal_store.flush_wal()?; - self.persist_control_file()?; + self.persist_control_file(self.state.clone())?; } Ok(()) } - /// Persist in-memory state to the disk. - fn persist_control_file(&mut self) -> Result<()> { - let mut state = self.state.clone(); - + /// Persist in-memory state to the disk, taking other data from state. + fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> { state.commit_lsn = self.inmem.commit_lsn; - state.s3_wal_lsn = self.inmem.s3_wal_lsn; + state.backup_lsn = self.inmem.backup_lsn; state.peer_horizon_lsn = self.inmem.peer_horizon_lsn; state.remote_consistent_lsn = self.inmem.remote_consistent_lsn; state.proposer_uuid = self.inmem.proposer_uuid; @@ -823,13 +805,6 @@ where // do the job if !msg.wal_data.is_empty() { self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?; - - // If this was the first record we ever received, initialize - // commit_lsn to help find_end_of_wal skip the hole in the - // beginning. - if self.global_commit_lsn == Lsn(0) { - self.global_commit_lsn = msg.h.begin_lsn; - } } // flush wal to the disk, if required @@ -852,7 +827,7 @@ where if self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) < self.inmem.peer_horizon_lsn { - self.persist_control_file()?; + self.persist_control_file(self.state.clone())?; } trace!( @@ -898,11 +873,11 @@ where self.update_commit_lsn()?; } } - if let Some(s3_wal_lsn) = sk_info.s3_wal_lsn { - let new_s3_wal_lsn = max(s3_wal_lsn, self.inmem.s3_wal_lsn); + if let Some(backup_lsn) = sk_info.backup_lsn { + let new_backup_lsn = max(backup_lsn, self.inmem.backup_lsn); sync_control_file |= - self.state.s3_wal_lsn + (self.state.server.wal_seg_size as u64) < new_s3_wal_lsn; - self.inmem.s3_wal_lsn = new_s3_wal_lsn; + self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn; + self.inmem.backup_lsn = new_backup_lsn; } if let Some(remote_consistent_lsn) = sk_info.remote_consistent_lsn { let new_remote_consistent_lsn = @@ -920,7 +895,7 @@ where self.inmem.peer_horizon_lsn = new_peer_horizon_lsn; } if sync_control_file { - self.persist_control_file()?; + self.persist_control_file(self.state.clone())?; } Ok(()) } @@ -930,29 +905,23 @@ where /// offloading. /// While it is safe to use inmem values for determining horizon, /// we use persistent to make possible normal states less surprising. - pub fn get_horizon_segno(&self, s3_offload_enabled: bool) -> XLogSegNo { - let s3_offload_horizon = if s3_offload_enabled { - self.state.s3_wal_lsn - } else { - Lsn(u64::MAX) - }; - let horizon_lsn = min( - min( - self.state.remote_consistent_lsn, - self.state.peer_horizon_lsn, - ), - s3_offload_horizon, + pub fn get_horizon_segno(&self, wal_backup_enabled: bool) -> XLogSegNo { + let mut horizon_lsn = min( + self.state.remote_consistent_lsn, + self.state.peer_horizon_lsn, ); + if wal_backup_enabled { + horizon_lsn = min(horizon_lsn, self.state.backup_lsn); + } horizon_lsn.segment_number(self.state.server.wal_seg_size as usize) } } #[cfg(test)] mod tests { - use std::ops::Deref; - use super::*; use crate::wal_storage::Storage; + use std::ops::Deref; // fake storage for tests struct InMemoryState { @@ -1013,7 +982,8 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap(); + + let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -1028,7 +998,8 @@ mod tests { let storage = InMemoryState { persisted_state: state, }; - sk = SafeKeeper::new(ztli, storage, sk.wal_store, ZNodeId(0)).unwrap(); + + sk = SafeKeeper::new(ztli, storage, sk.wal_store, NodeId(0)).unwrap(); // and ensure voting second time for 1 is not ok vote_resp = sk.process_msg(&vote_request); @@ -1045,7 +1016,8 @@ mod tests { }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, ZNodeId(0)).unwrap(); + + let mut sk = SafeKeeper::new(ztli, storage, wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index d52dd6ea57..7439d6a8f6 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -8,31 +8,30 @@ use anyhow::{bail, Context, Result}; use postgres_ffi::xlog_utils::{get_current_timestamp, TimestampTz, MAX_SEND_SIZE}; -use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; use bytes::Bytes; use serde::{Deserialize, Serialize}; use std::cmp::min; use std::net::Shutdown; use std::sync::Arc; -use std::thread::sleep; use std::time::Duration; use std::{str, thread}; -use tokio::sync::mpsc::UnboundedSender; + +use tokio::sync::watch::Receiver; +use tokio::time::timeout; use tracing::*; use utils::{ bin_ser::BeSer, lsn::Lsn, postgres_backend::PostgresBackend, - pq_proto::{BeMessage, FeMessage, WalSndKeepAlive, XLogDataBody, ZenithFeedback}, + pq_proto::{BeMessage, FeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody}, sock_split::ReadStream, - zid::{ZTenantId, ZTimelineId}, }; // See: https://www.postgresql.org/docs/13/protocol-replication.html const HOT_STANDBY_FEEDBACK_TAG_BYTE: u8 = b'h'; const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r'; // zenith extension of replication protocol -const ZENITH_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; +const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; type FullTransactionId = u64; @@ -83,40 +82,6 @@ impl Drop for ReplicationConnGuard { } } -// XXX: Naming is a bit messy here. -// This ReplicationStreamGuard lives as long as ReplicationConn -// and current ReplicationConnGuard is tied to the background thread -// that receives feedback. -struct ReplicationStreamGuard { - tx: UnboundedSender, - tenant_id: ZTenantId, - timeline_id: ZTimelineId, - pageserver_connstr: String, -} - -impl Drop for ReplicationStreamGuard { - fn drop(&mut self) { - // the connection with pageserver is lost, - // resume callback subscription - debug!( - "Connection to pageserver is gone. Resume callmemaybe subsciption if necessary. tenantid {} timelineid {}", - self.tenant_id, self.timeline_id, - ); - - let subscription_key = SubscriptionStateKey::new( - self.tenant_id, - self.timeline_id, - self.pageserver_connstr.to_owned(), - ); - - self.tx - .send(CallmeEvent::Resume(subscription_key)) - .unwrap_or_else(|e| { - error!("failed to send Resume request to callmemaybe thread {}", e); - }); - } -} - impl ReplicationConn { /// Create a new `ReplicationConn` pub fn new(pgb: &mut PostgresBackend) -> Self { @@ -159,15 +124,15 @@ impl ReplicationConn { warn!("unexpected StandbyReply. Read-only postgres replicas are not supported in safekeepers yet."); // timeline.update_replica_state(replica_id, Some(state)); } - Some(ZENITH_STATUS_UPDATE_TAG_BYTE) => { + Some(NEON_STATUS_UPDATE_TAG_BYTE) => { // Note: deserializing is on m[9..] because we skip the tag byte and len bytes. let buf = Bytes::copy_from_slice(&m[9..]); - let reply = ZenithFeedback::parse(buf); + let reply = ReplicationFeedback::parse(buf); - trace!("ZenithFeedback is {:?}", reply); - // Only pageserver sends ZenithFeedback, so set the flag. + trace!("ReplicationFeedback is {:?}", reply); + // Only pageserver sends ReplicationFeedback, so set the flag. // This replica is the source of information to resend to compute. - state.zenith_feedback = Some(reply); + state.pageserver_feedback = Some(reply); timeline.update_replica_state(replica_id, state); } @@ -199,9 +164,8 @@ impl ReplicationConn { spg: &mut SafekeeperPostgresHandler, pgb: &mut PostgresBackend, mut start_pos: Lsn, - pageserver_connstr: Option, ) -> Result<()> { - let _enter = info_span!("WAL sender", timeline = %spg.ztimelineid.unwrap(), pageserver_connstr = %pageserver_connstr.as_deref().unwrap_or_default()).entered(); + let _enter = info_span!("WAL sender", timeline = %spg.ztimelineid.unwrap()).entered(); // spawn the background thread which receives HotStandbyFeedback messages. let bg_timeline = Arc::clone(spg.timeline.get()); @@ -229,130 +193,142 @@ impl ReplicationConn { } })?; - let mut wal_seg_size: usize; - loop { - wal_seg_size = spg.timeline.get().get_state().1.server.wal_seg_size as usize; - if wal_seg_size == 0 { - error!("Cannot start replication before connecting to wal_proposer"); - sleep(Duration::from_secs(1)); + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?; + + runtime.block_on(async move { + let (_, persisted_state) = spg.timeline.get().get_state(); + // add persisted_state.timeline_start_lsn == Lsn(0) check + if persisted_state.server.wal_seg_size == 0 { + bail!("Cannot start replication before connecting to walproposer"); + } + + let wal_end = spg.timeline.get().get_end_of_wal(); + // Walproposer gets special handling: safekeeper must give proposer all + // local WAL till the end, whether committed or not (walproposer will + // hang otherwise). That's because walproposer runs the consensus and + // synchronizes safekeepers on the most advanced one. + // + // There is a small risk of this WAL getting concurrently garbaged if + // another compute rises which collects majority and starts fixing log + // on this safekeeper itself. That's ok as (old) proposer will never be + // able to commit such WAL. + let stop_pos: Option = if spg.appname == Some("wal_proposer_recovery".to_string()) + { + Some(wal_end) } else { + None + }; + + info!("Start replication from {:?} till {:?}", start_pos, stop_pos); + + // switch to copy + pgb.write_message(&BeMessage::CopyBothResponse)?; + + let mut end_pos = Lsn(0); + + let mut wal_reader = WalReader::new( + spg.conf.timeline_dir(&spg.timeline.get().zttid), + &persisted_state, + start_pos, + spg.conf.wal_backup_enabled, + )?; + + // buffer for wal sending, limited by MAX_SEND_SIZE + let mut send_buf = vec![0u8; MAX_SEND_SIZE]; + + // watcher for commit_lsn updates + let mut commit_lsn_watch_rx = spg.timeline.get().get_commit_lsn_watch_rx(); + + loop { + if let Some(stop_pos) = stop_pos { + if start_pos >= stop_pos { + break; /* recovery finished */ + } + end_pos = stop_pos; + } else { + /* Wait until we have some data to stream */ + let lsn = wait_for_lsn(&mut commit_lsn_watch_rx, start_pos).await?; + + if let Some(lsn) = lsn { + end_pos = lsn; + } else { + // TODO: also check once in a while whether we are walsender + // to right pageserver. + if spg.timeline.get().stop_walsender(replica_id)? { + // Shut down, timeline is suspended. + // TODO create proper error type for this + bail!("end streaming to {:?}", spg.appname); + } + + // timeout expired: request pageserver status + pgb.write_message(&BeMessage::KeepAlive(WalSndKeepAlive { + sent_ptr: end_pos.0, + timestamp: get_current_timestamp(), + request_reply: true, + })) + .context("Failed to send KeepAlive message")?; + continue; + } + } + + let send_size = end_pos.checked_sub(start_pos).unwrap().0 as usize; + let send_size = min(send_size, send_buf.len()); + + let send_buf = &mut send_buf[..send_size]; + + // read wal into buffer + let send_size = wal_reader.read(send_buf).await?; + let send_buf = &send_buf[..send_size]; + + // Write some data to the network socket. + pgb.write_message(&BeMessage::XLogData(XLogDataBody { + wal_start: start_pos.0, + wal_end: end_pos.0, + timestamp: get_current_timestamp(), + data: send_buf, + })) + .context("Failed to send XLogData")?; + + start_pos += send_size as u64; + trace!("sent WAL up to {}", start_pos); + } + + Ok(()) + }) + } +} + +const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); + +// Wait until we have commit_lsn > lsn or timeout expires. Returns latest commit_lsn. +async fn wait_for_lsn(rx: &mut Receiver, lsn: Lsn) -> Result> { + let commit_lsn: Lsn = *rx.borrow(); + if commit_lsn > lsn { + return Ok(Some(commit_lsn)); + } + + let res = timeout(POLL_STATE_TIMEOUT, async move { + let mut commit_lsn; + loop { + rx.changed().await?; + commit_lsn = *rx.borrow(); + if commit_lsn > lsn { break; } } - let wal_end = spg.timeline.get().get_end_of_wal(); - // Walproposer gets special handling: safekeeper must give proposer all - // local WAL till the end, whether committed or not (walproposer will - // hang otherwise). That's because walproposer runs the consensus and - // synchronizes safekeepers on the most advanced one. - // - // There is a small risk of this WAL getting concurrently garbaged if - // another compute rises which collects majority and starts fixing log - // on this safekeeper itself. That's ok as (old) proposer will never be - // able to commit such WAL. - let stop_pos: Option = if spg.appname == Some("wal_proposer_recovery".to_string()) { - Some(wal_end) - } else { - None - }; - info!("Start replication from {:?} till {:?}", start_pos, stop_pos); - // Don't spam pageserver with callmemaybe queries - // when replication connection with pageserver is already established. - let _guard = { - if spg.appname == Some("wal_proposer_recovery".to_string()) { - None - } else { - let pageserver_connstr = pageserver_connstr.expect("there should be a pageserver connection string since this is not a wal_proposer_recovery"); - let zttid = spg.timeline.get().zttid; - let tx_clone = spg.timeline.get().callmemaybe_tx.clone(); - let subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.clone(), - ); - tx_clone - .send(CallmeEvent::Pause(subscription_key)) - .unwrap_or_else(|e| { - error!("failed to send Pause request to callmemaybe thread {}", e); - }); + Ok(commit_lsn) + }) + .await; - // create a guard to subscribe callback again, when this connection will exit - Some(ReplicationStreamGuard { - tx: tx_clone, - tenant_id: zttid.tenant_id, - timeline_id: zttid.timeline_id, - pageserver_connstr, - }) - } - }; - - // switch to copy - pgb.write_message(&BeMessage::CopyBothResponse)?; - - let mut end_pos = Lsn(0); - - let mut wal_reader = WalReader::new( - spg.conf.timeline_dir(&spg.timeline.get().zttid), - wal_seg_size, - start_pos, - ); - - // buffer for wal sending, limited by MAX_SEND_SIZE - let mut send_buf = vec![0u8; MAX_SEND_SIZE]; - - loop { - if let Some(stop_pos) = stop_pos { - if start_pos >= stop_pos { - break; /* recovery finished */ - } - end_pos = stop_pos; - } else { - /* Wait until we have some data to stream */ - let lsn = spg.timeline.get().wait_for_lsn(start_pos); - - if let Some(lsn) = lsn { - end_pos = lsn; - } else { - // TODO: also check once in a while whether we are walsender - // to right pageserver. - if spg.timeline.get().check_deactivate(replica_id)? { - // Shut down, timeline is suspended. - // TODO create proper error type for this - bail!("end streaming to {:?}", spg.appname); - } - - // timeout expired: request pageserver status - pgb.write_message(&BeMessage::KeepAlive(WalSndKeepAlive { - sent_ptr: end_pos.0, - timestamp: get_current_timestamp(), - request_reply: true, - })) - .context("Failed to send KeepAlive message")?; - continue; - } - } - - let send_size = end_pos.checked_sub(start_pos).unwrap().0 as usize; - let send_size = min(send_size, send_buf.len()); - - let send_buf = &mut send_buf[..send_size]; - - // read wal into buffer - let send_size = wal_reader.read(send_buf)?; - let send_buf = &send_buf[..send_size]; - - // Write some data to the network socket. - pgb.write_message(&BeMessage::XLogData(XLogDataBody { - wal_start: start_pos.0, - wal_end: end_pos.0, - timestamp: get_current_timestamp(), - data: send_buf, - })) - .context("Failed to send XLogData")?; - - start_pos += send_size as u64; - trace!("sent WAL up to {}", start_pos); - } - Ok(()) + match res { + // success + Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)), + // error inside closure + Ok(Err(err)) => Err(err), + // timeout + Err(_) => Ok(None), } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 2bb7771aac..bed6e447d7 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -3,41 +3,40 @@ use anyhow::{bail, Context, Result}; -use etcd_broker::SkTimelineInfo; +use etcd_broker::subscription_value::SkTimelineInfo; use lazy_static::lazy_static; use postgres_ffi::xlog_utils::XLogSegNo; use serde::Serialize; +use tokio::sync::watch; use std::cmp::{max, min}; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fs::{self}; -use std::sync::{Arc, Condvar, Mutex, MutexGuard}; -use std::time::Duration; -use tokio::sync::mpsc::UnboundedSender; +use std::sync::{Arc, Mutex, MutexGuard}; + +use tokio::sync::mpsc::Sender; use tracing::*; use utils::{ lsn::Lsn, - pq_proto::ZenithFeedback, - zid::{ZNodeId, ZTenantId, ZTenantTimelineId}, + pq_proto::ReplicationFeedback, + zid::{NodeId, ZTenantId, ZTenantTimelineId}, }; -use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; - use crate::control_file; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, SafekeeperMemState, }; use crate::send_wal::HotStandbyFeedback; + +use crate::metrics::FullTimelineInfo; use crate::wal_storage; use crate::wal_storage::Storage as wal_storage_iface; use crate::SafeKeeperConf; -const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); - /// Replica status update + hot standby feedback #[derive(Debug, Clone, Copy)] pub struct ReplicaState { @@ -47,8 +46,8 @@ pub struct ReplicaState { pub remote_consistent_lsn: Lsn, /// combined hot standby feedback from all replicas pub hs_feedback: HotStandbyFeedback, - /// Zenith specific feedback received from pageserver, if any - pub zenith_feedback: Option, + /// Replication specific feedback received from pageserver, if any + pub pageserver_feedback: Option, } impl Default for ReplicaState { @@ -67,7 +66,7 @@ impl ReplicaState { xmin: u64::MAX, catalog_xmin: u64::MAX, }, - zenith_feedback: None, + pageserver_feedback: None, } } } @@ -76,21 +75,21 @@ impl ReplicaState { struct SharedState { /// Safekeeper object sk: SafeKeeper, - /// For receiving-sending wal cooperation - /// quorum commit LSN we've notified walsenders about - notified_commit_lsn: Lsn, /// State of replicas replicas: Vec>, - /// Inactive clusters shouldn't occupy any resources, so timeline is - /// activated whenever there is a compute connection or pageserver is not - /// caughtup (it must have latest WAL for new compute start) and suspended - /// otherwise. + /// True when WAL backup launcher oversees the timeline, making sure WAL is + /// offloaded, allows to bother launcher less. + wal_backup_active: bool, + /// True whenever there is at least some pending activity on timeline: live + /// compute connection, pageserver is not caughtup (it must have latest WAL + /// for new compute start) or WAL backuping is not finished. Practically it + /// means safekeepers broadcast info to peers about the timeline, old WAL is + /// trimmed. /// /// TODO: it might be better to remove tli completely from GlobalTimelines /// when tli is inactive instead of having this flag. active: bool, num_computes: u32, - pageserver_connstr: Option, last_removed_segno: XLogSegNo, } @@ -99,20 +98,20 @@ impl SharedState { fn create( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result { let state = SafeKeeperState::new(zttid, peer_ids); let control_store = control_file::FileStorage::create_new(zttid, conf, state)?; + let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); let sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?; Ok(Self { - notified_commit_lsn: Lsn(0), sk, replicas: Vec::new(), + wal_backup_active: false, active: false, num_computes: 0, - pageserver_connstr: None, last_removed_segno: 0, }) } @@ -126,93 +125,73 @@ impl SharedState { info!("timeline {} restored", zttid.timeline_id); Ok(Self { - notified_commit_lsn: Lsn(0), sk: SafeKeeper::new(zttid.timeline_id, control_store, wal_store, conf.my_id)?, replicas: Vec::new(), + wal_backup_active: false, active: false, num_computes: 0, - pageserver_connstr: None, last_removed_segno: 0, }) } - - /// Activate the timeline: start/change walsender (via callmemaybe). - fn activate( - &mut self, - zttid: &ZTenantTimelineId, - pageserver_connstr: Option<&String>, - callmemaybe_tx: &UnboundedSender, - ) -> Result<()> { - if let Some(ref pageserver_connstr) = self.pageserver_connstr { - // unsub old sub. xxx: callmemaybe is going out - let old_subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.to_owned(), - ); - callmemaybe_tx - .send(CallmeEvent::Unsubscribe(old_subscription_key)) - .unwrap_or_else(|e| { - error!("failed to send Pause request to callmemaybe thread {}", e); - }); - } - if let Some(pageserver_connstr) = pageserver_connstr { - let subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.to_owned(), - ); - // xx: sending to channel under lock is not very cool, but - // shouldn't be a problem here. If it is, we can grab a counter - // here and later augment channel messages with it. - callmemaybe_tx - .send(CallmeEvent::Subscribe(subscription_key)) - .unwrap_or_else(|e| { - error!( - "failed to send Subscribe request to callmemaybe thread {}", - e - ); - }); - info!( - "timeline {} is subscribed to callmemaybe to {}", - zttid.timeline_id, pageserver_connstr - ); - } - self.pageserver_connstr = pageserver_connstr.map(|c| c.to_owned()); - self.active = true; - Ok(()) + fn is_active(&self) -> bool { + self.is_wal_backup_required() + // FIXME: add tracking of relevant pageservers and check them here individually, + // otherwise migration won't work (we suspend too early). + || self.sk.inmem.remote_consistent_lsn <= self.sk.inmem.commit_lsn } - /// Deactivate the timeline: stop callmemaybe. - fn deactivate( - &mut self, - zttid: &ZTenantTimelineId, - callmemaybe_tx: &UnboundedSender, - ) -> Result<()> { - if self.active { - if let Some(ref pageserver_connstr) = self.pageserver_connstr { - let subscription_key = SubscriptionStateKey::new( - zttid.tenant_id, - zttid.timeline_id, - pageserver_connstr.to_owned(), - ); - callmemaybe_tx - .send(CallmeEvent::Unsubscribe(subscription_key)) - .unwrap_or_else(|e| { - error!( - "failed to send Unsubscribe request to callmemaybe thread {}", - e - ); - }); - info!( - "timeline {} is unsubscribed from callmemaybe to {}", - zttid.timeline_id, - self.pageserver_connstr.as_ref().unwrap() - ); - } - self.active = false; + /// Mark timeline active/inactive and return whether s3 offloading requires + /// start/stop action. + fn update_status(&mut self, ttid: ZTenantTimelineId) -> bool { + let is_active = self.is_active(); + if self.active != is_active { + info!("timeline {} active={} now", ttid, is_active); } - Ok(()) + self.active = is_active; + self.is_wal_backup_action_pending() + } + + /// Should we run s3 offloading in current state? + fn is_wal_backup_required(&self) -> bool { + let seg_size = self.get_wal_seg_size(); + self.num_computes > 0 || + // Currently only the whole segment is offloaded, so compare segment numbers. + (self.sk.inmem.commit_lsn.segment_number(seg_size) > + self.sk.inmem.backup_lsn.segment_number(seg_size)) + } + + /// Is current state of s3 offloading is not what it ought to be? + fn is_wal_backup_action_pending(&self) -> bool { + let res = self.wal_backup_active != self.is_wal_backup_required(); + if res { + let action_pending = if self.is_wal_backup_required() { + "start" + } else { + "stop" + }; + trace!( + "timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}", + self.sk.state.timeline_id, action_pending, self.num_computes, self.sk.inmem.commit_lsn, self.sk.inmem.backup_lsn + ); + } + res + } + + /// Returns whether s3 offloading is required and sets current status as + /// matching. + fn wal_backup_attend(&mut self) -> bool { + self.wal_backup_active = self.is_wal_backup_required(); + self.wal_backup_active + } + + // Can this safekeeper offload to s3? Recently joined safekeepers might not + // have necessary WAL. + fn can_wal_backup(&self) -> bool { + self.sk.state.local_start_lsn <= self.sk.inmem.backup_lsn + } + + fn get_wal_seg_size(&self) -> usize { + self.sk.state.server.wal_seg_size as usize } /// Get combined state of all alive replicas @@ -235,25 +214,25 @@ impl SharedState { // we need to know which pageserver compute node considers to be main. // See https://github.com/zenithdb/zenith/issues/1171 // - if let Some(zenith_feedback) = state.zenith_feedback { - if let Some(acc_feedback) = acc.zenith_feedback { - if acc_feedback.ps_writelsn < zenith_feedback.ps_writelsn { + if let Some(pageserver_feedback) = state.pageserver_feedback { + if let Some(acc_feedback) = acc.pageserver_feedback { + if acc_feedback.ps_writelsn < pageserver_feedback.ps_writelsn { warn!("More than one pageserver is streaming WAL for the timeline. Feedback resolving is not fully supported yet."); - acc.zenith_feedback = Some(zenith_feedback); + acc.pageserver_feedback = Some(pageserver_feedback); } } else { - acc.zenith_feedback = Some(zenith_feedback); + acc.pageserver_feedback = Some(pageserver_feedback); } // last lsn received by pageserver // FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver. // See https://github.com/zenithdb/zenith/issues/1171 - acc.last_received_lsn = Lsn::from(zenith_feedback.ps_writelsn); + acc.last_received_lsn = Lsn::from(pageserver_feedback.ps_writelsn); // When at least one pageserver has preserved data up to remote_consistent_lsn, // safekeeper is free to delete it, so choose max of all pageservers. acc.remote_consistent_lsn = max( - Lsn::from(zenith_feedback.ps_applylsn), + Lsn::from(pageserver_feedback.ps_applylsn), acc.remote_consistent_lsn, ); } @@ -277,37 +256,47 @@ impl SharedState { /// Database instance (tenant) pub struct Timeline { pub zttid: ZTenantTimelineId, - pub callmemaybe_tx: UnboundedSender, + /// Sending here asks for wal backup launcher attention (start/stop + /// offloading). Sending zttid instead of concrete command allows to do + /// sending without timeline lock. + wal_backup_launcher_tx: Sender, + commit_lsn_watch_tx: watch::Sender, + /// For breeding receivers. + commit_lsn_watch_rx: watch::Receiver, mutex: Mutex, - /// conditional variable used to notify wal senders - cond: Condvar, } impl Timeline { fn new( zttid: ZTenantTimelineId, - callmemaybe_tx: UnboundedSender, + wal_backup_launcher_tx: Sender, shared_state: SharedState, ) -> Timeline { + let (commit_lsn_watch_tx, commit_lsn_watch_rx) = + watch::channel(shared_state.sk.inmem.commit_lsn); Timeline { zttid, - callmemaybe_tx, + wal_backup_launcher_tx, + commit_lsn_watch_tx, + commit_lsn_watch_rx, mutex: Mutex::new(shared_state), - cond: Condvar::new(), } } /// Register compute connection, starting timeline-related activity if it is /// not running yet. /// Can fail only if channel to a static thread got closed, which is not normal at all. - pub fn on_compute_connect(&self, pageserver_connstr: Option<&String>) -> Result<()> { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.num_computes += 1; - // FIXME: currently we always adopt latest pageserver connstr, but we - // should have kind of generations assigned by compute to distinguish - // the latest one or even pass it through consensus to reliably deliver - // to all safekeepers. - shared_state.activate(&self.zttid, pageserver_connstr, &self.callmemaybe_tx)?; + pub fn on_compute_connect(&self) -> Result<()> { + let is_wal_backup_action_pending: bool; + { + let mut shared_state = self.mutex.lock().unwrap(); + shared_state.num_computes += 1; + is_wal_backup_action_pending = shared_state.update_status(self.zttid); + } + // Wake up wal backup launcher, if offloading not started yet. + if is_wal_backup_action_pending { + self.wal_backup_launcher_tx.blocking_send(self.zttid)?; + } Ok(()) } @@ -315,49 +304,62 @@ impl Timeline { /// pageserver doesn't need catchup. /// Can fail only if channel to a static thread got closed, which is not normal at all. pub fn on_compute_disconnect(&self) -> Result<()> { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.num_computes -= 1; - // If there is no pageserver, can suspend right away; otherwise let - // walsender do that. - if shared_state.num_computes == 0 && shared_state.pageserver_connstr.is_none() { - shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; + let is_wal_backup_action_pending: bool; + { + let mut shared_state = self.mutex.lock().unwrap(); + shared_state.num_computes -= 1; + is_wal_backup_action_pending = shared_state.update_status(self.zttid); + } + // Wake up wal backup launcher, if it is time to stop the offloading. + if is_wal_backup_action_pending { + self.wal_backup_launcher_tx.blocking_send(self.zttid)?; } Ok(()) } - /// Deactivate tenant if there is no computes and pageserver is caughtup, - /// assuming the pageserver status is in replica_id. - /// Returns true if deactivated. - pub fn check_deactivate(&self, replica_id: usize) -> Result { + /// Whether we still need this walsender running? + /// TODO: check this pageserver is actually interested in this timeline. + pub fn stop_walsender(&self, replica_id: usize) -> Result { let mut shared_state = self.mutex.lock().unwrap(); - if !shared_state.active { - // already suspended - return Ok(true); - } if shared_state.num_computes == 0 { let replica_state = shared_state.replicas[replica_id].unwrap(); - let deactivate = shared_state.notified_commit_lsn == Lsn(0) || // no data at all yet - (replica_state.last_received_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. - replica_state.last_received_lsn >= shared_state.sk.inmem.commit_lsn); - if deactivate { - shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; + let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet + (replica_state.remote_consistent_lsn != Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. + replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn); + if stop { + shared_state.update_status(self.zttid); return Ok(true); } } Ok(false) } + /// Returns whether s3 offloading is required and sets current status as + /// matching it. + pub fn wal_backup_attend(&self) -> bool { + let mut shared_state = self.mutex.lock().unwrap(); + shared_state.wal_backup_attend() + } + + // Can this safekeeper offload to s3? Recently joined safekeepers might not + // have necessary WAL. + pub fn can_wal_backup(&self) -> bool { + self.mutex.lock().unwrap().can_wal_backup() + } + /// Deactivates the timeline, assuming it is being deleted. /// Returns whether the timeline was already active. /// - /// The callmemaybe thread is stopped by the deactivation message. We assume all other threads - /// will stop by themselves eventually (possibly with errors, but no panics). There should be no - /// compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but + /// We assume all threads will stop by themselves eventually (possibly with errors, but no panics). + /// There should be no compute threads (as we're deleting the timeline), actually. Some WAL may be left unsent, but /// we're deleting the timeline anyway. - pub fn deactivate_for_delete(&self) -> Result { - let mut shared_state = self.mutex.lock().unwrap(); - let was_active = shared_state.active; - shared_state.deactivate(&self.zttid, &self.callmemaybe_tx)?; + pub async fn deactivate_for_delete(&self) -> Result { + let was_active: bool; + { + let shared_state = self.mutex.lock().unwrap(); + was_active = shared_state.active; + } + self.wal_backup_launcher_tx.send(self.zttid).await?; Ok(was_active) } @@ -366,36 +368,35 @@ impl Timeline { shared_state.active } - /// Timed wait for an LSN to be committed. - /// - /// Returns the last committed LSN, which will be at least - /// as high as the LSN waited for, or None if timeout expired. - /// - pub fn wait_for_lsn(&self, lsn: Lsn) -> Option { - let mut shared_state = self.mutex.lock().unwrap(); - loop { - let commit_lsn = shared_state.notified_commit_lsn; - // This must be `>`, not `>=`. - if commit_lsn > lsn { - return Some(commit_lsn); - } - let result = self - .cond - .wait_timeout(shared_state, POLL_STATE_TIMEOUT) - .unwrap(); - if result.1.timed_out() { - return None; - } - shared_state = result.0 + /// Returns full timeline info, required for the metrics. + /// If the timeline is not active, returns None instead. + pub fn info_for_metrics(&self) -> Option { + let shared_state = self.mutex.lock().unwrap(); + if !shared_state.active { + return None; } + + Some(FullTimelineInfo { + zttid: self.zttid, + replicas: shared_state + .replicas + .iter() + .filter_map(|r| r.as_ref()) + .copied() + .collect(), + wal_backup_active: shared_state.wal_backup_active, + timeline_is_active: shared_state.active, + num_computes: shared_state.num_computes, + last_removed_segno: shared_state.last_removed_segno, + epoch_start_lsn: shared_state.sk.epoch_start_lsn, + mem_state: shared_state.sk.inmem.clone(), + persisted_state: shared_state.sk.state.clone(), + flush_lsn: shared_state.sk.wal_store.flush_lsn(), + }) } - // Notify caught-up WAL senders about new WAL data received - fn notify_wal_senders(&self, shared_state: &mut MutexGuard) { - if shared_state.notified_commit_lsn < shared_state.sk.inmem.commit_lsn { - shared_state.notified_commit_lsn = shared_state.sk.inmem.commit_lsn; - self.cond.notify_all(); - } + pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver { + self.commit_lsn_watch_rx.clone() } /// Pass arrived message to the safekeeper. @@ -404,6 +405,7 @@ impl Timeline { msg: &ProposerAcceptorMessage, ) -> Result> { let mut rmsg: Option; + let commit_lsn: Lsn; { let mut shared_state = self.mutex.lock().unwrap(); rmsg = shared_state.sk.process_msg(msg)?; @@ -412,46 +414,79 @@ impl Timeline { if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { let state = shared_state.get_replicas_state(); resp.hs_feedback = state.hs_feedback; - if let Some(zenith_feedback) = state.zenith_feedback { - resp.zenith_feedback = zenith_feedback; + if let Some(pageserver_feedback) = state.pageserver_feedback { + resp.pageserver_feedback = pageserver_feedback; } } - // Ping wal sender that new data might be available. - self.notify_wal_senders(&mut shared_state); + commit_lsn = shared_state.sk.inmem.commit_lsn; } + self.commit_lsn_watch_tx.send(commit_lsn)?; Ok(rmsg) } + pub fn get_wal_seg_size(&self) -> usize { + self.mutex.lock().unwrap().get_wal_seg_size() + } + pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { let shared_state = self.mutex.lock().unwrap(); (shared_state.sk.inmem.clone(), shared_state.sk.state.clone()) } + pub fn get_wal_backup_lsn(&self) -> Lsn { + self.mutex.lock().unwrap().sk.inmem.backup_lsn + } + + pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) { + self.mutex.lock().unwrap().sk.inmem.backup_lsn = backup_lsn; + // we should check whether to shut down offloader, but this will be done + // soon by peer communication anyway. + } + /// Prepare public safekeeper info for reporting. - pub fn get_public_info(&self, conf: &SafeKeeperConf) -> anyhow::Result { + pub fn get_public_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo { let shared_state = self.mutex.lock().unwrap(); - Ok(SkTimelineInfo { + SkTimelineInfo { last_log_term: Some(shared_state.sk.get_epoch()), flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()), // note: this value is not flushed to control file yet and can be lost commit_lsn: Some(shared_state.sk.inmem.commit_lsn), - s3_wal_lsn: Some(shared_state.sk.inmem.s3_wal_lsn), // TODO: rework feedbacks to avoid max here remote_consistent_lsn: Some(max( shared_state.get_replicas_state().remote_consistent_lsn, shared_state.sk.inmem.remote_consistent_lsn, )), peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn), - safekeeper_connection_string: Some(conf.listen_pg_addr.clone()), - }) + safekeeper_connstr: Some(conf.listen_pg_addr.clone()), + backup_lsn: Some(shared_state.sk.inmem.backup_lsn), + } } /// Update timeline state with peer safekeeper data. - pub fn record_safekeeper_info(&self, sk_info: &SkTimelineInfo, _sk_id: ZNodeId) -> Result<()> { - let mut shared_state = self.mutex.lock().unwrap(); - shared_state.sk.record_safekeeper_info(sk_info)?; - self.notify_wal_senders(&mut shared_state); + pub async fn record_safekeeper_info( + &self, + sk_info: &SkTimelineInfo, + _sk_id: NodeId, + ) -> Result<()> { + let is_wal_backup_action_pending: bool; + let commit_lsn: Lsn; + { + let mut shared_state = self.mutex.lock().unwrap(); + // WAL seg size not initialized yet (no message from compute ever + // received), can't do much without it. + if shared_state.get_wal_seg_size() == 0 { + return Ok(()); + } + shared_state.sk.record_safekeeper_info(sk_info)?; + is_wal_backup_action_pending = shared_state.update_status(self.zttid); + commit_lsn = shared_state.sk.inmem.commit_lsn; + } + self.commit_lsn_watch_tx.send(commit_lsn)?; + // Wake up wal backup launcher, if it is time to stop the offloading. + if is_wal_backup_action_pending { + self.wal_backup_launcher_tx.send(self.zttid).await?; + } Ok(()) } @@ -476,16 +511,16 @@ impl Timeline { shared_state.sk.wal_store.flush_lsn() } - pub fn remove_old_wal(&self, s3_offload_enabled: bool) -> Result<()> { + pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { let horizon_segno: XLogSegNo; let remover: Box Result<(), anyhow::Error>>; { let shared_state = self.mutex.lock().unwrap(); // WAL seg size not initialized yet, no WAL exists. - if shared_state.sk.state.server.wal_seg_size == 0 { + if shared_state.get_wal_seg_size() == 0 { return Ok(()); } - horizon_segno = shared_state.sk.get_horizon_segno(s3_offload_enabled); + horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled); remover = shared_state.sk.wal_store.remove_up_to(); if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { return Ok(()); @@ -521,13 +556,13 @@ impl TimelineTools for Option> { struct GlobalTimelinesState { timelines: HashMap>, - callmemaybe_tx: Option>, + wal_backup_launcher_tx: Option>, } lazy_static! { static ref TIMELINES_STATE: Mutex = Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), - callmemaybe_tx: None + wal_backup_launcher_tx: None, }); } @@ -541,17 +576,17 @@ pub struct TimelineDeleteForceResult { pub struct GlobalTimelines; impl GlobalTimelines { - pub fn set_callmemaybe_tx(callmemaybe_tx: UnboundedSender) { + pub fn init(wal_backup_launcher_tx: Sender) { let mut state = TIMELINES_STATE.lock().unwrap(); - assert!(state.callmemaybe_tx.is_none()); - state.callmemaybe_tx = Some(callmemaybe_tx); + assert!(state.wal_backup_launcher_tx.is_none()); + state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); } fn create_internal( mut state: MutexGuard, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result> { match state.timelines.get(&zttid) { Some(_) => bail!("timeline {} already exists", zttid), @@ -559,12 +594,13 @@ impl GlobalTimelines { // TODO: check directory existence let dir = conf.timeline_dir(&zttid); fs::create_dir_all(dir)?; + let shared_state = SharedState::create(conf, &zttid, peer_ids) .context("failed to create shared state")?; let new_tli = Arc::new(Timeline::new( zttid, - state.callmemaybe_tx.as_ref().unwrap().clone(), + state.wal_backup_launcher_tx.as_ref().unwrap().clone(), shared_state, )); state.timelines.insert(zttid, Arc::clone(&new_tli)); @@ -576,7 +612,7 @@ impl GlobalTimelines { pub fn create( conf: &SafeKeeperConf, zttid: ZTenantTimelineId, - peer_ids: Vec, + peer_ids: Vec, ) -> Result> { let state = TIMELINES_STATE.lock().unwrap(); GlobalTimelines::create_internal(state, conf, zttid, peer_ids) @@ -589,13 +625,14 @@ impl GlobalTimelines { zttid: ZTenantTimelineId, create: bool, ) -> Result> { + let _enter = info_span!("", timeline = %zttid.tenant_id).entered(); + let mut state = TIMELINES_STATE.lock().unwrap(); match state.timelines.get(&zttid) { Some(result) => Ok(Arc::clone(result)), None => { - let shared_state = - SharedState::restore(conf, &zttid).context("failed to restore shared state"); + let shared_state = SharedState::restore(conf, &zttid); let shared_state = match shared_state { Ok(shared_state) => shared_state, @@ -616,7 +653,7 @@ impl GlobalTimelines { let new_tli = Arc::new(Timeline::new( zttid, - state.callmemaybe_tx.as_ref().unwrap().clone(), + state.wal_backup_launcher_tx.as_ref().unwrap().clone(), shared_state, )); state.timelines.insert(zttid, Arc::clone(&new_tli)); @@ -625,8 +662,14 @@ impl GlobalTimelines { } } + /// Get loaded timeline, if it exists. + pub fn get_loaded(zttid: ZTenantTimelineId) -> Option> { + let state = TIMELINES_STATE.lock().unwrap(); + state.timelines.get(&zttid).map(Arc::clone) + } + /// Get ZTenantTimelineIDs of all active timelines. - pub fn get_active_timelines() -> Vec { + pub fn get_active_timelines() -> HashSet { let state = TIMELINES_STATE.lock().unwrap(); state .timelines @@ -636,6 +679,16 @@ impl GlobalTimelines { .collect() } + /// Return FullTimelineInfo for all active timelines. + pub fn active_timelines_metrics() -> Vec { + let state = TIMELINES_STATE.lock().unwrap(); + state + .timelines + .iter() + .filter_map(|(_, tli)| tli.info_for_metrics()) + .collect() + } + fn delete_force_internal( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, @@ -665,22 +718,23 @@ impl GlobalTimelines { /// b) an HTTP GET request about the timeline is made and it's able to restore the current state, or /// c) an HTTP POST request for timeline creation is made after the timeline is already deleted. /// TODO: ensure all of the above never happens. - pub fn delete_force( + pub async fn delete_force( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, ) -> Result { info!("deleting timeline {}", zttid); - let was_active = match TIMELINES_STATE.lock().unwrap().timelines.remove(zttid) { - None => false, - Some(tli) => tli.deactivate_for_delete()?, - }; + let timeline = TIMELINES_STATE.lock().unwrap().timelines.remove(zttid); + let mut was_active = false; + if let Some(tli) = timeline { + was_active = tli.deactivate_for_delete().await?; + } GlobalTimelines::delete_force_internal(conf, zttid, was_active) } /// Deactivates and deletes all timelines for the tenant, see `delete()`. /// Returns map of all timelines which the tenant had, `true` if a timeline was active. /// There may be a race if new timelines are created simultaneously. - pub fn delete_force_all_for_tenant( + pub async fn delete_force_all_for_tenant( conf: &SafeKeeperConf, tenant_id: &ZTenantId, ) -> Result> { @@ -691,14 +745,15 @@ impl GlobalTimelines { let timelines = &mut TIMELINES_STATE.lock().unwrap().timelines; for (&zttid, tli) in timelines.iter() { if zttid.tenant_id == *tenant_id { - to_delete.insert(zttid, tli.deactivate_for_delete()?); + to_delete.insert(zttid, tli.clone()); } } // TODO: test that the correct subset of timelines is removed. It's complicated because they are implicitly created currently. timelines.retain(|zttid, _| !to_delete.contains_key(zttid)); } let mut deleted = HashMap::new(); - for (zttid, was_active) in to_delete { + for (zttid, timeline) in to_delete { + let was_active = timeline.deactivate_for_delete().await?; deleted.insert( zttid, GlobalTimelines::delete_force_internal(conf, &zttid, was_active)?, diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs new file mode 100644 index 0000000000..b2f9d8d4f3 --- /dev/null +++ b/safekeeper/src/wal_backup.rs @@ -0,0 +1,490 @@ +use anyhow::{Context, Result}; +use etcd_broker::subscription_key::{ + NodeKind, OperationKind, SkOperationKind, SubscriptionKey, SubscriptionKind, +}; +use tokio::task::JoinHandle; + +use std::cmp::min; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::pin::Pin; +use std::sync::Arc; +use std::time::Duration; + +use postgres_ffi::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, PG_TLI}; +use remote_storage::{GenericRemoteStorage, RemoteStorage}; +use tokio::fs::File; +use tokio::runtime::Builder; + +use tokio::select; +use tokio::sync::mpsc::{self, Receiver, Sender}; +use tokio::sync::watch; +use tokio::time::sleep; +use tracing::*; + +use utils::{lsn::Lsn, zid::ZTenantTimelineId}; + +use crate::broker::{Election, ElectionLeader}; +use crate::timeline::{GlobalTimelines, Timeline}; +use crate::{broker, SafeKeeperConf}; + +use once_cell::sync::OnceCell; + +const BROKER_CONNECTION_RETRY_DELAY_MS: u64 = 1000; + +const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10; +const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; + +pub fn wal_backup_launcher_thread_main( + conf: SafeKeeperConf, + wal_backup_launcher_rx: Receiver, +) { + let rt = Builder::new_multi_thread() + .worker_threads(conf.backup_runtime_threads) + .enable_all() + .build() + .expect("failed to create wal backup runtime"); + + rt.block_on(async { + wal_backup_launcher_main_loop(conf, wal_backup_launcher_rx).await; + }); +} + +/// Check whether wal backup is required for timeline. If yes, mark that launcher is +/// aware of current status and return the timeline. +fn is_wal_backup_required(zttid: ZTenantTimelineId) -> Option> { + GlobalTimelines::get_loaded(zttid).filter(|t| t.wal_backup_attend()) +} + +struct WalBackupTaskHandle { + shutdown_tx: Sender<()>, + handle: JoinHandle<()>, +} + +struct WalBackupTimelineEntry { + timeline: Arc, + handle: Option, +} + +/// Start per timeline task, if it makes sense for this safekeeper to offload. +fn consider_start_task( + conf: &SafeKeeperConf, + zttid: ZTenantTimelineId, + task: &mut WalBackupTimelineEntry, +) { + if !task.timeline.can_wal_backup() { + return; + } + info!("starting WAL backup task for {}", zttid); + + // TODO: decide who should offload right here by simply checking current + // state instead of running elections in offloading task. + let election_name = SubscriptionKey { + cluster_prefix: conf.broker_etcd_prefix.clone(), + kind: SubscriptionKind::Operation( + zttid, + NodeKind::Safekeeper, + OperationKind::Safekeeper(SkOperationKind::WalBackup), + ), + } + .watch_key(); + let my_candidate_name = broker::get_candiate_name(conf.my_id); + let election = broker::Election::new( + election_name, + my_candidate_name, + conf.broker_endpoints.clone(), + ); + + let (shutdown_tx, shutdown_rx) = mpsc::channel(1); + let timeline_dir = conf.timeline_dir(&zttid); + + let handle = tokio::spawn( + backup_task_main(zttid, timeline_dir, shutdown_rx, election) + .instrument(info_span!("WAL backup task", zttid = %zttid)), + ); + + task.handle = Some(WalBackupTaskHandle { + shutdown_tx, + handle, + }); +} + +const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; + +/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup +/// tasks. Having this in separate task simplifies locking, allows to reap +/// panics and separate elections from offloading itself. +async fn wal_backup_launcher_main_loop( + conf: SafeKeeperConf, + mut wal_backup_launcher_rx: Receiver, +) { + info!( + "WAL backup launcher started, remote config {:?}", + conf.remote_storage + ); + + let conf_ = conf.clone(); + REMOTE_STORAGE.get_or_init(|| { + conf_.remote_storage.as_ref().map(|c| { + GenericRemoteStorage::new(conf_.workdir, c).expect("failed to create remote storage") + }) + }); + + // Presense in this map means launcher is aware s3 offloading is needed for + // the timeline, but task is started only if it makes sense for to offload + // from this safekeeper. + let mut tasks: HashMap = HashMap::new(); + + let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC)); + loop { + tokio::select! { + zttid = wal_backup_launcher_rx.recv() => { + // channel is never expected to get closed + let zttid = zttid.unwrap(); + if conf.remote_storage.is_none() || !conf.wal_backup_enabled { + continue; /* just drain the channel and do nothing */ + } + let timeline = is_wal_backup_required(zttid); + // do we need to do anything at all? + if timeline.is_some() != tasks.contains_key(&zttid) { + if let Some(timeline) = timeline { + // need to start the task + let entry = tasks.entry(zttid).or_insert(WalBackupTimelineEntry { + timeline, + handle: None, + }); + consider_start_task(&conf, zttid, entry); + } else { + // need to stop the task + info!("stopping WAL backup task for {}", zttid); + + let entry = tasks.remove(&zttid).unwrap(); + if let Some(wb_handle) = entry.handle { + // Tell the task to shutdown. Error means task exited earlier, that's ok. + let _ = wb_handle.shutdown_tx.send(()).await; + // Await the task itself. TODO: restart panicked tasks earlier. + if let Err(e) = wb_handle.handle.await { + warn!("WAL backup task for {} panicked: {}", zttid, e); + } + } + } + } + } + // Start known tasks, if needed and possible. + _ = ticker.tick() => { + for (zttid, entry) in tasks.iter_mut().filter(|(_, entry)| entry.handle.is_none()) { + consider_start_task(&conf, *zttid, entry); + } + } + } + } +} + +struct WalBackupTask { + timeline: Arc, + timeline_dir: PathBuf, + wal_seg_size: usize, + commit_lsn_watch_rx: watch::Receiver, + leader: Option, + election: Election, +} + +/// Offload single timeline. +async fn backup_task_main( + zttid: ZTenantTimelineId, + timeline_dir: PathBuf, + mut shutdown_rx: Receiver<()>, + election: Election, +) { + info!("started"); + let timeline: Arc = if let Some(tli) = GlobalTimelines::get_loaded(zttid) { + tli + } else { + /* Timeline could get deleted while task was starting, just exit then. */ + info!("no timeline, exiting"); + return; + }; + + let mut wb = WalBackupTask { + wal_seg_size: timeline.get_wal_seg_size(), + commit_lsn_watch_rx: timeline.get_commit_lsn_watch_rx(), + timeline, + timeline_dir, + leader: None, + election, + }; + + // task is spinned up only when wal_seg_size already initialized + assert!(wb.wal_seg_size > 0); + + let mut canceled = false; + select! { + _ = wb.run() => {} + _ = shutdown_rx.recv() => { + canceled = true; + } + } + if let Some(l) = wb.leader { + l.give_up().await; + } + info!("task {}", if canceled { "canceled" } else { "terminated" }); +} + +impl WalBackupTask { + async fn run(&mut self) { + let mut backup_lsn = Lsn(0); + + // election loop + loop { + let mut retry_attempt = 0u32; + + info!("acquiring leadership"); + if let Err(e) = broker::get_leader(&self.election, &mut self.leader).await { + error!("error during leader election {:?}", e); + sleep(Duration::from_millis(BROKER_CONNECTION_RETRY_DELAY_MS)).await; + continue; + } + info!("acquired leadership"); + + // offload loop + loop { + if retry_attempt == 0 { + // wait for new WAL to arrive + if let Err(e) = self.commit_lsn_watch_rx.changed().await { + // should never happen, as we hold Arc to timeline. + error!("commit_lsn watch shut down: {:?}", e); + return; + } + } else { + // or just sleep if we errored previously + let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS; + if let Some(backoff_delay) = + UPLOAD_FAILURE_RETRY_MIN_MS.checked_shl(retry_attempt) + { + retry_delay = min(retry_delay, backoff_delay); + } + sleep(Duration::from_millis(retry_delay)).await; + } + + let commit_lsn = *self.commit_lsn_watch_rx.borrow(); + + // Note that backup_lsn can be higher than commit_lsn if we + // don't have much local WAL and others already uploaded + // segments we don't even have. + if backup_lsn.segment_number(self.wal_seg_size) + >= commit_lsn.segment_number(self.wal_seg_size) + { + continue; /* nothing to do, common case as we wake up on every commit_lsn bump */ + } + // Perhaps peers advanced the position, check shmem value. + backup_lsn = self.timeline.get_wal_backup_lsn(); + if backup_lsn.segment_number(self.wal_seg_size) + >= commit_lsn.segment_number(self.wal_seg_size) + { + continue; + } + + if let Some(l) = self.leader.as_mut() { + // Optimization idea for later: + // Avoid checking election leader every time by returning current lease grant expiration time + // Re-check leadership only after expiration time, + // such approach would reduce overhead on write-intensive workloads + + match l + .check_am_i( + self.election.election_name.clone(), + self.election.candidate_name.clone(), + ) + .await + { + Ok(leader) => { + if !leader { + info!("lost leadership"); + break; + } + } + Err(e) => { + warn!("error validating leader, {:?}", e); + break; + } + } + } + + match backup_lsn_range( + backup_lsn, + commit_lsn, + self.wal_seg_size, + &self.timeline_dir, + ) + .await + { + Ok(backup_lsn_result) => { + backup_lsn = backup_lsn_result; + self.timeline.set_wal_backup_lsn(backup_lsn_result); + retry_attempt = 0; + } + Err(e) => { + error!( + "failed while offloading range {}-{}: {:?}", + backup_lsn, commit_lsn, e + ); + + retry_attempt = min(retry_attempt + 1, u32::MAX); + } + } + } + } + } +} + +pub async fn backup_lsn_range( + start_lsn: Lsn, + end_lsn: Lsn, + wal_seg_size: usize, + timeline_dir: &Path, +) -> Result { + let mut res = start_lsn; + let segments = get_segments(start_lsn, end_lsn, wal_seg_size); + for s in &segments { + backup_single_segment(s, timeline_dir) + .await + .with_context(|| format!("offloading segno {}", s.seg_no))?; + + res = s.end_lsn; + } + info!( + "offloaded segnos {:?} up to {}, previous backup_lsn {}", + segments.iter().map(|&s| s.seg_no).collect::>(), + end_lsn, + start_lsn, + ); + Ok(res) +} + +async fn backup_single_segment(seg: &Segment, timeline_dir: &Path) -> Result<()> { + let segment_file_name = seg.file_path(timeline_dir)?; + + backup_object(&segment_file_name, seg.size()).await?; + debug!("Backup of {} done", segment_file_name.display()); + + Ok(()) +} + +#[derive(Debug, Copy, Clone)] +pub struct Segment { + seg_no: XLogSegNo, + start_lsn: Lsn, + end_lsn: Lsn, +} + +impl Segment { + pub fn new(seg_no: u64, start_lsn: Lsn, end_lsn: Lsn) -> Self { + Self { + seg_no, + start_lsn, + end_lsn, + } + } + + pub fn object_name(self) -> String { + XLogFileName(PG_TLI, self.seg_no, self.size()) + } + + pub fn file_path(self, timeline_dir: &Path) -> Result { + Ok(timeline_dir.join(self.object_name())) + } + + pub fn size(self) -> usize { + (u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize + } +} + +fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec { + let first_seg = start.segment_number(seg_size); + let last_seg = end.segment_number(seg_size); + + let res: Vec = (first_seg..last_seg) + .map(|s| { + let start_lsn = XLogSegNoOffsetToRecPtr(s, 0, seg_size); + let end_lsn = XLogSegNoOffsetToRecPtr(s + 1, 0, seg_size); + Segment::new(s, Lsn::from(start_lsn), Lsn::from(end_lsn)) + }) + .collect(); + res +} + +static REMOTE_STORAGE: OnceCell> = OnceCell::new(); + +async fn backup_object(source_file: &Path, size: usize) -> Result<()> { + let storage = REMOTE_STORAGE.get().expect("failed to get remote storage"); + + let file = File::open(&source_file).await?; + + // Storage is initialized by launcher at this point. + match storage.as_ref().unwrap() { + GenericRemoteStorage::Local(local_storage) => { + let destination = local_storage.remote_object_id(source_file)?; + + debug!( + "local upload about to start from {} to {}", + source_file.display(), + destination.display() + ); + local_storage.upload(file, size, &destination, None).await + } + GenericRemoteStorage::S3(s3_storage) => { + let s3key = s3_storage.remote_object_id(source_file)?; + + debug!( + "S3 upload about to start from {} to {:?}", + source_file.display(), + s3key + ); + s3_storage.upload(file, size, &s3key, None).await + } + }?; + + Ok(()) +} + +pub async fn read_object( + file_path: PathBuf, + offset: u64, +) -> anyhow::Result>> { + let download = match REMOTE_STORAGE + .get() + .context("Failed to get remote storage")? + .as_ref() + .context("No remote storage configured")? + { + GenericRemoteStorage::Local(local_storage) => { + let source = local_storage.remote_object_id(&file_path)?; + + info!( + "local download about to start from {} at offset {}", + source.display(), + offset + ); + local_storage + .download_byte_range(&source, offset, None) + .await + } + GenericRemoteStorage::S3(s3_storage) => { + let s3key = s3_storage.remote_object_id(&file_path)?; + + info!( + "S3 download about to start from {:?} at offset {}", + s3key, offset + ); + s3_storage.download_byte_range(&s3key, offset, None).await + } + } + .with_context(|| { + format!( + "Failed to open WAL segment download stream for local storage path {}", + file_path.display() + ) + })?; + + Ok(download.download_stream) +} diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 503bd7c543..9b23e2189c 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -8,7 +8,9 @@ //! Note that last file has `.partial` suffix, that's different from postgres. use anyhow::{anyhow, bail, Context, Result}; -use std::io::{Read, Seek, SeekFrom}; +use std::io::{self, Seek, SeekFrom}; +use std::pin::Pin; +use tokio::io::AsyncRead; use lazy_static::lazy_static; use postgres_ffi::xlog_utils::{ @@ -26,25 +28,19 @@ use utils::{lsn::Lsn, zid::ZTenantTimelineId}; use crate::safekeeper::SafeKeeperState; +use crate::wal_backup::read_object; use crate::SafeKeeperConf; use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ}; use postgres_ffi::waldecoder::WalStreamDecoder; -use metrics::{ - register_gauge_vec, register_histogram_vec, Gauge, GaugeVec, Histogram, HistogramVec, - DISK_WRITE_SECONDS_BUCKETS, -}; +use metrics::{register_histogram_vec, Histogram, HistogramVec, DISK_WRITE_SECONDS_BUCKETS}; + +use tokio::io::{AsyncReadExt, AsyncSeekExt}; lazy_static! { // The prometheus crate does not support u64 yet, i64 only (see `IntGauge`). // i64 is faster than f64, so update to u64 when available. - static ref FLUSH_LSN_GAUGE: GaugeVec = register_gauge_vec!( - "safekeeper_flush_lsn", - "Current flush_lsn, grouped by timeline", - &["tenant_id", "timeline_id"] - ) - .expect("Failed to register safekeeper_flush_lsn gauge vec"); static ref WRITE_WAL_BYTES: HistogramVec = register_histogram_vec!( "safekeeper_write_wal_bytes", "Bytes written to WAL in a single request, grouped by timeline", @@ -69,7 +65,6 @@ lazy_static! { } struct WalStorageMetrics { - flush_lsn: Gauge, write_wal_bytes: Histogram, write_wal_seconds: Histogram, flush_wal_seconds: Histogram, @@ -80,7 +75,6 @@ impl WalStorageMetrics { let tenant_id = zttid.tenant_id.to_string(); let timeline_id = zttid.timeline_id.to_string(); Self { - flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&[&tenant_id, &timeline_id]), write_wal_bytes: WRITE_WAL_BYTES.with_label_values(&[&tenant_id, &timeline_id]), write_wal_seconds: WRITE_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), flush_wal_seconds: FLUSH_WAL_SECONDS.with_label_values(&[&tenant_id, &timeline_id]), @@ -126,7 +120,7 @@ pub struct PhysicalStorage { conf: SafeKeeperConf, // fields below are filled upon initialization - /// None if unitialized, Some(usize) if storage is initialized. + /// None if uninitialized, Some(usize) if storage is initialized. wal_seg_size: Option, /// Written to disk, but possibly still in the cache and not fully persisted. @@ -171,7 +165,6 @@ impl PhysicalStorage { /// Wrapper for flush_lsn updates that also updates metrics. fn update_flush_lsn(&mut self) { self.flush_record_lsn = self.write_record_lsn; - self.metrics.flush_lsn.set(self.flush_record_lsn.0 as f64); } /// Call fdatasync if config requires so. @@ -456,7 +449,7 @@ impl Storage for PhysicalStorage { segno += 1; let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, wal_seg_size)?; - // TODO: better use fs::try_exists which is currenty avaialble only in nightly build + // TODO: better use fs::try_exists which is currently available only in nightly build if wal_file_path.exists() { fs::remove_file(&wal_file_path)?; } else if wal_file_partial_path.exists() { @@ -516,69 +509,122 @@ pub struct WalReader { timeline_dir: PathBuf, wal_seg_size: usize, pos: Lsn, - file: Option, + wal_segment: Option>>, + + enable_remote_read: bool, + // S3 will be used to read WAL if LSN is not available locally + local_start_lsn: Lsn, } impl WalReader { - pub fn new(timeline_dir: PathBuf, wal_seg_size: usize, pos: Lsn) -> Self { - Self { - timeline_dir, - wal_seg_size, - pos, - file: None, + pub fn new( + timeline_dir: PathBuf, + state: &SafeKeeperState, + start_pos: Lsn, + enable_remote_read: bool, + ) -> Result { + if start_pos < state.timeline_start_lsn { + bail!( + "Requested streaming from {}, which is before the start of the timeline {}", + start_pos, + state.timeline_start_lsn + ); } + + // TODO: add state.timeline_start_lsn == Lsn(0) check + if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) { + bail!("state uninitialized, no data to read"); + } + + Ok(Self { + timeline_dir, + wal_seg_size: state.server.wal_seg_size as usize, + pos: start_pos, + wal_segment: None, + enable_remote_read, + local_start_lsn: state.local_start_lsn, + }) } - pub fn read(&mut self, buf: &mut [u8]) -> Result { - // Take the `File` from `wal_file`, or open a new file. - let mut file = match self.file.take() { - Some(file) => file, - None => { - // Open a new file. - let segno = self.pos.segment_number(self.wal_seg_size); - let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size); - let wal_file_path = self.timeline_dir.join(wal_file_name); - Self::open_wal_file(&wal_file_path)? - } + pub async fn read(&mut self, buf: &mut [u8]) -> Result { + let mut wal_segment = match self.wal_segment.take() { + Some(reader) => reader, + None => self.open_segment().await?, }; - let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize; - // How much to read and send in message? We cannot cross the WAL file // boundary, and we don't want send more than provided buffer. + let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize; let send_size = min(buf.len(), self.wal_seg_size - xlogoff); // Read some data from the file. let buf = &mut buf[0..send_size]; - file.seek(SeekFrom::Start(xlogoff as u64)) - .and_then(|_| file.read_exact(buf)) - .context("Failed to read data from WAL file")?; - + let send_size = wal_segment.read_exact(buf).await?; self.pos += send_size as u64; - // Decide whether to reuse this file. If we don't set wal_file here - // a new file will be opened next time. + // Decide whether to reuse this file. If we don't set wal_segment here + // a new reader will be opened next time. if self.pos.segment_offset(self.wal_seg_size) != 0 { - self.file = Some(file); + self.wal_segment = Some(wal_segment); } Ok(send_size) } + /// Open WAL segment at the current position of the reader. + async fn open_segment(&self) -> Result>> { + let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize; + let segno = self.pos.segment_number(self.wal_seg_size); + let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size); + let wal_file_path = self.timeline_dir.join(wal_file_name); + + // Try to open local file, if we may have WAL locally + if self.pos >= self.local_start_lsn { + let res = Self::open_wal_file(&wal_file_path).await; + match res { + Ok(mut file) => { + file.seek(SeekFrom::Start(xlogoff as u64)).await?; + return Ok(Box::pin(file)); + } + Err(e) => { + let is_not_found = e.chain().any(|e| { + if let Some(e) = e.downcast_ref::() { + e.kind() == io::ErrorKind::NotFound + } else { + false + } + }); + if !is_not_found { + return Err(e); + } + // NotFound is expected, fall through to remote read + } + }; + } + + // Try to open remote file, if remote reads are enabled + if self.enable_remote_read { + return read_object(wal_file_path, xlogoff as u64).await; + } + + bail!("WAL segment is not found") + } + /// Helper function for opening a wal file. - fn open_wal_file(wal_file_path: &Path) -> Result { + async fn open_wal_file(wal_file_path: &Path) -> Result { // First try to open the .partial file. let mut partial_path = wal_file_path.to_owned(); partial_path.set_extension("partial"); - if let Ok(opened_file) = File::open(&partial_path) { + if let Ok(opened_file) = tokio::fs::File::open(&partial_path).await { return Ok(opened_file); } // If that failed, try it without the .partial extension. - File::open(&wal_file_path) + tokio::fs::File::open(&wal_file_path) + .await .with_context(|| format!("Failed to open WAL file {:?}", wal_file_path)) .map_err(|e| { - error!("{}", e); + warn!("{}", e); e }) } diff --git a/scripts/generate_perf_report_page.py b/scripts/generate_perf_report_page.py index a15d04e056..23fa4b76a3 100755 --- a/scripts/generate_perf_report_page.py +++ b/scripts/generate_perf_report_page.py @@ -26,6 +26,7 @@ KEY_EXCLUDE_FIELDS = frozenset({ }) NEGATIVE_COLOR = 'negative' POSITIVE_COLOR = 'positive' +EPS = 1e-6 @dataclass @@ -120,7 +121,8 @@ def get_row_values(columns: List[str], run_result: SuitRun, # this might happen when new metric is added and there is no value for it in previous run # let this be here, TODO add proper handling when this actually happens raise ValueError(f'{column} not found in previous result') - ratio = float(value) / float(prev_value['value']) - 1 + # adding `EPS` to each term to avoid ZeroDivisionError when the denominator is zero + ratio = (float(value) + EPS) / (float(prev_value['value']) + EPS) - 1 ratio_display, color = format_ratio(ratio, current_value['report']) row_values.append(RowValue(value, color, ratio_display)) return row_values diff --git a/setup.cfg b/setup.cfg index b3b39fadd7..d1a2f9a359 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,6 +28,10 @@ strict = true # There is some work in progress, though: https://github.com/MagicStack/asyncpg/pull/577 ignore_missing_imports = true +[mypy-pg8000.*] +# Used only in testing clients +ignore_missing_imports = true + [mypy-cached_property.*] ignore_missing_imports = true diff --git a/test_runner/README.md b/test_runner/README.md index 059bbb83cc..4b54c45175 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -1,14 +1,14 @@ -## Zenith test runner +## Neon test runner This directory contains integration tests. Prerequisites: - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python) -- Zenith and Postgres binaries +- Neon and Postgres binaries - See the root [README.md](/README.md) for build directions - Tests can be run from the git tree; or see the environment variables below to run from other directories. -- The zenith git repo, including the postgres submodule +- The neon git repo, including the postgres submodule (for some tests, e.g. `pg_regress`) - Some tests (involving storage nodes coordination) require etcd installed. Follow [`the guide`](https://etcd.io/docs/v3.5/install/) to obtain it. @@ -45,14 +45,14 @@ If you want to run all tests that have the string "bench" in their names: Useful environment variables: -`ZENITH_BIN`: The directory where zenith binaries can be found. +`NEON_BIN`: The directory where neon binaries can be found. `POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found. `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. `ZENITH_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as -`--pageserver-config-override=${value}` parameter values when zenith cli is invoked -`RUST_LOG`: logging configuration to pass into Zenith CLI +`--pageserver-config-override=${value}` parameter values when neon_local cli is invoked +`RUST_LOG`: logging configuration to pass into Neon CLI Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them: `./scripts/pytest -s --log-cli-level=INFO ...` @@ -65,32 +65,32 @@ Exit after the first test failure: ### Writing a test -Every test needs a Zenith Environment, or ZenithEnv to operate in. A Zenith Environment +Every test needs a Neon Environment, or NeonEnv to operate in. A Neon Environment is like a little cloud-in-a-box, and consists of a Pageserver, 0-N Safekeepers, and compute Postgres nodes. The connections between them can be configured to use JWT authentication tokens, and some other configuration options can be tweaked too. -The easiest way to get access to a Zenith Environment is by using the `zenith_simple_env` +The easiest way to get access to a Neon Environment is by using the `neon_simple_env` fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes or make other destructive changes in that environment. Also don't assume that there are no tenants or branches or data in the cluster. For convenience, there is a branch called `empty`, though. The convention is to create a test-specific branch of that and load any test data there, instead of the 'main' branch. -For more complicated cases, you can build a custom Zenith Environment, with the `zenith_env` +For more complicated cases, you can build a custom Neon Environment, with the `neon_env` fixture: ```python -def test_foobar(zenith_env_builder: ZenithEnvBuilder): +def test_foobar(neon_env_builder: NeonEnvBuilder): # Prescribe the environment. # We want to have 3 safekeeper nodes, and use JWT authentication in the # connections to the page server - zenith_env_builder.num_safekeepers = 3 - zenith_env_builder.set_pageserver_auth(True) + neon_env_builder.num_safekeepers = 3 + neon_env_builder.set_pageserver_auth(True) # Now create the environment. This initializes the repository, and starts # up the page server and the safekeepers - env = zenith_env_builder.init_start() + env = neon_env_builder.init_start() # Run the test ... diff --git a/test_runner/batch_others/test_ancestor_branch.py b/test_runner/batch_others/test_ancestor_branch.py index 5dbd6d2e26..3e7ba22184 100644 --- a/test_runner/batch_others/test_ancestor_branch.py +++ b/test_runner/batch_others/test_ancestor_branch.py @@ -1,20 +1,17 @@ -from contextlib import closing - -import psycopg2.extras import pytest from fixtures.log_helper import log -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverApiException +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException # # Create ancestor branches off the main branch. # -def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() +def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() # Override defaults, 1M gc_horizon and 4M checkpoint_distance. # Extend compaction_period and gc_period to disable background compaction and gc. - tenant, _ = env.zenith_cli.create_tenant( + tenant, _ = env.neon_cli.create_tenant( conf={ 'gc_period': '10 m', 'gc_horizon': '1048576', @@ -24,13 +21,11 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): 'compaction_target_size': '4194304', }) - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - pscur.execute("failpoints flush-frozen=sleep(10000)") + env.pageserver.safe_psql("failpoints flush-frozen-before-sync=sleep(10000)") pg_branch0 = env.postgres.create_start('main', tenant_id=tenant) branch0_cur = pg_branch0.connect().cursor() - branch0_cur.execute("SHOW zenith.zenith_timeline") + branch0_cur.execute("SHOW neon.timeline_id") branch0_timeline = branch0_cur.fetchone()[0] log.info(f"b0 timeline {branch0_timeline}") @@ -50,12 +45,12 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 100k rows: {lsn_100}') # Create branch1. - env.zenith_cli.create_branch('branch1', 'main', tenant_id=tenant, ancestor_start_lsn=lsn_100) + env.neon_cli.create_branch('branch1', 'main', tenant_id=tenant, ancestor_start_lsn=lsn_100) pg_branch1 = env.postgres.create_start('branch1', tenant_id=tenant) log.info("postgres is running on 'branch1' branch") branch1_cur = pg_branch1.connect().cursor() - branch1_cur.execute("SHOW zenith.zenith_timeline") + branch1_cur.execute("SHOW neon.timeline_id") branch1_timeline = branch1_cur.fetchone()[0] log.info(f"b1 timeline {branch1_timeline}") @@ -74,12 +69,12 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 200k rows: {lsn_200}') # Create branch2. - env.zenith_cli.create_branch('branch2', 'branch1', tenant_id=tenant, ancestor_start_lsn=lsn_200) + env.neon_cli.create_branch('branch2', 'branch1', tenant_id=tenant, ancestor_start_lsn=lsn_200) pg_branch2 = env.postgres.create_start('branch2', tenant_id=tenant) log.info("postgres is running on 'branch2' branch") branch2_cur = pg_branch2.connect().cursor() - branch2_cur.execute("SHOW zenith.zenith_timeline") + branch2_cur.execute("SHOW neon.timeline_id") branch2_timeline = branch2_cur.fetchone()[0] log.info(f"b2 timeline {branch2_timeline}") @@ -110,17 +105,3 @@ def test_ancestor_branch(zenith_env_builder: ZenithEnvBuilder): branch2_cur.execute('SELECT count(*) FROM foo') assert branch2_cur.fetchone() == (300000, ) - - -def test_ancestor_branch_detach(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - - parent_timeline_id = env.zenith_cli.create_branch("test_ancestor_branch_detach_parent", "empty") - - env.zenith_cli.create_branch("test_ancestor_branch_detach_branch1", - "test_ancestor_branch_detach_parent") - - ps_http = env.pageserver.http_client() - with pytest.raises(ZenithPageserverApiException, - match="Failed to detach inmem tenant timeline"): - ps_http.timeline_detach(env.initial_tenant, parent_timeline_id) diff --git a/test_runner/batch_others/test_auth.py b/test_runner/batch_others/test_auth.py index a8ad384f27..0fd0a5d7e3 100644 --- a/test_runner/batch_others/test_auth.py +++ b/test_runner/batch_others/test_auth.py @@ -1,14 +1,12 @@ from contextlib import closing -from typing import Iterator -from uuid import UUID, uuid4 -from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithPageserverApiException -from requests.exceptions import HTTPError +from uuid import uuid4 +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException import pytest -def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.pageserver_auth_enabled = True - env = zenith_env_builder.init_start() +def test_pageserver_auth(neon_env_builder: NeonEnvBuilder): + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() ps = env.pageserver @@ -25,8 +23,8 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): ps.safe_psql("set FOO", password=tenant_token) ps.safe_psql("set FOO", password=management_token) - new_timeline_id = env.zenith_cli.create_branch('test_pageserver_auth', - tenant_id=env.initial_tenant) + new_timeline_id = env.neon_cli.create_branch('test_pageserver_auth', + tenant_id=env.initial_tenant) # tenant can create branches tenant_http_client.timeline_create(tenant_id=env.initial_tenant, @@ -36,7 +34,7 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): ancestor_timeline_id=new_timeline_id) # fail to create branch using token with different tenant_id - with pytest.raises(ZenithPageserverApiException, + with pytest.raises(NeonPageserverApiException, match='Forbidden: Tenant id mismatch. Permission denied'): invalid_tenant_http_client.timeline_create(tenant_id=env.initial_tenant, ancestor_timeline_id=new_timeline_id) @@ -46,21 +44,21 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): # fail to create tenant using tenant token with pytest.raises( - ZenithPageserverApiException, + NeonPageserverApiException, match='Forbidden: Attempt to access management api with tenant scope. Permission denied' ): tenant_http_client.tenant_create() @pytest.mark.parametrize('with_safekeepers', [False, True]) -def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_safekeepers: bool): - zenith_env_builder.pageserver_auth_enabled = True +def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): + neon_env_builder.auth_enabled = True if with_safekeepers: - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() branch = f'test_compute_auth_to_pageserver{with_safekeepers}' - env.zenith_cli.create_branch(branch) + env.neon_cli.create_branch(branch) pg = env.postgres.create_start(branch) with closing(pg.connect()) as conn: diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/batch_others/test_backpressure.py index 81f45b749b..4ca03b102b 100644 --- a/test_runner/batch_others/test_backpressure.py +++ b/test_runner/batch_others/test_backpressure.py @@ -1,15 +1,13 @@ from contextlib import closing, contextmanager import psycopg2.extras import pytest -from fixtures.zenith_fixtures import PgProtocol, ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log -import os import time -import asyncpg -from fixtures.zenith_fixtures import Postgres +from fixtures.neon_fixtures import Postgres import threading -pytest_plugins = ("fixtures.zenith_fixtures") +pytest_plugins = ("fixtures.neon_fixtures") @contextmanager @@ -26,7 +24,7 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv log.info("checks started") with pg_cur(pg) as cur: - cur.execute("CREATE EXTENSION zenith") # TODO move it to zenith_fixtures? + cur.execute("CREATE EXTENSION neon") # TODO move it to neon_fixtures? cur.execute("select pg_size_bytes(current_setting('max_replication_write_lag'))") res = cur.fetchone() @@ -93,10 +91,10 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv @pytest.mark.skip("See https://github.com/neondatabase/neon/issues/1587") -def test_backpressure_received_lsn_lag(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() +def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() # Create a branch for us - env.zenith_cli.create_branch('test_backpressure') + env.neon_cli.create_branch('test_backpressure') pg = env.postgres.create_start('test_backpressure', config_lines=['max_replication_write_lag=30MB']) diff --git a/test_runner/batch_others/test_basebackup_error.py b/test_runner/batch_others/test_basebackup_error.py new file mode 100644 index 0000000000..0909ed98a7 --- /dev/null +++ b/test_runner/batch_others/test_basebackup_error.py @@ -0,0 +1,18 @@ +import pytest + +from fixtures.neon_fixtures import NeonEnv + + +# +# Test error handling, if the 'basebackup' command fails in the middle +# of building the tar archive. +# +def test_basebackup_error(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_basebackup_error", "empty") + + # Introduce failpoint + env.pageserver.safe_psql(f"failpoints basebackup-before-control-file=return") + + with pytest.raises(Exception, match="basebackup-before-control-file"): + pg = env.postgres.create_start('test_basebackup_error') diff --git a/test_runner/batch_others/test_branch_and_gc.py b/test_runner/batch_others/test_branch_and_gc.py new file mode 100644 index 0000000000..a6210b9176 --- /dev/null +++ b/test_runner/batch_others/test_branch_and_gc.py @@ -0,0 +1,101 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import lsn_from_hex + + +# Test the GC implementation when running with branching. +# This test reproduces the issue https://github.com/neondatabase/neon/issues/707. +# +# Consider two LSNs `lsn1` and `lsn2` with some delta files as follows: +# ... +# p -> has an image layer xx_p with p < lsn1 +# ... +# lsn1 +# ... +# q -> has an image layer yy_q with lsn1 < q < lsn2 +# ... +# lsn2 +# +# Consider running a GC iteration such that the GC horizon is between p and lsn1 +# ... +# p -> has an image layer xx_p with p < lsn1 +# D_start -> is a delta layer D's start (e.g D = '...-...-D_start-D_end') +# ... +# GC_h -> is a gc horizon such that p < GC_h < lsn1 +# ... +# lsn1 +# ... +# D_end -> is a delta layer D's end +# ... +# q -> has an image layer yy_q with lsn1 < q < lsn2 +# ... +# lsn2 +# +# As described in the issue #707, the image layer xx_p will be deleted as +# its range is below the GC horizon and there exists a newer image layer yy_q (q > p). +# However, removing xx_p will corrupt any delta layers that depend on xx_p that +# are not deleted by GC. For example, the delta layer D is corrupted in the +# above example because D depends on the image layer xx_p for value reconstruction. +# +# Because the delta layer D covering lsn1 is corrupted, creating a branch +# starting from lsn1 should return an error as follows: +# could not find data for key ... at LSN ..., for request at LSN ... +def test_branch_and_gc(neon_simple_env: NeonEnv): + env = neon_simple_env + + tenant, _ = env.neon_cli.create_tenant( + conf={ + # disable background GC + 'gc_period': '10 m', + 'gc_horizon': f'{10 * 1024 ** 3}', + + # small checkpoint distance to create more delta layer files + 'checkpoint_distance': f'{1024 ** 2}', + + # set the target size to be large to allow the image layer to cover the whole key space + 'compaction_target_size': f'{1024 ** 3}', + + # tweak the default settings to allow quickly create image layers and L1 layers + 'compaction_period': '1 s', + 'compaction_threshold': '2', + 'image_creation_threshold': '1', + + # set PITR interval to be small, so we can do GC + 'pitr_interval': '1 s' + }) + + timeline_main = env.neon_cli.create_timeline(f'test_main', tenant_id=tenant) + pg_main = env.postgres.create_start('test_main', tenant_id=tenant) + + main_cur = pg_main.connect().cursor() + + main_cur.execute( + "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')" + ) + main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)') + main_cur.execute('SELECT pg_current_wal_insert_lsn()') + lsn1 = main_cur.fetchone()[0] + log.info(f'LSN1: {lsn1}') + + main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)') + main_cur.execute('SELECT pg_current_wal_insert_lsn()') + lsn2 = main_cur.fetchone()[0] + log.info(f'LSN2: {lsn2}') + + # Set the GC horizon so that lsn1 is inside the horizon, which means + # we can create a new branch starting from lsn1. + env.pageserver.safe_psql( + f'''do_gc {tenant.hex} {timeline_main.hex} {lsn_from_hex(lsn2) - lsn_from_hex(lsn1) + 1024}''' + ) + + env.neon_cli.create_branch('test_branch', + 'test_main', + tenant_id=tenant, + ancestor_start_lsn=lsn1) + pg_branch = env.postgres.create_start('test_branch', tenant_id=tenant) + + branch_cur = pg_branch.connect().cursor() + branch_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)') + + branch_cur.execute('SELECT count(*) FROM foo') + assert branch_cur.fetchone() == (200000, ) diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index fc84af5283..0274c6c1e0 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -1,37 +1,36 @@ -import subprocess from contextlib import closing import psycopg2.extras import pytest from fixtures.log_helper import log from fixtures.utils import print_gc_result -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder # # Create a couple of branches off the main branch, at a historical point in time. # -def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): +def test_branch_behind(neon_env_builder: NeonEnvBuilder): # Use safekeeper in this test to avoid a subtle race condition. # Without safekeeper, walreceiver reconnection can stuck # because of IO deadlock. # - # See https://github.com/zenithdb/zenith/issues/1068 - zenith_env_builder.num_safekeepers = 1 + # See https://github.com/neondatabase/neon/issues/1068 + neon_env_builder.num_safekeepers = 1 # Disable pitr, because here we want to test branch creation after GC - zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = zenith_env_builder.init_start() + neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + env = neon_env_builder.init_start() # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch('test_branch_behind') + env.neon_cli.create_branch('test_branch_behind') pgmain = env.postgres.create_start('test_branch_behind') log.info("postgres is running on 'test_branch_behind' branch") main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - main_cur.execute("SHOW zenith.zenith_timeline") + main_cur.execute("SHOW neon.timeline_id") timeline = main_cur.fetchone()[0] # Create table, and insert the first 100 rows @@ -61,9 +60,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 200100 rows: {lsn_b}') # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch('test_branch_behind_hundred', - 'test_branch_behind', - ancestor_start_lsn=lsn_a) + env.neon_cli.create_branch('test_branch_behind_hundred', + 'test_branch_behind', + ancestor_start_lsn=lsn_a) # Insert many more rows. This generates enough WAL to fill a few segments. main_cur.execute(''' @@ -78,9 +77,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 400100 rows: {lsn_c}') # Branch at the point where only 200100 rows were inserted - env.zenith_cli.create_branch('test_branch_behind_more', - 'test_branch_behind', - ancestor_start_lsn=lsn_b) + env.neon_cli.create_branch('test_branch_behind_more', + 'test_branch_behind', + ancestor_start_lsn=lsn_b) pg_hundred = env.postgres.create_start('test_branch_behind_hundred') pg_more = env.postgres.create_start('test_branch_behind_more') @@ -104,9 +103,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): # Check bad lsn's for branching # branch at segment boundary - env.zenith_cli.create_branch('test_branch_segment_boundary', - 'test_branch_behind', - ancestor_start_lsn="0/3000000") + env.neon_cli.create_branch('test_branch_segment_boundary', + 'test_branch_behind', + ancestor_start_lsn="0/3000000") pg = env.postgres.create_start('test_branch_segment_boundary') cur = pg.connect().cursor() cur.execute('SELECT 1') @@ -114,13 +113,13 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): # branch at pre-initdb lsn with pytest.raises(Exception, match="invalid branch start lsn"): - env.zenith_cli.create_branch('test_branch_preinitdb', ancestor_start_lsn="0/42") + env.neon_cli.create_branch('test_branch_preinitdb', ancestor_start_lsn="0/42") # branch at pre-ancestor lsn with pytest.raises(Exception, match="less than timeline ancestor lsn"): - env.zenith_cli.create_branch('test_branch_preinitdb', - 'test_branch_behind', - ancestor_start_lsn="0/42") + env.neon_cli.create_branch('test_branch_preinitdb', + 'test_branch_behind', + ancestor_start_lsn="0/42") # check that we cannot create branch based on garbage collected data with closing(env.pageserver.connect()) as psconn: @@ -132,9 +131,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): with pytest.raises(Exception, match="invalid branch start lsn"): # this gced_lsn is pretty random, so if gc is disabled this woudln't fail - env.zenith_cli.create_branch('test_branch_create_fail', - 'test_branch_behind', - ancestor_start_lsn=gced_lsn) + env.neon_cli.create_branch('test_branch_create_fail', + 'test_branch_behind', + ancestor_start_lsn=gced_lsn) # check that after gc everything is still there hundred_cur.execute('SELECT count(*) FROM foo') diff --git a/test_runner/batch_others/test_branching.py b/test_runner/batch_others/test_branching.py new file mode 100644 index 0000000000..c61bac7a58 --- /dev/null +++ b/test_runner/batch_others/test_branching.py @@ -0,0 +1,89 @@ +from typing import List +import threading +import pytest +from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres +import time +import random +from fixtures.log_helper import log +from performance.test_perf_pgbench import get_scales_matrix + + +# Test branch creation +# +# This test spawns pgbench in a thread in the background, and creates a branch while +# pgbench is running. Then it launches pgbench on the new branch, and creates another branch. +# Repeat `n_branches` times. +# +# If 'ty' == 'cascade', each branch is created from the previous branch, so that you end +# up with a branch of a branch of a branch ... of a branch. With 'ty' == 'flat', +# each branch is created from the root. +@pytest.mark.parametrize("n_branches", [10]) +@pytest.mark.parametrize("scale", get_scales_matrix(1)) +@pytest.mark.parametrize("ty", ["cascade", "flat"]) +def test_branching_with_pgbench(neon_simple_env: NeonEnv, + pg_bin: PgBin, + n_branches: int, + scale: int, + ty: str): + env = neon_simple_env + + # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test + tenant, _ = env.neon_cli.create_tenant( + conf={ + 'gc_period': '5 s', + 'gc_horizon': f'{1024 ** 2}', + 'checkpoint_distance': f'{1024 ** 2}', + 'compaction_target_size': f'{1024 ** 2}', + # set PITR interval to be small, so we can do GC + 'pitr_interval': '5 s' + }) + + def run_pgbench(pg: Postgres): + connstr = pg.connstr() + + log.info(f"Start a pgbench workload on pg {connstr}") + + pg_bin.run_capture(['pgbench', '-i', f'-s{scale}', connstr]) + pg_bin.run_capture(['pgbench', '-T15', connstr]) + + env.neon_cli.create_branch('b0', tenant_id=tenant) + pgs: List[Postgres] = [] + pgs.append(env.postgres.create_start('b0', tenant_id=tenant)) + + threads: List[threading.Thread] = [] + threads.append(threading.Thread(target=run_pgbench, args=(pgs[0], ), daemon=True)) + threads[-1].start() + + thread_limit = 4 + + for i in range(n_branches): + # random a delay between [0, 5] + delay = random.random() * 5 + time.sleep(delay) + log.info(f"Sleep {delay}s") + + # If the number of concurrent threads exceeds a threshold, + # wait for all the threads to finish before spawning a new one. + # Because tests defined in `batch_others` are run concurrently in CI, + # we want to avoid the situation that one test exhausts resources for other tests. + if len(threads) >= thread_limit: + for thread in threads: + thread.join() + threads = [] + + if ty == "cascade": + env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(i), tenant_id=tenant) + else: + env.neon_cli.create_branch('b{}'.format(i + 1), 'b0', tenant_id=tenant) + + pgs.append(env.postgres.create_start('b{}'.format(i + 1), tenant_id=tenant)) + + threads.append(threading.Thread(target=run_pgbench, args=(pgs[-1], ), daemon=True)) + threads[-1].start() + + for thread in threads: + thread.join() + + for pg in pgs: + res = pg.safe_psql('SELECT count(*) from pgbench_accounts') + assert res[0] == (100000 * scale, ) diff --git a/test_runner/batch_others/test_broken_timeline.py b/test_runner/batch_others/test_broken_timeline.py index 17eadb33b4..675236fbd7 100644 --- a/test_runner/batch_others/test_broken_timeline.py +++ b/test_runner/batch_others/test_broken_timeline.py @@ -1,21 +1,22 @@ import pytest +import concurrent.futures from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv from fixtures.log_helper import log import os # Test restarting page server, while safekeeper and compute node keep # running. -def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder): +def test_broken_timeline(neon_env_builder: NeonEnvBuilder): # One safekeeper is enough for this test. - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() tenant_timelines = [] for n in range(4): - tenant_id_uuid, timeline_id_uuid = env.zenith_cli.create_tenant() + tenant_id_uuid, timeline_id_uuid = env.neon_cli.create_tenant() tenant_id = tenant_id_uuid.hex timeline_id = timeline_id_uuid.hex @@ -25,7 +26,7 @@ def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder): cur.execute("CREATE TABLE t(key int primary key, value text)") cur.execute("INSERT INTO t SELECT generate_series(1,100), 'payload'") - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timeline_id") timeline_id = cur.fetchone()[0] pg.stop() tenant_timelines.append((tenant_id, timeline_id, pg)) @@ -78,3 +79,37 @@ def test_broken_timeline(zenith_env_builder: ZenithEnvBuilder): with pytest.raises(Exception, match="Cannot load local timeline") as err: pg.start() log.info(f'compute startup failed as expected: {err}') + + +def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): + env = neon_simple_env + + tenant_id, _ = env.neon_cli.create_tenant() + + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + futures = [ + executor.submit(env.neon_cli.create_timeline, + f"test-create-multiple-timelines-{i}", + tenant_id) for i in range(4) + ] + for future in futures: + future.result() + + +def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv): + env = neon_simple_env + + tenant_id, _ = env.neon_cli.create_tenant() + + # Introduce failpoint when creating a new timeline + env.pageserver.safe_psql(f"failpoints before-checkpoint-new-timeline=return") + with pytest.raises(Exception, match="before-checkpoint-new-timeline"): + _ = env.neon_cli.create_timeline("test_fix_broken_timelines", tenant_id) + + # Restart the page server + env.neon_cli.pageserver_stop(immediate=True) + env.neon_cli.pageserver_start() + + # Check that tenant with "broken" timeline is not loaded. + with pytest.raises(Exception, match=f"Failed to get repo for tenant {tenant_id.hex}"): + env.neon_cli.list_timelines(tenant_id) diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py index b7eeedb23e..cbf55e9fc1 100644 --- a/test_runner/batch_others/test_clog_truncate.py +++ b/test_runner/batch_others/test_clog_truncate.py @@ -3,18 +3,18 @@ import os from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log # # Test compute node start after clog truncation # -def test_clog_truncate(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch('test_clog_truncate', 'empty') +def test_clog_truncate(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch('test_clog_truncate', 'empty') - # set agressive autovacuum to make sure that truncation will happen + # set aggressive autovacuum to make sure that truncation will happen config = [ 'autovacuum_max_workers=10', 'autovacuum_vacuum_threshold=0', @@ -29,7 +29,7 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv): log.info('postgres is running on test_clog_truncate branch') # Install extension containing function needed for test - pg.safe_psql('CREATE EXTENSION zenith_test_utils') + pg.safe_psql('CREATE EXTENSION neon_test_utils') # Consume many xids to advance clog with closing(pg.connect()) as conn: @@ -62,9 +62,9 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv): # create new branch after clog truncation and start a compute node on it log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}') - env.zenith_cli.create_branch('test_clog_truncate_new', - 'test_clog_truncate', - ancestor_start_lsn=lsn_after_truncation) + env.neon_cli.create_branch('test_clog_truncate_new', + 'test_clog_truncate', + ancestor_start_lsn=lsn_after_truncation) pg2 = env.postgres.create_start('test_clog_truncate_new') log.info('postgres is running on test_clog_truncate_new branch') diff --git a/test_runner/batch_others/test_close_fds.py b/test_runner/batch_others/test_close_fds.py new file mode 100644 index 0000000000..9521b1bb4a --- /dev/null +++ b/test_runner/batch_others/test_close_fds.py @@ -0,0 +1,51 @@ +from contextlib import closing +import shutil +import time +import subprocess +import os.path + +from cached_property import threading +from fixtures.neon_fixtures import NeonEnv +from fixtures.log_helper import log + + +def lsof_path() -> str: + path_output = shutil.which("lsof") + if path_output is None: + raise RuntimeError('lsof not found in PATH') + else: + return path_output + + +# Makes sure that `pageserver.pid` is only held by `pageserve` command, not other commands. +# This is to test the changes in https://github.com/neondatabase/neon/pull/1834. +def test_lsof_pageserver_pid(neon_simple_env: NeonEnv): + env = neon_simple_env + + def start_workload(): + env.neon_cli.create_branch("test_lsof_pageserver_pid") + pg = env.postgres.create_start("test_lsof_pageserver_pid") + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE TABLE foo as SELECT x FROM generate_series(1,100000) x") + cur.execute("update foo set x=x+1") + + workload_thread = threading.Thread(target=start_workload, args=(), daemon=True) + workload_thread.start() + + path = os.path.join(env.repo_dir, "pageserver.pid") + lsof = lsof_path() + while workload_thread.is_alive(): + res = subprocess.run([lsof, path], + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + # parse the `lsof` command's output to get only the list of commands + commands = [line.split(' ')[0] for line in res.stdout.strip().split('\n')[1:]] + if len(commands) > 0: + log.info(f"lsof commands: {commands}") + assert commands == ['pageserve'] + + time.sleep(1.0) diff --git a/test_runner/batch_others/test_config.py b/test_runner/batch_others/test_config.py index fd2b3b4e99..51deeebeed 100644 --- a/test_runner/batch_others/test_config.py +++ b/test_runner/batch_others/test_config.py @@ -1,15 +1,15 @@ from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log # # Test starting Postgres with custom options # -def test_config(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_config", "empty") +def test_config(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_config", "empty") # change config pg = env.postgres.create_start('test_config', config_lines=['log_min_messages=debug1']) diff --git a/test_runner/batch_others/test_crafted_wal_end.py b/test_runner/batch_others/test_crafted_wal_end.py new file mode 100644 index 0000000000..d1c46fc73a --- /dev/null +++ b/test_runner/batch_others/test_crafted_wal_end.py @@ -0,0 +1,63 @@ +from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft +from fixtures.log_helper import log +import pytest + +# Restart nodes with WAL end having specially crafted shape, like last record +# crossing segment boundary, to test decoding issues. + + +@pytest.mark.parametrize('wal_type', + [ + 'simple', + 'last_wal_record_xlog_switch', + 'last_wal_record_xlog_switch_ends_on_page_boundary', + 'last_wal_record_crossing_segment', + 'wal_record_crossing_segment_followed_by_small_one', + ]) +def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + env.neon_cli.create_branch('test_crafted_wal_end') + + pg = env.postgres.create('test_crafted_wal_end') + wal_craft = WalCraft(env) + pg.config(wal_craft.postgres_config()) + pg.start() + res = pg.safe_psql_many(queries=[ + 'CREATE TABLE keys(key int primary key)', + 'INSERT INTO keys SELECT generate_series(1, 100)', + 'SELECT SUM(key) FROM keys' + ]) + assert res[-1][0] == (5050, ) + + wal_craft.in_existing(wal_type, pg.connstr()) + + log.info("Restarting all safekeepers and pageservers") + env.pageserver.stop() + env.safekeepers[0].stop() + env.safekeepers[0].start() + env.pageserver.start() + + log.info("Trying more queries") + res = pg.safe_psql_many(queries=[ + 'SELECT SUM(key) FROM keys', + 'INSERT INTO keys SELECT generate_series(101, 200)', + 'SELECT SUM(key) FROM keys', + ]) + assert res[0][0] == (5050, ) + assert res[-1][0] == (20100, ) + + log.info("Restarting all safekeepers and pageservers (again)") + env.pageserver.stop() + env.safekeepers[0].stop() + env.safekeepers[0].start() + env.pageserver.start() + + log.info("Trying more queries (again)") + res = pg.safe_psql_many(queries=[ + 'SELECT SUM(key) FROM keys', + 'INSERT INTO keys SELECT generate_series(201, 300)', + 'SELECT SUM(key) FROM keys', + ]) + assert res[0][0] == (20100, ) + assert res[-1][0] == (45150, ) diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/batch_others/test_createdropdb.py index 24898be70a..151ce997ee 100644 --- a/test_runner/batch_others/test_createdropdb.py +++ b/test_runner/batch_others/test_createdropdb.py @@ -2,16 +2,16 @@ import os import pathlib from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.log_helper import log # # Test CREATE DATABASE when there have been relmapper changes # -def test_createdb(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch('test_createdb', 'empty') +def test_createdb(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch('test_createdb', 'empty') pg = env.postgres.create_start('test_createdb') log.info("postgres is running on 'test_createdb' branch") @@ -27,7 +27,7 @@ def test_createdb(zenith_simple_env: ZenithEnv): lsn = cur.fetchone()[0] # Create a branch - env.zenith_cli.create_branch('test_createdb2', 'test_createdb', ancestor_start_lsn=lsn) + env.neon_cli.create_branch('test_createdb2', 'test_createdb', ancestor_start_lsn=lsn) pg2 = env.postgres.create_start('test_createdb2') # Test that you can connect to the new database on both branches @@ -35,21 +35,26 @@ def test_createdb(zenith_simple_env: ZenithEnv): with closing(db.connect(dbname='foodb')) as conn: with conn.cursor() as cur: # Check database size in both branches - cur.execute( - 'select pg_size_pretty(pg_database_size(%s)), pg_size_pretty(sum(pg_relation_size(oid))) from pg_class where relisshared is false;', - ('foodb', )) + cur.execute(""" + select pg_size_pretty(pg_database_size('foodb')), + pg_size_pretty( + sum(pg_relation_size(oid, 'main')) + +sum(pg_relation_size(oid, 'vm')) + +sum(pg_relation_size(oid, 'fsm')) + ) FROM pg_class where relisshared is false + """) res = cur.fetchone() # check that dbsize equals sum of all relation sizes, excluding shared ones - # This is how we define dbsize in zenith for now + # This is how we define dbsize in neon for now assert res[0] == res[1] # # Test DROP DATABASE # -def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir): - env = zenith_simple_env - env.zenith_cli.create_branch('test_dropdb', 'empty') +def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): + env = neon_simple_env + env.neon_cli.create_branch('test_dropdb', 'empty') pg = env.postgres.create_start('test_dropdb') log.info("postgres is running on 'test_dropdb' branch") @@ -73,14 +78,14 @@ def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir): lsn_after_drop = cur.fetchone()[0] # Create two branches before and after database drop. - env.zenith_cli.create_branch('test_before_dropdb', - 'test_dropdb', - ancestor_start_lsn=lsn_before_drop) + env.neon_cli.create_branch('test_before_dropdb', + 'test_dropdb', + ancestor_start_lsn=lsn_before_drop) pg_before = env.postgres.create_start('test_before_dropdb') - env.zenith_cli.create_branch('test_after_dropdb', - 'test_dropdb', - ancestor_start_lsn=lsn_after_drop) + env.neon_cli.create_branch('test_after_dropdb', + 'test_dropdb', + ancestor_start_lsn=lsn_after_drop) pg_after = env.postgres.create_start('test_after_dropdb') # Test that database exists on the branch before drop diff --git a/test_runner/batch_others/test_createuser.py b/test_runner/batch_others/test_createuser.py index f4bbbc8a7a..cbfe496e19 100644 --- a/test_runner/batch_others/test_createuser.py +++ b/test_runner/batch_others/test_createuser.py @@ -1,15 +1,15 @@ from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log # # Test CREATE USER to check shared catalog restore # -def test_createuser(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch('test_createuser', 'empty') +def test_createuser(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch('test_createuser', 'empty') pg = env.postgres.create_start('test_createuser') log.info("postgres is running on 'test_createuser' branch") @@ -24,7 +24,7 @@ def test_createuser(zenith_simple_env: ZenithEnv): lsn = cur.fetchone()[0] # Create a branch - env.zenith_cli.create_branch('test_createuser2', 'test_createuser', ancestor_start_lsn=lsn) + env.neon_cli.create_branch('test_createuser2', 'test_createuser', ancestor_start_lsn=lsn) pg2 = env.postgres.create_start('test_createuser2') # Test that you can connect to new branch as a new user diff --git a/test_runner/batch_others/test_fullbackup.py b/test_runner/batch_others/test_fullbackup.py new file mode 100644 index 0000000000..cd6c40f56b --- /dev/null +++ b/test_runner/batch_others/test_fullbackup.py @@ -0,0 +1,68 @@ +from contextlib import closing + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres +from fixtures.neon_fixtures import pg_distrib_dir +import os +from fixtures.utils import subprocess_capture + +num_rows = 1000 + + +# Ensure that regular postgres can start from fullbackup +def test_fullbackup(neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + port_distributor: PortDistributor): + + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch('test_fullbackup') + pgmain = env.postgres.create_start('test_fullbackup') + log.info("postgres is running on 'test_fullbackup' branch") + + timeline = pgmain.safe_psql("SHOW neon.timeline_id")[0][0] + + with closing(pgmain.connect()) as conn: + with conn.cursor() as cur: + # data loading may take a while, so increase statement timeout + cur.execute("SET statement_timeout='300s'") + cur.execute(f'''CREATE TABLE tbl AS SELECT 'long string to consume some space' || g + from generate_series(1,{num_rows}) g''') + cur.execute("CHECKPOINT") + + cur.execute('SELECT pg_current_wal_insert_lsn()') + lsn = cur.fetchone()[0] + log.info(f"start_backup_lsn = {lsn}") + + # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. + # PgBin sets it automatically, but here we need to pipe psql output to the tar command. + psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')} + + # Get and unpack fullbackup from pageserver + restored_dir_path = env.repo_dir / "restored_datadir" + os.mkdir(restored_dir_path, 0o750) + query = f"fullbackup {env.initial_tenant.hex} {timeline} {lsn}" + cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] + result_basepath = pg_bin.run_capture(cmd, env=psql_env) + tar_output_file = result_basepath + ".stdout" + subprocess_capture(str(env.repo_dir), + ["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)]) + + # HACK + # fullbackup returns neon specific pg_control and first WAL segment + # use resetwal to overwrite it + pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, 'pg_resetwal') + cmd = [pg_resetwal_path, "-D", str(restored_dir_path)] + pg_bin.run_capture(cmd, env=psql_env) + + # Restore from the backup and find the data we inserted + port = port_distributor.get_port() + with VanillaPostgres(restored_dir_path, pg_bin, port, init=False) as vanilla_pg: + # TODO make port an optional argument + vanilla_pg.configure([ + f"port={port}", + ]) + vanilla_pg.start() + num_rows_found = vanilla_pg.safe_psql('select count(*) from tbl;', user="cloud_admin")[0][0] + assert num_rows == num_rows_found diff --git a/test_runner/batch_others/test_gc_aggressive.py b/test_runner/batch_others/test_gc_aggressive.py index 519a6dda1c..bffeedfdc0 100644 --- a/test_runner/batch_others/test_gc_aggressive.py +++ b/test_runner/batch_others/test_gc_aggressive.py @@ -1,7 +1,7 @@ import asyncio import random -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres from fixtures.log_helper import log # Test configuration @@ -27,7 +27,7 @@ async def update_table(pg: Postgres): # Perform aggressive GC with 0 horizon -async def gc(env: ZenithEnv, timeline: str): +async def gc(env: NeonEnv, timeline: str): psconn = await env.pageserver.connect_async() while updates_performed < updates_to_perform: @@ -35,7 +35,7 @@ async def gc(env: ZenithEnv, timeline: str): # At the same time, run UPDATEs and GC -async def update_and_gc(env: ZenithEnv, pg: Postgres, timeline: str): +async def update_and_gc(env: NeonEnv, pg: Postgres, timeline: str): workers = [] for worker_id in range(num_connections): workers.append(asyncio.create_task(update_table(pg))) @@ -48,21 +48,21 @@ async def update_and_gc(env: ZenithEnv, pg: Postgres, timeline: str): # # Aggressively force GC, while running queries. # -# (repro for https://github.com/zenithdb/zenith/issues/1047) +# (repro for https://github.com/neondatabase/neon/issues/1047) # -def test_gc_aggressive(zenith_env_builder: ZenithEnvBuilder): +def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_gc_aggressive", "main") + neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_gc_aggressive", "main") pg = env.postgres.create_start('test_gc_aggressive') log.info('postgres is running on test_gc_aggressive branch') conn = pg.connect() cur = conn.cursor() - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timeline_id") timeline = cur.fetchone()[0] # Create table, and insert the first 100 rows diff --git a/test_runner/batch_others/test_import.py b/test_runner/batch_others/test_import.py new file mode 100644 index 0000000000..617d4808cc --- /dev/null +++ b/test_runner/batch_others/test_import.py @@ -0,0 +1,198 @@ +import pytest +from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_upload, wait_for_last_record_lsn +from fixtures.utils import lsn_from_hex, lsn_to_hex +from uuid import UUID, uuid4 +import tarfile +import os +import shutil +from pathlib import Path +import json +from fixtures.utils import subprocess_capture +from fixtures.log_helper import log +from contextlib import closing +from fixtures.neon_fixtures import pg_distrib_dir + + +@pytest.mark.timeout(600) +def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder): + # Put data in vanilla pg + vanilla_pg.start() + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + vanilla_pg.safe_psql('''create table t as select 'long string to consume some space' || g + from generate_series(1,300000) g''') + assert vanilla_pg.safe_psql('select count(*) from t') == [(300000, )] + + # Take basebackup + basebackup_dir = os.path.join(test_output_dir, "basebackup") + base_tar = os.path.join(basebackup_dir, "base.tar") + wal_tar = os.path.join(basebackup_dir, "pg_wal.tar") + os.mkdir(basebackup_dir) + vanilla_pg.safe_psql("CHECKPOINT") + pg_bin.run([ + "pg_basebackup", + "-F", + "tar", + "-d", + vanilla_pg.connstr(), + "-D", + basebackup_dir, + ]) + + # Make corrupt base tar with missing pg_control + unpacked_base = os.path.join(basebackup_dir, "unpacked-base") + corrupt_base_tar = os.path.join(unpacked_base, "corrupt-base.tar") + os.mkdir(unpacked_base, 0o750) + subprocess_capture(str(test_output_dir), ["tar", "-xf", base_tar, "-C", unpacked_base]) + os.remove(os.path.join(unpacked_base, "global/pg_control")) + subprocess_capture(str(test_output_dir), + ["tar", "-cf", "corrupt-base.tar"] + os.listdir(unpacked_base), + cwd=unpacked_base) + + # Get start_lsn and end_lsn + with open(os.path.join(basebackup_dir, "backup_manifest")) as f: + manifest = json.load(f) + start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"] + end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] + + node_name = "import_from_vanilla" + tenant = uuid4() + timeline = uuid4() + + # Set up pageserver for import + neon_env_builder.enable_local_fs_remote_storage() + env = neon_env_builder.init_start() + env.pageserver.http_client().tenant_create(tenant) + + def import_tar(base, wal): + env.neon_cli.raw_cli([ + "timeline", + "import", + "--tenant-id", + tenant.hex, + "--timeline-id", + timeline.hex, + "--node-name", + node_name, + "--base-lsn", + start_lsn, + "--base-tarfile", + base, + "--end-lsn", + end_lsn, + "--wal-tarfile", + wal, + ]) + + # Importing corrupt backup fails + with pytest.raises(Exception): + import_tar(corrupt_base_tar, wal_tar) + + # Clean up + # TODO it should clean itself + client = env.pageserver.http_client() + client.timeline_delete(tenant, timeline) + + # Importing correct backup works + import_tar(base_tar, wal_tar) + + # Wait for data to land in s3 + wait_for_last_record_lsn(client, tenant, timeline, lsn_from_hex(end_lsn)) + wait_for_upload(client, tenant, timeline, lsn_from_hex(end_lsn)) + + # Check it worked + pg = env.postgres.create_start(node_name, tenant_id=tenant) + assert pg.safe_psql('select count(*) from t') == [(300000, )] + + +@pytest.mark.timeout(600) +def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_builder): + + num_rows = 3000 + neon_env_builder.num_safekeepers = 1 + neon_env_builder.enable_local_fs_remote_storage() + env = neon_env_builder.init_start() + + env.neon_cli.create_branch('test_import_from_pageserver') + pgmain = env.postgres.create_start('test_import_from_pageserver') + log.info("postgres is running on 'test_import_from_pageserver' branch") + + timeline = pgmain.safe_psql("SHOW neon.timeline_id")[0][0] + + with closing(pgmain.connect()) as conn: + with conn.cursor() as cur: + # data loading may take a while, so increase statement timeout + cur.execute("SET statement_timeout='300s'") + cur.execute(f'''CREATE TABLE tbl AS SELECT 'long string to consume some space' || g + from generate_series(1,{num_rows}) g''') + cur.execute("CHECKPOINT") + + cur.execute('SELECT pg_current_wal_insert_lsn()') + lsn = cur.fetchone()[0] + log.info(f"start_backup_lsn = {lsn}") + + # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. + # PgBin sets it automatically, but here we need to pipe psql output to the tar command. + psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')} + + # Get a fullbackup from pageserver + query = f"fullbackup { env.initial_tenant.hex} {timeline} {lsn}" + cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] + result_basepath = pg_bin.run_capture(cmd, env=psql_env) + tar_output_file = result_basepath + ".stdout" + + # Stop the first pageserver instance, erase all its data + env.postgres.stop_all() + env.pageserver.stop() + + dir_to_clear = Path(env.repo_dir) / 'tenants' + shutil.rmtree(dir_to_clear) + os.mkdir(dir_to_clear) + + #start the pageserver again + env.pageserver.start() + + # Import using another tenantid, because we use the same pageserver. + # TODO Create another pageserver to maeke test more realistic. + tenant = uuid4() + + # Import to pageserver + node_name = "import_from_pageserver" + client = env.pageserver.http_client() + client.tenant_create(tenant) + env.neon_cli.raw_cli([ + "timeline", + "import", + "--tenant-id", + tenant.hex, + "--timeline-id", + timeline, + "--node-name", + node_name, + "--base-lsn", + lsn, + "--base-tarfile", + os.path.join(tar_output_file), + ]) + + # Wait for data to land in s3 + wait_for_last_record_lsn(client, tenant, UUID(timeline), lsn_from_hex(lsn)) + wait_for_upload(client, tenant, UUID(timeline), lsn_from_hex(lsn)) + + # Check it worked + pg = env.postgres.create_start(node_name, tenant_id=tenant) + assert pg.safe_psql('select count(*) from tbl') == [(num_rows, )] + + # Take another fullbackup + query = f"fullbackup { tenant.hex} {timeline} {lsn}" + cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query] + result_basepath = pg_bin.run_capture(cmd, env=psql_env) + new_tar_output_file = result_basepath + ".stdout" + + # Check it's the same as the first fullbackup + # TODO pageserver should be checking checksum + assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file) + + # Check that gc works + psconn = env.pageserver.connect() + pscur = psconn.cursor() + pscur.execute(f"do_gc {tenant.hex} {timeline} 0") diff --git a/test_runner/batch_others/test_lsn_mapping.py b/test_runner/batch_others/test_lsn_mapping.py index 37113b46f2..1eca92ed58 100644 --- a/test_runner/batch_others/test_lsn_mapping.py +++ b/test_runner/batch_others/test_lsn_mapping.py @@ -4,7 +4,7 @@ import math from uuid import UUID import psycopg2.extras import psycopg2.errors -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres from fixtures.log_helper import log import time @@ -12,11 +12,11 @@ import time # # Test pageserver get_lsn_by_timestamp API # -def test_lsn_mapping(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init_start() +def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() - new_timeline_id = env.zenith_cli.create_branch('test_lsn_mapping') + new_timeline_id = env.neon_cli.create_branch('test_lsn_mapping') pgmain = env.postgres.create_start("test_lsn_mapping") log.info("postgres is running on 'test_lsn_mapping' branch") diff --git a/test_runner/batch_others/test_multixact.py b/test_runner/batch_others/test_multixact.py index 7a508a67fb..b17676658b 100644 --- a/test_runner/batch_others/test_multixact.py +++ b/test_runner/batch_others/test_multixact.py @@ -1,4 +1,4 @@ -from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.log_helper import log @@ -8,9 +8,9 @@ from fixtures.log_helper import log # it only checks next_multixact_id field in restored pg_control, # since we don't have functions to check multixact internals. # -def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir): - env = zenith_simple_env - env.zenith_cli.create_branch('test_multixact', 'empty') +def test_multixact(neon_simple_env: NeonEnv, test_output_dir): + env = neon_simple_env + env.neon_cli.create_branch('test_multixact', 'empty') pg = env.postgres.create_start('test_multixact') log.info("postgres is running on 'test_multixact' branch") @@ -60,7 +60,7 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir): assert int(next_multixact_id) > int(next_multixact_id_old) # Branch at this point - env.zenith_cli.create_branch('test_multixact_new', 'test_multixact', ancestor_start_lsn=lsn) + env.neon_cli.create_branch('test_multixact_new', 'test_multixact', ancestor_start_lsn=lsn) pg_new = env.postgres.create_start('test_multixact_new') log.info("postgres is running on 'test_multixact_new' branch") diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_neon_cli.py similarity index 60% rename from test_runner/batch_others/test_zenith_cli.py rename to test_runner/batch_others/test_neon_cli.py index 103d51aae5..728bc7b894 100644 --- a/test_runner/batch_others/test_zenith_cli.py +++ b/test_runner/batch_others/test_neon_cli.py @@ -1,12 +1,12 @@ import uuid import requests -from fixtures.zenith_fixtures import DEFAULT_BRANCH_NAME, ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient +from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient from typing import cast -def helper_compare_timeline_list(pageserver_http_client: ZenithPageserverHttpClient, - env: ZenithEnv, +def helper_compare_timeline_list(pageserver_http_client: NeonPageserverHttpClient, + env: NeonEnv, initial_tenant: uuid.UUID): """ Compare timelines list returned by CLI and directly via API. @@ -17,65 +17,65 @@ def helper_compare_timeline_list(pageserver_http_client: ZenithPageserverHttpCli map(lambda t: cast(str, t['timeline_id']), pageserver_http_client.timeline_list(initial_tenant))) - timelines_cli = env.zenith_cli.list_timelines() - assert timelines_cli == env.zenith_cli.list_timelines(initial_tenant) + timelines_cli = env.neon_cli.list_timelines() + assert timelines_cli == env.neon_cli.list_timelines(initial_tenant) cli_timeline_ids = sorted([timeline_id for (_, timeline_id) in timelines_cli]) assert timelines_api == cli_timeline_ids -def test_cli_timeline_list(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_cli_timeline_list(neon_simple_env: NeonEnv): + env = neon_simple_env pageserver_http_client = env.pageserver.http_client() # Initial sanity check helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Create a branch for us - main_timeline_id = env.zenith_cli.create_branch('test_cli_branch_list_main') + main_timeline_id = env.neon_cli.create_branch('test_cli_branch_list_main') helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Create a nested branch - nested_timeline_id = env.zenith_cli.create_branch('test_cli_branch_list_nested', - 'test_cli_branch_list_main') + nested_timeline_id = env.neon_cli.create_branch('test_cli_branch_list_nested', + 'test_cli_branch_list_main') helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Check that all new branches are visible via CLI - timelines_cli = [timeline_id for (_, timeline_id) in env.zenith_cli.list_timelines()] + timelines_cli = [timeline_id for (_, timeline_id) in env.neon_cli.list_timelines()] assert main_timeline_id.hex in timelines_cli assert nested_timeline_id.hex in timelines_cli -def helper_compare_tenant_list(pageserver_http_client: ZenithPageserverHttpClient, env: ZenithEnv): +def helper_compare_tenant_list(pageserver_http_client: NeonPageserverHttpClient, env: NeonEnv): tenants = pageserver_http_client.tenant_list() tenants_api = sorted(map(lambda t: cast(str, t['id']), tenants)) - res = env.zenith_cli.list_tenants() + res = env.neon_cli.list_tenants() tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) assert tenants_api == tenants_cli -def test_cli_tenant_list(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_cli_tenant_list(neon_simple_env: NeonEnv): + env = neon_simple_env pageserver_http_client = env.pageserver.http_client() # Initial sanity check helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant1, _ = env.zenith_cli.create_tenant() + tenant1, _ = env.neon_cli.create_tenant() # check tenant1 appeared helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant2, _ = env.zenith_cli.create_tenant() + tenant2, _ = env.neon_cli.create_tenant() # check tenant2 appeared helper_compare_tenant_list(pageserver_http_client, env) - res = env.zenith_cli.list_tenants() + res = env.neon_cli.list_tenants() tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) assert env.initial_tenant.hex in tenants @@ -83,18 +83,18 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv): assert tenant2.hex in tenants -def test_cli_tenant_create(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - tenant_id, _ = env.zenith_cli.create_tenant() - timelines = env.zenith_cli.list_timelines(tenant_id) +def test_cli_tenant_create(neon_simple_env: NeonEnv): + env = neon_simple_env + tenant_id, _ = env.neon_cli.create_tenant() + timelines = env.neon_cli.list_timelines(tenant_id) # an initial timeline should be created upon tenant creation assert len(timelines) == 1 assert timelines[0][0] == DEFAULT_BRANCH_NAME -def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() +def test_cli_ipv4_listeners(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() # Connect to sk port on v4 loopback res = requests.get(f'http://127.0.0.1:{env.safekeepers[0].port.http}/v1/status') @@ -108,17 +108,17 @@ def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): # assert res.ok -def test_cli_start_stop(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() +def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() # Stop default ps/sk - env.zenith_cli.pageserver_stop() - env.zenith_cli.safekeeper_stop() + env.neon_cli.pageserver_stop() + env.neon_cli.safekeeper_stop() # Default start - res = env.zenith_cli.raw_cli(["start"]) + res = env.neon_cli.raw_cli(["start"]) res.check_returncode() # Default stop - res = env.zenith_cli.raw_cli(["stop"]) + res = env.neon_cli.raw_cli(["stop"]) res.check_returncode() diff --git a/test_runner/batch_others/test_next_xid.py b/test_runner/batch_others/test_next_xid.py index 1ab1addad3..f8d11a9381 100644 --- a/test_runner/batch_others/test_next_xid.py +++ b/test_runner/batch_others/test_next_xid.py @@ -1,12 +1,12 @@ import time -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder # Test restarting page server, while safekeeper and compute node keep # running. -def test_next_xid(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() +def test_next_xid(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() pg = env.postgres.create_start('main') diff --git a/test_runner/batch_others/test_normal_work.py b/test_runner/batch_others/test_normal_work.py new file mode 100644 index 0000000000..5b25691517 --- /dev/null +++ b/test_runner/batch_others/test_normal_work.py @@ -0,0 +1,50 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient +import pytest + + +def check_tenant(env: NeonEnv, pageserver_http: NeonPageserverHttpClient): + tenant_id, timeline_id = env.neon_cli.create_tenant() + pg = env.postgres.create_start('main', tenant_id=tenant_id) + # we rely upon autocommit after each statement + res_1 = pg.safe_psql_many(queries=[ + 'CREATE TABLE t(key int primary key, value text)', + 'INSERT INTO t SELECT generate_series(1,100000), \'payload\'', + 'SELECT sum(key) FROM t', + ]) + + assert res_1[-1][0] == (5000050000, ) + # TODO check detach on live instance + log.info("stopping compute") + pg.stop() + log.info("compute stopped") + + pg.start() + res_2 = pg.safe_psql('SELECT sum(key) FROM t') + assert res_2[0] == (5000050000, ) + + pg.stop() + pageserver_http.tenant_detach(tenant_id) + + +@pytest.mark.parametrize('num_timelines,num_safekeepers', [(3, 1)]) +def test_normal_work(neon_env_builder: NeonEnvBuilder, num_timelines: int, num_safekeepers: int): + """ + Basic test: + * create new tenant with a timeline + * write some data + * ensure that it was successfully written + * restart compute + * check that the data is there + * stop compute + * detach timeline + + Repeat check for several tenants/timelines. + """ + + neon_env_builder.num_safekeepers = num_safekeepers + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + for _ in range(num_timelines): + check_tenant(env, pageserver_http) diff --git a/test_runner/batch_others/test_old_request_lsn.py b/test_runner/batch_others/test_old_request_lsn.py index cf7fe09b1e..1e96c0a1fa 100644 --- a/test_runner/batch_others/test_old_request_lsn.py +++ b/test_runner/batch_others/test_old_request_lsn.py @@ -1,4 +1,4 @@ -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log from fixtures.utils import print_gc_result import psycopg2.extras @@ -14,11 +14,11 @@ import psycopg2.extras # just a hint that the page hasn't been modified since that LSN, and the page # server should return the latest page version regardless of the LSN. # -def test_old_request_lsn(zenith_env_builder: ZenithEnvBuilder): +def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_old_request_lsn", "main") + neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_old_request_lsn", "main") pg = env.postgres.create_start('test_old_request_lsn') log.info('postgres is running on test_old_request_lsn branch') @@ -26,7 +26,7 @@ def test_old_request_lsn(zenith_env_builder: ZenithEnvBuilder): cur = pg_conn.cursor() # Get the timeline ID of our branch. We need it for the 'do_gc' command - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timeline_id") timeline = cur.fetchone()[0] psconn = env.pageserver.connect() diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index 7fe3b4dff5..7f9cb9493d 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -1,24 +1,27 @@ +from typing import Optional from uuid import uuid4, UUID import pytest -from fixtures.zenith_fixtures import ( +from fixtures.utils import lsn_from_hex +from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, - ZenithEnv, - ZenithEnvBuilder, - ZenithPageserverHttpClient, - ZenithPageserverApiException, + NeonEnv, + NeonEnvBuilder, + NeonPageserverHttpClient, + NeonPageserverApiException, + wait_until, ) # test that we cannot override node id -def test_pageserver_init_node_id(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init() +def test_pageserver_init_node_id(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init() with pytest.raises( Exception, match="node id can only be set during pageserver init and cannot be overridden"): env.pageserver.start(overrides=['--pageserver-config-override=id=10']) -def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): +def check_client(client: NeonPageserverHttpClient, initial_tenant: UUID): client.check_status() # check initial tenant is there @@ -54,48 +57,65 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): assert local_timeline_details['timeline_state'] == 'Loaded' -def test_pageserver_http_get_wal_receiver_not_found(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): + env = neon_simple_env client = env.pageserver.http_client() - tenant_id, timeline_id = env.zenith_cli.create_tenant() + tenant_id, timeline_id = env.neon_cli.create_tenant() - # no PG compute node is running, so no WAL receiver is running - with pytest.raises(ZenithPageserverApiException) as e: - _ = client.wal_receiver_get(tenant_id, timeline_id) - assert "Not Found" in str(e.value) + empty_response = client.wal_receiver_get(tenant_id, timeline_id) + + assert empty_response.get('wal_producer_connstr') is None, 'Should not be able to connect to WAL streaming without PG compute node running' + assert empty_response.get('last_received_msg_lsn') is None, 'Should not be able to connect to WAL streaming without PG compute node running' + assert empty_response.get('last_received_msg_ts') is None, 'Should not be able to connect to WAL streaming without PG compute node running' -def test_pageserver_http_get_wal_receiver_success(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): + env = neon_simple_env client = env.pageserver.http_client() - tenant_id, timeline_id = env.zenith_cli.create_tenant() + tenant_id, timeline_id = env.neon_cli.create_tenant() pg = env.postgres.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id) - res = client.wal_receiver_get(tenant_id, timeline_id) - assert list(res.keys()) == [ - "thread_id", - "wal_producer_connstr", - "last_received_msg_lsn", - "last_received_msg_ts", - ] + def expect_updated_msg_lsn(prev_msg_lsn: Optional[int]) -> int: + res = client.wal_receiver_get(tenant_id, timeline_id) - # make a DB modification then expect getting a new WAL receiver's data + # a successful `wal_receiver_get` response must contain the below fields + assert list(res.keys()) == [ + "wal_producer_connstr", + "last_received_msg_lsn", + "last_received_msg_ts", + ] + + assert res["last_received_msg_lsn"] is not None, "the last received message's LSN is empty" + + last_msg_lsn = lsn_from_hex(res["last_received_msg_lsn"]) + assert prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn, \ + f"the last received message's LSN {last_msg_lsn} hasn't been updated \ + compared to the previous message's LSN {prev_msg_lsn}" + + return last_msg_lsn + + # Wait to make sure that we get a latest WAL receiver data. + # We need to wait here because it's possible that we don't have access to + # the latest WAL during the time the `wal_receiver_get` API is called. + # See: https://github.com/neondatabase/neon/issues/1768. + lsn = wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(None)) + + # Make a DB modification then expect getting a new WAL receiver's data. pg.safe_psql("CREATE TABLE t(key int primary key, value text)") - res2 = client.wal_receiver_get(tenant_id, timeline_id) - assert res2["last_received_msg_lsn"] > res["last_received_msg_lsn"] + wait_until(number_of_iterations=5, interval=1, func=lambda: expect_updated_msg_lsn(lsn)) -def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_pageserver_http_api_client(neon_simple_env: NeonEnv): + env = neon_simple_env client = env.pageserver.http_client() check_client(client, env.initial_tenant) -def test_pageserver_http_api_client_auth_enabled(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.pageserver_auth_enabled = True - env = zenith_env_builder.init_start() +def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilder): + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() management_token = env.auth_keys.generate_management_token() diff --git a/test_runner/batch_others/test_pageserver_catchup.py b/test_runner/batch_others/test_pageserver_catchup.py index 758b018046..dd24351e17 100644 --- a/test_runner/batch_others/test_pageserver_catchup.py +++ b/test_runner/batch_others/test_pageserver_catchup.py @@ -1,15 +1,15 @@ -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder # Test safekeeper sync and pageserver catch up # while initial compute node is down and pageserver is lagging behind safekeepers. # Ensure that basebackup after restart of all components is correct # and new compute node contains all data. -def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() +def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_pageserver_catchup_while_compute_down') + env.neon_cli.create_branch('test_pageserver_catchup_while_compute_down') # Make shared_buffers large to ensure we won't query pageserver while it is down. pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down', config_lines=['shared_buffers=512MB']) diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/batch_others/test_pageserver_restart.py index 69f5ea85ce..403ff7b305 100644 --- a/test_runner/batch_others/test_pageserver_restart.py +++ b/test_runner/batch_others/test_pageserver_restart.py @@ -1,13 +1,13 @@ -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log # Test restarting page server, while safekeeper and compute node keep # running. -def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() +def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_pageserver_restart') + env.neon_cli.create_branch('test_pageserver_restart') pg = env.postgres.create_start('test_pageserver_restart') pg_conn = pg.connect() diff --git a/test_runner/batch_others/test_parallel_copy.py b/test_runner/batch_others/test_parallel_copy.py index a44acecf21..55947fe427 100644 --- a/test_runner/batch_others/test_parallel_copy.py +++ b/test_runner/batch_others/test_parallel_copy.py @@ -1,6 +1,6 @@ from io import BytesIO import asyncio -from fixtures.zenith_fixtures import ZenithEnv, Postgres +from fixtures.neon_fixtures import NeonEnv, Postgres from fixtures.log_helper import log @@ -38,9 +38,9 @@ async def parallel_load_same_table(pg: Postgres, n_parallel: int): # Load data into one table with COPY TO from 5 parallel connections -def test_parallel_copy(zenith_simple_env: ZenithEnv, n_parallel=5): - env = zenith_simple_env - env.zenith_cli.create_branch("test_parallel_copy", "empty") +def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5): + env = neon_simple_env + env.neon_cli.create_branch("test_parallel_copy", "empty") pg = env.postgres.create_start('test_parallel_copy') log.info("postgres is running on 'test_parallel_copy' branch") diff --git a/test_runner/batch_others/test_pitr_gc.py b/test_runner/batch_others/test_pitr_gc.py index ee19bddfe8..161f628429 100644 --- a/test_runner/batch_others/test_pitr_gc.py +++ b/test_runner/batch_others/test_pitr_gc.py @@ -5,27 +5,27 @@ import psycopg2.extras import pytest from fixtures.log_helper import log from fixtures.utils import print_gc_result -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder # # Check pitr_interval GC behavior. # Insert some data, run GC and create a branch in the past. # -def test_pitr_gc(zenith_env_builder: ZenithEnvBuilder): +def test_pitr_gc(neon_env_builder: NeonEnvBuilder): - zenith_env_builder.num_safekeepers = 1 + neon_env_builder.num_safekeepers = 1 # Set pitr interval such that we need to keep the data - zenith_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" + neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" - env = zenith_env_builder.init_start() + env = neon_env_builder.init_start() pgmain = env.postgres.create_start('main') log.info("postgres is running on 'main' branch") main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() - main_cur.execute("SHOW zenith.zenith_timeline") + main_cur.execute("SHOW neon.timeline_id") timeline = main_cur.fetchone()[0] # Create table @@ -55,14 +55,14 @@ def test_pitr_gc(zenith_env_builder: ZenithEnvBuilder): with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: pscur.execute(f"compact {env.initial_tenant.hex} {timeline}") - # perform agressive GC. Data still should be kept because of the PITR setting. + # perform aggressive GC. Data still should be kept because of the PITR setting. pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") row = pscur.fetchone() print_gc_result(row) # Branch at the point where only 100 rows were inserted # It must have been preserved by PITR setting - env.zenith_cli.create_branch('test_pitr_gc_hundred', 'main', ancestor_start_lsn=lsn_a) + env.neon_cli.create_branch('test_pitr_gc_hundred', 'main', ancestor_start_lsn=lsn_a) pg_hundred = env.postgres.create_start('test_pitr_gc_hundred') diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/batch_others/test_proxy.py index a6f828f829..ebeede8df7 100644 --- a/test_runner/batch_others/test_proxy.py +++ b/test_runner/batch_others/test_proxy.py @@ -2,7 +2,7 @@ import pytest def test_proxy_select_1(static_proxy): - static_proxy.safe_psql("select 1;") + static_proxy.safe_psql("select 1;", options="project=generic-project-name") # Pass extra options to the server. diff --git a/test_runner/batch_others/test_read_validation.py b/test_runner/batch_others/test_read_validation.py index ee41e6511c..6b8a154865 100644 --- a/test_runner/batch_others/test_read_validation.py +++ b/test_runner/batch_others/test_read_validation.py @@ -1,22 +1,22 @@ from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log from psycopg2.errors import UndefinedTable from psycopg2.errors import IoError -pytest_plugins = ("fixtures.zenith_fixtures") +pytest_plugins = ("fixtures.neon_fixtures") -extensions = ["pageinspect", "zenith_test_utils", "pg_buffercache"] +extensions = ["pageinspect", "neon_test_utils", "pg_buffercache"] # # Validation of reading different page versions # -def test_read_validation(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_read_validation", "empty") +def test_read_validation(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_read_validation", "empty") pg = env.postgres.create_start("test_read_validation") log.info("postgres is running on 'test_read_validation' branch") @@ -125,9 +125,9 @@ def test_read_validation(zenith_simple_env: ZenithEnv): log.info("Caught an expected failure: {}".format(e)) -def test_read_validation_neg(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_read_validation_neg", "empty") +def test_read_validation_neg(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_read_validation_neg", "empty") pg = env.postgres.create_start("test_read_validation_neg") log.info("postgres is running on 'test_read_validation_neg' branch") diff --git a/test_runner/batch_others/test_readonly_node.py b/test_runner/batch_others/test_readonly_node.py index 808ee62def..286c756a5e 100644 --- a/test_runner/batch_others/test_readonly_node.py +++ b/test_runner/batch_others/test_readonly_node.py @@ -1,6 +1,6 @@ import pytest from fixtures.log_helper import log -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv # @@ -9,9 +9,9 @@ from fixtures.zenith_fixtures import ZenithEnv # This is very similar to the 'test_branch_behind' test, but instead of # creating branches, creates read-only nodes. # -def test_readonly_node(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch('test_readonly_node', 'empty') +def test_readonly_node(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch('test_readonly_node', 'empty') pgmain = env.postgres.create_start('test_readonly_node') log.info("postgres is running on 'test_readonly_node' branch") diff --git a/test_runner/batch_others/test_recovery.py b/test_runner/batch_others/test_recovery.py index eb1747efa5..5ba783b802 100644 --- a/test_runner/batch_others/test_recovery.py +++ b/test_runner/batch_others/test_recovery.py @@ -4,28 +4,28 @@ import psycopg2.extras import json from ast import Assert from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log # # Test pageserver recovery after crash # -def test_pageserver_recovery(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 1 +def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 # Override default checkpointer settings to run it more often - zenith_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" + neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" - env = zenith_env_builder.init() + env = neon_env_builder.init() # Check if failpoints enables. Otherwise the test doesn't make sense - f = env.zenith_cli.pageserver_enabled_features() + f = env.neon_cli.pageserver_enabled_features() assert "failpoints" in f["features"], "Build pageserver with --features=failpoints option to run this test" - zenith_env_builder.start() + neon_env_builder.start() # Create a branch for us - env.zenith_cli.create_branch("test_pageserver_recovery", "main") + env.neon_cli.create_branch("test_pageserver_recovery", "main") pg = env.postgres.create_start('test_pageserver_recovery') log.info("postgres is running on 'test_pageserver_recovery' branch") @@ -45,7 +45,8 @@ def test_pageserver_recovery(zenith_env_builder: ZenithEnvBuilder): # Configure failpoints pscur.execute( - "failpoints checkpoint-before-sync=sleep(2000);checkpoint-after-sync=exit") + "failpoints flush-frozen-before-sync=sleep(2000);checkpoint-after-sync=exit" + ) # Do some updates until pageserver is crashed try: diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index afbe3c55c7..163912690c 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -1,12 +1,12 @@ # It's possible to run any regular test with the local fs remote storage via -# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/zenith_zzz/'}" poetry ...... +# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... import shutil, os from contextlib import closing from pathlib import Path import time from uuid import UUID -from fixtures.zenith_fixtures import ZenithEnvBuilder, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload +from fixtures.neon_fixtures import NeonEnvBuilder, assert_timeline_local, wait_until, wait_for_last_record_lsn, wait_for_upload from fixtures.log_helper import log from fixtures.utils import lsn_from_hex, lsn_to_hex import pytest @@ -30,12 +30,15 @@ import pytest # # The tests are done for all types of remote storage pageserver supports. @pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3']) -def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, storage_type: str): - # zenith_env_builder.rust_log_override = 'debug' +def test_remote_storage_backup_and_restore(neon_env_builder: NeonEnvBuilder, storage_type: str): + # Use this test to check more realistic SK ids: some etcd key parsing bugs were related, + # and this test needs SK to write data to pageserver, so it will be visible + neon_env_builder.safekeepers_id_start = 12 + if storage_type == 'local_fs': - zenith_env_builder.enable_local_fs_remote_storage() + neon_env_builder.enable_local_fs_remote_storage() elif storage_type == 'mock_s3': - zenith_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore') + neon_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore') else: raise RuntimeError(f'Unknown storage type: {storage_type}') @@ -43,13 +46,13 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, data_secret = 'very secret secret' ##### First start, insert secret data and upload it to the remote storage - env = zenith_env_builder.init_start() + env = neon_env_builder.init_start() pg = env.postgres.create_start('main') client = env.pageserver.http_client() - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] checkpoint_numbers = range(1, 3) @@ -88,14 +91,14 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, # Introduce failpoint in download env.pageserver.safe_psql(f"failpoints remote-storage-download-pre-rename=return") - client.timeline_attach(UUID(tenant_id), UUID(timeline_id)) + client.tenant_attach(UUID(tenant_id)) - # is there a better way to assert that fafilpoint triggered? + # is there a better way to assert that failpoint triggered? time.sleep(10) # assert cannot attach timeline that is scheduled for download - with pytest.raises(Exception, match="Timeline download is already in progress"): - client.timeline_attach(UUID(tenant_id), UUID(timeline_id)) + with pytest.raises(Exception, match="Conflict: Tenant download is already in progress"): + client.tenant_attach(UUID(tenant_id)) detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) log.info("Timeline detail with active failpoint: %s", detail) @@ -106,17 +109,17 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, env.pageserver.stop() env.pageserver.start() - client.timeline_attach(UUID(tenant_id), UUID(timeline_id)) + client.tenant_attach(UUID(tenant_id)) log.info("waiting for timeline redownload") wait_until(number_of_iterations=10, interval=1, - func=lambda: assert_local(client, UUID(tenant_id), UUID(timeline_id))) + func=lambda: assert_timeline_local(client, UUID(tenant_id), UUID(timeline_id))) detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) assert detail['local'] is not None log.info("Timeline detail after attach completed: %s", detail) - assert lsn_from_hex(detail['local']['last_record_lsn']) >= current_lsn, 'current db Lsn should shoud not be less than the one stored on remote storage' + assert lsn_from_hex(detail['local']['last_record_lsn']) >= current_lsn, 'current db Lsn should should not be less than the one stored on remote storage' assert not detail['remote']['awaits_download'] pg = env.postgres.create_start('main') diff --git a/test_runner/batch_others/test_restart_compute.py b/test_runner/batch_others/test_restart_compute.py deleted file mode 100644 index d6e7fd9e0d..0000000000 --- a/test_runner/batch_others/test_restart_compute.py +++ /dev/null @@ -1,74 +0,0 @@ -import pytest - -from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnvBuilder -from fixtures.log_helper import log - - -# -# Test restarting and recreating a postgres instance -# -@pytest.mark.parametrize('with_safekeepers', [False, True]) -def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_safekeepers: bool): - zenith_env_builder.pageserver_auth_enabled = True - if with_safekeepers: - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() - - env.zenith_cli.create_branch('test_restart_compute') - pg = env.postgres.create_start('test_restart_compute') - log.info("postgres is running on 'test_restart_compute' branch") - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - cur.execute('CREATE TABLE t(key int primary key, value text)') - cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - cur.execute('SELECT sum(key) FROM t') - r = cur.fetchone() - assert r == (5000050000, ) - log.info(f"res = {r}") - - # Remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute') - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # We can still see the row - cur.execute('SELECT sum(key) FROM t') - r = cur.fetchone() - assert r == (5000050000, ) - log.info(f"res = {r}") - - # Insert another row - cur.execute("INSERT INTO t VALUES (100001, 'payload2')") - cur.execute('SELECT count(*) FROM t') - - r = cur.fetchone() - assert r == (100001, ) - log.info(f"res = {r}") - - # Again remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute') - - # That select causes lots of FPI's and increases probability of wakeepers - # lagging behind after query completion - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # We can still see the rows - cur.execute('SELECT count(*) FROM t') - - r = cur.fetchone() - assert r == (100001, ) - log.info(f"res = {r}") - - # And again remove data directory and restart - pg.stop_and_destroy().create_start('test_restart_compute') - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # We can still see the rows - cur.execute('SELECT count(*) FROM t') - - r = cur.fetchone() - assert r == (100001, ) - log.info(f"res = {r}") diff --git a/test_runner/batch_others/test_subxacts.py b/test_runner/batch_others/test_subxacts.py index bed1c4be63..d06877825e 100644 --- a/test_runner/batch_others/test_subxacts.py +++ b/test_runner/batch_others/test_subxacts.py @@ -1,4 +1,4 @@ -from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.log_helper import log @@ -6,11 +6,11 @@ from fixtures.log_helper import log # # The pg_subxact SLRU is not preserved on restarts, and doesn't need to be # maintained in the pageserver, so subtransactions are not very exciting for -# Zenith. They are included in the commit record though and updated in the +# Neon. They are included in the commit record though and updated in the # CLOG. -def test_subxacts(zenith_simple_env: ZenithEnv, test_output_dir): - env = zenith_simple_env - env.zenith_cli.create_branch("test_subxacts", "empty") +def test_subxacts(neon_simple_env: NeonEnv, test_output_dir): + env = neon_simple_env + env.neon_cli.create_branch("test_subxacts", "empty") pg = env.postgres.create_start('test_subxacts') log.info("postgres is running on 'test_subxacts' branch") diff --git a/test_runner/batch_others/test_tenant_conf.py b/test_runner/batch_others/test_tenant_conf.py index d627d8a6ee..d25aad742e 100644 --- a/test_runner/batch_others/test_tenant_conf.py +++ b/test_runner/batch_others/test_tenant_conf.py @@ -3,25 +3,25 @@ from contextlib import closing import pytest import psycopg2.extras -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log -def test_tenant_config(zenith_env_builder: ZenithEnvBuilder): +def test_tenant_config(neon_env_builder: NeonEnvBuilder): # set some non-default global config - zenith_env_builder.pageserver_config_override = ''' + neon_env_builder.pageserver_config_override = ''' page_cache_size=444; wait_lsn_timeout='111 s'; tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' - env = zenith_env_builder.init_start() + env = neon_env_builder.init_start() """Test per tenant configuration""" - tenant, _ = env.zenith_cli.create_tenant(conf={ + tenant, _ = env.neon_cli.create_tenant(conf={ 'checkpoint_distance': '20000', 'gc_period': '30sec', }) - env.zenith_cli.create_timeline(f'test_tenant_conf', tenant_id=tenant) + env.neon_cli.create_timeline(f'test_tenant_conf', tenant_id=tenant) pg = env.postgres.create_start( "test_tenant_conf", "main", @@ -66,11 +66,11 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}''' }.items()) # update the config and ensure that it has changed - env.zenith_cli.config_tenant(tenant_id=tenant, - conf={ - 'checkpoint_distance': '15000', - 'gc_period': '80sec', - }) + env.neon_cli.config_tenant(tenant_id=tenant, + conf={ + 'checkpoint_distance': '15000', + 'gc_period': '80sec', + }) with closing(env.pageserver.connect()) as psconn: with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: diff --git a/test_runner/batch_others/test_tenant_detach.py b/test_runner/batch_others/test_tenant_detach.py new file mode 100644 index 0000000000..2df5409b4f --- /dev/null +++ b/test_runner/batch_others/test_tenant_detach.py @@ -0,0 +1,64 @@ +from threading import Thread +from uuid import uuid4 +import psycopg2 +import pytest + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException + + +def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + # first check for non existing tenant + tenant_id = uuid4() + with pytest.raises(expected_exception=NeonPageserverApiException, + match=f'Tenant not found for id {tenant_id.hex}'): + pageserver_http.tenant_detach(tenant_id) + + # create new nenant + tenant_id, timeline_id = env.neon_cli.create_tenant() + + # assert tenant exists on disk + assert (env.repo_dir / "tenants" / tenant_id.hex).exists() + + pg = env.postgres.create_start('main', tenant_id=tenant_id) + # we rely upon autocommit after each statement + pg.safe_psql_many(queries=[ + 'CREATE TABLE t(key int primary key, value text)', + 'INSERT INTO t SELECT generate_series(1,100000), \'payload\'', + ]) + + # gc should not try to even start + with pytest.raises(expected_exception=psycopg2.DatabaseError, + match='gc target timeline does not exist'): + env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {uuid4().hex} 0') + + # try to concurrently run gc and detach + gc_thread = Thread( + target=lambda: env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0'), ) + gc_thread.start() + + last_error = None + for i in range(3): + try: + pageserver_http.tenant_detach(tenant_id) + except Exception as e: + last_error = e + log.error(f"try {i} error detaching tenant: {e}") + continue + else: + break + # else is called if the loop finished without reaching "break" + else: + pytest.fail(f"could not detach timeline: {last_error}") + + gc_thread.join(timeout=10) + + # check that nothing is left on disk for deleted tenant + assert not (env.repo_dir / "tenants" / tenant_id.hex).exists() + + with pytest.raises(expected_exception=psycopg2.DatabaseError, + match=f'Tenant {tenant_id.hex} not found'): + env.pageserver.safe_psql(f'do_gc {tenant_id.hex} {timeline_id.hex} 0') diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 91506e120d..73f6f52e72 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -1,17 +1,32 @@ -from contextlib import closing, contextmanager import os import pathlib +import signal import subprocess import threading -import typing +from contextlib import closing, contextmanager +from typing import Any, Dict, Optional, Tuple from uuid import UUID -from fixtures.log_helper import log -from typing import Optional -import signal -import pytest -from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, Etcd, ZenithPageserverHttpClient, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload, zenith_binpath, pg_distrib_dir -from fixtures.utils import lsn_from_hex +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + Etcd, + NeonEnv, + NeonEnvBuilder, + NeonPageserverHttpClient, + PageserverPort, + PortDistributor, + Postgres, + assert_no_in_progress_downloads_for_tenant, + assert_timeline_local, + base_dir, + neon_binpath, + pg_distrib_dir, + wait_for_last_record_lsn, + wait_for_upload, + wait_until, +) +from fixtures.utils import lsn_from_hex, subprocess_capture def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -26,7 +41,7 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path, http_port: int, broker: Optional[Etcd]): """ - cannot use ZenithPageserver yet because it depends on zenith cli + cannot use NeonPageserver yet because it depends on neon cli which currently lacks support for multiple pageservers """ cmd = [ @@ -92,7 +107,7 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve # if we recovered after failure verify that we have correct number of rows log.info("recovering at %s", inserted_ctr) cur.execute("SELECT count(*) FROM load") - # it seems that sometimes transaction gets commited before we can acknowledge + # it seems that sometimes transaction gets committed before we can acknowledge # the result, so sometimes selected value is larger by one than we expect assert cur.fetchone()[0] - inserted_ctr <= 1 log.info("successfully recovered %s", inserted_ctr) @@ -101,73 +116,211 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve log.info('load thread stopped') -@pytest.mark.skip( - reason= - "needs to replace callmemaybe call with better idea how to migrate timelines between pageservers" -) -@pytest.mark.parametrize('with_load', ['with_load', 'without_load']) -def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, - port_distributor: PortDistributor, - with_load: str): - zenith_env_builder.enable_local_fs_remote_storage() +def populate_branch( + pg: Postgres, + tenant_id: UUID, + ps_http: NeonPageserverHttpClient, + create_table: bool, + expected_sum: Optional[int], +) -> Tuple[UUID, int]: + # insert some data + with pg_cur(pg) as cur: + cur.execute("SHOW neon.timeline_id") + timeline_id = UUID(cur.fetchone()[0]) + log.info("timeline to relocate %s", timeline_id.hex) - env = zenith_env_builder.init_start() + cur.execute("SELECT pg_current_wal_flush_lsn()") + log.info("pg_current_wal_flush_lsn() %s", lsn_from_hex(cur.fetchone()[0])) + log.info("timeline detail %s", + ps_http.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)) + + # we rely upon autocommit after each statement + # as waiting for acceptors happens there + if create_table: + cur.execute("CREATE TABLE t(key int, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'some payload'") + if expected_sum is not None: + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (expected_sum, ) + cur.execute("SELECT pg_current_wal_flush_lsn()") + + current_lsn = lsn_from_hex(cur.fetchone()[0]) + return timeline_id, current_lsn + + +def ensure_checkpoint( + pageserver_cur, + pageserver_http: NeonPageserverHttpClient, + tenant_id: UUID, + timeline_id: UUID, + current_lsn: int, +): + # run checkpoint manually to be sure that data landed in remote storage + pageserver_cur.execute(f"checkpoint {tenant_id.hex} {timeline_id.hex}") + + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + +def check_timeline_attached( + new_pageserver_http_client: NeonPageserverHttpClient, + tenant_id: UUID, + timeline_id: UUID, + old_timeline_detail: Dict[str, Any], + old_current_lsn: int, +): + # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint + new_timeline_detail = assert_timeline_local(new_pageserver_http_client, tenant_id, timeline_id) + + # when load is active these checks can break because lsns are not static + # so lets check with some margin + assert_abs_margin_ratio(lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']), + lsn_from_hex(old_timeline_detail['local']['disk_consistent_lsn']), + 0.03) + + assert_abs_margin_ratio(lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']), + old_current_lsn, + 0.03) + + +def switch_pg_to_new_pageserver(env: NeonEnv, + pg: Postgres, + new_pageserver_port: int, + tenant_id: UUID, + timeline_id: UUID) -> pathlib.Path: + pg.stop() + + pg_config_file_path = pathlib.Path(pg.config_file_path()) + pg_config_file_path.open('a').write( + f"\nneon.pageserver_connstring = 'postgresql://no_user:@localhost:{new_pageserver_port}'") + + pg.start() + + timeline_to_detach_local_path = env.repo_dir / 'tenants' / tenant_id.hex / 'timelines' / timeline_id.hex + files_before_detach = os.listdir(timeline_to_detach_local_path) + assert 'metadata' in files_before_detach, f'Regular timeline {timeline_to_detach_local_path} should have the metadata file,\ + but got: {files_before_detach}' + assert len(files_before_detach) >= 2, f'Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\ + but got {files_before_detach}' + + return timeline_to_detach_local_path + + +def post_migration_check(pg: Postgres, sum_before_migration: int, old_local_path: pathlib.Path): + with pg_cur(pg) as cur: + # check that data is still there + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (sum_before_migration, ) + # check that we can write new data + cur.execute("INSERT INTO t SELECT generate_series(1001,2000), 'some payload'") + cur.execute("SELECT sum(key) FROM t") + assert cur.fetchone() == (sum_before_migration + 1500500, ) + + assert not os.path.exists(old_local_path), f'After detach, local timeline dir {old_local_path} should be removed' + + +@pytest.mark.parametrize( + 'method', + [ + # A minor migration involves no storage breaking changes. + # It is done by attaching the tenant to a new pageserver. + 'minor', + # A major migration involves exporting a postgres datadir + # basebackup and importing it into the new pageserver. + # This kind of migration can tolerate breaking changes + # to storage format + pytest.param('major', marks=pytest.mark.xfail(reason="Not implemented")), + ]) +@pytest.mark.parametrize('with_load', ['with_load', 'without_load']) +def test_tenant_relocation(neon_env_builder: NeonEnvBuilder, + port_distributor: PortDistributor, + test_output_dir, + method: str, + with_load: str): + neon_env_builder.enable_local_fs_remote_storage() + + env = neon_env_builder.init_start() # create folder for remote storage mock remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage' - tenant, _ = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) - log.info("tenant to relocate %s", tenant) - - # attach does not download ancestor branches (should it?), just use root branch for now - env.zenith_cli.create_root_branch('test_tenant_relocation', tenant_id=tenant) - - tenant_pg = env.postgres.create_start(branch_name='test_tenant_relocation', - node_name='test_tenant_relocation', - tenant_id=tenant) - - # insert some data - with closing(tenant_pg.connect()) as conn: - with conn.cursor() as cur: - # save timeline for later gc call - cur.execute("SHOW zenith.zenith_timeline") - timeline = UUID(cur.fetchone()[0]) - log.info("timeline to relocate %s", timeline.hex) - - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - cur.execute("CREATE TABLE t(key int primary key, value text)") - cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'some payload'") - cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (500500, ) - cur.execute("SELECT pg_current_wal_flush_lsn()") - - current_lsn = lsn_from_hex(cur.fetchone()[0]) + # we use two branches to check that they are both relocated + # first branch is used for load, compute for second one is used to + # check that data is not lost pageserver_http = env.pageserver.http_client() + tenant_id, initial_timeline_id = env.neon_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) + log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id) + + env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id) + pg_main = env.postgres.create_start(branch_name='test_tenant_relocation_main', + tenant_id=tenant_id) + + timeline_id_main, current_lsn_main = populate_branch( + pg_main, + tenant_id=tenant_id, + ps_http=pageserver_http, + create_table=True, + expected_sum=500500, + ) + + env.neon_cli.create_branch( + new_branch_name="test_tenant_relocation_second", + ancestor_branch_name="test_tenant_relocation_main", + tenant_id=tenant_id, + ) + pg_second = env.postgres.create_start(branch_name='test_tenant_relocation_second', + tenant_id=tenant_id) + + timeline_id_second, current_lsn_second = populate_branch( + pg_second, + tenant_id=tenant_id, + ps_http=pageserver_http, + create_table=False, + expected_sum=1001000, + ) + # wait until pageserver receives that data - wait_for_last_record_lsn(pageserver_http, tenant, timeline, current_lsn) - timeline_detail = assert_local(pageserver_http, tenant, timeline) + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_main, current_lsn_main) + timeline_detail_main = assert_timeline_local(pageserver_http, tenant_id, timeline_id_main) + + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id_second, current_lsn_second) + timeline_detail_second = assert_timeline_local(pageserver_http, tenant_id, timeline_id_second) if with_load == 'with_load': # create load table - with pg_cur(tenant_pg) as cur: + with pg_cur(pg_main) as cur: cur.execute("CREATE TABLE load(value text)") load_stop_event = threading.Event() load_ok_event = threading.Event() - load_thread = threading.Thread(target=load, - args=(tenant_pg, load_stop_event, load_ok_event)) + load_thread = threading.Thread( + target=load, + args=(pg_main, load_stop_event, load_ok_event), + daemon=True, # To make sure the child dies when the parent errors + ) load_thread.start() - # run checkpoint manually to be sure that data landed in remote storage - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor() as pscur: - pscur.execute(f"checkpoint {tenant.hex} {timeline.hex}") + # this requirement introduces a problem + # if user creates a branch during migration + # it wont appear on the new pageserver + with pg_cur(env.pageserver) as cur: + ensure_checkpoint( + cur, + pageserver_http=pageserver_http, + tenant_id=tenant_id, + timeline_id=timeline_id_main, + current_lsn=current_lsn_main, + ) - # wait until pageserver successfully uploaded a checkpoint to remote storage - wait_for_upload(pageserver_http, tenant, timeline, current_lsn) + ensure_checkpoint( + cur, + pageserver_http=pageserver_http, + tenant_id=tenant_id, + timeline_id=timeline_id_second, + current_lsn=current_lsn_second, + ) log.info("inititalizing new pageserver") # bootstrap second pageserver @@ -177,77 +330,101 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, new_pageserver_pg_port = port_distributor.get_port() new_pageserver_http_port = port_distributor.get_port() log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port) - pageserver_bin = pathlib.Path(zenith_binpath) / 'pageserver' + pageserver_bin = pathlib.Path(neon_binpath) / 'pageserver' - new_pageserver_http = ZenithPageserverHttpClient(port=new_pageserver_http_port, auth_token=None) + new_pageserver_http = NeonPageserverHttpClient(port=new_pageserver_http_port, auth_token=None) with new_pageserver_helper(new_pageserver_dir, pageserver_bin, remote_storage_mock_path, new_pageserver_pg_port, new_pageserver_http_port, - zenith_env_builder.broker): + neon_env_builder.broker): - # call to attach timeline to new pageserver - new_pageserver_http.timeline_attach(tenant, timeline) - # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint - new_timeline_detail = wait_until( - number_of_iterations=5, - interval=1, - func=lambda: assert_local(new_pageserver_http, tenant, timeline)) + # Migrate either by attaching from s3 or import/export basebackup + if method == "major": + cmd = [ + "python", + os.path.join(base_dir, "scripts/export_import_between_pageservers.py"), + "--tenant-id", + tenant_id.hex, + "--from-host", + "localhost", + "--from-http-port", + str(pageserver_http.port), + "--from-pg-port", + str(env.pageserver.service_port.pg), + "--to-host", + "localhost", + "--to-http-port", + str(new_pageserver_http_port), + "--to-pg-port", + str(new_pageserver_pg_port), + "--psql-path", + os.path.join(pg_distrib_dir, "bin", "psql"), + "--work-dir", + os.path.join(test_output_dir), + ] + subprocess_capture(str(env.repo_dir), cmd, check=True) + elif method == "minor": + # call to attach timeline to new pageserver + new_pageserver_http.tenant_attach(tenant_id) - # when load is active these checks can break because lsns are not static - # so lets check with some margin - assert_abs_margin_ratio(lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']), - lsn_from_hex(timeline_detail['local']['disk_consistent_lsn']), - 0.03) + # check that it shows that download is in progress + tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id) + assert tenant_status.get('has_in_progress_downloads'), tenant_status - # callmemaybe to start replication from safekeeper to the new pageserver - # when there is no load there is a clean checkpoint and no wal delta - # needs to be streamed to the new pageserver - # TODO (rodionov) use attach to start replication - with pg_cur(PgProtocol(host='localhost', port=new_pageserver_pg_port)) as cur: - # "callmemaybe {} {} host={} port={} options='-c ztimelineid={} ztenantid={}'" - safekeeper_connstring = f"host=localhost port={env.safekeepers[0].port.pg} options='-c ztimelineid={timeline} ztenantid={tenant} pageserver_connstr=postgresql://no_user:@localhost:{new_pageserver_pg_port}'" - cur.execute("callmemaybe {} {} {}".format(tenant.hex, - timeline.hex, - safekeeper_connstring)) + # wait until tenant is downloaded + wait_until(number_of_iterations=10, + interval=1, + func=lambda: assert_no_in_progress_downloads_for_tenant( + new_pageserver_http, tenant_id)) - tenant_pg.stop() + check_timeline_attached( + new_pageserver_http, + tenant_id, + timeline_id_main, + timeline_detail_main, + current_lsn_main, + ) - # rewrite zenith cli config to use new pageserver for basebackup to start new compute + check_timeline_attached( + new_pageserver_http, + tenant_id, + timeline_id_second, + timeline_detail_second, + current_lsn_second, + ) + + # rewrite neon cli config to use new pageserver for basebackup to start new compute cli_config_lines = (env.repo_dir / 'config').read_text().splitlines() cli_config_lines[-2] = f"listen_http_addr = 'localhost:{new_pageserver_http_port}'" cli_config_lines[-1] = f"listen_pg_addr = 'localhost:{new_pageserver_pg_port}'" (env.repo_dir / 'config').write_text('\n'.join(cli_config_lines)) - tenant_pg_config_file_path = pathlib.Path(tenant_pg.config_file_path()) - tenant_pg_config_file_path.open('a').write( - f"\nzenith.page_server_connstring = 'postgresql://no_user:@localhost:{new_pageserver_pg_port}'" + old_local_path_main = switch_pg_to_new_pageserver( + env, + pg_main, + new_pageserver_pg_port, + tenant_id, + timeline_id_main, ) - tenant_pg.start() - - timeline_to_detach_local_path = env.repo_dir / 'tenants' / tenant.hex / 'timelines' / timeline.hex - files_before_detach = os.listdir(timeline_to_detach_local_path) - assert 'metadata' in files_before_detach, f'Regular timeline {timeline_to_detach_local_path} should have the metadata file,\ - but got: {files_before_detach}' - assert len(files_before_detach) > 2, f'Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\ - but got {files_before_detach}' + old_local_path_second = switch_pg_to_new_pageserver( + env, + pg_second, + new_pageserver_pg_port, + tenant_id, + timeline_id_second, + ) # detach tenant from old pageserver before we check # that all the data is there to be sure that old pageserver # is no longer involved, and if it is, we will see the errors - pageserver_http.timeline_detach(tenant, timeline) + pageserver_http.tenant_detach(tenant_id) - with pg_cur(tenant_pg) as cur: - # check that data is still there - cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (500500, ) - # check that we can write new data - cur.execute("INSERT INTO t SELECT generate_series(1001,2000), 'some payload'") - cur.execute("SELECT sum(key) FROM t") - assert cur.fetchone() == (2001000, ) + post_migration_check(pg_main, 500500, old_local_path_main) + post_migration_check(pg_second, 1001000, old_local_path_second) if with_load == 'with_load': assert load_ok_event.wait(3) @@ -256,9 +433,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, load_thread.join(timeout=10) log.info('load thread stopped') - assert not os.path.exists(timeline_to_detach_local_path), f'After detach, local timeline dir {timeline_to_detach_local_path} should be removed' - - # bring old pageserver back for clean shutdown via zenith cli + # bring old pageserver back for clean shutdown via neon cli # new pageserver will be shut down by the context manager cli_config_lines = (env.repo_dir / 'config').read_text().splitlines() cli_config_lines[-2] = f"listen_http_addr = 'localhost:{env.pageserver.service_port.http}'" diff --git a/test_runner/batch_others/test_tenant_tasks.py b/test_runner/batch_others/test_tenant_tasks.py new file mode 100644 index 0000000000..fae2a2199d --- /dev/null +++ b/test_runner/batch_others/test_tenant_tasks.py @@ -0,0 +1,70 @@ +from fixtures.neon_fixtures import NeonEnvBuilder, wait_until +from uuid import UUID +import time + + +def get_only_element(l): + assert len(l) == 1 + return l[0] + + +# Test that gc and compaction tenant tasks start and stop correctly +def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): + # The gc and compaction loops don't bother to watch for tenant state + # changes while sleeping, so we use small periods to make this test + # run faster. With default settings we'd have to wait longer for tasks + # to notice state changes and shut down. + # TODO fix this behavior in the pageserver + tenant_config = "{gc_period = '1 s', compaction_period = '1 s'}" + neon_env_builder.pageserver_config_override = f"tenant_config={tenant_config}" + name = "test_tenant_tasks" + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + + def get_state(tenant): + all_states = client.tenant_list() + matching = [t for t in all_states if t["id"] == tenant.hex] + return get_only_element(matching)["state"] + + def get_metric_value(name): + metrics = client.get_metrics() + relevant = [line for line in metrics.splitlines() if line.startswith(name)] + if len(relevant) == 0: + return 0 + line = get_only_element(relevant) + value = line.lstrip(name).strip() + return int(value) + + def delete_all_timelines(tenant): + timelines = [UUID(t["timeline_id"]) for t in client.timeline_list(tenant)] + for t in timelines: + client.timeline_delete(tenant, t) + + def assert_idle(tenant): + assert get_state(tenant) == "Idle" + + # Create tenant, start compute + tenant, _ = env.neon_cli.create_tenant() + timeline = env.neon_cli.create_timeline(name, tenant_id=tenant) + pg = env.postgres.create_start(name, tenant_id=tenant) + assert (get_state(tenant) == "Active") + + # Stop compute + pg.stop() + + # Detach all tenants and wait for them to go idle + # TODO they should be already idle since there are no active computes + for tenant_info in client.tenant_list(): + tenant_id = UUID(tenant_info["id"]) + delete_all_timelines(tenant_id) + wait_until(10, 0.2, lambda: assert_idle(tenant_id)) + + # Assert that all tasks finish quickly after tenants go idle + def assert_tasks_finish(): + tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}') + tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}') + tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}') + assert tasks_started == tasks_ended + assert tasks_panicked == 0 + + wait_until(10, 0.2, assert_tasks_finish) diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py index 9ccb8cf196..8d73d8185c 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/batch_others/test_tenants.py @@ -3,26 +3,26 @@ from datetime import datetime import os import pytest -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.log_helper import log from fixtures.metrics import parse_metrics from fixtures.utils import lsn_to_hex @pytest.mark.parametrize('with_safekeepers', [False, True]) -def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_safekeepers: bool): +def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder, with_safekeepers: bool): if with_safekeepers: - zenith_env_builder.num_safekeepers = 3 + neon_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() + env = neon_env_builder.init_start() """Tests tenants with and without wal acceptors""" - tenant_1, _ = env.zenith_cli.create_tenant() - tenant_2, _ = env.zenith_cli.create_tenant() + tenant_1, _ = env.neon_cli.create_tenant() + tenant_2, _ = env.neon_cli.create_tenant() - env.zenith_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', - tenant_id=tenant_1) - env.zenith_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', - tenant_id=tenant_2) + env.neon_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', + tenant_id=tenant_1) + env.neon_cli.create_timeline(f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', + tenant_id=tenant_2) pg_tenant1 = env.postgres.create_start( f'test_tenants_normal_work_with_safekeepers{with_safekeepers}', @@ -44,15 +44,15 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_safekeep assert cur.fetchone() == (5000050000, ) -def test_metrics_normal_work(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 +def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() - tenant_1, _ = env.zenith_cli.create_tenant() - tenant_2, _ = env.zenith_cli.create_tenant() + env = neon_env_builder.init_start() + tenant_1, _ = env.neon_cli.create_tenant() + tenant_2, _ = env.neon_cli.create_tenant() - timeline_1 = env.zenith_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_1) - timeline_2 = env.zenith_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_2) + timeline_1 = env.neon_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_1) + timeline_2 = env.neon_cli.create_timeline('test_metrics_normal_work', tenant_id=tenant_2) pg_tenant1 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_1) pg_tenant2 = env.postgres.create_start('test_metrics_normal_work', tenant_id=tenant_2) @@ -72,7 +72,7 @@ def test_metrics_normal_work(zenith_env_builder: ZenithEnvBuilder): collected_metrics[f'safekeeper{sk.id}'] = sk.http_client().get_metrics_str() for name in collected_metrics: - basepath = os.path.join(zenith_env_builder.repo_dir, f'{name}.metrics') + basepath = os.path.join(neon_env_builder.repo_dir, f'{name}.metrics') with open(basepath, 'w') as stdout_f: print(collected_metrics[name], file=stdout_f, flush=True) diff --git a/test_runner/batch_others/test_tenants_with_remote_storage.py b/test_runner/batch_others/test_tenants_with_remote_storage.py index c00f077fcd..41506ad920 100644 --- a/test_runner/batch_others/test_tenants_with_remote_storage.py +++ b/test_runner/batch_others/test_tenants_with_remote_storage.py @@ -12,17 +12,17 @@ from uuid import UUID import pytest -from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithEnv, Postgres, wait_for_last_record_lsn, wait_for_upload +from fixtures.neon_fixtures import NeonEnvBuilder, NeonEnv, Postgres, wait_for_last_record_lsn, wait_for_upload from fixtures.utils import lsn_from_hex -async def tenant_workload(env: ZenithEnv, pg: Postgres): +async def tenant_workload(env: NeonEnv, pg: Postgres): pageserver_conn = await env.pageserver.connect_async() pg_conn = await pg.connect_async() - tenant_id = await pg_conn.fetchval("show zenith.zenith_tenant") - timeline_id = await pg_conn.fetchval("show zenith.zenith_timeline") + tenant_id = await pg_conn.fetchval("show neon.tenant_id") + timeline_id = await pg_conn.fetchval("show neon.timeline_id") await pg_conn.execute("CREATE TABLE t(key int primary key, value text)") for i in range(1, 100): @@ -35,7 +35,7 @@ async def tenant_workload(env: ZenithEnv, pg: Postgres): assert res == i * 1000 -async def all_tenants_workload(env: ZenithEnv, tenants_pgs): +async def all_tenants_workload(env: NeonEnv, tenants_pgs): workers = [] for tenant, pg in tenants_pgs: worker = tenant_workload(env, pg) @@ -46,28 +46,28 @@ async def all_tenants_workload(env: ZenithEnv, tenants_pgs): @pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3']) -def test_tenants_many(zenith_env_builder: ZenithEnvBuilder, storage_type: str): +def test_tenants_many(neon_env_builder: NeonEnvBuilder, storage_type: str): if storage_type == 'local_fs': - zenith_env_builder.enable_local_fs_remote_storage() + neon_env_builder.enable_local_fs_remote_storage() elif storage_type == 'mock_s3': - zenith_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore') + neon_env_builder.enable_s3_mock_remote_storage('test_remote_storage_backup_and_restore') else: raise RuntimeError(f'Unknown storage type: {storage_type}') - zenith_env_builder.enable_local_fs_remote_storage() + neon_env_builder.enable_local_fs_remote_storage() - env = zenith_env_builder.init_start() + env = neon_env_builder.init_start() tenants_pgs = [] for i in range(1, 5): # Use a tiny checkpoint distance, to create a lot of layers quickly - tenant, _ = env.zenith_cli.create_tenant( + tenant, _ = env.neon_cli.create_tenant( conf={ 'checkpoint_distance': '5000000', }) - env.zenith_cli.create_timeline(f'test_tenants_many', tenant_id=tenant) + env.neon_cli.create_timeline(f'test_tenants_many', tenant_id=tenant) pg = env.postgres.create_start( f'test_tenants_many', @@ -82,9 +82,9 @@ def test_tenants_many(zenith_env_builder: ZenithEnvBuilder, storage_type: str): for tenant, pg in tenants_pgs: with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute("show zenith.zenith_tenant") + cur.execute("show neon.tenant_id") tenant_id = cur.fetchone()[0] - cur.execute("show zenith.zenith_timeline") + cur.execute("show neon.timeline_id") timeline_id = cur.fetchone()[0] cur.execute("SELECT pg_current_wal_flush_lsn()") current_lsn = lsn_from_hex(cur.fetchone()[0]) diff --git a/test_runner/batch_others/test_timeline_delete.py b/test_runner/batch_others/test_timeline_delete.py new file mode 100644 index 0000000000..594475faf4 --- /dev/null +++ b/test_runner/batch_others/test_timeline_delete.py @@ -0,0 +1,60 @@ +from uuid import uuid4 +import pytest + +from fixtures.neon_fixtures import NeonEnv, NeonPageserverApiException, wait_until + + +def test_timeline_delete(neon_simple_env: NeonEnv): + env = neon_simple_env + + ps_http = env.pageserver.http_client() + + # first try to delete non existing timeline + # for existing tenant: + invalid_timeline_id = uuid4() + with pytest.raises(NeonPageserverApiException, match="timeline not found"): + ps_http.timeline_delete(tenant_id=env.initial_tenant, timeline_id=invalid_timeline_id) + + # for non existing tenant: + invalid_tenant_id = uuid4() + with pytest.raises(NeonPageserverApiException, + match=f"Tenant {invalid_tenant_id.hex} not found in local tenant state"): + ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id) + + # construct pair of branches to validate that pageserver prohibits + # deletion of ancestor timelines when they have child branches + parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_parent", "empty") + + leaf_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_delete_branch1", + "test_ancestor_branch_delete_parent") + + ps_http = env.pageserver.http_client() + with pytest.raises(NeonPageserverApiException, + match="Cannot detach timeline which has child timelines"): + + timeline_path = env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / parent_timeline_id.hex + assert timeline_path.exists() + + ps_http.timeline_delete(env.initial_tenant, parent_timeline_id) + + assert not timeline_path.exists() + + timeline_path = env.repo_dir / "tenants" / env.initial_tenant.hex / "timelines" / leaf_timeline_id.hex + assert timeline_path.exists() + + # retry deletes when compaction or gc is running in pageserver + wait_until(number_of_iterations=3, + interval=0.2, + func=lambda: ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)) + + assert not timeline_path.exists() + + # check 404 + with pytest.raises(NeonPageserverApiException, + match="is not found neither locally nor remotely"): + ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) + + # FIXME leaves tenant without timelines, should we prevent deletion of root timeline? + wait_until(number_of_iterations=3, + interval=0.2, + func=lambda: ps_http.timeline_delete(env.initial_tenant, parent_timeline_id)) diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 0b33b56df3..7b7b16bcbf 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -1,18 +1,17 @@ from contextlib import closing import psycopg2.extras import psycopg2.errors -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres, assert_local +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local from fixtures.log_helper import log import time -def test_timeline_size(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - # Branch at the point where only 100 rows were inserted - new_timeline_id = env.zenith_cli.create_branch('test_timeline_size', 'empty') +def test_timeline_size(neon_simple_env: NeonEnv): + env = neon_simple_env + new_timeline_id = env.neon_cli.create_branch('test_timeline_size', 'empty') client = env.pageserver.http_client() - timeline_details = assert_local(client, env.initial_tenant, new_timeline_id) + timeline_details = assert_timeline_local(client, env.initial_tenant, new_timeline_id) assert timeline_details['local']['current_logical_size'] == timeline_details['local'][ 'current_logical_size_non_incremental'] @@ -21,9 +20,8 @@ def test_timeline_size(zenith_simple_env: ZenithEnv): with closing(pgmain.connect()) as conn: with conn.cursor() as cur: - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timeline_id") - # Create table, and insert the first 100 rows cur.execute("CREATE TABLE foo (t text)") cur.execute(""" INSERT INTO foo @@ -31,13 +29,58 @@ def test_timeline_size(zenith_simple_env: ZenithEnv): FROM generate_series(1, 10) g """) - res = assert_local(client, env.initial_tenant, new_timeline_id) + res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) local_details = res['local'] assert local_details["current_logical_size"] == local_details[ "current_logical_size_non_incremental"] cur.execute("TRUNCATE foo") - res = assert_local(client, env.initial_tenant, new_timeline_id) + res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) + local_details = res['local'] + assert local_details["current_logical_size"] == local_details[ + "current_logical_size_non_incremental"] + + +def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): + env = neon_simple_env + new_timeline_id = env.neon_cli.create_branch('test_timeline_size', 'empty') + + client = env.pageserver.http_client() + timeline_details = assert_timeline_local(client, env.initial_tenant, new_timeline_id) + assert timeline_details['local']['current_logical_size'] == timeline_details['local'][ + 'current_logical_size_non_incremental'] + + pgmain = env.postgres.create_start("test_timeline_size") + log.info("postgres is running on 'test_timeline_size' branch") + + with closing(pgmain.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SHOW neon.timeline_id") + + res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) + local_details = res['local'] + assert local_details["current_logical_size"] == local_details[ + "current_logical_size_non_incremental"] + + cur.execute('CREATE DATABASE foodb') + with closing(pgmain.connect(dbname='foodb')) as conn: + with conn.cursor() as cur2: + + cur2.execute("CREATE TABLE foo (t text)") + cur2.execute(""" + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 10) g + """) + + res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) + local_details = res['local'] + assert local_details["current_logical_size"] == local_details[ + "current_logical_size_non_incremental"] + + cur.execute('DROP DATABASE foodb') + + res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) local_details = res['local'] assert local_details["current_logical_size"] == local_details[ "current_logical_size_non_incremental"] @@ -69,24 +112,24 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 time.sleep(polling_interval) -def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() - new_timeline_id = env.zenith_cli.create_branch('test_timeline_size_quota') +def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + new_timeline_id = env.neon_cli.create_branch('test_timeline_size_quota') client = env.pageserver.http_client() - res = assert_local(client, env.initial_tenant, new_timeline_id) + res = assert_timeline_local(client, env.initial_tenant, new_timeline_id) assert res['local']["current_logical_size"] == res['local'][ "current_logical_size_non_incremental"] pgmain = env.postgres.create_start( "test_timeline_size_quota", # Set small limit for the test - config_lines=['zenith.max_cluster_size=30MB']) + config_lines=['neon.max_cluster_size=30MB']) log.info("postgres is running on 'test_timeline_size_quota' branch") with closing(pgmain.connect()) as conn: with conn.cursor() as cur: - cur.execute("CREATE EXTENSION zenith") # TODO move it to zenith_fixtures? + cur.execute("CREATE EXTENSION neon") # TODO move it to neon_fixtures? cur.execute("CREATE TABLE foo (t text)") diff --git a/test_runner/batch_others/test_twophase.py b/test_runner/batch_others/test_twophase.py index 4afdc7e0be..04e3d0b7bc 100644 --- a/test_runner/batch_others/test_twophase.py +++ b/test_runner/batch_others/test_twophase.py @@ -1,15 +1,15 @@ import os -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log # # Test branching, when a transaction is in prepared state # -def test_twophase(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_twophase", "empty") +def test_twophase(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_twophase", "empty") pg = env.postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5']) log.info("postgres is running on 'test_twophase' branch") @@ -55,7 +55,7 @@ def test_twophase(zenith_simple_env: ZenithEnv): assert len(twophase_files) == 2 # Create a branch with the transaction in prepared state - env.zenith_cli.create_branch("test_twophase_prepared", "test_twophase") + env.neon_cli.create_branch("test_twophase_prepared", "test_twophase") # Start compute on the new branch pg2 = env.postgres.create_start( diff --git a/test_runner/batch_others/test_vm_bits.py b/test_runner/batch_others/test_vm_bits.py index 49e48dd450..29b55f5b8c 100644 --- a/test_runner/batch_others/test_vm_bits.py +++ b/test_runner/batch_others/test_vm_bits.py @@ -1,4 +1,4 @@ -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log @@ -6,10 +6,10 @@ from fixtures.log_helper import log # Test that the VM bit is cleared correctly at a HEAP_DELETE and # HEAP_UPDATE record. # -def test_vm_bit_clear(zenith_simple_env: ZenithEnv): - env = zenith_simple_env +def test_vm_bit_clear(neon_simple_env: NeonEnv): + env = neon_simple_env - env.zenith_cli.create_branch("test_vm_bit_clear", "empty") + env.neon_cli.create_branch("test_vm_bit_clear", "empty") pg = env.postgres.create_start('test_vm_bit_clear') log.info("postgres is running on 'test_vm_bit_clear' branch") @@ -17,7 +17,7 @@ def test_vm_bit_clear(zenith_simple_env: ZenithEnv): cur = pg_conn.cursor() # Install extension containing function needed for test - cur.execute('CREATE EXTENSION zenith_test_utils') + cur.execute('CREATE EXTENSION neon_test_utils') # Create a test table and freeze it to set the VM bit. cur.execute('CREATE TABLE vmtest_delete (id integer PRIMARY KEY)') @@ -28,12 +28,12 @@ def test_vm_bit_clear(zenith_simple_env: ZenithEnv): cur.execute('INSERT INTO vmtest_update SELECT g FROM generate_series(1, 1000) g') cur.execute('VACUUM FREEZE vmtest_update') - # DELETE and UDPATE the rows. + # DELETE and UPDATE the rows. cur.execute('DELETE FROM vmtest_delete WHERE id = 1') cur.execute('UPDATE vmtest_update SET id = 5000 WHERE id = 1') # Branch at this point, to test that later - env.zenith_cli.create_branch("test_vm_bit_clear_new", "test_vm_bit_clear") + env.neon_cli.create_branch("test_vm_bit_clear_new", "test_vm_bit_clear") # Clear the buffer cache, to force the VM page to be re-fetched from # the page server diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index e1b7bd91ee..9b876f780d 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -1,7 +1,9 @@ +import pathlib import pytest import random import time import os +import shutil import signal import subprocess import sys @@ -12,29 +14,11 @@ from contextlib import closing from dataclasses import dataclass, field from multiprocessing import Process, Value from pathlib import Path -from fixtures.zenith_fixtures import PgBin, Etcd, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol -from fixtures.utils import get_dir_size, lsn_to_hex, mkdir_if_needed, lsn_from_hex +from fixtures.neon_fixtures import PgBin, Etcd, Postgres, RemoteStorageUsers, Safekeeper, NeonEnv, NeonEnvBuilder, PortDistributor, SafekeeperPort, neon_binpath, PgProtocol +from fixtures.utils import get_dir_size, lsn_to_hex, lsn_from_hex from fixtures.log_helper import log from typing import List, Optional, Any - - -# basic test, write something in setup with wal acceptors, ensure that commits -# succeed and data is written -def test_normal_work(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() - - env.zenith_cli.create_branch('test_safekeepers_normal_work') - pg = env.postgres.create_start('test_safekeepers_normal_work') - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - # we rely upon autocommit after each statement - # as waiting for acceptors happens there - cur.execute('CREATE TABLE t(key int primary key, value text)') - cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - cur.execute('SELECT sum(key) FROM t') - assert cur.fetchone() == (5000050000, ) +from uuid import uuid4 @dataclass @@ -48,9 +32,9 @@ class TimelineMetrics: # Run page server and multiple acceptors, and multiple compute nodes running # against different timelines. -def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() +def test_many_timelines(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() n_timelines = 3 @@ -58,15 +42,15 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): "test_safekeepers_many_timelines_{}".format(tlin) for tlin in range(n_timelines) ] # pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418') - # that's not really human readable, so the branch names are introduced in Zenith CLI. - # Zenith CLI stores its branch <-> timeline mapping in its internals, + # that's not really human readable, so the branch names are introduced in Neon CLI. + # Neon CLI stores its branch <-> timeline mapping in its internals, # but we need this to collect metrics from other servers, related to the timeline. branch_names_to_timeline_ids = {} # start postgres on each timeline pgs = [] for branch_name in branch_names: - new_timeline_id = env.zenith_cli.create_branch(branch_name) + new_timeline_id = env.neon_cli.create_branch(branch_name) pgs.append(env.postgres.create_start(branch_name)) branch_names_to_timeline_ids[branch_name] = new_timeline_id @@ -112,14 +96,14 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): # the compute node, which only happens after a consensus of safekeepers # has confirmed the transaction. We assume majority consensus here. assert (2 * sum(m.last_record_lsn <= lsn - for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + for lsn in m.flush_lsns) > neon_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" assert (2 * sum(m.last_record_lsn <= lsn - for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" + for lsn in m.commit_lsns) > neon_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}" timeline_metrics.append(m) log.info(f"{message}: {timeline_metrics}") return timeline_metrics - # TODO: https://github.com/zenithdb/zenith/issues/809 + # TODO: https://github.com/neondatabase/neon/issues/809 # collect_metrics("before CREATE TABLE") # Do everything in different loops to have actions on different timelines @@ -187,15 +171,15 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): # Check that dead minority doesn't prevent the commits: execute insert n_inserts # times, with fault_probability chance of getting a wal acceptor down or up # along the way. 2 of 3 are always alive, so the work keeps going. -def test_restarts(zenith_env_builder: ZenithEnvBuilder): +def test_restarts(neon_env_builder: NeonEnvBuilder): fault_probability = 0.01 n_inserts = 1000 n_acceptors = 3 - zenith_env_builder.num_safekeepers = n_acceptors - env = zenith_env_builder.init_start() + neon_env_builder.num_safekeepers = n_acceptors + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_safekeepers_restarts') + env.neon_cli.create_branch('test_safekeepers_restarts') pg = env.postgres.create_start('test_safekeepers_restarts') # we rely upon autocommit after each statement @@ -228,11 +212,11 @@ def delayed_safekeeper_start(wa): # When majority of acceptors is offline, commits are expected to be frozen -def test_unavailability(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 2 - env = zenith_env_builder.init_start() +def test_unavailability(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 2 + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_safekeepers_unavailability') + env.neon_cli.create_branch('test_safekeepers_unavailability') pg = env.postgres.create_start('test_safekeepers_unavailability') # we rely upon autocommit after each statement @@ -298,12 +282,12 @@ def stop_value(): # do inserts while concurrently getting up/down subsets of acceptors -def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value): +def test_race_conditions(neon_env_builder: NeonEnvBuilder, stop_value): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_safekeepers_race_conditions') + env.neon_cli.create_branch('test_safekeepers_race_conditions') pg = env.postgres.create_start('test_safekeepers_race_conditions') # we rely upon autocommit after each statement @@ -327,18 +311,18 @@ def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value): # Test that safekeepers push their info to the broker and learn peer status from it -def test_broker(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - zenith_env_builder.enable_local_fs_remote_storage() - env = zenith_env_builder.init_start() +def test_broker(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_local_fs_remote_storage() + env = neon_env_builder.init_start() - env.zenith_cli.create_branch("test_broker", "main") + env.neon_cli.create_branch("test_broker", "main") pg = env.postgres.create_start('test_broker') pg.safe_psql("CREATE TABLE t(key int primary key, value text)") - # learn zenith timeline from compute - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + # learn neon timeline from compute + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] # wait until remote_consistent_lsn gets advanced on all safekeepers clients = [sk.http_client() for sk in env.safekeepers] @@ -368,13 +352,15 @@ def test_broker(zenith_env_builder: ZenithEnvBuilder): # Test that old WAL consumed by peers and pageserver is removed from safekeepers. -def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 2 - # to advance remote_consistent_llsn - zenith_env_builder.enable_local_fs_remote_storage() - env = zenith_env_builder.init_start() +@pytest.mark.parametrize('auth_enabled', [False, True]) +def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): + neon_env_builder.num_safekeepers = 2 + # to advance remote_consistent_lsn + neon_env_builder.enable_local_fs_remote_storage() + neon_env_builder.auth_enabled = auth_enabled + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_safekeepers_wal_removal') + env.neon_cli.create_branch('test_safekeepers_wal_removal') pg = env.postgres.create_start('test_safekeepers_wal_removal') with closing(pg.connect()) as conn: @@ -384,11 +370,14 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): cur.execute('CREATE TABLE t(key int primary key, value text)') cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] # force checkpoint to advance remote_consistent_lsn - with closing(env.pageserver.connect()) as psconn: + pageserver_conn_options = {} + if auth_enabled: + pageserver_conn_options['password'] = env.auth_keys.generate_tenant_token(tenant_id) + with closing(env.pageserver.connect(**pageserver_conn_options)) as psconn: with psconn.cursor() as pscur: pscur.execute(f"checkpoint {tenant_id} {timeline_id}") @@ -399,9 +388,29 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): ] assert all(os.path.exists(p) for p in first_segments) - http_cli = env.safekeepers[0].http_client() + if not auth_enabled: + http_cli = env.safekeepers[0].http_client() + else: + http_cli = env.safekeepers[0].http_client( + auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + http_cli_other = env.safekeepers[0].http_client( + auth_token=env.auth_keys.generate_tenant_token(uuid4().hex)) + http_cli_noauth = env.safekeepers[0].http_client() + # Pretend WAL is offloaded to s3. - http_cli.record_safekeeper_info(tenant_id, timeline_id, {'s3_wal_lsn': 'FFFFFFFF/FEFFFFFF'}) + if auth_enabled: + old_backup_lsn = http_cli.timeline_status(tenant_id=tenant_id, + timeline_id=timeline_id).backup_lsn + assert 'FFFFFFFF/FEFFFFFF' != old_backup_lsn + for cli in [http_cli_other, http_cli_noauth]: + with pytest.raises(cli.HTTPError, match='Forbidden|Unauthorized'): + cli.record_safekeeper_info(tenant_id, + timeline_id, {'backup_lsn': 'FFFFFFFF/FEFFFFFF'}) + assert old_backup_lsn == http_cli.timeline_status(tenant_id=tenant_id, + timeline_id=timeline_id).backup_lsn + http_cli.record_safekeeper_info(tenant_id, timeline_id, {'backup_lsn': 'FFFFFFFF/FEFFFFFF'}) + assert 'FFFFFFFF/FEFFFFFF' == http_cli.timeline_status(tenant_id=tenant_id, + timeline_id=timeline_id).backup_lsn # wait till first segment is removed on all safekeepers started_at = time.time() @@ -414,8 +423,202 @@ def test_wal_removal(zenith_env_builder: ZenithEnvBuilder): time.sleep(0.5) +def wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end): + started_at = time.time() + http_cli = live_sk.http_client() + while True: + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"live sk status is {tli_status}") + + if lsn_from_hex(tli_status.backup_lsn) >= lsn_from_hex(seg_end): + break + elapsed = time.time() - started_at + if elapsed > 20: + raise RuntimeError( + f"timed out waiting {elapsed:.0f}s for segment ending at {seg_end} get offloaded") + time.sleep(0.5) + + +def wait_wal_trim(tenant_id, timeline_id, sk, target_size): + started_at = time.time() + http_cli = sk.http_client() + while True: + tli_status = http_cli.timeline_status(tenant_id, timeline_id) + sk_wal_size = get_dir_size(os.path.join(sk.data_dir(), tenant_id, + timeline_id)) / 1024 / 1024 + log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size:.2f}MB status={tli_status}") + + if sk_wal_size <= target_size: + break + + elapsed = time.time() - started_at + if elapsed > 20: + raise RuntimeError( + f"timed out waiting {elapsed:.0f}s for sk_id={sk.id} to trim WAL to {target_size:.2f}MB, current size is {sk_wal_size:.2f}MB" + ) + time.sleep(0.5) + + +@pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs']) +def test_wal_backup(neon_env_builder: NeonEnvBuilder, storage_type: str): + neon_env_builder.num_safekeepers = 3 + if storage_type == 'local_fs': + neon_env_builder.enable_local_fs_remote_storage() + elif storage_type == 'mock_s3': + neon_env_builder.enable_s3_mock_remote_storage('test_safekeepers_wal_backup') + else: + raise RuntimeError(f'Unknown storage type: {storage_type}') + neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER + + env = neon_env_builder.init_start() + + env.neon_cli.create_branch('test_safekeepers_wal_backup') + pg = env.postgres.create_start('test_safekeepers_wal_backup') + + # learn neon timeline from compute + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + pg_conn = pg.connect() + cur = pg_conn.cursor() + cur.execute('create table t(key int, value text)') + + # Shut down subsequently each of safekeepers and fill a segment while sk is + # down; ensure segment gets offloaded by others. + offloaded_seg_end = ['0/2000000', '0/3000000', '0/4000000'] + for victim, seg_end in zip(env.safekeepers, offloaded_seg_end): + victim.stop() + # roughly fills one segment + cur.execute("insert into t select generate_series(1,250000), 'payload'") + live_sk = [sk for sk in env.safekeepers if sk != victim][0] + + wait_segment_offload(tenant_id, timeline_id, live_sk, seg_end) + + victim.start() + + # put one of safekeepers down again + env.safekeepers[0].stop() + # restart postgres + pg.stop_and_destroy().create_start('test_safekeepers_wal_backup') + # and ensure offloading still works + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("insert into t select generate_series(1,250000), 'payload'") + wait_segment_offload(tenant_id, timeline_id, env.safekeepers[1], '0/5000000') + + +@pytest.mark.parametrize('storage_type', ['mock_s3', 'local_fs']) +def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, storage_type: str): + neon_env_builder.num_safekeepers = 3 + if storage_type == 'local_fs': + neon_env_builder.enable_local_fs_remote_storage() + elif storage_type == 'mock_s3': + neon_env_builder.enable_s3_mock_remote_storage('test_s3_wal_replay') + else: + raise RuntimeError(f'Unknown storage type: {storage_type}') + neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER + + env = neon_env_builder.init_start() + env.neon_cli.create_branch('test_s3_wal_replay') + + env.pageserver.stop() + pageserver_tenants_dir = os.path.join(env.repo_dir, 'tenants') + pageserver_fresh_copy = os.path.join(env.repo_dir, 'tenants_fresh') + log.info(f"Creating a copy of pageserver in a fresh state at {pageserver_fresh_copy}") + shutil.copytree(pageserver_tenants_dir, pageserver_fresh_copy) + env.pageserver.start() + + pg = env.postgres.create_start('test_s3_wal_replay') + + # learn neon timeline from compute + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + expected_sum = 0 + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("create table t(key int, value text)") + cur.execute("insert into t values (1, 'payload')") + expected_sum += 1 + + offloaded_seg_end = ['0/3000000'] + for seg_end in offloaded_seg_end: + # roughly fills two segments + cur.execute("insert into t select generate_series(1,500000), 'payload'") + expected_sum += 500000 * 500001 // 2 + + cur.execute("select sum(key) from t") + assert cur.fetchone()[0] == expected_sum + + for sk in env.safekeepers: + wait_segment_offload(tenant_id, timeline_id, sk, seg_end) + + # advance remote_consistent_lsn to trigger WAL trimming + # this LSN should be less than commit_lsn, so timeline will be active=true in safekeepers, to push etcd updates + env.safekeepers[0].http_client().record_safekeeper_info( + tenant_id, timeline_id, {'remote_consistent_lsn': offloaded_seg_end[-1]}) + + for sk in env.safekeepers: + # require WAL to be trimmed, so no more than one segment is left on disk + wait_wal_trim(tenant_id, timeline_id, sk, 16 * 1.5) + + cur.execute('SELECT pg_current_wal_flush_lsn()') + last_lsn = cur.fetchone()[0] + + pageserver_lsn = env.pageserver.http_client().timeline_detail( + uuid.UUID(tenant_id), uuid.UUID((timeline_id)))["local"]["last_record_lsn"] + lag = lsn_from_hex(last_lsn) - lsn_from_hex(pageserver_lsn) + log.info( + f'Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb' + ) + + # replace pageserver with a fresh copy + pg.stop_and_destroy() + env.pageserver.stop() + + log.info(f'Removing current pageserver state at {pageserver_tenants_dir}') + shutil.rmtree(pageserver_tenants_dir) + log.info(f'Copying fresh pageserver state from {pageserver_fresh_copy}') + shutil.move(pageserver_fresh_copy, pageserver_tenants_dir) + + # start pageserver and wait for replay + env.pageserver.start() + wait_lsn_timeout = 60 * 3 + started_at = time.time() + last_debug_print = 0.0 + + while True: + elapsed = time.time() - started_at + if elapsed > wait_lsn_timeout: + raise RuntimeError(f'Timed out waiting for WAL redo') + + pageserver_lsn = env.pageserver.http_client().timeline_detail( + uuid.UUID(tenant_id), uuid.UUID((timeline_id)))["local"]["last_record_lsn"] + lag = lsn_from_hex(last_lsn) - lsn_from_hex(pageserver_lsn) + + if time.time() > last_debug_print + 10 or lag <= 0: + last_debug_print = time.time() + log.info(f'Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb') + + if lag <= 0: + break + + time.sleep(1) + + log.info(f'WAL redo took {elapsed} s') + + # verify data + pg.create_start('test_s3_wal_replay') + + with closing(pg.connect()) as conn: + with conn.cursor() as cur: + cur.execute("select sum(key) from t") + assert cur.fetchone()[0] == expected_sum + + class ProposerPostgres(PgProtocol): - """Object for running postgres without ZenithEnv""" + """Object for running postgres without NeonEnv""" def __init__(self, pgdata_dir: str, pg_bin, @@ -423,7 +626,7 @@ class ProposerPostgres(PgProtocol): tenant_id: uuid.UUID, listen_addr: str, port: int): - super().__init__(host=listen_addr, port=port, user='zenith_admin', dbname='postgres') + super().__init__(host=listen_addr, port=port, user='cloud_admin', dbname='postgres') self.pgdata_dir: str = pgdata_dir self.pg_bin: PgBin = pg_bin @@ -443,15 +646,15 @@ class ProposerPostgres(PgProtocol): def create_dir_config(self, safekeepers: str): """ Create dir and config for running --sync-safekeepers """ - mkdir_if_needed(self.pg_data_dir_path()) + pathlib.Path(self.pg_data_dir_path()).mkdir(exist_ok=True) with open(self.config_file_path(), "w") as f: cfg = [ "synchronous_standby_names = 'walproposer'\n", - "shared_preload_libraries = 'zenith'\n", - f"zenith.zenith_timeline = '{self.timeline_id.hex}'\n", - f"zenith.zenith_tenant = '{self.tenant_id.hex}'\n", - f"zenith.page_server_connstring = ''\n", - f"wal_acceptors = '{safekeepers}'\n", + "shared_preload_libraries = 'neon'\n", + f"neon.timeline_id = '{self.timeline_id.hex}'\n", + f"neon.tenant_id = '{self.tenant_id.hex}'\n", + f"neon.pageserver_connstring = ''\n", + f"safekeepers = '{safekeepers}'\n", f"listen_addresses = '{self.listen_addr}'\n", f"port = '{self.port}'\n", ] @@ -479,7 +682,7 @@ class ProposerPostgres(PgProtocol): def initdb(self): """ Run initdb """ - args = ["initdb", "-U", "zenith_admin", "-D", self.pg_data_dir_path()] + args = ["initdb", "-U", "cloud_admin", "-D", self.pg_data_dir_path()] self.pg_bin.run(args) def start(self): @@ -497,14 +700,14 @@ class ProposerPostgres(PgProtocol): # insert wal in all safekeepers and run sync on proposer -def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, +def test_sync_safekeepers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor): # We don't really need the full environment for this test, just the # safekeepers would be enough. - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() timeline_id = uuid.uuid4() tenant_id = uuid.uuid4() @@ -551,25 +754,42 @@ def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, assert all(lsn_after_sync == lsn for lsn in lsn_after_append) -def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): - env = zenith_env_builder.init_start() +@pytest.mark.parametrize('auth_enabled', [False, True]) +def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): + neon_env_builder.auth_enabled = auth_enabled + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_timeline_status') + env.neon_cli.create_branch('test_timeline_status') pg = env.postgres.create_start('test_timeline_status') wa = env.safekeepers[0] - wa_http_cli = wa.http_client() - wa_http_cli.check_status() - # learn zenith timeline from compute - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + # learn neon timeline from compute + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + if not auth_enabled: + wa_http_cli = wa.http_client() + wa_http_cli.check_status() + else: + wa_http_cli = wa.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + wa_http_cli.check_status() + wa_http_cli_bad = wa.http_client( + auth_token=env.auth_keys.generate_tenant_token(uuid4().hex)) + wa_http_cli_bad.check_status() + wa_http_cli_noauth = wa.http_client() + wa_http_cli_noauth.check_status() # fetch something sensible from status tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) epoch = tli_status.acceptor_epoch timeline_start_lsn = tli_status.timeline_start_lsn + if auth_enabled: + for cli in [wa_http_cli_bad, wa_http_cli_noauth]: + with pytest.raises(cli.HTTPError, match='Forbidden|Unauthorized'): + cli.timeline_status(tenant_id, timeline_id) + pg.safe_psql("create table t(i int)") # ensure epoch goes up after reboot @@ -597,7 +817,7 @@ class SafekeeperEnv: peer_port=self.port_distributor.get_port()) self.pg_bin = pg_bin self.num_safekeepers = num_safekeepers - self.bin_safekeeper = os.path.join(str(zenith_binpath), 'safekeeper') + self.bin_safekeeper = os.path.join(str(neon_binpath), 'safekeeper') self.safekeepers: Optional[List[subprocess.CompletedProcess[Any]]] = None self.postgres: Optional[ProposerPostgres] = None self.tenant_id: Optional[uuid.UUID] = None @@ -609,7 +829,7 @@ class SafekeeperEnv: self.timeline_id = uuid.uuid4() self.tenant_id = uuid.uuid4() - mkdir_if_needed(str(self.repo_dir)) + self.repo_dir.mkdir(exist_ok=True) # Create config and a Safekeeper object for each safekeeper self.safekeepers = [] @@ -628,8 +848,8 @@ class SafekeeperEnv: http=self.port_distributor.get_port(), ) - safekeeper_dir = os.path.join(self.repo_dir, f"sk{i}") - mkdir_if_needed(safekeeper_dir) + safekeeper_dir = self.repo_dir / f"sk{i}" + safekeeper_dir.mkdir(exist_ok=True) args = [ self.bin_safekeeper, @@ -638,7 +858,7 @@ class SafekeeperEnv: "--listen-http", f"127.0.0.1:{port.http}", "-D", - safekeeper_dir, + str(safekeeper_dir), "--id", str(i), "--broker-endpoints", @@ -708,8 +928,8 @@ def test_safekeeper_without_pageserver(test_output_dir: str, assert res == 5050 -def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): - def safekeepers_guc(env: ZenithEnv, sk_names: List[int]) -> str: +def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder): + def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str: return ','.join([f'localhost:{sk.port.pg}' for sk in env.safekeepers if sk.id in sk_names]) def execute_payload(pg: Postgres): @@ -736,9 +956,9 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): except Exception as e: log.info(f"Safekeeper {sk.id} status error: {e}") - zenith_env_builder.num_safekeepers = 4 - env = zenith_env_builder.init_start() - env.zenith_cli.create_branch('test_replace_safekeeper') + neon_env_builder.num_safekeepers = 4 + env = neon_env_builder.init_start() + env.neon_cli.create_branch('test_replace_safekeeper') log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() @@ -747,9 +967,9 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): pg.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers)) pg.start() - # learn zenith timeline from compute - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] + # learn neon timeline from compute + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] execute_payload(pg) show_statuses(env.safekeepers, tenant_id, timeline_id) @@ -799,7 +1019,7 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): # We have `wal_keep_size=0`, so postgres should trim WAL once it's broadcasted # to all safekeepers. This test checks that compute WAL can fit into small number # of WAL segments. -def test_wal_deleted_after_broadcast(zenith_env_builder: ZenithEnvBuilder): +def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder): # used to calculate delta in collect_stats last_lsn = .0 @@ -821,10 +1041,10 @@ def test_wal_deleted_after_broadcast(zenith_env_builder: ZenithEnvBuilder): def generate_wal(cur): cur.execute("INSERT INTO t SELECT generate_series(1,300000), 'payload'") - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_wal_deleted_after_broadcast') + env.neon_cli.create_branch('test_wal_deleted_after_broadcast') # Adjust checkpoint config to prevent keeping old WAL segments pg = env.postgres.create_start( 'test_wal_deleted_after_broadcast', @@ -849,18 +1069,20 @@ def test_wal_deleted_after_broadcast(zenith_env_builder: ZenithEnvBuilder): assert wal_size_after_checkpoint < 16 * 2.5 -def test_delete_force(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init_start() +@pytest.mark.parametrize('auth_enabled', [False, True]) +def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): + neon_env_builder.num_safekeepers = 1 + neon_env_builder.auth_enabled = auth_enabled + env = neon_env_builder.init_start() # Create two tenants: one will be deleted, other should be preserved. tenant_id = env.initial_tenant.hex - timeline_id_1 = env.zenith_cli.create_branch('br1').hex # Acive, delete explicitly - timeline_id_2 = env.zenith_cli.create_branch('br2').hex # Inactive, delete explictly - timeline_id_3 = env.zenith_cli.create_branch('br3').hex # Active, delete with the tenant - timeline_id_4 = env.zenith_cli.create_branch('br4').hex # Inactive, delete with the tenant + timeline_id_1 = env.neon_cli.create_branch('br1').hex # Active, delete explicitly + timeline_id_2 = env.neon_cli.create_branch('br2').hex # Inactive, delete explicitly + timeline_id_3 = env.neon_cli.create_branch('br3').hex # Active, delete with the tenant + timeline_id_4 = env.neon_cli.create_branch('br4').hex # Inactive, delete with the tenant - tenant_id_other_uuid, timeline_id_other_uuid = env.zenith_cli.create_tenant() + tenant_id_other_uuid, timeline_id_other_uuid = env.neon_cli.create_tenant() tenant_id_other = tenant_id_other_uuid.hex timeline_id_other = timeline_id_other_uuid.hex @@ -876,7 +1098,14 @@ def test_delete_force(zenith_env_builder: ZenithEnvBuilder): cur.execute('CREATE TABLE t(key int primary key)') sk = env.safekeepers[0] sk_data_dir = Path(sk.data_dir()) - sk_http = sk.http_client() + if not auth_enabled: + sk_http = sk.http_client() + sk_http_other = sk_http + else: + sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) + sk_http_other = sk.http_client( + auth_token=env.auth_keys.generate_tenant_token(tenant_id_other)) + sk_http_noauth = sk.http_client() assert (sk_data_dir / tenant_id / timeline_id_1).is_dir() assert (sk_data_dir / tenant_id / timeline_id_2).is_dir() assert (sk_data_dir / tenant_id / timeline_id_3).is_dir() @@ -916,6 +1145,15 @@ def test_delete_force(zenith_env_builder: ZenithEnvBuilder): assert (sk_data_dir / tenant_id / timeline_id_4).is_dir() assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + if auth_enabled: + # Ensure we cannot delete the other tenant + for sk_h in [sk_http, sk_http_noauth]: + with pytest.raises(sk_h.HTTPError, match='Forbidden|Unauthorized'): + assert sk_h.timeline_delete_force(tenant_id_other, timeline_id_other) + with pytest.raises(sk_h.HTTPError, match='Forbidden|Unauthorized'): + assert sk_h.tenant_delete_force(tenant_id_other) + assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() + # Remove initial tenant's br2 (inactive) assert sk_http.timeline_delete_force(tenant_id, timeline_id_2) == { "dir_existed": True, @@ -956,7 +1194,7 @@ def test_delete_force(zenith_env_builder: ZenithEnvBuilder): assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir() # Ensure the other tenant still works - sk_http.timeline_status(tenant_id_other, timeline_id_other) + sk_http_other.timeline_status(tenant_id_other, timeline_id_other) with closing(pg_other.connect()) as conn: with conn.cursor() as cur: cur.execute('INSERT INTO t (key) VALUES (123)') diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index c484b6401c..d74ef8840a 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -1,13 +1,14 @@ import asyncio import uuid + import asyncpg import random import time -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres, Safekeeper +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, Safekeeper from fixtures.log_helper import getLogger from fixtures.utils import lsn_from_hex, lsn_to_hex -from typing import List +from typing import List, Optional log = getLogger('root.safekeeper_async') @@ -136,7 +137,7 @@ async def wait_for_lsn(safekeeper: Safekeeper, # On each iteration 1 acceptor is stopped, and 2 others should allow # background workers execute transactions. In the end, state should remain # consistent. -async def run_restarts_under_load(env: ZenithEnv, +async def run_restarts_under_load(env: NeonEnv, pg: Postgres, acceptors: List[Safekeeper], n_workers=10, @@ -151,8 +152,8 @@ async def run_restarts_under_load(env: ZenithEnv, test_timeout_at = time.monotonic() + 5 * 60 pg_conn = await pg.connect_async() - tenant_id = await pg_conn.fetchval("show zenith.zenith_tenant") - timeline_id = await pg_conn.fetchval("show zenith.zenith_timeline") + tenant_id = await pg_conn.fetchval("show neon.tenant_id") + timeline_id = await pg_conn.fetchval("show neon.timeline_id") bank = BankClient(pg_conn, n_accounts=n_accounts, init_amount=init_amount) # create tables and initial balances @@ -202,11 +203,11 @@ async def run_restarts_under_load(env: ZenithEnv, # Restart acceptors one by one, while executing and validating bank transactions -def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() +def test_restarts_under_load(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_safekeepers_restarts_under_load') + env.neon_cli.create_branch('test_safekeepers_restarts_under_load') # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long pg = env.postgres.create_start('test_safekeepers_restarts_under_load', config_lines=['max_replication_write_lag=1MB']) @@ -217,11 +218,11 @@ def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder): # Restart acceptors one by one and test that everything is working as expected # when checkpoins are triggered frequently by max_wal_size=32MB. Because we have # wal_keep_size=0, there will be aggressive WAL segments recycling. -def test_restarts_frequent_checkpoints(zenith_env_builder: ZenithEnvBuilder): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() +def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() - env.zenith_cli.create_branch('test_restarts_frequent_checkpoints') + env.neon_cli.create_branch('test_restarts_frequent_checkpoints') # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long pg = env.postgres.create_start('test_restarts_frequent_checkpoints', config_lines=[ @@ -234,3 +235,172 @@ def test_restarts_frequent_checkpoints(zenith_env_builder: ZenithEnvBuilder): # we try to simulate large (flush_lsn - truncate_lsn) lag, to test that WAL segments # are not removed before broadcasted to all safekeepers, with the help of replication slot asyncio.run(run_restarts_under_load(env, pg, env.safekeepers, period_time=15, iterations=5)) + + +def postgres_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): + pg = Postgres( + env, + tenant_id=env.initial_tenant, + port=env.port_distributor.get_port(), + # In these tests compute has high probability of terminating on its own + # before our stop() due to lost consensus leadership. + check_stop_result=False) + + # embed current time in node name + node_name = pgdir_name or f'pg_node_{time.time()}' + return pg.create_start(branch_name=branch, + node_name=node_name, + config_lines=['log_statement=all']) + + +async def exec_compute_query(env: NeonEnv, + branch: str, + query: str, + pgdir_name: Optional[str] = None): + with postgres_create_start(env, branch=branch, pgdir_name=pgdir_name) as pg: + before_conn = time.time() + conn = await pg.connect_async() + res = await conn.fetch(query) + await conn.close() + after_conn = time.time() + log.info(f'{query} took {after_conn - before_conn}s') + return res + + +async def run_compute_restarts(env: NeonEnv, + queries=16, + batch_insert=10000, + branch='test_compute_restarts'): + cnt = 0 + sum = 0 + + await exec_compute_query(env, branch, 'CREATE TABLE t (i int)') + + for i in range(queries): + if i % 4 == 0: + await exec_compute_query( + env, branch, f'INSERT INTO t SELECT 1 FROM generate_series(1, {batch_insert})') + sum += batch_insert + cnt += batch_insert + elif (i % 4 == 1) or (i % 4 == 3): + # Note that select causes lots of FPI's and increases probability of safekeepers + # standing at different LSNs after compute termination. + actual_sum = (await exec_compute_query(env, branch, 'SELECT SUM(i) FROM t'))[0][0] + assert actual_sum == sum, f'Expected sum={sum}, actual={actual_sum}' + elif i % 4 == 2: + await exec_compute_query(env, branch, 'UPDATE t SET i = i + 1') + sum += cnt + + +# Add a test which creates compute for every query, and then destroys it right after. +def test_compute_restarts(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch('test_compute_restarts') + asyncio.run(run_compute_restarts(env)) + + +class BackgroundCompute(object): + MAX_QUERY_GAP_SECONDS = 2 + + def __init__(self, index: int, env: NeonEnv, branch: str): + self.index = index + self.env = env + self.branch = branch + self.running = False + self.stopped = False + self.total_tries = 0 + self.successful_queries: List[int] = [] + + async def run(self): + if self.running: + raise Exception('BackgroundCompute is already running') + + self.running = True + i = 0 + while not self.stopped: + try: + verify_key = (self.index << 16) + i + i += 1 + self.total_tries += 1 + res = await exec_compute_query( + self.env, + self.branch, + f'INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key', + pgdir_name=f'bgcompute{self.index}_key{verify_key}', + ) + log.info(f'result: {res}') + if len(res) != 1: + raise Exception('No result returned') + if res[0][0] != verify_key: + raise Exception('Wrong result returned') + self.successful_queries.append(verify_key) + except Exception as e: + log.info(f'BackgroundCompute {self.index} query failed: {e}') + + # With less sleep, there is a very big chance of not committing + # anything or only 1 xact during test run. + await asyncio.sleep(random.uniform(0, self.MAX_QUERY_GAP_SECONDS)) + self.running = False + + +async def run_concurrent_computes(env: NeonEnv, + num_computes=10, + run_seconds=20, + branch='test_concurrent_computes'): + await exec_compute_query( + env, + branch, + 'CREATE TABLE query_log (t timestamp default now(), index int, verify_key int)') + + computes = [BackgroundCompute(i, env, branch) for i in range(num_computes)] + background_tasks = [asyncio.create_task(compute.run()) for compute in computes] + + await asyncio.sleep(run_seconds) + log.info("stopping all tasks but one") + for compute in computes[1:]: + compute.stopped = True + await asyncio.gather(*background_tasks[1:]) + log.info("stopped all tasks but one") + + # work for some time with only one compute -- it should be able to make some xacts + TIMEOUT_SECONDS = computes[0].MAX_QUERY_GAP_SECONDS + 3 + initial_queries_by_0 = len(computes[0].successful_queries) + log.info(f'Waiting for another query by computes[0], ' + f'it already had {initial_queries_by_0}, timeout is {TIMEOUT_SECONDS}s') + for _ in range(10 * TIMEOUT_SECONDS): + current_queries_by_0 = len(computes[0].successful_queries) - initial_queries_by_0 + if current_queries_by_0 >= 1: + log.info(f'Found {current_queries_by_0} successful queries ' + f'by computes[0], completing the test') + break + await asyncio.sleep(0.1) + else: + assert False, "Timed out while waiting for another query by computes[0]" + computes[0].stopped = True + + await asyncio.gather(background_tasks[0]) + + result = await exec_compute_query(env, branch, 'SELECT * FROM query_log') + # we should have inserted something while single compute was running + log.info(f'Executed {len(result)} queries, {current_queries_by_0} of them ' + f'by computes[0] after we started stopping the others') + for row in result: + log.info(f'{row[0]} {row[1]} {row[2]}') + + # ensure everything reported as committed wasn't lost + for compute in computes: + for verify_key in compute.successful_queries: + assert verify_key in [row[2] for row in result] + + +# Run multiple computes concurrently, creating-destroying them after single +# query. Ensure we don't lose any xacts reported as committed and be able to +# progress once only one compute remains. +def test_concurrent_computes(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch('test_concurrent_computes') + asyncio.run(run_concurrent_computes(env)) diff --git a/test_runner/batch_others/test_wal_restore.py b/test_runner/batch_others/test_wal_restore.py index f4aceac5e8..809e942415 100644 --- a/test_runner/batch_others/test_wal_restore.py +++ b/test_runner/batch_others/test_wal_restore.py @@ -1,35 +1,33 @@ import os -import subprocess +from pathlib import Path -from fixtures.zenith_fixtures import (ZenithEnvBuilder, - VanillaPostgres, - PortDistributor, - PgBin, - base_dir, - vanilla_pg, - pg_distrib_dir) -from fixtures.log_helper import log +from fixtures.neon_fixtures import (NeonEnvBuilder, + VanillaPostgres, + PortDistributor, + PgBin, + base_dir, + pg_distrib_dir) -def test_wal_restore(zenith_env_builder: ZenithEnvBuilder, +def test_wal_restore(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, - test_output_dir, + test_output_dir: Path, port_distributor: PortDistributor): - env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_wal_restore") + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_wal_restore") pg = env.postgres.create_start('test_wal_restore') pg.safe_psql("create table t as select generate_series(1,300000)") - tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] - env.zenith_cli.pageserver_stop() + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + env.neon_cli.pageserver_stop() port = port_distributor.get_port() - data_dir = os.path.join(test_output_dir, 'pgsql.restored') + data_dir = test_output_dir / 'pgsql.restored' with VanillaPostgres(data_dir, PgBin(test_output_dir), port) as restored: pg_bin.run_capture([ os.path.join(base_dir, 'libs/utils/scripts/restore_from_wal.sh'), os.path.join(pg_distrib_dir, 'bin'), - os.path.join(test_output_dir, 'repo/safekeepers/sk1/{}/*'.format(tenant_id)), - data_dir, + str(test_output_dir / 'repo' / 'safekeepers' / 'sk1' / str(tenant_id) / '*'), + str(data_dir), str(port) ]) restored.start() - assert restored.safe_psql('select count(*) from t', user='zenith_admin') == [(300000, )] + assert restored.safe_psql('select count(*) from t', user='cloud_admin') == [(300000, )] diff --git a/test_runner/batch_pg_regress/test_isolation.py b/test_runner/batch_pg_regress/test_isolation.py index 7c99c04fe3..0124459440 100644 --- a/test_runner/batch_pg_regress/test_isolation.py +++ b/test_runner/batch_pg_regress/test_isolation.py @@ -1,25 +1,24 @@ import os +from pathlib import Path import pytest -from fixtures.utils import mkdir_if_needed -from fixtures.zenith_fixtures import ZenithEnv, base_dir, pg_distrib_dir +from fixtures.neon_fixtures import NeonEnv, base_dir, pg_distrib_dir # The isolation tests run for a long time, especially in debug mode, # so use a larger-than-default timeout. @pytest.mark.timeout(1800) -def test_isolation(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys): - env = zenith_simple_env +def test_isolation(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): + env = neon_simple_env - env.zenith_cli.create_branch("test_isolation", "empty") + env.neon_cli.create_branch("test_isolation", "empty") # Connect to postgres and create a database called "regression". # isolation tests use prepared transactions, so enable them pg = env.postgres.create_start('test_isolation', config_lines=['max_prepared_transactions=100']) pg.safe_psql('CREATE DATABASE isolation_regression') # Create some local directories for pg_isolation_regress to run in. - runpath = os.path.join(test_output_dir, 'regress') - mkdir_if_needed(runpath) - mkdir_if_needed(os.path.join(runpath, 'testtablespace')) + runpath = test_output_dir / 'regress' + (runpath / 'testtablespace').mkdir(parents=True) # Compute all the file locations that pg_isolation_regress will need. build_path = os.path.join(pg_distrib_dir, 'build/src/test/isolation') diff --git a/test_runner/batch_pg_regress/test_zenith_regress.py b/test_runner/batch_pg_regress/test_neon_regress.py similarity index 67% rename from test_runner/batch_pg_regress/test_zenith_regress.py rename to test_runner/batch_pg_regress/test_neon_regress.py index 2b57137d16..66ea67d9f1 100644 --- a/test_runner/batch_pg_regress/test_zenith_regress.py +++ b/test_runner/batch_pg_regress/test_neon_regress.py @@ -1,30 +1,29 @@ import os +from pathlib import Path -from fixtures.utils import mkdir_if_needed -from fixtures.zenith_fixtures import (ZenithEnv, - check_restored_datadir_content, - base_dir, - pg_distrib_dir) +from fixtures.neon_fixtures import (NeonEnv, + check_restored_datadir_content, + base_dir, + pg_distrib_dir) from fixtures.log_helper import log -def test_zenith_regress(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys): - env = zenith_simple_env +def test_neon_regress(neon_simple_env: NeonEnv, test_output_dir: Path, pg_bin, capsys): + env = neon_simple_env - env.zenith_cli.create_branch("test_zenith_regress", "empty") + env.neon_cli.create_branch("test_neon_regress", "empty") # Connect to postgres and create a database called "regression". - pg = env.postgres.create_start('test_zenith_regress') + pg = env.postgres.create_start('test_neon_regress') pg.safe_psql('CREATE DATABASE regression') # Create some local directories for pg_regress to run in. - runpath = os.path.join(test_output_dir, 'regress') - mkdir_if_needed(runpath) - mkdir_if_needed(os.path.join(runpath, 'testtablespace')) + runpath = test_output_dir / 'regress' + (runpath / 'testtablespace').mkdir(parents=True) # Compute all the file locations that pg_regress will need. - # This test runs zenith specific tests + # This test runs neon specific tests build_path = os.path.join(pg_distrib_dir, 'build/src/test/regress') - src_path = os.path.join(base_dir, 'test_runner/zenith_regress') + src_path = os.path.join(base_dir, 'test_runner/neon_regress') bindir = os.path.join(pg_distrib_dir, 'bin') schedule = os.path.join(src_path, 'parallel_schedule') pg_regress = os.path.join(build_path, 'pg_regress') diff --git a/test_runner/batch_pg_regress/test_pg_regress.py b/test_runner/batch_pg_regress/test_pg_regress.py index be7776113a..b53bc21ca2 100644 --- a/test_runner/batch_pg_regress/test_pg_regress.py +++ b/test_runner/batch_pg_regress/test_pg_regress.py @@ -1,24 +1,23 @@ import os +import pathlib import pytest -from fixtures.utils import mkdir_if_needed -from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content, base_dir, pg_distrib_dir +from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content, base_dir, pg_distrib_dir # The pg_regress tests run for a long time, especially in debug mode, # so use a larger-than-default timeout. @pytest.mark.timeout(1800) -def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin, capsys): - env = zenith_simple_env +def test_pg_regress(neon_simple_env: NeonEnv, test_output_dir: pathlib.Path, pg_bin, capsys): + env = neon_simple_env - env.zenith_cli.create_branch("test_pg_regress", "empty") + env.neon_cli.create_branch("test_pg_regress", "empty") # Connect to postgres and create a database called "regression". pg = env.postgres.create_start('test_pg_regress') pg.safe_psql('CREATE DATABASE regression') # Create some local directories for pg_regress to run in. - runpath = os.path.join(test_output_dir, 'regress') - mkdir_if_needed(runpath) - mkdir_if_needed(os.path.join(runpath, 'testtablespace')) + runpath = test_output_dir / 'regress' + (runpath / 'testtablespace').mkdir(parents=True) # Compute all the file locations that pg_regress will need. build_path = os.path.join(pg_distrib_dir, 'build/src/test/regress') @@ -51,7 +50,7 @@ def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin, # checkpoint one more time to ensure that the lsn we get is the latest one pg.safe_psql('CHECKPOINT') - lsn = pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0] + pg.safe_psql('select pg_current_wal_insert_lsn()')[0][0] # Check that we restore the content of the datadir correctly check_restored_datadir_content(test_output_dir, env, pg) diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 59e415e3a8..c6e6289a5c 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -1,6 +1,5 @@ -pytest_plugins = ( - "fixtures.zenith_fixtures", - "fixtures.benchmark_fixture", - "fixtures.compare_fixtures", - "fixtures.slow", -) +pytest_plugins = ("fixtures.neon_fixtures", + "fixtures.benchmark_fixture", + "fixtures.compare_fixtures", + "fixtures.slow", + "fixtures.pg_stats") diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 5fc6076f51..3a679cc705 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -25,9 +25,9 @@ To use, declare the 'zenbenchmark' fixture in the test function. Run the bencmark, and then record the result by calling zenbenchmark.record. For example: import timeit -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv -def test_mybench(zenith_simple_env: env, zenbenchmark): +def test_mybench(neon_simple_env: env, zenbenchmark): # Initialize the test ... @@ -142,7 +142,7 @@ class MetricReport(str, enum.Enum): # str is a hack to make it json serializabl LOWER_IS_BETTER = 'lower_is_better' -class ZenithBenchmarker: +class NeonBenchmarker: """ An object for recording benchmark results. This is created for each test function by the zenbenchmark fixture @@ -163,7 +163,7 @@ class ZenithBenchmarker: Record a benchmark result. """ # just to namespace the value - name = f"zenith_benchmarker_{metric_name}" + name = f"neon_benchmarker_{metric_name}" self.property_recorder( name, { @@ -206,7 +206,7 @@ class ZenithBenchmarker: f"{prefix}.number_of_transactions_actually_processed", pg_bench_result.number_of_transactions_actually_processed, '', - # thats because this is predefined by test matrix and doesnt change across runs + # that's because this is predefined by test matrix and doesn't change across runs report=MetricReport.TEST_PARAM, ) self.record(f"{prefix}.latency_average", @@ -289,12 +289,12 @@ class ZenithBenchmarker: @pytest.fixture(scope="function") -def zenbenchmark(record_property) -> Iterator[ZenithBenchmarker]: +def zenbenchmark(record_property) -> Iterator[NeonBenchmarker]: """ This is a python decorator for benchmark fixtures. It contains functions for recording measurements, and prints them out at the end. """ - benchmarker = ZenithBenchmarker(record_property) + benchmarker = NeonBenchmarker(record_property) yield benchmarker @@ -302,7 +302,7 @@ def pytest_addoption(parser): parser.addoption( "--out-dir", dest="out_dir", - help="Directory to ouput performance tests results to.", + help="Directory to output performance tests results to.", ) diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index d572901ed1..9808d83492 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -1,18 +1,19 @@ import pytest from contextlib import contextmanager from abc import ABC, abstractmethod +from fixtures.pg_stats import PgStatTable -from fixtures.zenith_fixtures import PgBin, PgProtocol, VanillaPostgres, RemotePostgres, ZenithEnv -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker +from fixtures.neon_fixtures import PgBin, PgProtocol, VanillaPostgres, RemotePostgres, NeonEnv +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker # Type-related stuff -from typing import Iterator +from typing import Dict, List class PgCompare(ABC): """Common interface of all postgres implementations, useful for benchmarks. - This class is a helper class for the zenith_with_baseline fixture. See its documentation + This class is a helper class for the neon_with_baseline fixture. See its documentation for more details. """ @property @@ -26,7 +27,7 @@ class PgCompare(ABC): pass @property - def zenbenchmark(self) -> ZenithBenchmarker: + def zenbenchmark(self) -> NeonBenchmarker: pass @abstractmethod @@ -51,22 +52,47 @@ class PgCompare(ABC): def record_duration(self, out_name): pass + @contextmanager + def record_pg_stats(self, pg_stats: List[PgStatTable]): + init_data = self._retrieve_pg_stats(pg_stats) -class ZenithCompare(PgCompare): - """PgCompare interface for the zenith stack.""" + yield + + data = self._retrieve_pg_stats(pg_stats) + + for k in set(init_data) & set(data): + self.zenbenchmark.record(k, data[k] - init_data[k], '', MetricReport.HIGHER_IS_BETTER) + + def _retrieve_pg_stats(self, pg_stats: List[PgStatTable]) -> Dict[str, int]: + results: Dict[str, int] = {} + + with self.pg.connect().cursor() as cur: + for pg_stat in pg_stats: + cur.execute(pg_stat.query) + row = cur.fetchone() + assert len(row) == len(pg_stat.columns) + + for col, val in zip(pg_stat.columns, row): + results[f"{pg_stat.table}.{col}"] = int(val) + + return results + + +class NeonCompare(PgCompare): + """PgCompare interface for the neon stack.""" def __init__(self, - zenbenchmark: ZenithBenchmarker, - zenith_simple_env: ZenithEnv, + zenbenchmark: NeonBenchmarker, + neon_simple_env: NeonEnv, pg_bin: PgBin, branch_name): - self.env = zenith_simple_env + self.env = neon_simple_env self._zenbenchmark = zenbenchmark self._pg_bin = pg_bin # We only use one branch and one timeline - self.env.zenith_cli.create_branch(branch_name, 'empty') + self.env.neon_cli.create_branch(branch_name, 'empty') self._pg = self.env.postgres.create_start(branch_name) - self.timeline = self.pg.safe_psql("SHOW zenith.zenith_timeline")[0][0] + self.timeline = self.pg.safe_psql("SHOW neon.timeline_id")[0][0] # Long-lived cursor, useful for flushing self.psconn = self.env.pageserver.connect() @@ -221,9 +247,9 @@ class RemoteCompare(PgCompare): @pytest.fixture(scope='function') -def zenith_compare(request, zenbenchmark, pg_bin, zenith_simple_env) -> ZenithCompare: +def neon_compare(request, zenbenchmark, pg_bin, neon_simple_env) -> NeonCompare: branch_name = request.node.name - return ZenithCompare(zenbenchmark, zenith_simple_env, pg_bin, branch_name) + return NeonCompare(zenbenchmark, neon_simple_env, pg_bin, branch_name) @pytest.fixture(scope='function') @@ -236,13 +262,13 @@ def remote_compare(zenbenchmark, remote_pg) -> RemoteCompare: return RemoteCompare(zenbenchmark, remote_pg) -@pytest.fixture(params=["vanilla_compare", "zenith_compare"], ids=["vanilla", "zenith"]) -def zenith_with_baseline(request) -> PgCompare: - """Parameterized fixture that helps compare zenith against vanilla postgres. +@pytest.fixture(params=["vanilla_compare", "neon_compare"], ids=["vanilla", "neon"]) +def neon_with_baseline(request) -> PgCompare: + """Parameterized fixture that helps compare neon against vanilla postgres. A test that uses this fixture turns into a parameterized test that runs against: 1. A vanilla postgres instance - 2. A simple zenith env (see zenith_simple_env) + 2. A simple neon env (see neon_simple_env) 3. Possibly other postgres protocol implementations. The main goal of this fixture is to make it easier for people to read and write @@ -254,7 +280,7 @@ def zenith_with_baseline(request) -> PgCompare: of that. If a test requires some one-off special implementation-specific logic, use of - isinstance(zenith_with_baseline, ZenithCompare) is encouraged. Though if that + isinstance(neon_with_baseline, NeonCompare) is encouraged. Though if that implementation-specific logic is widely useful across multiple tests, it might make sense to add methods to the PgCompare class. """ diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/neon_fixtures.py similarity index 78% rename from test_runner/fixtures/zenith_fixtures.py rename to test_runner/fixtures/neon_fixtures.py index 8f9bf1c11b..3a6a233208 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1,8 +1,10 @@ from __future__ import annotations from dataclasses import field +from enum import Flag, auto import textwrap from cached_property import cached_property +import abc import asyncpg import os import boto3 @@ -28,18 +30,13 @@ from dataclasses import dataclass # Type-related stuff from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import make_dsn, parse_dsn -from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, TypeVar, cast, Union, Tuple +from typing import Any, Callable, Dict, Iterator, List, Optional, Type, TypeVar, cast, Union, Tuple from typing_extensions import Literal import requests import backoff # type: ignore -from .utils import (etcd_path, - get_self_dir, - mkdir_if_needed, - subprocess_capture, - lsn_from_hex, - lsn_to_hex) +from .utils import (etcd_path, get_self_dir, subprocess_capture, lsn_from_hex, lsn_to_hex) from fixtures.log_helper import log """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -49,7 +46,7 @@ A fixture is created with the decorator @pytest.fixture decorator. See docs: https://docs.pytest.org/en/6.2.x/fixture.html There are several environment variables that can control the running of tests: -ZENITH_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information. +NEON_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information. There's no need to import this file to use it. It should be declared as a plugin inside conftest.py, and that makes it available to all tests. @@ -74,20 +71,20 @@ def pytest_addoption(parser): "--skip-interfering-proc-check", dest="skip_interfering_proc_check", action="store_true", - help="skip check for interferring processes", + help="skip check for interfering processes", ) # These are set in pytest_configure() base_dir = "" -zenith_binpath = "" +neon_binpath = "" pg_distrib_dir = "" top_output_dir = "" def check_interferring_processes(config): if config.getoption("skip_interfering_proc_check"): - warnings.warn("interferring process check is skipped") + warnings.warn("interfering process check is skipped") return # does not use -c as it is not supported on macOS @@ -99,14 +96,14 @@ def check_interferring_processes(config): # result of the test. # NOTE this shows as an internal pytest error, there might be a better way raise Exception( - 'Found interfering processes running. Stop all Zenith pageservers, nodes, safekeepers, as well as stand-alone Postgres.' + 'Found interfering processes running. Stop all Neon pageservers, nodes, safekeepers, as well as stand-alone Postgres.' ) def pytest_configure(config): """ Ensure that no unwanted daemons are running before we start testing. - Check that we do not owerflow available ports range. + Check that we do not overflow available ports range. """ check_interferring_processes(config) @@ -126,7 +123,7 @@ def pytest_configure(config): top_output_dir = env_test_output else: top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR) - mkdir_if_needed(top_output_dir) + pathlib.Path(top_output_dir).mkdir(exist_ok=True) # Find the postgres installation. global pg_distrib_dir @@ -145,25 +142,25 @@ def pytest_configure(config): raise Exception('postgres not found at "{}"'.format(pg_distrib_dir)) if os.getenv("REMOTE_ENV"): - # we are in remote env and do not have zenith binaries locally + # we are in remote env and do not have neon binaries locally # this is the case for benchmarks run on self-hosted runner return - # Find the zenith binaries. - global zenith_binpath - env_zenith_bin = os.environ.get('ZENITH_BIN') - if env_zenith_bin: - zenith_binpath = env_zenith_bin + # Find the neon binaries. + global neon_binpath + env_neon_bin = os.environ.get('NEON_BIN') + if env_neon_bin: + neon_binpath = env_neon_bin else: - zenith_binpath = os.path.join(base_dir, 'target/debug') - log.info(f'zenith_binpath is {zenith_binpath}') - if not os.path.exists(os.path.join(zenith_binpath, 'pageserver')): - raise Exception('zenith binaries not found at "{}"'.format(zenith_binpath)) + neon_binpath = os.path.join(base_dir, 'target/debug') + log.info(f'neon_binpath is {neon_binpath}') + if not os.path.exists(os.path.join(neon_binpath, 'pageserver')): + raise Exception('neon binaries not found at "{}"'.format(neon_binpath)) def profiling_supported(): """Return True if the pageserver was compiled with the 'profiling' feature """ - bin_pageserver = os.path.join(str(zenith_binpath), 'pageserver') + bin_pageserver = os.path.join(str(neon_binpath), 'pageserver') res = subprocess.run([bin_pageserver, '--version'], check=True, universal_newlines=True, @@ -222,7 +219,7 @@ def can_bind(host: str, port: int) -> bool: # TODO: The pageserver and safekeepers don't use SO_REUSEADDR at the # moment. If that changes, we should use start using SO_REUSEADDR here # too, to allow reusing ports more quickly. - # See https://github.com/zenithdb/zenith/issues/801 + # See https://github.com/neondatabase/neon/issues/801 #sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) try: @@ -328,7 +325,7 @@ class PgProtocol: # Convert options='-c=' to server_settings if 'options' in conn_options: options = conn_options.pop('options') - for match in re.finditer('-c(\w*)=(\w*)', options): + for match in re.finditer(r'-c(\w*)=(\w*)', options): key = match.group(1) val = match.group(2) if 'server_options' in conn_options: @@ -337,18 +334,30 @@ class PgProtocol: conn_options['server_settings'] = {key: val} return await asyncpg.connect(**conn_options) - def safe_psql(self, query: str, **kwargs: Any) -> List[Any]: + def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]: """ Execute query against the node and return all rows. This method passes all extra params to connstr. """ + return self.safe_psql_many([query], **kwargs)[0] + def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]: + """ + Execute queries against the node and return all rows. + This method passes all extra params to connstr. + """ + result: List[List[Any]] = [] with closing(self.connect(**kwargs)) as conn: with conn.cursor() as cur: - cur.execute(query) - if cur.description is None: - return [] # query didn't return data - return cast(List[Any], cur.fetchall()) + for query in queries: + log.info(f"Executing query: {query}") + cur.execute(query) + + if cur.description is None: + result.append([]) # query didn't return data + else: + result.append(cast(List[Any], cur.fetchall())) + return result @dataclass @@ -421,52 +430,102 @@ class MockS3Server: def secret_key(self) -> str: return 'test' + def access_env_vars(self) -> Dict[Any, Any]: + return { + 'AWS_ACCESS_KEY_ID': self.access_key(), + 'AWS_SECRET_ACCESS_KEY': self.secret_key(), + } + def kill(self): self.subprocess.kill() -class ZenithEnvBuilder: - """ - Builder object to create a Zenith runtime environment +@dataclass +class LocalFsStorage: + local_path: Path - You should use the `zenith_env_builder` or `zenith_simple_env` pytest - fixture to create the ZenithEnv object. That way, the repository is + +@dataclass +class S3Storage: + bucket_name: str + bucket_region: str + endpoint: Optional[str] + + +RemoteStorage = Union[LocalFsStorage, S3Storage] + + +# serialize as toml inline table +def remote_storage_to_toml_inline_table(remote_storage): + if isinstance(remote_storage, LocalFsStorage): + res = f"local_path='{remote_storage.local_path}'" + elif isinstance(remote_storage, S3Storage): + res = f"bucket_name='{remote_storage.bucket_name}', bucket_region='{remote_storage.bucket_region}'" + if remote_storage.endpoint is not None: + res += f", endpoint='{remote_storage.endpoint}'" + else: + raise Exception(f'Unknown storage configuration {remote_storage}') + else: + raise Exception("invalid remote storage type") + return f"{{{res}}}" + + +class RemoteStorageUsers(Flag): + PAGESERVER = auto() + SAFEKEEPER = auto() + + +class NeonEnvBuilder: + """ + Builder object to create a Neon runtime environment + + You should use the `neon_env_builder` or `neon_simple_env` pytest + fixture to create the NeonEnv object. That way, the repository is created in the right directory, based on the test name, and it's properly cleaned up after the test has finished. """ - def __init__(self, - repo_dir: Path, - port_distributor: PortDistributor, - broker: Etcd, - mock_s3_server: MockS3Server, - remote_storage: Optional[RemoteStorage] = None, - pageserver_config_override: Optional[str] = None, - num_safekeepers: int = 1, - pageserver_auth_enabled: bool = False, - rust_log_override: Optional[str] = None, - default_branch_name=DEFAULT_BRANCH_NAME): + def __init__( + self, + repo_dir: Path, + port_distributor: PortDistributor, + broker: Etcd, + mock_s3_server: MockS3Server, + remote_storage: Optional[RemoteStorage] = None, + remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER, + pageserver_config_override: Optional[str] = None, + num_safekeepers: int = 1, + # Use non-standard SK ids to check for various parsing bugs + safekeepers_id_start: int = 0, + # fsync is disabled by default to make the tests go faster + safekeepers_enable_fsync: bool = False, + auth_enabled: bool = False, + rust_log_override: Optional[str] = None, + default_branch_name=DEFAULT_BRANCH_NAME): self.repo_dir = repo_dir self.rust_log_override = rust_log_override self.port_distributor = port_distributor self.remote_storage = remote_storage + self.remote_storage_users = remote_storage_users self.broker = broker self.mock_s3_server = mock_s3_server self.pageserver_config_override = pageserver_config_override self.num_safekeepers = num_safekeepers - self.pageserver_auth_enabled = pageserver_auth_enabled + self.safekeepers_id_start = safekeepers_id_start + self.safekeepers_enable_fsync = safekeepers_enable_fsync + self.auth_enabled = auth_enabled self.default_branch_name = default_branch_name - self.env: Optional[ZenithEnv] = None + self.env: Optional[NeonEnv] = None - def init(self) -> ZenithEnv: + def init(self) -> NeonEnv: # Cannot create more than one environment from one builder assert self.env is None, "environment already initialized" - self.env = ZenithEnv(self) + self.env = NeonEnv(self) return self.env def start(self): self.env.start() - def init_start(self) -> ZenithEnv: + def init_start(self) -> NeonEnv: env = self.init() self.start() return env @@ -497,9 +556,9 @@ class ZenithEnvBuilder: aws_access_key_id=self.mock_s3_server.access_key(), aws_secret_access_key=self.mock_s3_server.secret_key(), ).create_bucket(Bucket=bucket_name) - self.remote_storage = S3Storage(bucket=bucket_name, + self.remote_storage = S3Storage(bucket_name=bucket_name, endpoint=mock_endpoint, - region=mock_region) + bucket_region=mock_region) def __enter__(self): return self @@ -515,12 +574,12 @@ class ZenithEnvBuilder: self.env.pageserver.stop(immediate=True) -class ZenithEnv: +class NeonEnv: """ - An object representing the Zenith runtime environment. It consists of + An object representing the Neon runtime environment. It consists of the page server, 0-N safekeepers, and the compute nodes. - ZenithEnv contains functions for stopping/starting nodes in the + NeonEnv contains functions for stopping/starting nodes in the environment, checking their status, creating tenants, connecting to the nodes, creating and destroying compute nodes, etc. The page server and the safekeepers are considered fixed in the environment, you cannot @@ -528,7 +587,7 @@ class ZenithEnv: likely change in the future, as we start supporting multiple page servers and adding/removing safekeepers on the fly). - Some notable functions and fields in ZenithEnv: + Some notable functions and fields in NeonEnv: postgres - A factory object for creating postgres compute nodes. @@ -542,23 +601,24 @@ class ZenithEnv: initial_tenant - tenant ID of the initial tenant created in the repository - zenith_cli - can be used to run the 'zenith' CLI tool + neon_cli - can be used to run the 'neon' CLI tool create_tenant() - initializes a new tenant in the page server, returns the tenant id """ - def __init__(self, config: ZenithEnvBuilder): + def __init__(self, config: NeonEnvBuilder): self.repo_dir = config.repo_dir self.rust_log_override = config.rust_log_override self.port_distributor = config.port_distributor self.s3_mock_server = config.mock_s3_server - self.zenith_cli = ZenithCli(env=self) + self.neon_cli = NeonCli(env=self) self.postgres = PostgresFactory(self) self.safekeepers: List[Safekeeper] = [] self.broker = config.broker self.remote_storage = config.remote_storage + self.remote_storage_users = config.remote_storage_users - # generate initial tenant ID here instead of letting 'zenith init' generate it, + # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. self.initial_tenant = uuid.uuid4() @@ -578,7 +638,7 @@ class ZenithEnv: pg=self.port_distributor.get_port(), http=self.port_distributor.get_port(), ) - pageserver_auth_type = "ZenithJWT" if config.pageserver_auth_enabled else "Trust" + pageserver_auth_type = "ZenithJWT" if config.auth_enabled else "Trust" toml += textwrap.dedent(f""" [pageserver] @@ -588,10 +648,10 @@ class ZenithEnv: auth_type = '{pageserver_auth_type}' """) - # Create a corresponding ZenithPageserver object - self.pageserver = ZenithPageserver(self, - port=pageserver_port, - config_override=config.pageserver_config_override) + # Create a corresponding NeonPageserver object + self.pageserver = NeonPageserver(self, + port=pageserver_port, + config_override=config.pageserver_config_override) # Create config and a Safekeeper object for each safekeeper for i in range(1, config.num_safekeepers + 1): @@ -599,19 +659,27 @@ class ZenithEnv: pg=self.port_distributor.get_port(), http=self.port_distributor.get_port(), ) - id = i # assign ids sequentially + id = config.safekeepers_id_start + i # assign ids sequentially toml += textwrap.dedent(f""" [[safekeepers]] id = {id} pg_port = {port.pg} http_port = {port.http} - sync = false # Disable fsyncs to make the tests go faster - """) + sync = {'true' if config.safekeepers_enable_fsync else 'false'}""") + if config.auth_enabled: + toml += textwrap.dedent(f""" + auth_enabled = true + """) + if bool(self.remote_storage_users + & RemoteStorageUsers.SAFEKEEPER) and self.remote_storage is not None: + toml += textwrap.dedent(f""" + remote_storage = "{remote_storage_to_toml_inline_table(self.remote_storage)}" + """) safekeeper = Safekeeper(env=self, id=id, port=port) self.safekeepers.append(safekeeper) log.info(f"Config: {toml}") - self.zenith_cli.init(toml) + self.neon_cli.init(toml) def start(self): # Start up broker, pageserver and all safekeepers @@ -636,10 +704,10 @@ class ZenithEnv: def _shared_simple_env(request: Any, port_distributor: PortDistributor, mock_s3_server: MockS3Server, - default_broker: Etcd) -> Iterator[ZenithEnv]: + default_broker: Etcd) -> Iterator[NeonEnv]: """ - Internal fixture backing the `zenith_simple_env` fixture. If TEST_SHARED_FIXTURES - is set, this is shared by all tests using `zenith_simple_env`. + # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES + is set, this is shared by all tests using `neon_simple_env`. """ if os.environ.get('TEST_SHARED_FIXTURES') is None: @@ -650,23 +718,23 @@ def _shared_simple_env(request: Any, repo_dir = os.path.join(str(top_output_dir), "shared_repo") shutil.rmtree(repo_dir, ignore_errors=True) - with ZenithEnvBuilder(Path(repo_dir), port_distributor, default_broker, - mock_s3_server) as builder: + with NeonEnvBuilder(Path(repo_dir), port_distributor, default_broker, + mock_s3_server) as builder: env = builder.init_start() # For convenience in tests, create a branch from the freshly-initialized cluster. - env.zenith_cli.create_branch('empty', ancestor_branch_name=DEFAULT_BRANCH_NAME) + env.neon_cli.create_branch('empty', ancestor_branch_name=DEFAULT_BRANCH_NAME) yield env @pytest.fixture(scope='function') -def zenith_simple_env(_shared_simple_env: ZenithEnv) -> Iterator[ZenithEnv]: +def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: """ - Simple Zenith environment, with no authentication and no safekeepers. + Simple Neon environment, with no authentication and no safekeepers. If TEST_SHARED_FIXTURES environment variable is set, we reuse the same - environment for all tests that use 'zenith_simple_env', keeping the + environment for all tests that use 'neon_simple_env', keeping the page server and safekeepers running. Any compute nodes are stopped after each the test, however. """ @@ -676,17 +744,17 @@ def zenith_simple_env(_shared_simple_env: ZenithEnv) -> Iterator[ZenithEnv]: @pytest.fixture(scope='function') -def zenith_env_builder(test_output_dir, - port_distributor: PortDistributor, - mock_s3_server: MockS3Server, - default_broker: Etcd) -> Iterator[ZenithEnvBuilder]: +def neon_env_builder(test_output_dir, + port_distributor: PortDistributor, + mock_s3_server: MockS3Server, + default_broker: Etcd) -> Iterator[NeonEnvBuilder]: """ - Fixture to create a Zenith environment for test. + Fixture to create a Neon environment for test. - To use, define 'zenith_env_builder' fixture in your test to get access to the + To use, define 'neon_env_builder' fixture in your test to get access to the builder object. Set properties on it to describe the environment. Finally, initialize and start up the environment by calling - zenith_env_builder.init_start(). + neon_env_builder.init_start(). After the initialization, you can launch compute nodes by calling the functions in the 'env.postgres' factory object, stop/start the @@ -697,16 +765,16 @@ def zenith_env_builder(test_output_dir, repo_dir = os.path.join(test_output_dir, "repo") # Return the builder to the caller - with ZenithEnvBuilder(Path(repo_dir), port_distributor, default_broker, - mock_s3_server) as builder: + with NeonEnvBuilder(Path(repo_dir), port_distributor, default_broker, + mock_s3_server) as builder: yield builder -class ZenithPageserverApiException(Exception): +class NeonPageserverApiException(Exception): pass -class ZenithPageserverHttpClient(requests.Session): +class NeonPageserverHttpClient(requests.Session): def __init__(self, port: int, auth_token: Optional[str] = None): super().__init__() self.port = port @@ -723,23 +791,54 @@ class ZenithPageserverHttpClient(requests.Session): msg = res.json()['msg'] except: msg = '' - raise ZenithPageserverApiException(msg) from e + raise NeonPageserverApiException(msg) from e def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() - def timeline_attach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): + def tenant_list(self) -> List[Dict[Any, Any]]: + res = self.get(f"http://localhost:{self.port}/v1/tenant") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, list) + return res_json + + def tenant_create(self, new_tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/attach", + f"http://localhost:{self.port}/v1/tenant", + json={ + 'new_tenant_id': new_tenant_id.hex if new_tenant_id else None, + }, ) self.verbose_error(res) + if res.status_code == 409: + raise Exception(f'could not create tenant: already exists for id {new_tenant_id}') + new_tenant_id = res.json() + assert isinstance(new_tenant_id, str) + return uuid.UUID(new_tenant_id) + + def tenant_attach(self, tenant_id: uuid.UUID): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/attach") + self.verbose_error(res) - def timeline_detach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/detach", - ) + def tenant_detach(self, tenant_id: uuid.UUID): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/detach") self.verbose_error(res) + def tenant_status(self, tenant_id: uuid.UUID) -> Dict[Any, Any]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def timeline_list(self, tenant_id: uuid.UUID) -> List[Dict[str, Any]]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, list) + return res_json + def timeline_create( self, tenant_id: uuid.UUID, @@ -764,34 +863,6 @@ class ZenithPageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def tenant_list(self) -> List[Dict[Any, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/tenant") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def tenant_create(self, new_tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: - res = self.post( - f"http://localhost:{self.port}/v1/tenant", - json={ - 'new_tenant_id': new_tenant_id.hex if new_tenant_id else None, - }, - ) - self.verbose_error(res) - if res.status_code == 409: - raise Exception(f'could not create tenant: already exists for id {new_tenant_id}') - new_tenant_id = res.json() - assert isinstance(new_tenant_id, str) - return uuid.UUID(new_tenant_id) - - def timeline_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1" @@ -801,6 +872,14 @@ class ZenithPageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json + def timeline_delete(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): + res = self.delete( + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + def wal_receiver_get(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/wal_receiver" @@ -822,20 +901,6 @@ class PageserverPort: http: int -@dataclass -class LocalFsStorage: - root: Path - - -@dataclass -class S3Storage: - bucket: str - region: str - endpoint: Optional[str] - - -RemoteStorage = Union[LocalFsStorage, S3Storage] - CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", re.MULTILINE) CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", @@ -844,14 +909,89 @@ TIMELINE_DATA_EXTRACTOR = re.compile(r"\s(?P[^\s]+)\s\[(?P 'subprocess.CompletedProcess[str]': + """ + Run the command with the specified arguments. + + Arguments must be in list form, e.g. ['pg', 'create'] + + Return both stdout and stderr, which can be accessed as + + >>> result = env.neon_cli.raw_cli(...) + >>> assert result.stderr == "" + >>> log.info(result.stdout) + + If `check_return_code`, on non-zero exit code logs failure and raises. + """ + + assert type(arguments) == list + assert type(self.COMMAND) == str + + bin_neon = os.path.join(str(neon_binpath), self.COMMAND) + + args = [bin_neon] + arguments + log.info('Running command "{}"'.format(' '.join(args))) + log.info(f'Running in "{self.env.repo_dir}"') + + env_vars = os.environ.copy() + env_vars['NEON_REPO_DIR'] = str(self.env.repo_dir) + env_vars['POSTGRES_DISTRIB_DIR'] = str(pg_distrib_dir) + if self.env.rust_log_override is not None: + env_vars['RUST_LOG'] = self.env.rust_log_override + for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): + env_vars[extra_env_key] = extra_env_value + + # Pass coverage settings + var = 'LLVM_PROFILE_FILE' + val = os.environ.get(var) + if val: + env_vars[var] = val + + # Intercept CalledProcessError and print more info + res = subprocess.run(args, + env=env_vars, + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if not res.returncode: + log.info(f"Run success: {res.stdout}") + elif check_return_code: + # this way command output will be in recorded and shown in CI in failure message + msg = f"""\ + Run {res.args} failed: + stdout: {res.stdout} + stderr: {res.stderr} + """ + log.info(msg) + raise Exception(msg) from subprocess.CalledProcessError(res.returncode, + res.args, + res.stdout, + res.stderr) + return res + + +class NeonCli(AbstractNeonCli): + """ + A typed wrapper around the `neon` CLI tool. Supports main commands via typed methods and a way to run arbitrary command directly via CLI. """ - def __init__(self, env: ZenithEnv): - self.env = env - pass + + COMMAND = 'neon_local' def create_tenant(self, tenant_id: Optional[uuid.UUID] = None, @@ -935,7 +1075,7 @@ class ZenithCli: created_timeline_id = matches.group('timeline_id') if created_timeline_id is None: - raise Exception('could not find timeline id after `zenith timeline create` invocation') + raise Exception('could not find timeline id after `neon timeline create` invocation') else: return uuid.UUID(created_timeline_id) @@ -967,13 +1107,13 @@ class ZenithCli: created_timeline_id = matches.group('timeline_id') if created_timeline_id is None: - raise Exception('could not find timeline id after `zenith timeline create` invocation') + raise Exception('could not find timeline id after `neon timeline create` invocation') else: return uuid.UUID(created_timeline_id) def list_timelines(self, tenant_id: Optional[uuid.UUID] = None) -> List[Tuple[str, str]]: """ - Returns a list of (branch_name, timeline_id) tuples out of parsed `zenith timeline list` CLI output. + Returns a list of (branch_name, timeline_id) tuples out of parsed `neon timeline list` CLI output. """ # (L) main [b49f7954224a0ad25cc0013ea107b54b] @@ -998,6 +1138,7 @@ class ZenithCli: append_pageserver_param_overrides( params_to_update=cmd, remote_storage=self.env.remote_storage, + remote_storage_users=self.env.remote_storage_users, pageserver_config_override=self.env.pageserver.config_override) res = self.raw_cli(cmd) @@ -1005,7 +1146,7 @@ class ZenithCli: return res def pageserver_enabled_features(self) -> Any: - bin_pageserver = os.path.join(str(zenith_binpath), 'pageserver') + bin_pageserver = os.path.join(str(neon_binpath), 'pageserver') args = [bin_pageserver, '--enabled-features'] log.info('Running command "{}"'.format(' '.join(args))) @@ -1022,14 +1163,10 @@ class ZenithCli: append_pageserver_param_overrides( params_to_update=start_args, remote_storage=self.env.remote_storage, + remote_storage_users=self.env.remote_storage_users, pageserver_config_override=self.env.pageserver.config_override) - s3_env_vars = None - if self.env.s3_mock_server: - s3_env_vars = { - 'AWS_ACCESS_KEY_ID': self.env.s3_mock_server.access_key(), - 'AWS_SECRET_ACCESS_KEY': self.env.s3_mock_server.secret_key(), - } + s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None return self.raw_cli(start_args, extra_env_vars=s3_env_vars) def pageserver_stop(self, immediate=False) -> 'subprocess.CompletedProcess[str]': @@ -1041,14 +1178,15 @@ class ZenithCli: return self.raw_cli(cmd) def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]': - return self.raw_cli(['safekeeper', 'start', str(id)]) + s3_env_vars = self.env.s3_mock_server.access_env_vars() if self.env.s3_mock_server else None + return self.raw_cli(['safekeeper', 'start', str(id)], extra_env_vars=s3_env_vars) def safekeeper_stop(self, id: Optional[int] = None, immediate=False) -> 'subprocess.CompletedProcess[str]': args = ['safekeeper', 'stop'] if id is not None: - args.extend(str(id)) + args.append(str(id)) if immediate: args.extend(['-m', 'immediate']) return self.raw_cli(args) @@ -1109,6 +1247,7 @@ class ZenithCli: node_name: str, tenant_id: Optional[uuid.UUID] = None, destroy=False, + check_return_code=True, ) -> 'subprocess.CompletedProcess[str]': args = [ 'pg', @@ -1121,85 +1260,41 @@ class ZenithCli: if node_name is not None: args.append(node_name) - return self.raw_cli(args) - - def raw_cli(self, - arguments: List[str], - extra_env_vars: Optional[Dict[str, str]] = None, - check_return_code=True) -> 'subprocess.CompletedProcess[str]': - """ - Run "zenith" with the specified arguments. - - Arguments must be in list form, e.g. ['pg', 'create'] - - Return both stdout and stderr, which can be accessed as - - >>> result = env.zenith_cli.raw_cli(...) - >>> assert result.stderr == "" - >>> log.info(result.stdout) - """ - - assert type(arguments) == list - - bin_zenith = os.path.join(str(zenith_binpath), 'neon_local') - - args = [bin_zenith] + arguments - log.info('Running command "{}"'.format(' '.join(args))) - log.info(f'Running in "{self.env.repo_dir}"') - - env_vars = os.environ.copy() - env_vars['ZENITH_REPO_DIR'] = str(self.env.repo_dir) - env_vars['POSTGRES_DISTRIB_DIR'] = str(pg_distrib_dir) - if self.env.rust_log_override is not None: - env_vars['RUST_LOG'] = self.env.rust_log_override - for (extra_env_key, extra_env_value) in (extra_env_vars or {}).items(): - env_vars[extra_env_key] = extra_env_value - - # Pass coverage settings - var = 'LLVM_PROFILE_FILE' - val = os.environ.get(var) - if val: - env_vars[var] = val - - # Intercept CalledProcessError and print more info - try: - res = subprocess.run(args, - env=env_vars, - check=True, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - log.info(f"Run success: {res.stdout}") - except subprocess.CalledProcessError as exc: - # this way command output will be in recorded and shown in CI in failure message - msg = f"""\ - Run failed: {exc} - stdout: {exc.stdout} - stderr: {exc.stderr} - """ - log.info(msg) - - raise Exception(msg) from exc - - if check_return_code: - res.check_returncode() - return res + return self.raw_cli(args, check_return_code=check_return_code) -class ZenithPageserver(PgProtocol): +class WalCraft(AbstractNeonCli): + """ + A typed wrapper around the `wal_craft` CLI tool. + Supports main commands via typed methods and a way to run arbitrary command directly via CLI. + """ + + COMMAND = 'wal_craft' + + def postgres_config(self) -> List[str]: + res = self.raw_cli(["print-postgres-config"]) + res.check_returncode() + return res.stdout.split('\n') + + def in_existing(self, type: str, connection: str) -> None: + res = self.raw_cli(["in-existing", type, connection]) + res.check_returncode() + + +class NeonPageserver(PgProtocol): """ An object representing a running pageserver. - Initializes the repository via `zenith init`. + Initializes the repository via `neon init`. """ - def __init__(self, env: ZenithEnv, port: PageserverPort, config_override: Optional[str] = None): - super().__init__(host='localhost', port=port.pg, user='zenith_admin') + def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional[str] = None): + super().__init__(host='localhost', port=port.pg, user='cloud_admin') self.env = env self.running = False self.service_port = port self.config_override = config_override - def start(self, overrides=()) -> 'ZenithPageserver': + def start(self, overrides=()) -> 'NeonPageserver': """ Start the page server. `overrides` allows to add some config to this pageserver start. @@ -1207,17 +1302,17 @@ class ZenithPageserver(PgProtocol): """ assert self.running == False - self.env.zenith_cli.pageserver_start(overrides=overrides) + self.env.neon_cli.pageserver_start(overrides=overrides) self.running = True return self - def stop(self, immediate=False) -> 'ZenithPageserver': + def stop(self, immediate=False) -> 'NeonPageserver': """ Stop the page server. Returns self. """ if self.running: - self.env.zenith_cli.pageserver_stop(immediate) + self.env.neon_cli.pageserver_stop(immediate) self.running = False return self @@ -1227,8 +1322,8 @@ class ZenithPageserver(PgProtocol): def __exit__(self, exc_type, exc, tb): self.stop(True) - def http_client(self, auth_token: Optional[str] = None) -> ZenithPageserverHttpClient: - return ZenithPageserverHttpClient( + def http_client(self, auth_token: Optional[str] = None) -> NeonPageserverHttpClient: + return NeonPageserverHttpClient( port=self.service_port.http, auth_token=auth_token, ) @@ -1237,22 +1332,13 @@ class ZenithPageserver(PgProtocol): def append_pageserver_param_overrides( params_to_update: List[str], remote_storage: Optional[RemoteStorage], + remote_storage_users: RemoteStorageUsers, pageserver_config_override: Optional[str] = None, ): - if remote_storage is not None: - if isinstance(remote_storage, LocalFsStorage): - pageserver_storage_override = f"local_path='{remote_storage.root}'" - elif isinstance(remote_storage, S3Storage): - pageserver_storage_override = f"bucket_name='{remote_storage.bucket}',\ - bucket_region='{remote_storage.region}'" - - if remote_storage.endpoint is not None: - pageserver_storage_override += f",endpoint='{remote_storage.endpoint}'" - - else: - raise Exception(f'Unknown storage configuration {remote_storage}') + if bool(remote_storage_users & RemoteStorageUsers.PAGESERVER) and remote_storage is not None: + remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage) params_to_update.append( - f'--pageserver-config-override=remote_storage={{{pageserver_storage_override}}}') + f'--pageserver-config-override=remote_storage={remote_storage_toml_table}') env_overrides = os.getenv('ZENITH_PAGESERVER_OVERRIDES') if env_overrides is not None: @@ -1269,7 +1355,7 @@ def append_pageserver_param_overrides( class PgBin: """ A helper class for executing postgres binaries """ - def __init__(self, log_dir: str): + def __init__(self, log_dir: Path): self.log_dir = log_dir self.pg_bin_path = os.path.join(str(pg_distrib_dir), 'bin') self.env = os.environ.copy() @@ -1320,21 +1406,28 @@ class PgBin: self._fixpath(command) log.info('Running command "{}"'.format(' '.join(command))) env = self._build_env(env) - return subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True, **kwargs) + return subprocess_capture(str(self.log_dir), + command, + env=env, + cwd=cwd, + check=True, + **kwargs) @pytest.fixture(scope='function') -def pg_bin(test_output_dir: str) -> PgBin: +def pg_bin(test_output_dir: Path) -> PgBin: return PgBin(test_output_dir) class VanillaPostgres(PgProtocol): - def __init__(self, pgdatadir: str, pg_bin: PgBin, port: int): + def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True): super().__init__(host='localhost', port=port, dbname='postgres') self.pgdatadir = pgdatadir self.pg_bin = pg_bin self.running = False - self.pg_bin.run_capture(['initdb', '-D', pgdatadir]) + if init: + self.pg_bin.run_capture(['initdb', '-D', str(pgdatadir)]) + self.configure([f"port = {port}\n"]) def configure(self, options: List[str]): """Append lines into postgresql.conf file.""" @@ -1349,12 +1442,13 @@ class VanillaPostgres(PgProtocol): if log_path is None: log_path = os.path.join(self.pgdatadir, "pg.log") - self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, '-l', log_path, 'start']) + self.pg_bin.run_capture( + ['pg_ctl', '-w', '-D', str(self.pgdatadir), '-l', log_path, 'start']) def stop(self): assert self.running self.running = False - self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, 'stop']) + self.pg_bin.run_capture(['pg_ctl', '-w', '-D', str(self.pgdatadir), 'stop']) def get_subdir_size(self, subdir) -> int: """Return size of pgdatadir subdirectory in bytes.""" @@ -1369,10 +1463,12 @@ class VanillaPostgres(PgProtocol): @pytest.fixture(scope='function') -def vanilla_pg(test_output_dir: str) -> Iterator[VanillaPostgres]: - pgdatadir = os.path.join(test_output_dir, "pgdata-vanilla") +def vanilla_pg(test_output_dir: Path, + port_distributor: PortDistributor) -> Iterator[VanillaPostgres]: + pgdatadir = test_output_dir / "pgdata-vanilla" pg_bin = PgBin(test_output_dir) - with VanillaPostgres(pgdatadir, pg_bin, 5432) as vanilla_pg: + port = port_distributor.get_port() + with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: yield vanilla_pg @@ -1393,7 +1489,7 @@ class RemotePostgres(PgProtocol): raise Exception('cannot stop a remote Postgres instance') def get_subdir_size(self, subdir) -> int: - # TODO: Could use the server's Generic File Acccess functions if superuser. + # TODO: Could use the server's Generic File Access functions if superuser. # See https://www.postgresql.org/docs/14/functions-admin.html#FUNCTIONS-ADMIN-GENFILE raise Exception('cannot get size of a Postgres instance') @@ -1406,7 +1502,7 @@ class RemotePostgres(PgProtocol): @pytest.fixture(scope='function') -def remote_pg(test_output_dir: str) -> Iterator[RemotePostgres]: +def remote_pg(test_output_dir: Path) -> Iterator[RemotePostgres]: pg_bin = PgBin(test_output_dir) connstr = os.getenv("BENCHMARK_CONNSTR") @@ -1417,8 +1513,8 @@ def remote_pg(test_output_dir: str) -> Iterator[RemotePostgres]: yield remote_pg -class ZenithProxy(PgProtocol): - def __init__(self, port: int): +class NeonProxy(PgProtocol): + def __init__(self, port: int, pg_port: int): super().__init__(host="127.0.0.1", user="proxy_user", password="pytest2", @@ -1427,18 +1523,20 @@ class ZenithProxy(PgProtocol): self.http_port = 7001 self.host = "127.0.0.1" self.port = port + self.pg_port = pg_port self._popen: Optional[subprocess.Popen[bytes]] = None - def start_static(self, addr="127.0.0.1:5432") -> None: + def start(self) -> None: assert self._popen is None # Start proxy - bin_proxy = os.path.join(str(zenith_binpath), 'proxy') + bin_proxy = os.path.join(str(neon_binpath), 'proxy') args = [bin_proxy] args.extend(["--http", f"{self.host}:{self.http_port}"]) args.extend(["--proxy", f"{self.host}:{self.port}"]) args.extend(["--auth-backend", "postgres"]) - args.extend(["--auth-endpoint", "postgres://proxy_auth:pytest1@localhost:5432/postgres"]) + args.extend( + ["--auth-endpoint", f"postgres://proxy_auth:pytest1@localhost:{self.pg_port}/postgres"]) self._popen = subprocess.Popen(args) self._wait_until_ready() @@ -1457,27 +1555,34 @@ class ZenithProxy(PgProtocol): @pytest.fixture(scope='function') -def static_proxy(vanilla_pg) -> Iterator[ZenithProxy]: - """Zenith proxy that routes directly to vanilla postgres.""" +def static_proxy(vanilla_pg, port_distributor) -> Iterator[NeonProxy]: + """Neon proxy that routes directly to vanilla postgres.""" vanilla_pg.start() vanilla_pg.safe_psql("create user proxy_auth with password 'pytest1' superuser") vanilla_pg.safe_psql("create user proxy_user with password 'pytest2'") - with ZenithProxy(4432) as proxy: - proxy.start_static() + port = port_distributor.get_port() + pg_port = vanilla_pg.default_options['port'] + with NeonProxy(port, pg_port) as proxy: + proxy.start() yield proxy class Postgres(PgProtocol): """ An object representing a running postgres daemon. """ - def __init__(self, env: ZenithEnv, tenant_id: uuid.UUID, port: int): - super().__init__(host='localhost', port=port, user='zenith_admin', dbname='postgres') + def __init__(self, + env: NeonEnv, + tenant_id: uuid.UUID, + port: int, + check_stop_result: bool = True): + super().__init__(host='localhost', port=port, user='cloud_admin', dbname='postgres') self.env = env self.running = False self.node_name: Optional[str] = None # dubious, see asserts below self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA self.tenant_id = tenant_id self.port = port + self.check_stop_result = check_stop_result # path to conf is /pgdatadirs/tenants///postgresql.conf def create( @@ -1496,11 +1601,11 @@ class Postgres(PgProtocol): config_lines = [] self.node_name = node_name or f'{branch_name}_pg_node' - self.env.zenith_cli.pg_create(branch_name, - node_name=self.node_name, - tenant_id=self.tenant_id, - lsn=lsn, - port=self.port) + self.env.neon_cli.pg_create(branch_name, + node_name=self.node_name, + tenant_id=self.tenant_id, + lsn=lsn, + port=self.port) path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name self.pgdata_dir = os.path.join(self.env.repo_dir, path) @@ -1524,13 +1629,11 @@ class Postgres(PgProtocol): log.info(f"Starting postgres node {self.node_name}") - run_result = self.env.zenith_cli.pg_start(self.node_name, - tenant_id=self.tenant_id, - port=self.port) + run_result = self.env.neon_cli.pg_start(self.node_name, + tenant_id=self.tenant_id, + port=self.port) self.running = True - log.info(f"stdout: {run_result.stdout}") - return self def pg_data_dir_path(self) -> str: @@ -1564,14 +1667,12 @@ class Postgres(PgProtocol): for cfg_line in cfg_lines: # walproposer uses different application_name if ("synchronous_standby_names" in cfg_line or - # don't ask pageserver to fetch WAL from compute - "callmemaybe_connstring" in cfg_line or # don't repeat safekeepers/wal_acceptors multiple times - "wal_acceptors" in cfg_line): + "safekeepers" in cfg_line): continue f.write(cfg_line) f.write("synchronous_standby_names = 'walproposer'\n") - f.write("wal_acceptors = '{}'\n".format(safekeepers)) + f.write("safekeepers = '{}'\n".format(safekeepers)) return self def config(self, lines: List[str]) -> 'Postgres': @@ -1596,7 +1697,9 @@ class Postgres(PgProtocol): if self.running: assert self.node_name is not None - self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id) + self.env.neon_cli.pg_stop(self.node_name, + self.tenant_id, + check_return_code=self.check_stop_result) self.running = False return self @@ -1608,7 +1711,10 @@ class Postgres(PgProtocol): """ assert self.node_name is not None - self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, True) + self.env.neon_cli.pg_stop(self.node_name, + self.tenant_id, + True, + check_return_code=self.check_stop_result) self.node_name = None self.running = False @@ -1627,6 +1733,8 @@ class Postgres(PgProtocol): Returns self. """ + started_at = time.time() + self.create( branch_name=branch_name, node_name=node_name, @@ -1634,6 +1742,8 @@ class Postgres(PgProtocol): lsn=lsn, ).start() + log.info(f"Postgres startup took {time.time() - started_at} seconds") + return self def __enter__(self): @@ -1645,7 +1755,7 @@ class Postgres(PgProtocol): class PostgresFactory: """ An object representing multiple running postgres daemons. """ - def __init__(self, env: ZenithEnv): + def __init__(self, env: NeonEnv): self.env = env self.num_instances = 0 self.instances: List[Postgres] = [] @@ -1716,15 +1826,14 @@ class SafekeeperPort: @dataclass class Safekeeper: """ An object representing a running safekeeper daemon. """ - env: ZenithEnv + env: NeonEnv port: SafekeeperPort id: int - auth_token: Optional[str] = None running: bool = False def start(self) -> 'Safekeeper': assert self.running == False - self.env.zenith_cli.safekeeper_start(self.id) + self.env.neon_cli.safekeeper_start(self.id) self.running = True # wait for wal acceptor start by checking its status started_at = time.time() @@ -1744,7 +1853,7 @@ class Safekeeper: def stop(self, immediate=False) -> 'Safekeeper': log.info('Stopping safekeeper {}'.format(self.id)) - self.env.zenith_cli.safekeeper_stop(self.id, immediate) + self.env.neon_cli.safekeeper_stop(self.id, immediate) self.running = False return self @@ -1775,8 +1884,8 @@ class Safekeeper: assert isinstance(res, dict) return res - def http_client(self) -> SafekeeperHttpClient: - return SafekeeperHttpClient(port=self.port.http) + def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient: + return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token) def data_dir(self) -> str: return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}") @@ -1786,8 +1895,9 @@ class Safekeeper: class SafekeeperTimelineStatus: acceptor_epoch: int flush_lsn: str - remote_consistent_lsn: str timeline_start_lsn: str + backup_lsn: str + remote_consistent_lsn: str @dataclass @@ -1799,9 +1909,15 @@ class SafekeeperMetrics: class SafekeeperHttpClient(requests.Session): - def __init__(self, port: int): + HTTPError = requests.HTTPError + + def __init__(self, port: int, auth_token: Optional[str] = None): super().__init__() self.port = port + self.auth_token = auth_token + + if auth_token is not None: + self.headers['Authorization'] = f'Bearer {auth_token}' def check_status(self): self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() @@ -1812,8 +1928,9 @@ class SafekeeperHttpClient(requests.Session): resj = res.json() return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'], flush_lsn=resj['flush_lsn'], - remote_consistent_lsn=resj['remote_consistent_lsn'], - timeline_start_lsn=resj['timeline_start_lsn']) + timeline_start_lsn=resj['timeline_start_lsn'], + backup_lsn=resj['backup_lsn'], + remote_consistent_lsn=resj['remote_consistent_lsn']) def record_safekeeper_info(self, tenant_id: str, timeline_id: str, body): res = self.post( @@ -1864,9 +1981,12 @@ class Etcd: datadir: str port: int peer_port: int - binary_path: Path = etcd_path() + binary_path: Path = field(init=False) handle: Optional[subprocess.Popen[Any]] = None # handle of running daemon + def __post_init__(self): + self.binary_path = etcd_path() + def client_url(self): return f'http://127.0.0.1:{self.port}' @@ -1893,7 +2013,11 @@ class Etcd: f"--data-dir={self.datadir}", f"--listen-client-urls={client_url}", f"--advertise-client-urls={client_url}", - f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}" + f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}", + # Set --quota-backend-bytes to keep the etcd virtual memory + # size smaller. Our test etcd clusters are very small. + # See https://github.com/etcd-io/etcd/issues/7910 + f"--quota-backend-bytes=100000000" ] self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file) @@ -1916,17 +2040,19 @@ class Etcd: self.handle.wait() -def get_test_output_dir(request: Any) -> str: +def get_test_output_dir(request: Any) -> pathlib.Path: """ Compute the working directory for an individual test. """ test_name = request.node.name - test_dir = os.path.join(str(top_output_dir), test_name) + test_dir = pathlib.Path(top_output_dir) / test_name.replace("/", "-") log.info(f'get_test_output_dir is {test_dir}') + # make mypy happy + assert isinstance(test_dir, pathlib.Path) return test_dir # This is autouse, so the test output directory always gets created, even # if a test doesn't put anything there. It also solves a problem with the -# zenith_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it +# neon_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it # creates the repo in the test output directory. But it cannot depend on # 'test_output_dir' fixture, because when TEST_SHARED_FIXTURES is not set, # it has 'session' scope and cannot access fixtures with 'function' @@ -1934,14 +2060,14 @@ def get_test_output_dir(request: Any) -> str: # this fixture ensures that the directory exists. That works because # 'autouse' fixtures are run before other fixtures. @pytest.fixture(scope='function', autouse=True) -def test_output_dir(request: Any) -> str: +def test_output_dir(request: Any) -> pathlib.Path: """ Create the working directory for an individual test. """ # one directory per test test_dir = get_test_output_dir(request) log.info(f'test_output_dir is {test_dir}') shutil.rmtree(test_dir, ignore_errors=True) - mkdir_if_needed(test_dir) + test_dir.mkdir() return test_dir @@ -1987,7 +2113,7 @@ def should_skip_file(filename: str) -> bool: # # Test helpers # -def list_files_to_compare(pgdata_dir: str): +def list_files_to_compare(pgdata_dir: pathlib.Path): pgdata_files = [] for root, _file, filenames in os.walk(pgdata_dir): for filename in filenames: @@ -2004,20 +2130,20 @@ def list_files_to_compare(pgdata_dir: str): # pg is the existing and running compute node, that we want to compare with a basebackup -def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Postgres): +def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, pg: Postgres): # Get the timeline ID. We need it for the 'basebackup' command with closing(pg.connect()) as conn: with conn.cursor() as cur: - cur.execute("SHOW zenith.zenith_timeline") + cur.execute("SHOW neon.timeline_id") timeline = cur.fetchone()[0] # stop postgres to ensure that files won't change pg.stop() # Take a basebackup from pageserver - restored_dir_path = os.path.join(env.repo_dir, f"{pg.node_name}_restored_datadir") - mkdir_if_needed(restored_dir_path) + restored_dir_path = env.repo_dir / f"{pg.node_name}_restored_datadir" + restored_dir_path.mkdir(exist_ok=True) pg_bin = PgBin(test_output_dir) psql_path = os.path.join(pg_bin.pg_bin_path, 'psql') @@ -2044,7 +2170,7 @@ def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Pos # list files we're going to compare assert pg.pgdata_dir - pgdata_files = list_files_to_compare(pg.pgdata_dir) + pgdata_files = list_files_to_compare(pathlib.Path(pg.pgdata_dir)) restored_files = list_files_to_compare(restored_dir_path) # check that file sets are equal @@ -2076,7 +2202,7 @@ def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Pos assert (mismatch, error) == ([], []) -def wait_until(number_of_iterations: int, interval: int, func): +def wait_until(number_of_iterations: int, interval: float, func): """ Wait until 'func' returns successfully, without exception. Returns the last return value from the the function. @@ -2094,22 +2220,30 @@ def wait_until(number_of_iterations: int, interval: int, func): raise Exception("timed out while waiting for %s" % func) from last_exception -def assert_local(pageserver_http_client: ZenithPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID): +def assert_timeline_local(pageserver_http_client: NeonPageserverHttpClient, + tenant: uuid.UUID, + timeline: uuid.UUID): timeline_detail = pageserver_http_client.timeline_detail(tenant, timeline) assert timeline_detail.get('local', {}).get("disk_consistent_lsn"), timeline_detail return timeline_detail -def remote_consistent_lsn(pageserver_http_client: ZenithPageserverHttpClient, +def assert_no_in_progress_downloads_for_tenant( + pageserver_http_client: NeonPageserverHttpClient, + tenant: uuid.UUID, +): + tenant_status = pageserver_http_client.tenant_status(tenant) + assert tenant_status['has_in_progress_downloads'] is False, tenant_status + + +def remote_consistent_lsn(pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID) -> int: detail = pageserver_http_client.timeline_detail(tenant, timeline) if detail['remote'] is None: # No remote information at all. This happens right after creating - # a timeline, before any part of it it has been uploaded to remote + # a timeline, before any part of it has been uploaded to remote # storage yet. return 0 else: @@ -2118,7 +2252,7 @@ def remote_consistent_lsn(pageserver_http_client: ZenithPageserverHttpClient, return lsn_from_hex(lsn_str) -def wait_for_upload(pageserver_http_client: ZenithPageserverHttpClient, +def wait_for_upload(pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID, lsn: int): @@ -2134,7 +2268,7 @@ def wait_for_upload(pageserver_http_client: ZenithPageserverHttpClient, lsn_to_hex(lsn), lsn_to_hex(current_lsn))) -def last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, +def last_record_lsn(pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID) -> int: detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -2144,7 +2278,7 @@ def last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, return lsn_from_hex(lsn_str) -def wait_for_last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient, +def wait_for_last_record_lsn(pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID, lsn: int): diff --git a/test_runner/fixtures/pg_stats.py b/test_runner/fixtures/pg_stats.py new file mode 100644 index 0000000000..e113d37248 --- /dev/null +++ b/test_runner/fixtures/pg_stats.py @@ -0,0 +1,52 @@ +from typing import List + +import pytest + + +class PgStatTable: + table: str + columns: List[str] + additional_query: str + + def __init__(self, table: str, columns: List[str], filter_query: str = ""): + self.table = table + self.columns = columns + self.additional_query = filter_query + + @property + def query(self) -> str: + return f"SELECT {','.join(self.columns)} FROM {self.table} {self.additional_query}" + + +@pytest.fixture(scope='function') +def pg_stats_rw() -> List[PgStatTable]: + return [ + PgStatTable("pg_stat_database", + ["tup_returned", "tup_fetched", "tup_inserted", "tup_updated", "tup_deleted"], + "WHERE datname='postgres'"), + ] + + +@pytest.fixture(scope='function') +def pg_stats_ro() -> List[PgStatTable]: + return [ + PgStatTable("pg_stat_database", ["tup_returned", "tup_fetched"], + "WHERE datname='postgres'"), + ] + + +@pytest.fixture(scope='function') +def pg_stats_wo() -> List[PgStatTable]: + return [ + PgStatTable("pg_stat_database", ["tup_inserted", "tup_updated", "tup_deleted"], + "WHERE datname='postgres'"), + ] + + +@pytest.fixture(scope='function') +def pg_stats_wal() -> List[PgStatTable]: + return [ + PgStatTable("pg_stat_wal", + ["wal_records", "wal_fpi", "wal_bytes", "wal_buffers_full", "wal_write"], + "") + ] diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index ba9bc6e113..c49fa08d77 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -3,7 +3,7 @@ import shutil import subprocess from pathlib import Path -from typing import Any, List, Optional +from typing import Any, List from fixtures.log_helper import log @@ -12,18 +12,6 @@ def get_self_dir() -> str: return os.path.dirname(os.path.abspath(__file__)) -def mkdir_if_needed(path: str) -> None: - """ Create a directory if it doesn't already exist - - Note this won't try to create intermediate directories. - """ - try: - os.mkdir(path) - except FileExistsError: - pass - assert os.path.isdir(path) - - def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: """ Run a process and capture its output @@ -95,6 +83,9 @@ def get_dir_size(path: str) -> int: totalbytes = 0 for root, dirs, files in os.walk(path): for name in files: - totalbytes += os.path.getsize(os.path.join(root, name)) + try: + totalbytes += os.path.getsize(os.path.join(root, name)) + except FileNotFoundError as e: + pass # file could be concurrently removed return totalbytes diff --git a/test_runner/zenith_regress/.gitignore b/test_runner/neon_regress/.gitignore similarity index 100% rename from test_runner/zenith_regress/.gitignore rename to test_runner/neon_regress/.gitignore diff --git a/test_runner/zenith_regress/README.md b/test_runner/neon_regress/README.md similarity index 56% rename from test_runner/zenith_regress/README.md rename to test_runner/neon_regress/README.md index 61e3aad04e..b23a55462e 100644 --- a/test_runner/zenith_regress/README.md +++ b/test_runner/neon_regress/README.md @@ -1,7 +1,7 @@ To add a new SQL test -- add sql script to run to zenith_regress/sql/testname.sql -- add expected output to zenith_regress/expected/testname.out +- add sql script to run to neon_regress/sql/testname.sql +- add expected output to neon_regress/expected/testname.out - add testname to parallel_schedule That's it. diff --git a/test_runner/zenith_regress/expected/.gitignore b/test_runner/neon_regress/expected/.gitignore similarity index 100% rename from test_runner/zenith_regress/expected/.gitignore rename to test_runner/neon_regress/expected/.gitignore diff --git a/test_runner/zenith_regress/expected/zenith-cid.out b/test_runner/neon_regress/expected/neon-cid.out similarity index 100% rename from test_runner/zenith_regress/expected/zenith-cid.out rename to test_runner/neon_regress/expected/neon-cid.out diff --git a/test_runner/zenith_regress/expected/zenith-clog.out b/test_runner/neon_regress/expected/neon-clog.out similarity index 100% rename from test_runner/zenith_regress/expected/zenith-clog.out rename to test_runner/neon_regress/expected/neon-clog.out diff --git a/test_runner/zenith_regress/expected/zenith-rel-truncate.out b/test_runner/neon_regress/expected/neon-rel-truncate.out similarity index 100% rename from test_runner/zenith_regress/expected/zenith-rel-truncate.out rename to test_runner/neon_regress/expected/neon-rel-truncate.out diff --git a/test_runner/zenith_regress/expected/zenith-vacuum-full.out b/test_runner/neon_regress/expected/neon-vacuum-full.out similarity index 100% rename from test_runner/zenith_regress/expected/zenith-vacuum-full.out rename to test_runner/neon_regress/expected/neon-vacuum-full.out diff --git a/test_runner/zenith_regress/parallel_schedule b/test_runner/neon_regress/parallel_schedule similarity index 71% rename from test_runner/zenith_regress/parallel_schedule rename to test_runner/neon_regress/parallel_schedule index f64bf8a034..569c7b5066 100644 --- a/test_runner/zenith_regress/parallel_schedule +++ b/test_runner/neon_regress/parallel_schedule @@ -4,7 +4,7 @@ # number of connections needed to run the tests. # ---------- -test: zenith-cid -test: zenith-rel-truncate -test: zenith-clog -test: zenith-vacuum-full +test: neon-cid +test: neon-rel-truncate +test: neon-clog +test: neon-vacuum-full diff --git a/test_runner/zenith_regress/sql/.gitignore b/test_runner/neon_regress/sql/.gitignore similarity index 100% rename from test_runner/zenith_regress/sql/.gitignore rename to test_runner/neon_regress/sql/.gitignore diff --git a/test_runner/zenith_regress/sql/zenith-cid.sql b/test_runner/neon_regress/sql/neon-cid.sql similarity index 100% rename from test_runner/zenith_regress/sql/zenith-cid.sql rename to test_runner/neon_regress/sql/neon-cid.sql diff --git a/test_runner/zenith_regress/sql/zenith-clog.sql b/test_runner/neon_regress/sql/neon-clog.sql similarity index 100% rename from test_runner/zenith_regress/sql/zenith-clog.sql rename to test_runner/neon_regress/sql/neon-clog.sql diff --git a/test_runner/zenith_regress/sql/zenith-rel-truncate.sql b/test_runner/neon_regress/sql/neon-rel-truncate.sql similarity index 100% rename from test_runner/zenith_regress/sql/zenith-rel-truncate.sql rename to test_runner/neon_regress/sql/neon-rel-truncate.sql diff --git a/test_runner/zenith_regress/sql/zenith-vacuum-full.sql b/test_runner/neon_regress/sql/neon-vacuum-full.sql similarity index 100% rename from test_runner/zenith_regress/sql/zenith-vacuum-full.sql rename to test_runner/neon_regress/sql/neon-vacuum-full.sql diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 3b57ac73cc..6a5bad8757 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -1,8 +1,8 @@ from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare # @@ -15,8 +15,8 @@ from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare # 3. Disk space used # 4. Peak memory usage # -def test_bulk_insert(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +def test_bulk_insert(neon_with_baseline: PgCompare): + env = neon_with_baseline with closing(env.pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index 0e16d3e749..fe3c3afe37 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -2,7 +2,7 @@ import timeit from fixtures.benchmark_fixture import MetricReport import pytest -from fixtures.zenith_fixtures import ZenithEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder # Run bulk tenant creation test. # @@ -13,33 +13,29 @@ from fixtures.zenith_fixtures import ZenithEnvBuilder @pytest.mark.parametrize('tenants_count', [1, 5, 10]) -@pytest.mark.parametrize('use_safekeepers', ['with_wa', 'without_wa']) def test_bulk_tenant_create( - zenith_env_builder: ZenithEnvBuilder, - use_safekeepers: str, + neon_env_builder: NeonEnvBuilder, tenants_count: int, zenbenchmark, ): - """Measure tenant creation time (with and without wal acceptors)""" - if use_safekeepers == 'with_wa': - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() time_slices = [] for i in range(tenants_count): start = timeit.default_timer() - tenant, _ = env.zenith_cli.create_tenant() - env.zenith_cli.create_timeline( - f'test_bulk_tenant_create_{tenants_count}_{i}_{use_safekeepers}', tenant_id=tenant) + tenant, _ = env.neon_cli.create_tenant() + env.neon_cli.create_timeline(f'test_bulk_tenant_create_{tenants_count}_{i}', + tenant_id=tenant) # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now? #if use_safekeepers == 'with_sa': # wa_factory.start_n_new(3) - pg_tenant = env.postgres.create_start( - f'test_bulk_tenant_create_{tenants_count}_{i}_{use_safekeepers}', tenant_id=tenant) + pg_tenant = env.postgres.create_start(f'test_bulk_tenant_create_{tenants_count}_{i}', + tenant_id=tenant) end = timeit.default_timer() time_slices.append(end - start) diff --git a/test_runner/performance/test_compare_pg_stats.py b/test_runner/performance/test_compare_pg_stats.py new file mode 100644 index 0000000000..a8a9e3cd4d --- /dev/null +++ b/test_runner/performance/test_compare_pg_stats.py @@ -0,0 +1,89 @@ +import os +from typing import List + +import pytest +from fixtures.compare_fixtures import PgCompare +from fixtures.pg_stats import PgStatTable + +from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix + + +def get_seeds_matrix(default: int = 100): + seeds = os.getenv("TEST_PG_BENCH_SEEDS_MATRIX", default=str(default)) + return list(map(int, seeds.split(","))) + + +@pytest.mark.parametrize("seed", get_seeds_matrix()) +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix(5)) +def test_compare_pg_stats_rw_with_pgbench_default(neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_rw: List[PgStatTable]): + env = neon_with_baseline + # initialize pgbench + env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.flush() + + with env.record_pg_stats(pg_stats_rw): + env.pg_bin.run_capture( + ['pgbench', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) + env.flush() + + +@pytest.mark.parametrize("seed", get_seeds_matrix()) +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix(5)) +def test_compare_pg_stats_wo_with_pgbench_simple_update(neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_wo: List[PgStatTable]): + env = neon_with_baseline + # initialize pgbench + env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.flush() + + with env.record_pg_stats(pg_stats_wo): + env.pg_bin.run_capture( + ['pgbench', '-N', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) + env.flush() + + +@pytest.mark.parametrize("seed", get_seeds_matrix()) +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix(5)) +def test_compare_pg_stats_ro_with_pgbench_select_only(neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_ro: List[PgStatTable]): + env = neon_with_baseline + # initialize pgbench + env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.flush() + + with env.record_pg_stats(pg_stats_ro): + env.pg_bin.run_capture( + ['pgbench', '-S', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) + env.flush() + + +@pytest.mark.parametrize("seed", get_seeds_matrix()) +@pytest.mark.parametrize("scale", get_scales_matrix()) +@pytest.mark.parametrize("duration", get_durations_matrix(5)) +def test_compare_pg_stats_wal_with_pgbench_default(neon_with_baseline: PgCompare, + seed: int, + scale: int, + duration: int, + pg_stats_wal: List[PgStatTable]): + env = neon_with_baseline + # initialize pgbench + env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.flush() + + with env.record_pg_stats(pg_stats_wal): + env.pg_bin.run_capture( + ['pgbench', f'-T{duration}', f'--random-seed={seed}', env.pg.connstr()]) + env.flush() diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py index e04a0361cb..ad088684d5 100644 --- a/test_runner/performance/test_copy.py +++ b/test_runner/performance/test_copy.py @@ -1,8 +1,8 @@ from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare from io import BufferedReader, RawIOBase from itertools import repeat @@ -41,8 +41,8 @@ def copy_test_data(rows: int): # # COPY performance tests. # -def test_copy(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +def test_copy(neon_with_baseline: PgCompare): + env = neon_with_baseline # Get the timeline ID of our branch. We need it for the pageserver 'checkpoint' command with closing(env.pg.connect()) as conn: diff --git a/test_runner/performance/test_gist_build.py b/test_runner/performance/test_gist_build.py index 92396f6cb7..839eb3f57d 100644 --- a/test_runner/performance/test_gist_build.py +++ b/test_runner/performance/test_gist_build.py @@ -1,8 +1,8 @@ import os from contextlib import closing from fixtures.benchmark_fixture import MetricReport -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +from fixtures.neon_fixtures import NeonEnv +from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare from fixtures.log_helper import log @@ -11,8 +11,8 @@ from fixtures.log_helper import log # As of this writing, we're duplicate those giant WAL records for each page, # which makes the delta layer about 32x larger than it needs to be. # -def test_gist_buffering_build(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +def test_gist_buffering_build(neon_with_baseline: PgCompare): + env = neon_with_baseline with closing(env.pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/performance/test_hot_page.py b/test_runner/performance/test_hot_page.py index 2042b0d548..d3da0310ce 100644 --- a/test_runner/performance/test_hot_page.py +++ b/test_runner/performance/test_hot_page.py @@ -8,7 +8,7 @@ from pytest_lazyfixture import lazy_fixture # type: ignore "env", [ # The test is too slow to run in CI, but fast enough to run with remote tests - pytest.param(lazy_fixture("zenith_compare"), id="zenith", marks=pytest.mark.slow), + pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), ]) @@ -18,6 +18,7 @@ def test_hot_page(env: PgCompare): with closing(env.pg.connect()) as conn: with conn.cursor() as cur: + cur.execute('drop table if exists t, f;') # Write many updates to the same row with env.record_duration('write'): diff --git a/test_runner/performance/test_hot_table.py b/test_runner/performance/test_hot_table.py index 11e047b8c3..997c772f88 100644 --- a/test_runner/performance/test_hot_table.py +++ b/test_runner/performance/test_hot_table.py @@ -8,7 +8,7 @@ from pytest_lazyfixture import lazy_fixture # type: ignore "env", [ # The test is too slow to run in CI, but fast enough to run with remote tests - pytest.param(lazy_fixture("zenith_compare"), id="zenith", marks=pytest.mark.slow), + pytest.param(lazy_fixture("neon_compare"), id="neon", marks=pytest.mark.slow), pytest.param(lazy_fixture("vanilla_compare"), id="vanilla", marks=pytest.mark.slow), pytest.param(lazy_fixture("remote_compare"), id="remote", marks=pytest.mark.remote_cluster), ]) @@ -20,6 +20,7 @@ def test_hot_table(env: PgCompare): with closing(env.pg.connect()) as conn: with conn.cursor() as cur: + cur.execute('drop table if exists t;') # Write many updates to a small table with env.record_duration('write'): diff --git a/test_runner/performance/test_parallel_copy_to.py b/test_runner/performance/test_parallel_copy_to.py index e4388ce8e2..d4e74ce195 100644 --- a/test_runner/performance/test_parallel_copy_to.py +++ b/test_runner/performance/test_parallel_copy_to.py @@ -1,10 +1,10 @@ from io import BytesIO import asyncio import asyncpg -from fixtures.zenith_fixtures import ZenithEnv, Postgres, PgProtocol +from fixtures.neon_fixtures import NeonEnv, Postgres, PgProtocol from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare async def repeat_bytes(buf, repetitions: int): @@ -36,9 +36,9 @@ async def parallel_load_different_tables(pg: PgProtocol, n_parallel: int): # Load 5 different tables in parallel with COPY TO -def test_parallel_copy_different_tables(zenith_with_baseline: PgCompare, n_parallel=5): +def test_parallel_copy_different_tables(neon_with_baseline: PgCompare, n_parallel=5): - env = zenith_with_baseline + env = neon_with_baseline conn = env.pg.connect() cur = conn.cursor() @@ -65,8 +65,8 @@ async def parallel_load_same_table(pg: PgProtocol, n_parallel: int): # Load data into one table with COPY TO from 5 parallel connections -def test_parallel_copy_same_table(zenith_with_baseline: PgCompare, n_parallel=5): - env = zenith_with_baseline +def test_parallel_copy_same_table(neon_with_baseline: PgCompare, n_parallel=5): + env = neon_with_baseline conn = env.pg.connect() cur = conn.cursor() diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index fc10ca4d6c..8644ced6d9 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -1,8 +1,8 @@ from contextlib import closing -from fixtures.zenith_fixtures import PgBin, VanillaPostgres, ZenithEnv, profiling_supported -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +from fixtures.neon_fixtures import PgBin, VanillaPostgres, NeonEnv, profiling_supported +from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare -from fixtures.benchmark_fixture import PgBenchRunResult, MetricReport, ZenithBenchmarker +from fixtures.benchmark_fixture import PgBenchRunResult, MetricReport, NeonBenchmarker from fixtures.log_helper import log from pathlib import Path @@ -78,32 +78,30 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int): # Run simple-update workload run_pgbench(env, - "simple-update", - ['pgbench', '-n', '-c4', f'-T{duration}', '-P2', '-Mprepared', env.pg.connstr()]) + "simple-update", ['pgbench', '-N', '-c4', f'-T{duration}', '-P2', env.pg.connstr()]) # Run SELECT workload run_pgbench(env, - "select-only", - ['pgbench', '-S', '-c4', f'-T{duration}', '-P2', '-Mprepared', env.pg.connstr()]) + "select-only", ['pgbench', '-S', '-c4', f'-T{duration}', '-P2', env.pg.connstr()]) env.report_size() -def get_durations_matrix(): - durations = os.getenv("TEST_PG_BENCH_DURATIONS_MATRIX", default="45") +def get_durations_matrix(default: int = 45): + durations = os.getenv("TEST_PG_BENCH_DURATIONS_MATRIX", default=str(default)) return list(map(int, durations.split(","))) -def get_scales_matrix(): - scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX", default="10") +def get_scales_matrix(default: int = 10): + scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX", default=str(default)) return list(map(int, scales.split(","))) -# Run the pgbench tests against vanilla Postgres and zenith +# Run the pgbench tests against vanilla Postgres and neon @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix()) -def test_pgbench(zenith_with_baseline: PgCompare, scale: int, duration: int): - run_test_pgbench(zenith_with_baseline, scale, duration) +def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int): + run_test_pgbench(neon_with_baseline, scale, duration) # Run the pgbench tests, and generate a flamegraph from it @@ -114,18 +112,18 @@ def test_pgbench(zenith_with_baseline: PgCompare, scale: int, duration: int): # can see how much overhead the profiling adds. @pytest.mark.parametrize("scale", get_scales_matrix()) @pytest.mark.parametrize("duration", get_durations_matrix()) -def test_pgbench_flamegraph(zenbenchmark, pg_bin, zenith_env_builder, scale: int, duration: int): - zenith_env_builder.num_safekeepers = 1 - zenith_env_builder.pageserver_config_override = ''' +def test_pgbench_flamegraph(zenbenchmark, pg_bin, neon_env_builder, scale: int, duration: int): + neon_env_builder.num_safekeepers = 1 + neon_env_builder.pageserver_config_override = ''' profiling="page_requests" ''' if not profiling_supported(): pytest.skip("pageserver was built without 'profiling' feature") - env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("empty", "main") + env = neon_env_builder.init_start() + env.neon_cli.create_branch("empty", "main") - run_test_pgbench(ZenithCompare(zenbenchmark, env, pg_bin, "pgbench"), scale, duration) + run_test_pgbench(NeonCompare(zenbenchmark, env, pg_bin, "pgbench"), scale, duration) # Run the pgbench tests against an existing Postgres cluster diff --git a/test_runner/performance/test_random_writes.py b/test_runner/performance/test_random_writes.py index 205388bd90..4350386dd0 100644 --- a/test_runner/performance/test_random_writes.py +++ b/test_runner/performance/test_random_writes.py @@ -1,8 +1,8 @@ import os from contextlib import closing from fixtures.benchmark_fixture import MetricReport -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +from fixtures.neon_fixtures import NeonEnv +from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare from fixtures.log_helper import log import psycopg2.extras @@ -16,14 +16,14 @@ import time # A naive pageserver implementation would create a full image layer for each # dirty segment, leading to write_amplification = segment_size / page_size, # when compared to vanilla postgres. With segment_size = 10MB, that's 1250. -def test_random_writes(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +def test_random_writes(neon_with_baseline: PgCompare): + env = neon_with_baseline # Number of rows in the test database. 1M rows runs quickly, but implies # a small effective_checkpoint_distance, which makes the test less realistic. # Using a 300 TB database would imply a 250 MB effective_checkpoint_distance, # but it will take a very long time to run. From what I've seen so far, - # increasing n_rows doesn't have impact on the (zenith_runtime / vanilla_runtime) + # increasing n_rows doesn't have impact on the (neon_runtime / vanilla_runtime) # performance ratio. n_rows = 1 * 1000 * 1000 # around 36 MB table @@ -65,7 +65,7 @@ def test_random_writes(zenith_with_baseline: PgCompare): env.zenbenchmark.record("table_size", table_size, 'bytes', MetricReport.TEST_PARAM) # Decide how much to write, based on knowledge of pageserver implementation. - # Avoiding segment collisions maximizes (zenith_runtime / vanilla_runtime). + # Avoiding segment collisions maximizes (neon_runtime / vanilla_runtime). segment_size = 10 * 1024 * 1024 n_segments = table_size // segment_size n_writes = load_factor * n_segments // 3 diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py index 85d0a24510..8ed31cb480 100644 --- a/test_runner/performance/test_seqscans.py +++ b/test_runner/performance/test_seqscans.py @@ -2,9 +2,9 @@ # from contextlib import closing from dataclasses import dataclass -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log -from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.compare_fixtures import PgCompare import pytest @@ -20,8 +20,8 @@ import pytest pytest.param(10000000, 1, 0), pytest.param(10000000, 1, 4) ]) -def test_seqscans(zenith_with_baseline: PgCompare, rows: int, iters: int, workers: int): - env = zenith_with_baseline +def test_seqscans(neon_with_baseline: PgCompare, rows: int, iters: int, workers: int): + env = neon_with_baseline with closing(env.pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index 53b6a3a4fc..1cfd128e9b 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -1,17 +1,17 @@ import pytest from contextlib import closing -from fixtures.zenith_fixtures import ZenithEnvBuilder -from fixtures.benchmark_fixture import ZenithBenchmarker +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.benchmark_fixture import NeonBenchmarker # This test sometimes runs for longer than the global 5 minute timeout. @pytest.mark.timeout(600) -def test_startup(zenith_env_builder: ZenithEnvBuilder, zenbenchmark: ZenithBenchmarker): - zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init_start() +def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() # Start - env.zenith_cli.create_branch('test_startup') + env.neon_cli.create_branch('test_startup') with zenbenchmark.record_duration("startup_time"): pg = env.postgres.create_start('test_startup') pg.safe_psql("select 1;") diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py new file mode 100644 index 0000000000..2a79a778fc --- /dev/null +++ b/test_runner/performance/test_wal_backpressure.py @@ -0,0 +1,266 @@ +import statistics +import threading +import time +import timeit +from typing import Callable + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare +from fixtures.log_helper import log +from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin +from fixtures.utils import lsn_from_hex + +from performance.test_perf_pgbench import (get_durations_matrix, get_scales_matrix) + + +@pytest.fixture(params=["vanilla", "neon_off", "neon_on"]) +# This fixture constructs multiple `PgCompare` interfaces using a builder pattern. +# The builder parameters are encoded in the fixture's param. +# For example, to build a `NeonCompare` interface, the corresponding fixture's param should have +# a format of `neon_{safekeepers_enable_fsync}`. +# Note that, here "_" is used to separate builder parameters. +def pg_compare(request) -> PgCompare: + x = request.param.split("_") + + if x[0] == "vanilla": + # `VanillaCompare` interface + fixture = request.getfixturevalue("vanilla_compare") + assert isinstance(fixture, VanillaCompare) + + return fixture + else: + assert len(x) == 2, f"request param ({request.param}) should have a format of \ + `neon_{{safekeepers_enable_fsync}}`" + + # `NeonCompare` interface + neon_env_builder = request.getfixturevalue("neon_env_builder") + assert isinstance(neon_env_builder, NeonEnvBuilder) + + zenbenchmark = request.getfixturevalue("zenbenchmark") + assert isinstance(zenbenchmark, NeonBenchmarker) + + pg_bin = request.getfixturevalue("pg_bin") + assert isinstance(pg_bin, PgBin) + + neon_env_builder.safekeepers_enable_fsync = x[1] == "on" + + env = neon_env_builder.init_start() + env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) + + branch_name = request.node.name + return NeonCompare(zenbenchmark, env, pg_bin, branch_name) + + +def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_iters: int): + """Start an intensive write workload across multiple tables. + + ## Single table workload: + At each step, insert new `new_rows_each_update` rows. + The variable `new_rows_each_update` is equal to `scale * 100_000`. + The number of steps is determined by `num_iters` variable.""" + new_rows_each_update = scale * 100_000 + + def start_single_table_workload(table_id: int): + for _ in range(num_iters): + with env.pg.connect().cursor() as cur: + cur.execute( + f"INSERT INTO t{table_id} SELECT FROM generate_series(1,{new_rows_each_update})" + ) + + with env.record_duration("run_duration"): + threads = [ + threading.Thread(target=start_single_table_workload, args=(i, )) + for i in range(n_tables) + ] + + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + +@pytest.mark.timeout(1000) +@pytest.mark.parametrize("n_tables", [5]) +@pytest.mark.parametrize("scale", get_scales_matrix(5)) +@pytest.mark.parametrize("num_iters", [10]) +def test_heavy_write_workload(pg_compare: PgCompare, n_tables: int, scale: int, num_iters: int): + env = pg_compare + + # Initializes test tables + with env.pg.connect().cursor() as cur: + for i in range(n_tables): + cur.execute( + f"CREATE TABLE t{i}(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')" + ) + cur.execute(f"INSERT INTO t{i} (key) VALUES (0)") + + workload_thread = threading.Thread(target=start_heavy_write_workload, + args=(env, n_tables, scale, num_iters)) + workload_thread.start() + + record_thread = threading.Thread(target=record_lsn_write_lag, + args=(env, lambda: workload_thread.is_alive())) + record_thread.start() + + record_read_latency(env, lambda: workload_thread.is_alive(), "SELECT * from t0 where key = 0") + workload_thread.join() + record_thread.join() + + +def start_pgbench_simple_update_workload(env: PgCompare, duration: int): + with env.record_duration("run_duration"): + env.pg_bin.run_capture([ + 'pgbench', + '-j10', + '-c10', + '-N', + f'-T{duration}', + env.pg.connstr(options="-csynchronous_commit=off") + ]) + env.flush() + + +@pytest.mark.timeout(1000) +@pytest.mark.parametrize("scale", get_scales_matrix(100)) +@pytest.mark.parametrize("duration", get_durations_matrix()) +def test_pgbench_simple_update_workload(pg_compare: PgCompare, scale: int, duration: int): + env = pg_compare + + # initialize pgbench tables + env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()]) + env.flush() + + workload_thread = threading.Thread(target=start_pgbench_simple_update_workload, + args=(env, duration)) + workload_thread.start() + + record_thread = threading.Thread(target=record_lsn_write_lag, + args=(env, lambda: workload_thread.is_alive())) + record_thread.start() + + record_read_latency(env, + lambda: workload_thread.is_alive(), + "SELECT * from pgbench_accounts where aid = 1") + workload_thread.join() + record_thread.join() + + +def start_pgbench_intensive_initialization(env: PgCompare, scale: int): + with env.record_duration("run_duration"): + # Needs to increase the statement timeout (default: 120s) because the + # initialization step can be slow with a large scale. + env.pg_bin.run_capture([ + 'pgbench', + f'-s{scale}', + '-i', + '-Idtg', + env.pg.connstr(options='-cstatement_timeout=300s') + ]) + + +@pytest.mark.timeout(1000) +@pytest.mark.parametrize("scale", get_scales_matrix(1000)) +def test_pgbench_intensive_init_workload(pg_compare: PgCompare, scale: int): + env = pg_compare + with env.pg.connect().cursor() as cur: + cur.execute("CREATE TABLE foo as select generate_series(1,100000)") + + workload_thread = threading.Thread(target=start_pgbench_intensive_initialization, + args=(env, scale)) + workload_thread.start() + + record_thread = threading.Thread(target=record_lsn_write_lag, + args=(env, lambda: workload_thread.is_alive())) + record_thread.start() + + record_read_latency(env, lambda: workload_thread.is_alive(), "SELECT count(*) from foo") + workload_thread.join() + record_thread.join() + + +def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_interval: float = 1.0): + if not isinstance(env, NeonCompare): + return + + lsn_write_lags = [] + last_received_lsn = 0 + last_pg_flush_lsn = 0 + + with env.pg.connect().cursor() as cur: + cur.execute("CREATE EXTENSION neon") + + while run_cond(): + cur.execute(''' + select pg_wal_lsn_diff(pg_current_wal_flush_lsn(),received_lsn), + pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_flush_lsn(),received_lsn)), + pg_current_wal_flush_lsn(), + received_lsn + from backpressure_lsns(); + ''') + + res = cur.fetchone() + lsn_write_lags.append(res[0]) + + curr_received_lsn = lsn_from_hex(res[3]) + lsn_process_speed = (curr_received_lsn - last_received_lsn) / (1024**2) + last_received_lsn = curr_received_lsn + + curr_pg_flush_lsn = lsn_from_hex(res[2]) + lsn_produce_speed = (curr_pg_flush_lsn - last_pg_flush_lsn) / (1024**2) + last_pg_flush_lsn = curr_pg_flush_lsn + + log.info( + f"received_lsn_lag={res[1]}, pg_flush_lsn={res[2]}, received_lsn={res[3]}, lsn_process_speed={lsn_process_speed:.2f}MB/s, lsn_produce_speed={lsn_produce_speed:.2f}MB/s" + ) + + time.sleep(pool_interval) + + env.zenbenchmark.record("lsn_write_lag_max", + float(max(lsn_write_lags) / (1024**2)), + "MB", + MetricReport.LOWER_IS_BETTER) + env.zenbenchmark.record("lsn_write_lag_avg", + float(statistics.mean(lsn_write_lags) / (1024**2)), + "MB", + MetricReport.LOWER_IS_BETTER) + env.zenbenchmark.record("lsn_write_lag_stdev", + float(statistics.stdev(lsn_write_lags) / (1024**2)), + "MB", + MetricReport.LOWER_IS_BETTER) + + +def record_read_latency(env: PgCompare, + run_cond: Callable[[], bool], + read_query: str, + read_interval: float = 1.0): + read_latencies = [] + + with env.pg.connect().cursor() as cur: + while run_cond(): + try: + t1 = timeit.default_timer() + cur.execute(read_query) + t2 = timeit.default_timer() + + log.info( + f"Executed read query {read_query}, got {cur.fetchall()}, read time {t2-t1:.2f}s" + ) + read_latencies.append(t2 - t1) + except Exception as err: + log.error(f"Got error when executing the read query: {err}") + + time.sleep(read_interval) + + env.zenbenchmark.record("read_latency_max", + max(read_latencies), + 's', + MetricReport.LOWER_IS_BETTER) + env.zenbenchmark.record("read_latency_avg", + statistics.mean(read_latencies), + 's', + MetricReport.LOWER_IS_BETTER) + env.zenbenchmark.record("read_latency_stdev", + statistics.stdev(read_latencies), + 's', + MetricReport.LOWER_IS_BETTER) diff --git a/test_runner/performance/test_write_amplification.py b/test_runner/performance/test_write_amplification.py index 49232bf6d3..1d729fd78f 100644 --- a/test_runner/performance/test_write_amplification.py +++ b/test_runner/performance/test_write_amplification.py @@ -13,13 +13,13 @@ import os from contextlib import closing from fixtures.benchmark_fixture import MetricReport -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare +from fixtures.neon_fixtures import NeonEnv +from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare from fixtures.log_helper import log -def test_write_amplification(zenith_with_baseline: PgCompare): - env = zenith_with_baseline +def test_write_amplification(neon_with_baseline: PgCompare): + env = neon_with_baseline with closing(env.pg.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/pg_clients/csharp/npgsql/.dockerignore b/test_runner/pg_clients/csharp/npgsql/.dockerignore new file mode 100644 index 0000000000..cd42ee34e8 --- /dev/null +++ b/test_runner/pg_clients/csharp/npgsql/.dockerignore @@ -0,0 +1,2 @@ +bin/ +obj/ diff --git a/test_runner/pg_clients/csharp/npgsql/.gitignore b/test_runner/pg_clients/csharp/npgsql/.gitignore new file mode 100644 index 0000000000..cd42ee34e8 --- /dev/null +++ b/test_runner/pg_clients/csharp/npgsql/.gitignore @@ -0,0 +1,2 @@ +bin/ +obj/ diff --git a/test_runner/pg_clients/csharp/npgsql/Dockerfile b/test_runner/pg_clients/csharp/npgsql/Dockerfile new file mode 100644 index 0000000000..a78bc2f3bc --- /dev/null +++ b/test_runner/pg_clients/csharp/npgsql/Dockerfile @@ -0,0 +1,14 @@ +FROM mcr.microsoft.com/dotnet/sdk:6.0 AS build +WORKDIR /source + +COPY *.csproj . +RUN dotnet restore + +COPY . . +RUN dotnet publish -c release -o /app --no-restore + +FROM mcr.microsoft.com/dotnet/runtime:6.0 +WORKDIR /app +COPY --from=build /app . + +ENTRYPOINT ["dotnet", "csharp-npgsql.dll"] diff --git a/test_runner/pg_clients/csharp/npgsql/Program.cs b/test_runner/pg_clients/csharp/npgsql/Program.cs new file mode 100644 index 0000000000..17c2d5b81d --- /dev/null +++ b/test_runner/pg_clients/csharp/npgsql/Program.cs @@ -0,0 +1,19 @@ +using Npgsql; + +var host = Environment.GetEnvironmentVariable("NEON_HOST"); +var database = Environment.GetEnvironmentVariable("NEON_DATABASE"); +var user = Environment.GetEnvironmentVariable("NEON_USER"); +var password = Environment.GetEnvironmentVariable("NEON_PASSWORD"); + +var connString = $"Host={host};Username={user};Password={password};Database={database}"; + +await using var conn = new NpgsqlConnection(connString); +await conn.OpenAsync(); + +await using (var cmd = new NpgsqlCommand("SELECT 1", conn)) +await using (var reader = await cmd.ExecuteReaderAsync()) +{ + while (await reader.ReadAsync()) + Console.WriteLine(reader.GetInt32(0)); +} +await conn.CloseAsync(); diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj new file mode 100644 index 0000000000..7c1f90c1fc --- /dev/null +++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj @@ -0,0 +1,14 @@ + + + + Exe + net6.0 + enable + enable + + + + + + + diff --git a/test_runner/pg_clients/java/jdbc/.gitignore b/test_runner/pg_clients/java/jdbc/.gitignore new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/test_runner/pg_clients/java/jdbc/.gitignore @@ -0,0 +1 @@ + diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile new file mode 100644 index 0000000000..daad99c3a1 --- /dev/null +++ b/test_runner/pg_clients/java/jdbc/Dockerfile @@ -0,0 +1,10 @@ +FROM openjdk:17 +WORKDIR /source + +COPY . . + +WORKDIR /app +RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.4.0.jar && \ + javac -d /app /source/Example.java + +CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"] diff --git a/test_runner/pg_clients/java/jdbc/Example.java b/test_runner/pg_clients/java/jdbc/Example.java new file mode 100644 index 0000000000..410a971649 --- /dev/null +++ b/test_runner/pg_clients/java/jdbc/Example.java @@ -0,0 +1,31 @@ +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.ResultSet; +import java.sql.Statement; +import java.util.Properties; + +public class Example +{ + public static void main( String[] args ) throws Exception + { + String host = System.getenv("NEON_HOST"); + String database = System.getenv("NEON_DATABASE"); + String user = System.getenv("NEON_USER"); + String password = System.getenv("NEON_PASSWORD"); + + String url = "jdbc:postgresql://%s/%s".formatted(host, database); + Properties props = new Properties(); + props.setProperty("user", user); + props.setProperty("password", password); + + Connection conn = DriverManager.getConnection(url, props); + Statement st = conn.createStatement(); + ResultSet rs = st.executeQuery("SELECT 1"); + while (rs.next()) + { + System.out.println(rs.getString(1)); + } + rs.close(); + st.close(); + } +} diff --git a/test_runner/pg_clients/python/asyncpg/Dockerfile b/test_runner/pg_clients/python/asyncpg/Dockerfile new file mode 100644 index 0000000000..10662f92d5 --- /dev/null +++ b/test_runner/pg_clients/python/asyncpg/Dockerfile @@ -0,0 +1,8 @@ +FROM python:3.10 +WORKDIR /source + +COPY . . + +RUN python3 -m pip install --no-cache-dir -r requirements.txt + +CMD ["python3", "asyncpg_example.py"] diff --git a/test_runner/pg_clients/python/asyncpg/asyncpg_example.py b/test_runner/pg_clients/python/asyncpg/asyncpg_example.py new file mode 100755 index 0000000000..7f579ce672 --- /dev/null +++ b/test_runner/pg_clients/python/asyncpg/asyncpg_example.py @@ -0,0 +1,30 @@ +#! /usr/bin/env python3 + +import asyncio +import os + +import asyncpg + + +async def run(**kwargs) -> asyncpg.Record: + conn = await asyncpg.connect( + **kwargs, + statement_cache_size=0, # Prepared statements doesn't work pgbouncer + ) + rv = await conn.fetchrow("SELECT 1") + await conn.close() + + return rv + + +if __name__ == "__main__": + kwargs = { + k.lstrip("NEON_").lower(): v + for k in ("NEON_HOST", "NEON_DATABASE", "NEON_USER", "NEON_PASSWORD") + if (v := os.environ.get(k, None)) is not None + } + + loop = asyncio.new_event_loop() + row = loop.run_until_complete(run(**kwargs)) + + print(row[0]) diff --git a/test_runner/pg_clients/python/asyncpg/requirements.txt b/test_runner/pg_clients/python/asyncpg/requirements.txt new file mode 100644 index 0000000000..edc57ecc81 --- /dev/null +++ b/test_runner/pg_clients/python/asyncpg/requirements.txt @@ -0,0 +1 @@ +asyncpg==0.25.0 diff --git a/test_runner/pg_clients/python/pg8000/Dockerfile b/test_runner/pg_clients/python/pg8000/Dockerfile new file mode 100644 index 0000000000..eddf64df5b --- /dev/null +++ b/test_runner/pg_clients/python/pg8000/Dockerfile @@ -0,0 +1,8 @@ +FROM python:3.10 +WORKDIR /source + +COPY . . + +RUN python3 -m pip install --no-cache-dir -r requirements.txt + +CMD ["python3", "pg8000_example.py"] diff --git a/test_runner/pg_clients/python/pg8000/README.md b/test_runner/pg_clients/python/pg8000/README.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/pg_clients/python/pg8000/pg8000_example.py b/test_runner/pg_clients/python/pg8000/pg8000_example.py new file mode 100755 index 0000000000..f463867f88 --- /dev/null +++ b/test_runner/pg_clients/python/pg8000/pg8000_example.py @@ -0,0 +1,23 @@ +#! /usr/bin/env python3 + +import os +import ssl + +import pg8000.dbapi + +if __name__ == "__main__": + kwargs = { + k.lstrip("NEON_").lower(): v + for k in ("NEON_HOST", "NEON_DATABASE", "NEON_USER", "NEON_PASSWORD") + if (v := os.environ.get(k, None)) is not None + } + conn = pg8000.dbapi.connect( + **kwargs, + ssl_context=True, + ) + + cursor = conn.cursor() + cursor.execute("SELECT 1") + row = cursor.fetchone() + print(row[0]) + conn.close() diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt new file mode 100644 index 0000000000..1577712150 --- /dev/null +++ b/test_runner/pg_clients/python/pg8000/requirements.txt @@ -0,0 +1 @@ +pg8000==1.29.1 diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/.dockerignore b/test_runner/pg_clients/swift/PostgresClientKitExample/.dockerignore new file mode 100644 index 0000000000..30bcfa4ed5 --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/.dockerignore @@ -0,0 +1 @@ +.build/ diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/.gitignore b/test_runner/pg_clients/swift/PostgresClientKitExample/.gitignore new file mode 100644 index 0000000000..30bcfa4ed5 --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/.gitignore @@ -0,0 +1 @@ +.build/ diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile new file mode 100644 index 0000000000..8f9477bd6a --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile @@ -0,0 +1,11 @@ +FROM swift:5.6 AS build +RUN apt-get -q update && apt-get -q install -y libssl-dev +WORKDIR /source + +COPY . . +RUN swift build --configuration release + +FROM swift:5.6 +WORKDIR /app +COPY --from=build /source/.build/release/release . +CMD ["/app/PostgresClientKitExample"] diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved new file mode 100644 index 0000000000..478e31000e --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved @@ -0,0 +1,41 @@ +{ + "pins" : [ + { + "identity" : "bluesocket", + "kind" : "remoteSourceControl", + "location" : "https://github.com/IBM-Swift/BlueSocket.git", + "state" : { + "revision" : "dd924c3bc2c1c144c42b8dda3896f1a03115ded4", + "version" : "2.0.2" + } + }, + { + "identity" : "bluesslservice", + "kind" : "remoteSourceControl", + "location" : "https://github.com/IBM-Swift/BlueSSLService", + "state" : { + "revision" : "c249988fb748749739144e7f554710552acdc0bd", + "version" : "2.0.1" + } + }, + { + "identity" : "postgresclientkit", + "kind" : "remoteSourceControl", + "location" : "https://github.com/codewinsdotcom/PostgresClientKit.git", + "state" : { + "branch" : "v1.4.3", + "revision" : "beafedaea6dc9f04712e9a8547b77f47c406a47e" + } + }, + { + "identity" : "swift-argument-parser", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-argument-parser", + "state" : { + "revision" : "6b2aa2748a7881eebb9f84fb10c01293e15b52ca", + "version" : "0.5.0" + } + } + ], + "version" : 2 +} diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift new file mode 100644 index 0000000000..0d40b28572 --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift @@ -0,0 +1,17 @@ +// swift-tools-version:5.6 +import PackageDescription + +let package = Package( + name: "PostgresClientKitExample", + dependencies: [ + .package( + url: "https://github.com/codewinsdotcom/PostgresClientKit.git", + revision: "v1.4.3" + ) + ], + targets: [ + .target( + name: "PostgresClientKitExample", + dependencies: [ "PostgresClientKit" ]) + ] +) diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift new file mode 100644 index 0000000000..c7518dd88c --- /dev/null +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Sources/PostgresClientKitExample/main.swift @@ -0,0 +1,38 @@ +import Foundation + +import PostgresClientKit + +do { + var configuration = PostgresClientKit.ConnectionConfiguration() + + let env = ProcessInfo.processInfo.environment + if let host = env["NEON_HOST"] { + configuration.host = host + } + if let database = env["NEON_DATABASE"] { + configuration.database = database + } + if let user = env["NEON_USER"] { + configuration.user = user + } + if let password = env["NEON_PASSWORD"] { + configuration.credential = .scramSHA256(password: password) + } + + let connection = try PostgresClientKit.Connection(configuration: configuration) + defer { connection.close() } + + let text = "SELECT 1;" + let statement = try connection.prepareStatement(text: text) + defer { statement.close() } + + let cursor = try statement.execute(parameterValues: [ ]) + defer { cursor.close() } + + for row in cursor { + let columns = try row.get().columns + print(columns[0]) + } +} catch { + print(error) +} diff --git a/test_runner/pg_clients/test_pg_clients.py b/test_runner/pg_clients/test_pg_clients.py new file mode 100644 index 0000000000..7dc7299791 --- /dev/null +++ b/test_runner/pg_clients/test_pg_clients.py @@ -0,0 +1,54 @@ +import os +import shutil +import subprocess +from pathlib import Path +from tempfile import NamedTemporaryFile +from urllib.parse import urlparse + +import pytest +from fixtures.neon_fixtures import RemotePostgres + + +@pytest.mark.remote_cluster +@pytest.mark.parametrize( + "client", + [ + "csharp/npgsql", + "java/jdbc", + "python/asyncpg", + pytest.param( + "python/pg8000", # See https://github.com/neondatabase/neon/pull/2008#discussion_r912264281 + marks=pytest.mark.xfail(reason="Handles SSL in incompatible with Neon way")), + pytest.param( + "swift/PostgresClientKit", # See https://github.com/neondatabase/neon/pull/2008#discussion_r911896592 + marks=pytest.mark.xfail(reason="Neither SNI nor parameters is supported")), + "typescript/postgresql-client", + ], +) +def test_pg_clients(remote_pg: RemotePostgres, client: str): + conn_options = remote_pg.conn_options() + + env_file = None + with NamedTemporaryFile(mode="w", delete=False) as f: + env_file = f.name + f.write(f""" + NEON_HOST={conn_options["host"]} + NEON_DATABASE={conn_options["dbname"]} + NEON_USER={conn_options["user"]} + NEON_PASSWORD={conn_options["password"]} + """) + + image_tag = client.lower() + docker_bin = shutil.which("docker") + if docker_bin is None: + raise RuntimeError("docker is required for running this test") + + build_cmd = [ + docker_bin, "build", "--quiet", "--tag", image_tag, f"{Path(__file__).parent / client}" + ] + run_cmd = [docker_bin, "run", "--rm", "--env-file", env_file, image_tag] + + subprocess.run(build_cmd, check=True) + result = subprocess.run(run_cmd, check=True, capture_output=True, text=True) + + assert result.stdout.strip() == "1" diff --git a/test_runner/pg_clients/typescript/postgresql-client/.dockerignore b/test_runner/pg_clients/typescript/postgresql-client/.dockerignore new file mode 100644 index 0000000000..c2658d7d1b --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/.dockerignore @@ -0,0 +1 @@ +node_modules/ diff --git a/test_runner/pg_clients/typescript/postgresql-client/.gitignore b/test_runner/pg_clients/typescript/postgresql-client/.gitignore new file mode 100644 index 0000000000..c2658d7d1b --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/.gitignore @@ -0,0 +1 @@ +node_modules/ diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile new file mode 100644 index 0000000000..b57147503f --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile @@ -0,0 +1,7 @@ +FROM node:16 +WORKDIR /source + +COPY . . +RUN npm clean-install + +CMD ["/source/index.js"] \ No newline at end of file diff --git a/test_runner/pg_clients/typescript/postgresql-client/index.js b/test_runner/pg_clients/typescript/postgresql-client/index.js new file mode 100755 index 0000000000..af4899baab --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/index.js @@ -0,0 +1,25 @@ +#! /usr/bin/env node + +import {Connection} from 'postgresql-client'; + +const params = { + "host": process.env.NEON_HOST, + "database": process.env.NEON_DATABASE, + "user": process.env.NEON_USER, + "password": process.env.NEON_PASSWORD, + "ssl": true, +} +for (const key in params) { + if (params[key] === undefined) { + delete params[key]; + } +} + +const connection = new Connection(params); +await connection.connect(); +const result = await connection.query( + 'select 1' +); +const rows = result.rows; +await connection.close(); +console.log(rows[0][0]); diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json new file mode 100644 index 0000000000..bb5b4a1378 --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json @@ -0,0 +1,262 @@ +{ + "name": "typescript", + "lockfileVersion": 2, + "requires": true, + "packages": { + "": { + "dependencies": { + "postgresql-client": "^2.1.3" + } + }, + "node_modules/debug": { + "version": "4.3.4", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", + "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==", + "dependencies": { + "ms": "2.1.2" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/doublylinked": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.1.tgz", + "integrity": "sha512-Lpqb+qyHpR5Bew8xfKsxVYdjXEYAQ7HLp1IX47kHKmVCZeXErInytonjkL+kE+L4yaKSYEmDNR9MJYr5zwuAKA==", + "engines": { + "node": ">= 10.0" + } + }, + "node_modules/lightning-pool": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-3.1.3.tgz", + "integrity": "sha512-OgWuoh0BBrikWx/mc/XwIKwC9HHTe/GU3XODLMBPibv7jv8u0o2gQFS7KVEg5U8Oufg6N7mkm8Y1RoiLER0zeQ==", + "dependencies": { + "doublylinked": "^2.4.3", + "putil-promisify": "^1.8.2" + }, + "engines": { + "node": ">= 10.0" + } + }, + "node_modules/ms": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", + "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==" + }, + "node_modules/obuf": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz", + "integrity": "sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg==" + }, + "node_modules/postgres-bytea": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-3.0.0.tgz", + "integrity": "sha512-CNd4jim9RFPkObHSjVHlVrxoVQXz7quwNFpz7RY1okNNme49+sVyiTvTRobiLV548Hx/hb1BG+iE7h9493WzFw==", + "dependencies": { + "obuf": "~1.1.2" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/postgresql-client": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.1.3.tgz", + "integrity": "sha512-36Ga6JzhydsRzcCRcA/Y2hrX9C9sI0wS6sgRNBlOGkOwACXQVybmhDM7mAUbi9cT00N39Ee7btR0eMCyD//5Xg==", + "dependencies": { + "debug": "^4.3.4", + "doublylinked": "^2.5.1", + "lightning-pool": "^3.1.3", + "postgres-bytea": "^3.0.0", + "power-tasks": "^0.8.0", + "putil-merge": "^3.8.0", + "putil-promisify": "^1.8.5", + "putil-varhelpers": "^1.6.4" + }, + "engines": { + "node": ">=14.0", + "npm": ">=7.0.0" + } + }, + "node_modules/power-tasks": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-0.8.0.tgz", + "integrity": "sha512-HhMcx+y5UkzlEmKslruz8uAU2Yq8CODJsFEMFsYMrGp5EzKpkNHGu0RNvBqyewKJDZHPNKtBSULsEAxMqQIBVQ==", + "dependencies": { + "debug": "^4.3.4", + "doublylinked": "^2.5.1", + "strict-typed-events": "^2.2.0" + }, + "engines": { + "node": ">=14.0", + "npm": ">=7.0.0" + } + }, + "node_modules/putil-merge": { + "version": "3.8.0", + "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.8.0.tgz", + "integrity": "sha512-5tXPafJawWFoYZWLhkYXZ7IC/qkI45HgJsgv36lJBeq3qjFZfUITZE01CmWUBIlIn9f1yDiikqgYERARhVmgrg==", + "engines": { + "node": ">= 10.0" + } + }, + "node_modules/putil-promisify": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.8.5.tgz", + "integrity": "sha512-DItclasWWZokvpq3Aiaq0iV7WC8isP/0o/8mhC0yV6CQ781N/7NQHA1VyOm6hfpeFEwIQoo1C4Yjc5eH0q6Jbw==", + "engines": { + "node": ">= 6.0" + } + }, + "node_modules/putil-varhelpers": { + "version": "1.6.4", + "resolved": "https://registry.npmjs.org/putil-varhelpers/-/putil-varhelpers-1.6.4.tgz", + "integrity": "sha512-nM2nO1HS2yJUyPgz0grd2XZAM0Spr6/tt6F4xXeNDjByV00BV2mq6lZ+sDff8WIfQBI9Hn1Czh93H1xBvKESxw==", + "engines": { + "node": ">= 6.0" + } + }, + "node_modules/strict-typed-events": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.2.0.tgz", + "integrity": "sha512-yvHRtEfRRV7TJWi9cLhMt4Mb12JtAwXXONltUlLCA3fRB0LRy94B4E4e2gIlXzT5nZHTZVpOjJNOshri3LZ5bw==", + "dependencies": { + "putil-promisify": "^1.8.5", + "ts-gems": "^2.0.0" + }, + "engines": { + "node": ">=14.0" + } + }, + "node_modules/ts-gems": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.1.0.tgz", + "integrity": "sha512-5IqiG4nq1tsOhYPc4CwxA6bsE+TfU6uAABzf6bH4sdElgXpt/mlStvIYedvvtU7BM1+RRJxCaTLaaVFcCqNaiA==", + "peerDependencies": { + "typescript": ">=4.0.0" + } + }, + "node_modules/typescript": { + "version": "4.7.4", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.7.4.tgz", + "integrity": "sha512-C0WQT0gezHuw6AdY1M2jxUO83Rjf0HP7Sk1DtXj6j1EwkQNZrHAg2XPWlq62oqEhYvONq5pkC2Y9oPljWToLmQ==", + "peer": true, + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=4.2.0" + } + } + }, + "dependencies": { + "debug": { + "version": "4.3.4", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", + "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==", + "requires": { + "ms": "2.1.2" + } + }, + "doublylinked": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.1.tgz", + "integrity": "sha512-Lpqb+qyHpR5Bew8xfKsxVYdjXEYAQ7HLp1IX47kHKmVCZeXErInytonjkL+kE+L4yaKSYEmDNR9MJYr5zwuAKA==" + }, + "lightning-pool": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-3.1.3.tgz", + "integrity": "sha512-OgWuoh0BBrikWx/mc/XwIKwC9HHTe/GU3XODLMBPibv7jv8u0o2gQFS7KVEg5U8Oufg6N7mkm8Y1RoiLER0zeQ==", + "requires": { + "doublylinked": "^2.4.3", + "putil-promisify": "^1.8.2" + } + }, + "ms": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", + "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==" + }, + "obuf": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz", + "integrity": "sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg==" + }, + "postgres-bytea": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-3.0.0.tgz", + "integrity": "sha512-CNd4jim9RFPkObHSjVHlVrxoVQXz7quwNFpz7RY1okNNme49+sVyiTvTRobiLV548Hx/hb1BG+iE7h9493WzFw==", + "requires": { + "obuf": "~1.1.2" + } + }, + "postgresql-client": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.1.3.tgz", + "integrity": "sha512-36Ga6JzhydsRzcCRcA/Y2hrX9C9sI0wS6sgRNBlOGkOwACXQVybmhDM7mAUbi9cT00N39Ee7btR0eMCyD//5Xg==", + "requires": { + "debug": "^4.3.4", + "doublylinked": "^2.5.1", + "lightning-pool": "^3.1.3", + "postgres-bytea": "^3.0.0", + "power-tasks": "^0.8.0", + "putil-merge": "^3.8.0", + "putil-promisify": "^1.8.5", + "putil-varhelpers": "^1.6.4" + } + }, + "power-tasks": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-0.8.0.tgz", + "integrity": "sha512-HhMcx+y5UkzlEmKslruz8uAU2Yq8CODJsFEMFsYMrGp5EzKpkNHGu0RNvBqyewKJDZHPNKtBSULsEAxMqQIBVQ==", + "requires": { + "debug": "^4.3.4", + "doublylinked": "^2.5.1", + "strict-typed-events": "^2.2.0" + } + }, + "putil-merge": { + "version": "3.8.0", + "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.8.0.tgz", + "integrity": "sha512-5tXPafJawWFoYZWLhkYXZ7IC/qkI45HgJsgv36lJBeq3qjFZfUITZE01CmWUBIlIn9f1yDiikqgYERARhVmgrg==" + }, + "putil-promisify": { + "version": "1.8.5", + "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.8.5.tgz", + "integrity": "sha512-DItclasWWZokvpq3Aiaq0iV7WC8isP/0o/8mhC0yV6CQ781N/7NQHA1VyOm6hfpeFEwIQoo1C4Yjc5eH0q6Jbw==" + }, + "putil-varhelpers": { + "version": "1.6.4", + "resolved": "https://registry.npmjs.org/putil-varhelpers/-/putil-varhelpers-1.6.4.tgz", + "integrity": "sha512-nM2nO1HS2yJUyPgz0grd2XZAM0Spr6/tt6F4xXeNDjByV00BV2mq6lZ+sDff8WIfQBI9Hn1Czh93H1xBvKESxw==" + }, + "strict-typed-events": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.2.0.tgz", + "integrity": "sha512-yvHRtEfRRV7TJWi9cLhMt4Mb12JtAwXXONltUlLCA3fRB0LRy94B4E4e2gIlXzT5nZHTZVpOjJNOshri3LZ5bw==", + "requires": { + "putil-promisify": "^1.8.5", + "ts-gems": "^2.0.0" + } + }, + "ts-gems": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.1.0.tgz", + "integrity": "sha512-5IqiG4nq1tsOhYPc4CwxA6bsE+TfU6uAABzf6bH4sdElgXpt/mlStvIYedvvtU7BM1+RRJxCaTLaaVFcCqNaiA==", + "requires": {} + }, + "typescript": { + "version": "4.7.4", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.7.4.tgz", + "integrity": "sha512-C0WQT0gezHuw6AdY1M2jxUO83Rjf0HP7Sk1DtXj6j1EwkQNZrHAg2XPWlq62oqEhYvONq5pkC2Y9oPljWToLmQ==", + "peer": true + } + } +} diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json new file mode 100644 index 0000000000..5d8ca23a7f --- /dev/null +++ b/test_runner/pg_clients/typescript/postgresql-client/package.json @@ -0,0 +1,6 @@ +{ + "type": "module", + "dependencies": { + "postgresql-client": "^2.1.3" + } +} diff --git a/test_runner/test_broken.py b/test_runner/test_broken.py index 56c735e87c..3960546689 100644 --- a/test_runner/test_broken.py +++ b/test_runner/test_broken.py @@ -1,7 +1,7 @@ import pytest import os -from fixtures.zenith_fixtures import ZenithEnv +from fixtures.neon_fixtures import NeonEnv from fixtures.log_helper import log """ Use this test to see what happens when tests fail. @@ -18,10 +18,10 @@ run_broken = pytest.mark.skipif(os.environ.get('RUN_BROKEN') is None, @run_broken -def test_broken(zenith_simple_env: ZenithEnv, pg_bin): - env = zenith_simple_env +def test_broken(neon_simple_env: NeonEnv, pg_bin): + env = neon_simple_env - env.zenith_cli.create_branch("test_broken", "empty") + env.neon_cli.create_branch("test_broken", "empty") env.postgres.create_start("test_broken") log.info('postgres is running') diff --git a/vendor/postgres b/vendor/postgres index dba273190e..9c99008445 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit dba273190e546c2a6345c38435e91780797c734f +Subproject commit 9c99008445dbccd8204f188e0933def507058eac diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 92877faef7..4dc7e4e157 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -33,7 +33,9 @@ itoa = { version = "0.4", features = ["i128", "std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } memchr = { version = "2", features = ["std", "use_std"] } -num-integer = { version = "0.1", default-features = false, features = ["i128"] } +nom = { version = "7", features = ["alloc", "std"] } +num-bigint = { version = "0.4", features = ["std"] } +num-integer = { version = "0.1", default-features = false, features = ["i128", "std"] } num-traits = { version = "0.2", features = ["i128", "std"] } prost = { version = "0.10", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } @@ -41,10 +43,11 @@ regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cac regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } -tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } +time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "quickcheck", "quickcheck-dep", "std", "time-macros"] } +tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros", "winapi"] } tokio-util = { version = "0.7", features = ["codec", "io"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } -tracing-core = { version = "0.1", features = ["lazy_static", "std"] } +tracing-core = { version = "0.1", features = ["lazy_static", "std", "valuable"] } [build-dependencies] ahash = { version = "0.7", features = ["std"] } @@ -57,6 +60,7 @@ indexmap = { version = "1", default-features = false, features = ["std"] } libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } memchr = { version = "2", features = ["std", "use_std"] } +nom = { version = "7", features = ["alloc", "std"] } prost = { version = "0.10", features = ["prost-derive", "std"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }