diff --git a/.github/PULL_REQUEST_TEMPLATE/release-pr.md b/.github/PULL_REQUEST_TEMPLATE/release-pr.md index 8fcc3bd4af..a848077e6a 100644 --- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md +++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md @@ -14,7 +14,7 @@ - [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel - [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true) - [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some) -- [ ] Check [cloud SLO dashboard](https://observer.zenith.tech/d/_oWcBMJ7k/cloud-slos?orgId=1) -- [ ] Check [compute startup metrics dashboard](https://observer.zenith.tech/d/5OkYJEmVz/compute-startup-time) +- [ ] Check [cloud SLO dashboard](https://neonprod.grafana.net/d/_oWcBMJ7k/cloud-slos?orgId=1) +- [ ] Check [compute startup metrics dashboard](https://neonprod.grafana.net/d/5OkYJEmVz/compute-startup-time) diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index 9102bcf3da..7ee43a3587 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -5,12 +5,12 @@ inputs: api_key: desctiption: 'Neon API key' required: true - environment: - desctiption: 'dev (aka captest) or staging' - required: true project_id: desctiption: 'ID of the Project to create Branch in' required: true + api_host: + desctiption: 'Neon API host' + default: console.stage.neon.tech outputs: dsn: description: 'Created Branch DSN (for main database)' @@ -22,27 +22,6 @@ outputs: runs: using: "composite" steps: - - name: Parse Input - id: parse-input - shell: bash -euxo pipefail {0} - run: | - case "${ENVIRONMENT}" in - dev) - API_HOST=console.dev.neon.tech - ;; - staging) - API_HOST=console.stage.neon.tech - ;; - *) - echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only" - exit 1 - ;; - esac - - echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT - env: - ENVIRONMENT: ${{ inputs.environment }} - - name: Create New Branch id: create-branch shell: bash -euxo pipefail {0} @@ -56,7 +35,12 @@ runs: --data "{ \"branch\": { \"name\": \"Created by actions/neon-branch-create; GITHUB_RUN_ID=${GITHUB_RUN_ID} at $(date +%s)\" - } + }, + \"endpoints\": [ + { + \"type\": \"read_write\" + } + ] }") if [ -z "${branch}" ]; then @@ -84,8 +68,8 @@ runs: host=$(echo $branch | jq --raw-output '.endpoints[0].host') echo "host=${host}" >> $GITHUB_OUTPUT env: + API_HOST: ${{ inputs.api_host }} API_KEY: ${{ inputs.api_key }} - API_HOST: ${{ steps.parse-input.outputs.api_host }} PROJECT_ID: ${{ inputs.project_id }} - name: Get Role name @@ -103,8 +87,8 @@ runs: role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name') echo "role_name=${role_name}" >> $GITHUB_OUTPUT env: + API_HOST: ${{ inputs.api_host }} API_KEY: ${{ inputs.api_key }} - API_HOST: ${{ steps.parse-input.outputs.api_host }} PROJECT_ID: ${{ inputs.project_id }} BRANCH_ID: ${{ steps.create-branch.outputs.branch_id }} @@ -146,8 +130,8 @@ runs: echo "::add-mask::${dsn}" echo "dsn=${dsn}" >> $GITHUB_OUTPUT env: + API_HOST: ${{ inputs.api_host }} API_KEY: ${{ inputs.api_key }} - API_HOST: ${{ steps.parse-input.outputs.api_host }} PROJECT_ID: ${{ inputs.project_id }} BRANCH_ID: ${{ steps.create-branch.outputs.branch_id }} ROLE_NAME: ${{ steps.role-name.outputs.role_name }} diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml index 3ca96ced11..5689093e2e 100644 --- a/.github/actions/neon-branch-delete/action.yml +++ b/.github/actions/neon-branch-delete/action.yml @@ -5,40 +5,19 @@ inputs: api_key: desctiption: 'Neon API key' required: true - environment: - desctiption: 'dev (aka captest) or staging' - required: true project_id: desctiption: 'ID of the Project which should be deleted' required: true branch_id: desctiption: 'ID of the branch to delete' required: true + api_host: + desctiption: 'Neon API host' + default: console.stage.neon.tech runs: using: "composite" steps: - - name: Parse Input - id: parse-input - shell: bash -euxo pipefail {0} - run: | - case "${ENVIRONMENT}" in - dev) - API_HOST=console.dev.neon.tech - ;; - staging) - API_HOST=console.stage.neon.tech - ;; - *) - echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only" - exit 1 - ;; - esac - - echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT - env: - ENVIRONMENT: ${{ inputs.environment }} - - name: Delete Branch # Do not try to delete a branch if .github/actions/neon-project-create # or .github/actions/neon-branch-create failed before @@ -73,7 +52,7 @@ runs: exit 1 fi env: + API_HOST: ${{ inputs.api_host }} API_KEY: ${{ inputs.api_key }} PROJECT_ID: ${{ inputs.project_id }} BRANCH_ID: ${{ inputs.branch_id }} - API_HOST: ${{ steps.parse-input.outputs.api_host }} diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index 8399d6c511..0480bfbc84 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -5,12 +5,16 @@ inputs: api_key: desctiption: 'Neon API key' required: true - environment: - desctiption: 'dev (aka captest) or staging' - required: true region_id: desctiption: 'Region ID, if not set the project will be created in the default region' - required: false + default: aws-us-east-2 + postgres_version: + desctiption: 'Postgres version; default is 15' + default: 15 + api_host: + desctiption: 'Neon API host' + default: console.stage.neon.tech + outputs: dsn: description: 'Created Project DSN (for main database)' @@ -22,31 +26,6 @@ outputs: runs: using: "composite" steps: - - name: Parse Input - id: parse-input - shell: bash -euxo pipefail {0} - run: | - case "${ENVIRONMENT}" in - dev) - API_HOST=console.dev.neon.tech - REGION_ID=${REGION_ID:-aws-eu-west-1} - ;; - staging) - API_HOST=console.stage.neon.tech - REGION_ID=${REGION_ID:-aws-us-east-2} - ;; - *) - echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only" - exit 1 - ;; - esac - - echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT - echo "region_id=${REGION_ID}" >> $GITHUB_OUTPUT - env: - ENVIRONMENT: ${{ inputs.environment }} - REGION_ID: ${{ inputs.region_id }} - - name: Create Neon Project id: create-neon-project # A shell without `set -x` to not to expose password/dsn in logs @@ -61,6 +40,7 @@ runs: --data "{ \"project\": { \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\", + \"pg_version\": ${POSTGRES_VERSION}, \"region_id\": \"${REGION_ID}\", \"settings\": { } } @@ -75,7 +55,10 @@ runs: project_id=$(echo $project | jq --raw-output '.project.id') echo "project_id=${project_id}" >> $GITHUB_OUTPUT + + echo "Project ${project_id} has been created" env: + API_HOST: ${{ inputs.api_host }} API_KEY: ${{ inputs.api_key }} - API_HOST: ${{ steps.parse-input.outputs.api_host }} - REGION_ID: ${{ steps.parse-input.outputs.region_id }} + REGION_ID: ${{ inputs.region_id }} + POSTGRES_VERSION: ${{ inputs.postgres_version }} diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml index 88b5d3fc5b..adc8510a34 100644 --- a/.github/actions/neon-project-delete/action.yml +++ b/.github/actions/neon-project-delete/action.yml @@ -5,37 +5,16 @@ inputs: api_key: desctiption: 'Neon API key' required: true - environment: - desctiption: 'dev (aka captest) or staging' - required: true project_id: desctiption: 'ID of the Project to delete' required: true + api_host: + desctiption: 'Neon API host' + default: console.stage.neon.tech runs: using: "composite" steps: - - name: Parse Input - id: parse-input - shell: bash -euxo pipefail {0} - run: | - case "${ENVIRONMENT}" in - dev) - API_HOST=console.dev.neon.tech - ;; - staging) - API_HOST=console.stage.neon.tech - ;; - *) - echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only" - exit 1 - ;; - esac - - echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT - env: - ENVIRONMENT: ${{ inputs.environment }} - - name: Delete Neon Project # Do not try to delete a project if .github/actions/neon-project-create failed before if: ${{ inputs.project_id != '' }} @@ -48,7 +27,9 @@ runs: --header "Accept: application/json" \ --header "Content-Type: application/json" \ --header "Authorization: Bearer ${API_KEY}" + + echo "Project ${PROJECT_ID} has been deleted" env: + API_HOST: ${{ inputs.api_host }} API_KEY: ${{ inputs.api_key }} PROJECT_ID: ${{ inputs.project_id }} - API_HOST: ${{ steps.parse-input.outputs.api_host }} diff --git a/.github/ansible/ansible.cfg b/.github/ansible/ansible.cfg index 0497ee401d..5818a64455 100644 --- a/.github/ansible/ansible.cfg +++ b/.github/ansible/ansible.cfg @@ -3,7 +3,6 @@ localhost_warning = False host_key_checking = False timeout = 30 -collections_paths = ./collections [ssh_connection] ssh_args = -F ./ansible.ssh.cfg diff --git a/.github/ansible/neon-stress.hosts.yaml b/.github/ansible/neon-stress.hosts.yaml deleted file mode 100644 index dd61ac5a5e..0000000000 --- a/.github/ansible/neon-stress.hosts.yaml +++ /dev/null @@ -1,31 +0,0 @@ -storage: - vars: - bucket_name: neon-storage-ireland - bucket_region: eu-west-1 - console_mgmt_base_url: http://neon-stress-console.local - etcd_endpoints: neon-stress-etcd.local:2379 - safekeeper_enable_s3_offload: 'false' - pageserver_config_stub: - pg_distrib_dir: /usr/local - remote_storage: - bucket_name: "{{ bucket_name }}" - bucket_region: "{{ bucket_region }}" - prefix_in_bucket: "{{ inventory_hostname }}" - safekeeper_s3_prefix: neon-stress/wal - hostname_suffix: ".local" - remote_user: admin - children: - pageservers: - hosts: - neon-stress-ps-1: - console_region_id: aws-eu-west-1 - neon-stress-ps-2: - console_region_id: aws-eu-west-1 - safekeepers: - hosts: - neon-stress-sk-1: - console_region_id: aws-eu-west-1 - neon-stress-sk-2: - console_region_id: aws-eu-west-1 - neon-stress-sk-3: - console_region_id: aws-eu-west-1 diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml index bb4af91f71..648029c120 100644 --- a/.github/ansible/prod.ap-southeast-1.hosts.yaml +++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml @@ -3,7 +3,7 @@ storage: bucket_name: neon-prod-storage-ap-southeast-1 bucket_region: ap-southeast-1 console_mgmt_base_url: http://console-release.local - etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379 + broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051 pageserver_config_stub: pg_distrib_dir: /usr/local remote_storage: @@ -16,6 +16,7 @@ storage: ansible_aws_ssm_region: ap-southeast-1 ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1 console_region_id: aws-ap-southeast-1 + sentry_environment: production children: pageservers: diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml index 68b1579746..c285a9f3b6 100644 --- a/.github/ansible/prod.eu-central-1.hosts.yaml +++ b/.github/ansible/prod.eu-central-1.hosts.yaml @@ -3,7 +3,7 @@ storage: bucket_name: neon-prod-storage-eu-central-1 bucket_region: eu-central-1 console_mgmt_base_url: http://console-release.local - etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379 + broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051 pageserver_config_stub: pg_distrib_dir: /usr/local remote_storage: @@ -16,6 +16,7 @@ storage: ansible_aws_ssm_region: eu-central-1 ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1 console_region_id: aws-eu-central-1 + sentry_environment: production children: pageservers: diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml index 1d54e2ef0a..1753068b8c 100644 --- a/.github/ansible/prod.us-east-2.hosts.yaml +++ b/.github/ansible/prod.us-east-2.hosts.yaml @@ -3,7 +3,7 @@ storage: bucket_name: neon-prod-storage-us-east-2 bucket_region: us-east-2 console_mgmt_base_url: http://console-release.local - etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379 + broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051 pageserver_config_stub: pg_distrib_dir: /usr/local remote_storage: @@ -16,6 +16,7 @@ storage: ansible_aws_ssm_region: us-east-2 ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2 console_region_id: aws-us-east-2 + sentry_environment: production children: pageservers: diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml new file mode 100644 index 0000000000..9eb422a3ae --- /dev/null +++ b/.github/ansible/prod.us-west-2.hosts.yaml @@ -0,0 +1,39 @@ +storage: + vars: + bucket_name: neon-prod-storage-us-west-2 + bucket_region: us-west-2 + console_mgmt_base_url: http://console-release.local + broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051 + pageserver_config_stub: + pg_distrib_dir: /usr/local + remote_storage: + bucket_name: "{{ bucket_name }}" + bucket_region: "{{ bucket_region }}" + prefix_in_bucket: "pageserver/v1" + safekeeper_s3_prefix: safekeeper/v1/wal + hostname_suffix: "" + remote_user: ssm-user + ansible_aws_ssm_region: us-west-2 + ansible_aws_ssm_bucket_name: neon-prod-storage-us-west-2 + console_region_id: aws-us-west-2-new + sentry_environment: production + + children: + pageservers: + hosts: + pageserver-0.us-west-2.aws.neon.tech: + ansible_host: i-0d9f6dfae0e1c780d + pageserver-1.us-west-2.aws.neon.tech: + ansible_host: i-0c834be1dddba8b3f + pageserver-2.us-west-2.aws.neon.tech: + ansible_host: i-051642d372c0a4f32 + + safekeepers: + hosts: + safekeeper-0.us-west-2.aws.neon.tech: + ansible_host: i-00719d8a74986fda6 + safekeeper-1.us-west-2.aws.neon.tech: + ansible_host: i-074682f9d3c712e7c + safekeeper-2.us-west-2.aws.neon.tech: + ansible_host: i-042b7efb1729d7966 + diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml index 63b293f3e3..3122a43801 100644 --- a/.github/ansible/production.hosts.yaml +++ b/.github/ansible/production.hosts.yaml @@ -4,7 +4,7 @@ storage: console_mgmt_base_url: http://console-release.local bucket_name: zenith-storage-oregon bucket_region: us-west-2 - etcd_endpoints: zenith-1-etcd.local:2379 + broker_endpoint: http://storage-broker.prod.local:50051 pageserver_config_stub: pg_distrib_dir: /usr/local remote_storage: @@ -14,6 +14,7 @@ storage: safekeeper_s3_prefix: prod-1/wal hostname_suffix: ".local" remote_user: admin + sentry_environment: production children: pageservers: @@ -33,5 +34,5 @@ storage: console_region_id: aws-us-west-2 zenith-1-sk-2: console_region_id: aws-us-west-2 - zenith-1-sk-3: + zenith-1-sk-4: console_region_id: aws-us-west-2 diff --git a/.github/ansible/scripts/init_pageserver.sh b/.github/ansible/scripts/init_pageserver.sh index 426925a837..e89fc5e667 100644 --- a/.github/ansible/scripts/init_pageserver.sh +++ b/.github/ansible/scripts/init_pageserver.sh @@ -1,7 +1,8 @@ #!/bin/sh -# get instance id from meta-data service +# fetch params from meta-data service INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) +AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone) # store fqdn hostname in var HOST=$(hostname -f) @@ -16,7 +17,8 @@ cat <> $GITHUB_PATH + + - name: Set up Connection String + id: set-up-connstr + run: | + case "${PLATFORM}" in + neon-captest-prefetch) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }} + ;; + rds-aurora) + CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR }} + ;; + rds-postgres) + CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }} + ;; + *) + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'" + exit 1 + ;; + esac + + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + + psql ${CONNSTR} -c "SELECT version();" + + - name: Set database options + if: matrix.platform == 'neon-captest-prefetch' + run: | + DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()") + + psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on" + psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32" + psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32" + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + + - name: Run TPC-H benchmark + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_perf_olap.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 -k test_tpch + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + + - name: Create Allure report + if: success() || failure() + uses: ./.github/actions/allure-report + with: + action: generate + build_type: ${{ env.BUILD_TYPE }} + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1701e02dcb..17c698482c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -7,6 +7,10 @@ on: - release pull_request: +defaults: + run: + shell: bash -euxo pipefail {0} + concurrency: # Allow only one workflow per any non-`main` branch. group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} @@ -45,6 +49,83 @@ jobs: shell: bash id: build-tag + check-codestyle-python: + runs-on: [ self-hosted, dev, x64 ] + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cloud:pinned + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: false + fetch-depth: 1 + + - name: Cache poetry deps + id: cache_poetry + uses: actions/cache@v3 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + run: ./scripts/pysync + + - name: Run isort to ensure code format + run: poetry run isort --diff --check . + + - name: Run black to ensure code format + run: poetry run black --diff --check . + + - name: Run flake8 to ensure code format + run: poetry run flake8 . + + - name: Run mypy to check types + run: poetry run mypy . + + check-codestyle-rust: + runs-on: [ self-hosted, dev, x64 ] + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 1 + + - name: Restore cargo deps cache + id: cache_cargo + uses: actions/cache@v3 + with: + path: | + ~/.cargo/registry/ + !~/.cargo/registry/src + ~/.cargo/git/ + target/ + key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} + + # Some of our rust modules use FFI and need those to be checked + - name: Get postgres headers + run: make postgres-headers -j$(nproc) + - name: Run cargo clippy + run: ./run_clippy.sh + + # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run + - name: Check formatting + if: ${{ !cancelled() }} + run: cargo fmt --all -- --check + + # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci + - name: Check rust dependencies + if: ${{ !cancelled() }} + run: | + cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date + cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack + build-neon: runs-on: [ self-hosted, dev, x64 ] container: @@ -79,12 +160,10 @@ jobs: - name: Set pg 14 revision for caching id: pg_v14_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT - shell: bash -euxo pipefail {0} - name: Set pg 15 revision for caching id: pg_v15_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT - shell: bash -euxo pipefail {0} # Set some environment variables used by all the steps. # @@ -101,16 +180,15 @@ jobs: if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" CARGO_FEATURES="--features testing" - CARGO_FLAGS="--locked --timings $CARGO_FEATURES" + CARGO_FLAGS="--locked $CARGO_FEATURES" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" CARGO_FEATURES="--features testing,profiling" - CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES" + CARGO_FLAGS="--locked --release $CARGO_FEATURES" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV - shell: bash -euxo pipefail {0} # Don't include the ~/.cargo/registry/src directory. It contains just # uncompressed versions of the crates in ~/.cargo/registry/cache @@ -127,8 +205,8 @@ jobs: target/ # Fall back to older versions of the key, if no cache for current Cargo.lock was found key: | - v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} - v10-${{ runner.os }}-${{ matrix.build_type }}-cargo- + v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} + v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}- - name: Cache postgres v14 build id: cache_pg_14 @@ -147,26 +225,21 @@ jobs: - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' run: mold -run make postgres-v14 -j$(nproc) - shell: bash -euxo pipefail {0} - name: Build postgres v15 if: steps.cache_pg_15.outputs.cache-hit != 'true' run: mold -run make postgres-v15 -j$(nproc) - shell: bash -euxo pipefail {0} - name: Build neon extensions run: mold -run make neon-pg-ext -j$(nproc) - shell: bash -euxo pipefail {0} - name: Run cargo build run: | ${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests - shell: bash -euxo pipefail {0} - name: Run cargo test run: | ${cov_prefix} cargo test $CARGO_FLAGS - shell: bash -euxo pipefail {0} - name: Install rust binaries run: | @@ -207,11 +280,9 @@ jobs: echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list done fi - shell: bash -euxo pipefail {0} - name: Install postgres binaries run: cp -a pg_install /tmp/neon/pg_install - shell: bash -euxo pipefail {0} - name: Upload Neon artifact uses: ./.github/actions/upload @@ -219,17 +290,6 @@ jobs: name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact path: /tmp/neon - - name: Prepare cargo build timing stats for storing - run: | - mkdir -p "/tmp/neon/cargo-timings/$BUILD_TYPE/" - cp -r ./target/cargo-timings/* "/tmp/neon/cargo-timings/$BUILD_TYPE/" - shell: bash -euxo pipefail {0} - - name: Upload cargo build stats - uses: ./.github/actions/upload - with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-build-stats - path: /tmp/neon/cargo-timings/ - # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - name: Merge and upload coverage data if: matrix.build_type == 'debug' @@ -250,7 +310,7 @@ jobs: uses: actions/checkout@v3 with: submodules: true - fetch-depth: 2 + fetch-depth: 1 - name: Pytest regression tests uses: ./.github/actions/run-python-test-set @@ -284,7 +344,7 @@ jobs: uses: actions/checkout@v3 with: submodules: true - fetch-depth: 2 + fetch-depth: 1 - name: Pytest benchmarks uses: ./.github/actions/run-python-test-set @@ -305,7 +365,7 @@ jobs: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init needs: [ regress-tests, benchmarks ] - if: success() || failure() + if: ${{ !cancelled() }} strategy: fail-fast: false matrix: @@ -330,7 +390,6 @@ jobs: SHA: ${{ github.event.pull_request.head.sha || github.sha }} REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }} TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} - shell: bash -euxo pipefail {0} run: | curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json ./scripts/pysync @@ -363,7 +422,7 @@ jobs: !~/.cargo/registry/src ~/.cargo/git/ target/ - key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }} + key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} - name: Get Neon artifact uses: ./.github/actions/download @@ -379,7 +438,6 @@ jobs: - name: Merge coverage data run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge - shell: bash -euxo pipefail {0} - name: Build and upload coverage report run: | @@ -412,7 +470,6 @@ jobs: \"description\": \"Coverage report is ready\", \"target_url\": \"$REPORT_URL\" }" - shell: bash -euxo pipefail {0} trigger-e2e-tests: runs-on: [ self-hosted, dev, x64 ] @@ -463,6 +520,9 @@ jobs: runs-on: [ self-hosted, dev, x64 ] needs: [ tag ] container: gcr.io/kaniko-project/executor:v1.9.0-debug + defaults: + run: + shell: sh -eu {0} steps: - name: Checkout @@ -481,6 +541,9 @@ jobs: runs-on: [ self-hosted, dev, x64 ] needs: [ tag ] container: gcr.io/kaniko-project/executor:v1.9.0-debug + defaults: + run: + shell: sh -eu {0} steps: - name: Checkout @@ -492,10 +555,18 @@ jobs: - name: Kaniko build compute tools run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} - compute-node-image-v14: + compute-node-image: runs-on: [ self-hosted, dev, x64 ] container: gcr.io/kaniko-project/executor:v1.9.0-debug needs: [ tag ] + strategy: + fail-fast: false + matrix: + version: [ v14, v15 ] + defaults: + run: + shell: sh -eu {0} + steps: - name: Checkout uses: actions/checkout@v1 # v3 won't work with kaniko @@ -506,28 +577,40 @@ jobs: - name: Configure ECR login run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - name: Kaniko build compute node with extensions v14 - run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} + - name: Kaniko build compute node with extensions + run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-${{ matrix.version }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - compute-node-image-v15: + vm-compute-node-image: runs-on: [ self-hosted, dev, x64 ] - container: gcr.io/kaniko-project/executor:v1.9.0-debug - needs: [ tag ] + needs: [ tag, compute-node-image ] + strategy: + fail-fast: false + matrix: + version: [ v14, v15 ] + defaults: + run: + shell: sh -eu {0} + steps: - - name: Checkout - uses: actions/checkout@v1 # v3 won't work with kaniko - with: - submodules: true - fetch-depth: 0 + - name: Downloading latest vm-builder + run: | + curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder + chmod +x vm-builder - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json + - name: Pulling compute-node image + run: | + docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - - name: Kaniko build compute node with extensions v15 - run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} + - name: Build vm image + run: | + ./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + + - name: Pushing vm-compute-node image + run: | + docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} test-images: - needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ] + needs: [ tag, neon-image, compute-node-image, compute-tools-image ] runs-on: [ self-hosted, dev, x64 ] steps: @@ -571,13 +654,13 @@ jobs: promote-images: runs-on: [ self-hosted, dev, x64 ] - needs: [ tag, test-images ] + needs: [ tag, test-images, vm-compute-node-image ] if: github.event_name != 'workflow_dispatch' container: amazon/aws-cli strategy: fail-fast: false matrix: - name: [ neon, compute-node-v14, compute-node-v15, compute-tools ] + name: [ neon, compute-node-v14, vm-compute-node-v14, compute-node-v15, vm-compute-node-v15, compute-tools] steps: - name: Promote image to latest @@ -610,9 +693,15 @@ jobs: - name: Pull compute node v14 image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14 + - name: Pull vm compute node v14 image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14 + - name: Pull compute node v15 image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} compute-node-v15 + - name: Pull vm compute node v15 image from ECR + run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15 + - name: Pull rust image from ECR run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust @@ -624,7 +713,9 @@ jobs: crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest + crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest - name: Configure Docker Hub login run: | @@ -641,9 +732,15 @@ jobs: - name: Push compute node v14 image to Docker Hub run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} + - name: Push vm compute node v14 image to Docker Hub + run: crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} + - name: Push compute node v15 image to Docker Hub run: crane push compute-node-v15 neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} + - name: Push vm compute node v15 image to Docker Hub + run: crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} + - name: Push rust image to Docker Hub run: crane push rust neondatabase/rust:pinned @@ -655,38 +752,36 @@ jobs: crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest + crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest calculate-deploy-targets: - runs-on: [ self-hosted, Linux, k8s-runner ] + runs-on: [ self-hosted, dev, x64 ] if: | - (github.ref_name == 'main' || github.ref_name == 'release') && + github.ref_name == 'release' && github.event_name != 'workflow_dispatch' outputs: matrix-include: ${{ steps.set-matrix.outputs.include }} steps: - id: set-matrix run: | - if [[ "$GITHUB_REF_NAME" == "main" ]]; then - STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}' - NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY"}' - echo "include=[$STAGING, $NEON_STRESS]" >> $GITHUB_OUTPUT - elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}' + if [[ "$GITHUB_REF_NAME" == "release" ]]; then + PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}' echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to 'release'" exit 1 fi deploy: - runs-on: [ self-hosted, Linux, k8s-runner ] - #container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest + runs-on: [ self-hosted, dev, x64 ] + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] if: | - (github.ref_name == 'main' || github.ref_name == 'release') && + github.ref_name == 'release' && github.event_name != 'workflow_dispatch' defaults: run: @@ -701,16 +796,6 @@ jobs: submodules: true fetch-depth: 0 - - name: Setup python - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - - name: Setup ansible - run: | - export PATH="/root/.local/bin:$PATH" - pip install --progress-bar off --user ansible boto3 toml - - name: Redeploy run: | export DOCKER_TAG=${{needs.tag.outputs.build-tag}} @@ -731,8 +816,8 @@ jobs: chmod 0600 ssh-key ssh-add ssh-key rm -f ssh-key ssh-key-cert.pub - ansible-galaxy collection install sivel.toiletwater - ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} + ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater + ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} rm -f neon_install.tar.gz .neon_current_version deploy-new: @@ -740,7 +825,7 @@ jobs: container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version. # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly - needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] + needs: [ push-docker-hub, tag, regress-tests ] if: | (github.ref_name == 'main') && github.event_name != 'workflow_dispatch' @@ -770,7 +855,7 @@ jobs: exit 1 fi ansible-galaxy collection install sivel.toiletwater - ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}} + ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} rm -f neon_install.tar.gz .neon_current_version deploy-pr-test-new: @@ -780,7 +865,7 @@ jobs: # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly needs: [ push-docker-hub, tag, regress-tests ] if: | - contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') && + contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') && github.event_name != 'workflow_dispatch' defaults: run: @@ -803,7 +888,7 @@ jobs: ./get_binaries.sh ansible-galaxy collection install sivel.toiletwater - ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}} + ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} rm -f neon_install.tar.gz .neon_current_version deploy-prod-new: @@ -820,7 +905,7 @@ jobs: shell: bash strategy: matrix: - target_region: [ us-east-2, eu-central-1, ap-southeast-1 ] + target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ] steps: - name: Checkout uses: actions/checkout@v3 @@ -843,16 +928,16 @@ jobs: fi ansible-galaxy collection install sivel.toiletwater - ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}} + ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} rm -f neon_install.tar.gz .neon_current_version deploy-proxy: runs-on: [ self-hosted, dev, x64 ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] if: | - (github.ref_name == 'main' || github.ref_name == 'release') && + github.ref_name == 'release' && github.event_name != 'workflow_dispatch' defaults: run: @@ -885,8 +970,49 @@ jobs: - name: Re-deploy proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s - helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + + deploy-storage-broker: + name: deploy storage broker on old staging and old prod + runs-on: [ self-hosted, dev, x64 ] + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned + # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. + needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ] + if: | + github.ref_name == 'release' && + github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash + strategy: + matrix: + include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}} + env: + KUBECONFIG: .kubeconfig + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Add curl + run: apt update && apt install curl -y + + - name: Store kubeconfig file + run: | + echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG} + chmod 0600 ${KUBECONFIG} + + - name: Setup helm v3 + run: | + curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash + helm repo add neondatabase https://neondatabase.github.io/helm-charts + + - name: Deploy storage-broker + run: + helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s deploy-proxy-new: runs-on: [ self-hosted, dev, x64 ] @@ -925,19 +1051,53 @@ jobs: - name: Re-deploy scram proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - name: Re-deploy link proxy if: matrix.deploy_link_proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s - name: Re-deploy legacy scram proxy if: matrix.deploy_legacy_scram_proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + + deploy-storage-broker-dev-new: + runs-on: [ self-hosted, dev, x64 ] + container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned + # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. + needs: [ push-docker-hub, tag, regress-tests ] + if: | + (github.ref_name == 'main') && + github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash + strategy: + matrix: + include: + - target_region: us-east-2 + target_cluster: dev-us-east-2-beta + - target_region: eu-west-1 + target_cluster: dev-eu-west-1-zeta + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Configure environment + run: | + helm repo add neondatabase https://neondatabase.github.io/helm-charts + aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} + + - name: Deploy storage-broker + run: + helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s deploy-proxy-prod-new: runs-on: prod @@ -955,6 +1115,8 @@ jobs: include: - target_region: us-east-2 target_cluster: prod-us-east-2-delta + - target_region: us-west-2 + target_cluster: prod-us-west-2-eta - target_region: eu-central-1 target_cluster: prod-eu-central-1-gamma - target_region: ap-southeast-1 @@ -974,7 +1136,45 @@ jobs: - name: Re-deploy proxy run: | DOCKER_TAG=${{needs.tag.outputs.build-tag}} - helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s + helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s + + deploy-storage-broker-prod-new: + runs-on: prod + container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest + # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently. + needs: [ push-docker-hub, tag, regress-tests ] + if: | + (github.ref_name == 'release') && + github.event_name != 'workflow_dispatch' + defaults: + run: + shell: bash + strategy: + matrix: + include: + - target_region: us-east-2 + target_cluster: prod-us-east-2-delta + - target_region: us-west-2 + target_cluster: prod-us-west-2-eta + - target_region: eu-central-1 + target_cluster: prod-eu-central-1-gamma + - target_region: ap-southeast-1 + target_cluster: prod-ap-southeast-1-epsilon + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 0 + + - name: Configure environment + run: | + helm repo add neondatabase https://neondatabase.github.io/helm-charts + aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }} + + - name: Deploy storage-broker + run: + helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s promote-compatibility-data: runs-on: [ self-hosted, dev, x64 ] @@ -985,7 +1185,6 @@ jobs: if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch' steps: - name: Promote compatibility snapshot for the release - shell: bash -euxo pipefail {0} env: BUCKET: neon-github-public-dev PREFIX: artifacts/latest diff --git a/.github/workflows/codestyle.yml b/.github/workflows/codestyle.yml deleted file mode 100644 index 01fef71c9a..0000000000 --- a/.github/workflows/codestyle.yml +++ /dev/null @@ -1,166 +0,0 @@ -name: Check code style and build - -on: - push: - branches: - - main - pull_request: - -defaults: - run: - shell: bash -euxo pipefail {0} - -concurrency: - # Allow only one workflow per any non-`main` branch. - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} - cancel-in-progress: true - -env: - RUST_BACKTRACE: 1 - COPT: '-Werror' - -jobs: - check-codestyle-rust: - strategy: - fail-fast: false - matrix: - # XXX: both OSes have rustup - # * https://github.com/actions/runner-images/blob/main/images/macos/macos-12-Readme.md#rust-tools - # * https://github.com/actions/runner-images/blob/main/images/linux/Ubuntu2204-Readme.md#rust-tools - # this is all we need to install our toolchain later via rust-toolchain.toml - # so don't install any toolchain explicitly. - os: [ubuntu-latest, macos-latest] - timeout-minutes: 90 - name: check codestyle rust and postgres - runs-on: ${{ matrix.os }} - - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 2 - - - name: Check formatting - run: cargo fmt --all -- --check - - - name: Install Ubuntu postgres dependencies - if: matrix.os == 'ubuntu-latest' - run: | - sudo apt update - sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev protobuf-compiler - - - name: Install macOS postgres dependencies - if: matrix.os == 'macos-latest' - run: brew install flex bison openssl protobuf - - - name: Set pg 14 revision for caching - id: pg_v14_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT - shell: bash -euxo pipefail {0} - - - name: Set pg 15 revision for caching - id: pg_v15_rev - run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT - shell: bash -euxo pipefail {0} - - - name: Cache postgres v14 build - id: cache_pg_14 - uses: actions/cache@v3 - with: - path: pg_install/v14 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Cache postgres v15 build - id: cache_pg_15 - uses: actions/cache@v3 - with: - path: pg_install/v15 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - - - name: Set extra env for macOS - if: matrix.os == 'macos-latest' - run: | - echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV - echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV - - - name: Build postgres v14 - if: steps.cache_pg_14.outputs.cache-hit != 'true' - run: make postgres-v14 - shell: bash -euxo pipefail {0} - - - name: Build postgres v15 - if: steps.cache_pg_15.outputs.cache-hit != 'true' - run: make postgres-v15 - shell: bash -euxo pipefail {0} - - - name: Build neon extensions - run: make neon-pg-ext - - - name: Cache cargo deps - id: cache_cargo - uses: actions/cache@v3 - with: - path: | - ~/.cargo/registry - !~/.cargo/registry/src - ~/.cargo/git - target - key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust - - - name: Run cargo clippy - run: ./run_clippy.sh - - - name: Ensure all project builds - run: cargo build --locked --all --all-targets - - check-rust-dependencies: - runs-on: [ self-hosted, dev, x64 ] - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned - options: --init - - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: false - fetch-depth: 1 - - # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - - name: Check every project module is covered by Hakari - run: | - cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date - cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack - shell: bash -euxo pipefail {0} - - check-codestyle-python: - runs-on: [ self-hosted, Linux, k8s-runner ] - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: false - fetch-depth: 1 - - - name: Cache poetry deps - id: cache_poetry - uses: actions/cache@v3 - with: - path: ~/.cache/pypoetry/virtualenvs - key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }} - - - name: Install Python deps - run: ./scripts/pysync - - - name: Run isort to ensure code format - run: poetry run isort --diff --check . - - - name: Run black to ensure code format - run: poetry run black --diff --check . - - - name: Run flake8 to ensure code format - run: poetry run flake8 . - - - name: Run mypy to check types - run: poetry run mypy . diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml new file mode 100644 index 0000000000..b8600e0665 --- /dev/null +++ b/.github/workflows/neon_extra_builds.yml @@ -0,0 +1,128 @@ +name: Check neon with extra platform builds + +on: + push: + branches: + - main + +defaults: + run: + shell: bash -euxo pipefail {0} + +concurrency: + # Allow only one workflow per any non-`main` branch. + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }} + cancel-in-progress: true + +env: + RUST_BACKTRACE: 1 + COPT: '-Werror' + +jobs: + check-macos-build: + timeout-minutes: 90 + runs-on: macos-latest + + env: + # Use release build only, to have less debug info around + # Hence keeping target/ (and general cache size) smaller + BUILD_TYPE: release + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 1 + + - name: Install macOS postgres dependencies + run: brew install flex bison openssl protobuf + + - name: Set pg 14 revision for caching + id: pg_v14_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT + + - name: Set pg 15 revision for caching + id: pg_v15_rev + run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT + + - name: Cache postgres v14 build + id: cache_pg_14 + uses: actions/cache@v3 + with: + path: pg_install/v14 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Cache postgres v15 build + id: cache_pg_15 + uses: actions/cache@v3 + with: + path: pg_install/v15 + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + + - name: Set extra env for macOS + run: | + echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV + echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV + + - name: Cache cargo deps + uses: actions/cache@v3 + with: + path: | + ~/.cargo/registry + !~/.cargo/registry/src + ~/.cargo/git + target + key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust + + - name: Build postgres v14 + if: steps.cache_pg_14.outputs.cache-hit != 'true' + run: make postgres-v14 -j$(nproc) + + - name: Build postgres v15 + if: steps.cache_pg_15.outputs.cache-hit != 'true' + run: make postgres-v15 -j$(nproc) + + - name: Build neon extensions + run: make neon-pg-ext -j$(nproc) + + - name: Run cargo build + run: cargo build --all --release + + - name: Check that no warnings are produced + run: ./run_clippy.sh + + gather-rust-build-stats: + timeout-minutes: 90 + runs-on: ubuntu-latest + + env: + BUILD_TYPE: release + # build with incremental compilation produce partial results + # so do not attempt to cache this build, also disable the incremental compilation + CARGO_INCREMENTAL: 0 + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + fetch-depth: 1 + + - name: Install Ubuntu postgres dependencies + run: | + sudo apt update + sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev protobuf-compiler + + # Some of our rust modules use FFI and need those to be checked + - name: Get postgres headers + run: make postgres-headers -j$(nproc) + + - name: Produce the build stats + run: cargo build --all --release --timings + + - name: Upload the build stats + uses: actions/upload-artifact@v3 + with: + name: neon-${{ runner.os }}-release-build-stats + path: ./target/cargo-timings/ diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index 0600f9234f..9f57519589 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -23,6 +23,7 @@ jobs: runs-on: [ ubuntu-latest ] env: + DEFAULT_PG_VERSION: 14 TEST_OUTPUT: /tmp/test_output steps: @@ -51,8 +52,8 @@ jobs: id: create-neon-project uses: ./.github/actions/neon-project-create with: - environment: staging api_key: ${{ secrets.NEON_STAGING_API_KEY }} + postgres_version: ${{ env.DEFAULT_PG_VERSION }} - name: Run pytest env: @@ -63,7 +64,7 @@ jobs: run: | # Test framework expects we have psql binary; # but since we don't really need it in this test, let's mock it - mkdir -p "$POSTGRES_DISTRIB_DIR/v14/bin" && touch "$POSTGRES_DISTRIB_DIR/v14/bin/psql"; + mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql"; ./scripts/pytest \ --junitxml=$TEST_OUTPUT/junit.xml \ --tb=short \ @@ -75,7 +76,6 @@ jobs: if: ${{ always() }} uses: ./.github/actions/neon-project-delete with: - environment: staging project_id: ${{ steps.create-neon-project.outputs.project_id }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} diff --git a/Cargo.lock b/Cargo.lock index 88b6ef93bf..f1348eeace 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "addr2line" -version = "0.17.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9ecd88a8c8378ca913a680cd98f0f13ac67383d35993f86c90a70e3f137816b" +checksum = "a76fd60b23679b7d19bd066031410fb7e458ccc5e958eb5c325888ce4baedc97" dependencies = [ "gimli", ] @@ -30,9 +30,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "0.7.19" +version = "0.7.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4f55bd91a0978cbfd91c457a164bab8b4001c833b7f323132c0a4e1922dd44e" +checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" dependencies = [ "memchr", ] @@ -59,9 +59,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anyhow" -version = "1.0.65" +version = "1.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98161a4e3e2184da77bb14f02184cdd111e83bbbcc9979dfee3c44b9a85f5602" +checksum = "2cb2f989d18dd141ab8ae82f64d1a8cdd37e0840f73a406896cf5e99502fab61" dependencies = [ "backtrace", ] @@ -94,7 +94,7 @@ dependencies = [ "num-traits", "rusticata-macros", "thiserror", - "time 0.3.15", + "time", ] [[package]] @@ -143,9 +143,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.57" +version = "0.1.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76464446b8bc32758d7e88ee1a804d9914cd9b1cb264c029899680b0be29826f" +checksum = "677d1d8ab452a3936018a687b20e6f7cf5363d713b732b8884001317b0e48aa3" dependencies = [ "proc-macro2", "quote", @@ -154,9 +154,9 @@ dependencies = [ [[package]] name = "atomic-polyfill" -version = "0.1.10" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c041a8d9751a520ee19656232a18971f18946a7900f1520ee4400002244dd89" +checksum = "e3ff7eb3f316534d83a8a2c3d1674ace8a5a71198eba31e2e2b597833f699b28" dependencies = [ "critical-section", ] @@ -167,7 +167,7 @@ version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ - "hermit-abi", + "hermit-abi 0.1.19", "libc", "winapi", ] @@ -199,7 +199,7 @@ dependencies = [ "http", "hyper", "ring", - "time 0.3.15", + "time", "tokio", "tower", "tracing", @@ -340,7 +340,7 @@ dependencies = [ "percent-encoding", "regex", "ring", - "time 0.3.15", + "time", "tracing", ] @@ -477,7 +477,7 @@ dependencies = [ "itoa", "num-integer", "ryu", - "time 0.3.15", + "time", ] [[package]] @@ -500,16 +500,16 @@ dependencies = [ "aws-smithy-http", "aws-smithy-types", "http", - "rustc_version 0.4.0", + "rustc_version", "tracing", "zeroize", ] [[package]] name = "axum" -version = "0.5.16" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9e3356844c4d6a6d6467b8da2cffb4a2820be256f50a3a386c9d152bab31043" +checksum = "08b108ad2665fa3f6e6a517c3d80ec3e77d224c47d605167aefaa5d7ef97fa48" dependencies = [ "async-trait", "axum-core", @@ -525,9 +525,9 @@ dependencies = [ "mime", "percent-encoding", "pin-project-lite", + "rustversion", "serde", "sync_wrapper", - "tokio", "tower", "tower-http", "tower-layer", @@ -536,9 +536,9 @@ dependencies = [ [[package]] name = "axum-core" -version = "0.2.8" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9f0c0a60006f2a293d82d571f635042a72edf927539b7685bd62d361963839b" +checksum = "79b8558f5a0581152dc94dcd289132a1d377494bdeafcd41869b3258e3e2ad92" dependencies = [ "async-trait", "bytes", @@ -546,15 +546,16 @@ dependencies = [ "http", "http-body", "mime", + "rustversion", "tower-layer", "tower-service", ] [[package]] name = "backtrace" -version = "0.3.66" +version = "0.3.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cab84319d616cfb654d03394f38ab7e6f0919e181b1b57e1fd15e7fb4077d9a7" +checksum = "233d376d6d185f2a3093e58f283f60f880315b6c60075b01f36b3b85154564ca" dependencies = [ "addr2line", "cc", @@ -566,25 +567,16 @@ dependencies = [ ] [[package]] -name = "bare-metal" -version = "0.2.5" +name = "base64" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5deb64efa5bd81e31fcd1938615a6d98c82eafcbcd787162b6f63b91d6bac5b3" -dependencies = [ - "rustc_version 0.2.3", -] - -[[package]] -name = "bare-metal" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8fe8f5a8a398345e52358e18ff07cc17a568fbca5c6f73873d3a62056309603" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" [[package]] name = "base64" -version = "0.13.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" +checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5" [[package]] name = "bincode" @@ -617,18 +609,6 @@ dependencies = [ "which", ] -[[package]] -name = "bit_field" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcb6dd1c2376d2e096796e234a70e17e94cc2d5d54ff8ce42b28cef1d0d359a4" - -[[package]] -name = "bitfield" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719" - [[package]] name = "bitflags" version = "1.3.2" @@ -646,9 +626,9 @@ dependencies = [ [[package]] name = "bstr" -version = "1.0.1" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fca0852af221f458706eb0725c03e4ed6c46af9ac98e6a689d5e634215d594dd" +checksum = "b45ea9b00a7b3f2988e9a65ad3917e62123c38dba709b666506207be96d1790b" dependencies = [ "memchr", "once_cell", @@ -658,15 +638,15 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.11.0" +version = "3.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d" +checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba" [[package]] name = "bytemuck" -version = "1.12.1" +version = "1.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f5715e491b5a1598fc2bef5a606847b5dc1d48ea625bd3c02c00de8285591da" +checksum = "aaa3a8d9a1ca92e282c96a32d6511b695d7d994d1d102ba85d279f9b2756947f" [[package]] name = "byteorder" @@ -676,9 +656,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" -version = "1.2.1" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec8a7b6a70fde80372154c65702f00a0f56f3e1c36abbc6c440484be248856db" +checksum = "dfb24e866b15a1af2a1b663f10c6b6b8f397a84aadb828f12e5b289ec23a3a3c" dependencies = [ "serde", ] @@ -701,9 +681,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.0.73" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" +checksum = "a20104e2335ce8a659d6dd92a51a767a0c062599c73b343fd152cb401e828c3d" [[package]] name = "cexpr" @@ -722,17 +702,14 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.22" +version = "0.4.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfd4d1b31faaa3a89d7934dbded3111da0d2ef28e3ebccdb4f0179f5929d1ef1" +checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" dependencies = [ "iana-time-zone", - "js-sys", "num-integer", "num-traits", "serde", - "time 0.1.44", - "wasm-bindgen", "winapi", ] @@ -776,9 +753,9 @@ dependencies = [ [[package]] name = "clap" -version = "3.2.22" +version = "3.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86447ad904c7fb335a790c9d7fe3d0d971dc523b8ccd1561a520de9a85302750" +checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5" dependencies = [ "bitflags", "clap_lex 0.2.4", @@ -788,14 +765,14 @@ dependencies = [ [[package]] name = "clap" -version = "4.0.15" +version = "4.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bf8832993da70a4c6d13c581f4463c2bdda27b9bf1c5498dc4365543abe6d6f" +checksum = "a7db700bc935f9e43e88d00b0850dae18a63773cfbec6d8e070fccf7fef89a39" dependencies = [ - "atty", "bitflags", "clap_derive", "clap_lex 0.3.0", + "is-terminal", "once_cell", "strsim", "termcolor", @@ -803,9 +780,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.0.13" +version = "4.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c42f169caba89a7d512b5418b09864543eeb4d497416c917d7137863bd2076ad" +checksum = "0177313f9f02afc995627906bbd8967e2be069f5261954222dac78290c2b9014" dependencies = [ "heck", "proc-macro-error", @@ -842,15 +819,6 @@ dependencies = [ "libc", ] -[[package]] -name = "cmake" -version = "0.1.48" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8ad8cef104ac57b68b89df3208164d228503abbdce70f6880ffa3d970e7443a" -dependencies = [ - "cc", -] - [[package]] name = "codespan-reporting" version = "0.11.1" @@ -873,9 +841,9 @@ dependencies = [ [[package]] name = "comfy-table" -version = "6.1.0" +version = "6.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85914173c2f558d61613bfbbf1911f14e630895087a7ed2fafc0f5319e1536e7" +checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d" dependencies = [ "crossterm", "strum", @@ -889,7 +857,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "clap 4.0.15", + "clap 4.0.32", "env_logger", "futures", "hyper", @@ -931,10 +899,10 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.0.15", + "clap 4.0.32", "comfy-table", "git-version", - "nix 0.25.0", + "nix 0.25.1", "once_cell", "pageserver_api", "postgres", @@ -944,6 +912,7 @@ dependencies = [ "safekeeper_api", "serde", "serde_with", + "storage_broker", "tar", "thiserror", "toml", @@ -968,18 +937,6 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" -[[package]] -name = "cortex-m" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70858629a458fdfd39f9675c4dc309411f2a3f83bede76988d81bf1a0ecee9e0" -dependencies = [ - "bare-metal 0.2.5", - "bitfield", - "embedded-hal", - "volatile-register", -] - [[package]] name = "cpp_demangle" version = "0.3.5" @@ -1004,7 +961,7 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" dependencies = [ - "rustc_version 0.4.0", + "rustc_version", ] [[package]] @@ -1026,7 +983,7 @@ dependencies = [ "atty", "cast", "ciborium", - "clap 3.2.22", + "clap 3.2.23", "criterion-plot", "itertools", "lazy_static", @@ -1054,15 +1011,9 @@ dependencies = [ [[package]] name = "critical-section" -version = "0.2.7" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95da181745b56d4bd339530ec393508910c909c784e8962d15d722bacf0bcbcd" -dependencies = [ - "bare-metal 1.0.0", - "cfg-if", - "cortex-m", - "riscv", -] +checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52" [[package]] name = "crossbeam-channel" @@ -1087,14 +1038,14 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.11" +version = "0.9.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f916dfc5d356b0ed9dae65f1db9fc9770aa2851d2662b988ccf4fe3516e86348" +checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a" dependencies = [ "autocfg", "cfg-if", "crossbeam-utils", - "memoffset 0.6.5", + "memoffset 0.7.1", "scopeguard", ] @@ -1145,9 +1096,9 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.79" +version = "1.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f83d0ebf42c6eafb8d7c52f7e5f2d3003b89c7aa4fd2b79229209459a849af8" +checksum = "5add3fc1717409d029b20c5b6903fc0c0b02fa6741d820054f4a2efa5e5816fd" dependencies = [ "cc", "cxxbridge-flags", @@ -1157,9 +1108,9 @@ dependencies = [ [[package]] name = "cxx-build" -version = "1.0.79" +version = "1.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07d050484b55975889284352b0ffc2ecbda25c0c55978017c132b29ba0818a86" +checksum = "b4c87959ba14bc6fbc61df77c3fcfe180fc32b93538c4f1031dd802ccb5f2ff0" dependencies = [ "cc", "codespan-reporting", @@ -1172,15 +1123,15 @@ dependencies = [ [[package]] name = "cxxbridge-flags" -version = "1.0.79" +version = "1.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99d2199b00553eda8012dfec8d3b1c75fce747cf27c169a270b3b99e3448ab78" +checksum = "69a3e162fde4e594ed2b07d0f83c6c67b745e7f28ce58c6df5e6b6bef99dfb59" [[package]] name = "cxxbridge-macro" -version = "1.0.79" +version = "1.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcb67a6de1f602736dd7eaead0080cf3435df806c61b24b13328db128c58868f" +checksum = "3e7e2adeb6a0d4a282e581096b06e1791532b7d576dcde5ccd9382acf55db8e6" dependencies = [ "proc-macro2", "quote", @@ -1189,9 +1140,9 @@ dependencies = [ [[package]] name = "darling" -version = "0.14.1" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4529658bdda7fd6769b8614be250cdcfc3aeb0ee72fe66f9e41e5e5eb73eac02" +checksum = "b0dd3cd20dc6b5a876612a6e5accfe7f3dd883db6d07acfbf14c128f61550dfa" dependencies = [ "darling_core", "darling_macro", @@ -1199,9 +1150,9 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.14.1" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "649c91bc01e8b1eac09fb91e8dbc7d517684ca6be8ebc75bb9cafc894f9fdb6f" +checksum = "a784d2ccaf7c98501746bf0be29b2022ba41fd62a2e622af997a03e9f972859f" dependencies = [ "fnv", "ident_case", @@ -1213,9 +1164,9 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.14.1" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddfc69c5bfcbd2fc09a0f38451d2daf0e372e367986a83906d1b0dbc88134fb5" +checksum = "7618812407e9402654622dd402b0a89dff9ba93badd6540781526117b92aab7e" dependencies = [ "darling_core", "quote", @@ -1224,9 +1175,9 @@ dependencies = [ [[package]] name = "data-encoding" -version = "2.3.2" +version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ee2393c4a91429dffb4bedf19f4d6abf27d8a732c8ce4980305d782e5426d57" +checksum = "23d8666cb01533c39dde32bcbab8e227b4ed6679b2c925eba05feabea39508fb" [[package]] name = "debugid" @@ -1237,6 +1188,16 @@ dependencies = [ "uuid 0.8.2", ] +[[package]] +name = "debugid" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d" +dependencies = [ + "serde", + "uuid 1.2.2", +] + [[package]] name = "der-parser" version = "8.1.0" @@ -1253,9 +1214,9 @@ dependencies = [ [[package]] name = "digest" -version = "0.10.5" +version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adfbc57365a37acbd2ebf2b64d7e69bb766e2fea813521ed536f5d0520dcf86c" +checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" dependencies = [ "block-buffer", "crypto-common", @@ -1279,16 +1240,6 @@ version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" -[[package]] -name = "embedded-hal" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35949884794ad573cf46071e41c9b60efb0cb311e3ca01f7af807af1debc66ff" -dependencies = [ - "nb 0.1.3", - "void", -] - [[package]] name = "encoding_rs" version = "0.8.31" @@ -1300,9 +1251,9 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.9.1" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c90bf5f19754d10198ccb95b70664fc925bd1fc090a0fd9a6ebc54acc8cd6272" +checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7" dependencies = [ "atty", "humantime", @@ -1312,36 +1263,24 @@ dependencies = [ ] [[package]] -name = "etcd-client" -version = "0.9.2" +name = "errno" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8664f6ea68aba5503d42dd1be786b0f1bd9b7972e7f40208c83ef74db91bf" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" dependencies = [ - "http", - "prost 0.10.4", - "tokio", - "tokio-stream", - "tonic 0.7.2", - "tonic-build 0.7.2", - "tower", - "tower-service", + "errno-dragonfly", + "libc", + "winapi", ] [[package]] -name = "etcd_broker" -version = "0.1.0" +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" dependencies = [ - "etcd-client", - "once_cell", - "regex", - "serde", - "serde_json", - "serde_with", - "thiserror", - "tokio", - "tracing", - "utils", - "workspace_hack", + "cc", + "libc", ] [[package]] @@ -1372,14 +1311,14 @@ dependencies = [ [[package]] name = "filetime" -version = "0.2.17" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94a7bbaa59354bc20dd75b67f23e2797b4490e9d6928203fb105c79e448c86c" +checksum = "4e884668cd0c7480504233e951174ddc3b382f7c2666e3b7310b5c4e7b0c37f9" dependencies = [ "cfg-if", "libc", "redox_syscall", - "windows-sys", + "windows-sys 0.42.0", ] [[package]] @@ -1406,6 +1345,21 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.1.0" @@ -1436,9 +1390,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f21eda599937fba36daeb58a22e8f5cee2d14c4a17b5b7739c7c8e5e3b8230c" +checksum = "38390104763dc37a5145a53c29c63c1290b5d316d6086ec32c293f6736051bb0" dependencies = [ "futures-channel", "futures-core", @@ -1451,9 +1405,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30bdd20c28fadd505d0fd6712cdfcb0d4b5648baf45faef7f852afb2399bb050" +checksum = "52ba265a92256105f45b719605a571ffe2d1f0fea3807304b522c1d778f79eed" dependencies = [ "futures-core", "futures-sink", @@ -1461,15 +1415,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e5aa3de05362c3fb88de6531e6296e85cde7739cccad4b9dfeeb7f6ebce56bf" +checksum = "04909a7a7e4633ae6c4a9ab280aeb86da1236243a77b694a49eacd659a4bd3ac" [[package]] name = "futures-executor" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ff63c23854bee61b6e9cd331d523909f238fc7636290b96826e9cfa5faa00ab" +checksum = "7acc85df6714c176ab5edf386123fafe217be88c0840ec11f199441134a074e2" dependencies = [ "futures-core", "futures-task", @@ -1478,15 +1432,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbf4d2a7a308fd4578637c0b17c7e1c7ba127b8f6ba00b29f717e9655d85eb68" +checksum = "00f5fb52a06bdcadeb54e8d3671f8888a39697dcb0b81b23b55174030427f4eb" [[package]] name = "futures-macro" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42cd15d1c7456c04dbdf7e88bcd69760d74f3a798d6444e16974b505b0e62f17" +checksum = "bdfb8ce053d86b91919aad980c220b1fb8401a9394410e1c289ed7e66b61835d" dependencies = [ "proc-macro2", "quote", @@ -1495,15 +1449,15 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b20ba5a92e727ba30e72834706623d94ac93a725410b6a6b6fbc1b07f7ba56" +checksum = "39c15cf1a4aa79df40f1bb462fb39676d0ad9e366c2a33b590d7c66f4f81fcf9" [[package]] name = "futures-task" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6508c467c73851293f390476d4491cf4d227dbabcd4170f3bb6044959b294f1" +checksum = "2ffb393ac5d9a6eaa9d3fdf37ae2776656b706e200c8e16b1bdb227f5198e6ea" [[package]] name = "futures-timer" @@ -1513,9 +1467,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.24" +version = "0.3.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44fb6cb1be61cc1d2e43b262516aafcf63b241cffdb1d3fa115f91d9c7b09c90" +checksum = "197676987abd2f9cadff84926f410af1c183608d36641465df73ae8211dc65d6" dependencies = [ "futures-channel", "futures-core", @@ -1541,20 +1495,20 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6" +checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" dependencies = [ "cfg-if", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", ] [[package]] name = "gimli" -version = "0.26.2" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22030e2c5a68ec659fde1e949a745124b48e6fa8b045b7ed5bd1fe4ccc5c4e5d" +checksum = "dec7af912d60cdbd3677c1af9352ebae6fb8394d165568a2234df0fa00f87793" [[package]] name = "git-version" @@ -1586,9 +1540,9 @@ checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" [[package]] name = "h2" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ca32592cf21ac7ccab1825cd87f6c9b3d9022c44d086172ed0966bec8af30be" +checksum = "5f9f29bc9dda355256b2916cf526ab02ce0aeaaaf2bad60d65ef3f12f11dd0f4" dependencies = [ "bytes", "fnv", @@ -1635,7 +1589,7 @@ checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743" dependencies = [ "atomic-polyfill", "hash32", - "rustc_version 0.4.0", + "rustc_version", "spin 0.9.4", "stable_deref_trait", ] @@ -1655,6 +1609,15 @@ dependencies = [ "libc", ] +[[package]] +name = "hermit-abi" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" +dependencies = [ + "libc", +] + [[package]] name = "hex" version = "0.4.3" @@ -1679,6 +1642,17 @@ dependencies = [ "digest", ] +[[package]] +name = "hostname" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c731c3e10504cc8ed35cfe2f1db4c9274c3d35fa486e3b31df46f068ef3e867" +dependencies = [ + "libc", + "match_cfg", + "winapi", +] + [[package]] name = "http" version = "0.2.8" @@ -1737,9 +1711,9 @@ dependencies = [ [[package]] name = "hyper" -version = "0.14.20" +version = "0.14.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02c929dc5c39e335a03c405292728118860721b10190d98c2a0f0efd5baafbac" +checksum = "034711faac9d2166cb1baf1a2fb0b60b1f277f8492fd72176c17f3515e1abd3c" dependencies = [ "bytes", "futures-channel", @@ -1761,9 +1735,9 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.23.0" +version = "0.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d87c48c02e0dc5e3b849a2041db3029fd066650f8f717c07bf8ed78ccb895cac" +checksum = "1788965e61b367cd03a62950836d5cd41560c3577d90e40e0819373194d1661c" dependencies = [ "http", "hyper", @@ -1787,10 +1761,23 @@ dependencies = [ ] [[package]] -name = "iana-time-zone" -version = "0.1.51" +name = "hyper-tls" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5a6ef98976b22b3b7f2f3a806f858cb862044cfa66805aa3ad84cb3d3b785ed" +checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" +dependencies = [ + "bytes", + "hyper", + "native-tls", + "tokio", + "tokio-native-tls", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -1828,9 +1815,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.9.1" +version = "1.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" +checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399" dependencies = [ "autocfg", "hashbrown", @@ -1885,10 +1872,32 @@ dependencies = [ ] [[package]] -name = "ipnet" -version = "2.5.0" +name = "io-lifetimes" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "879d54834c8c76457ef4293a689b2a8c59b076067ad77b15efafbb05f92a592b" +checksum = "46112a93252b123d31a119a8d1a1ac19deac4fac6e0e8b0df58f0d4e5870e63c" +dependencies = [ + "libc", + "windows-sys 0.42.0", +] + +[[package]] +name = "ipnet" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11b0d96e660696543b251e58030cf9787df56da39dab19ad60eae7353040917e" + +[[package]] +name = "is-terminal" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189" +dependencies = [ + "hermit-abi 0.2.6", + "io-lifetimes", + "rustix", + "windows-sys 0.42.0", +] [[package]] name = "itertools" @@ -1901,9 +1910,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc" +checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440" [[package]] name = "js-sys" @@ -1916,11 +1925,11 @@ dependencies = [ [[package]] name = "jsonwebtoken" -version = "8.1.1" +version = "8.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aa4b4af834c6cfd35d8763d359661b90f2e45d8f750a0849156c7f4671af09c" +checksum = "09f4f04699947111ec1733e71778d763555737579e44b85844cae8e1940a1828" dependencies = [ - "base64", + "base64 0.13.1", "pem", "ring", "serde", @@ -1962,15 +1971,15 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.135" +version = "0.2.139" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c" +checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" [[package]] name = "libloading" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efbc0f03f9a775e9f6aed295c6a1ba2253c5757a9e03d55c6caa46a681abcddd" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" dependencies = [ "cfg-if", "winapi", @@ -1978,19 +1987,25 @@ dependencies = [ [[package]] name = "libm" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565" +checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" [[package]] name = "link-cplusplus" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9272ab7b96c9046fbc5bc56c06c117cb639fe2d509df0c421cad82d2915cf369" +checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" dependencies = [ "cc", ] +[[package]] +name = "linux-raw-sys" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" + [[package]] name = "lock_api" version = "0.4.9" @@ -2011,6 +2026,12 @@ dependencies = [ "serde", ] +[[package]] +name = "match_cfg" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4" + [[package]] name = "matchers" version = "0.1.0" @@ -2022,9 +2043,9 @@ dependencies = [ [[package]] name = "matchit" -version = "0.5.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb" +checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40" [[package]] name = "md-5" @@ -2049,9 +2070,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memmap2" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95af15f345b17af2efc8ead6080fb8bc376f8cec1b35277b935637595fe77498" +checksum = "4b182332558b18d807c4ce1ca8ca983b34c3ee32765e47b3f0f69b90355cc1dc" dependencies = [ "libc", ] @@ -2098,23 +2119,23 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.5.4" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96590ba8f175222643a85693f33d26e9c8a015f599c216509b1a6894af675d34" +checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" dependencies = [ "adler", ] [[package]] name = "mio" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57ee1c23c7c63b0c9250c339ffdc69255f110b298b901b9f6c82547b7b87caaf" +checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" dependencies = [ "libc", "log", - "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys", + "wasi", + "windows-sys 0.42.0", ] [[package]] @@ -2124,25 +2145,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" [[package]] -name = "nb" -version = "0.1.3" +name = "native-tls" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "801d31da0513b6ec5214e9bf433a77966320625a37860f910be265be6e18d06f" +checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" dependencies = [ - "nb 1.0.0", + "lazy_static", + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", ] -[[package]] -name = "nb" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "546c37ac5d9e56f55e73b677106873d9d9f5190605e41a856503623648488cae" - [[package]] name = "nix" -version = "0.23.1" +version = "0.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" +checksum = "8f3790c00a0150112de0f4cd161e3d7fc4b2d8a5542ffc35f099a2562aecb35c" dependencies = [ "bitflags", "cc", @@ -2153,9 +2177,9 @@ dependencies = [ [[package]] name = "nix" -version = "0.25.0" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e322c04a9e3440c327fca7b6c8a63e6890a32fa2ad689db972425f07e0d22abb" +checksum = "f346ff70e7dbfd675fe90590b92d59ef2de15a8779ae305ebcbfd3f0caf59be4" dependencies = [ "autocfg", "bitflags", @@ -2167,9 +2191,9 @@ dependencies = [ [[package]] name = "nom" -version = "7.1.1" +version = "7.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8903e5a29a317527874d0402f867152a3d21c908bb0b933e416c65e301d4c36" +checksum = "e5507769c4919c998e69e49c839d9dc6e693ede4cc4290d6ad8b41d4f09c548c" dependencies = [ "memchr", "minimal-lexical", @@ -2216,9 +2240,9 @@ dependencies = [ [[package]] name = "num-format" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54b862ff8df690cf089058c98b183676a7ed0f974cc08b426800093227cbff3b" +checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3" dependencies = [ "arrayvec", "itoa", @@ -2246,46 +2270,37 @@ dependencies = [ [[package]] name = "num_cpus" -version = "1.13.1" +version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" -dependencies = [ - "hermit-abi", - "libc", -] - -[[package]] -name = "num_threads" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" +checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" dependencies = [ + "hermit-abi 0.2.6", "libc", ] [[package]] name = "object" -version = "0.29.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21158b2c33aa6d4561f1c0a6ea283ca92bc54802a93b263e910746d679a7eb53" +checksum = "239da7f290cfa979f43f85a8efeee9a8a76d0827c356d37f9d3d7254d6b537fb" dependencies = [ "memchr", ] [[package]] name = "oid-registry" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d4bda43fd1b844cbc6e6e54b5444e2b1bc7838bce59ad205902cccbb26d6761" +checksum = "9bedf36ffb6ba96c2eb7144ef6270557b52e54b20c0a8e1eb2ff99a6c6959bff" dependencies = [ "asn1-rs", ] [[package]] name = "once_cell" -version = "1.15.0" +version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1" +checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66" [[package]] name = "oorandom" @@ -2293,6 +2308,32 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" +[[package]] +name = "openssl" +version = "0.10.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b102428fd03bc5edf97f62620f7298614c45cedf287c271e7ed450bbaf83f2e1" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "openssl-probe" version = "0.1.5" @@ -2300,10 +2341,34 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] -name = "os_str_bytes" -version = "6.3.0" +name = "openssl-sys" +version = "0.9.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff" +checksum = "23bbbf7854cd45b83958ebe919f0e8e516793727652e27fda10a8384cfc790b7" +dependencies = [ + "autocfg", + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "os_info" +version = "3.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4750134fb6a5d49afc80777394ad5d95b04bc12068c6abb92fae8f43817270f" +dependencies = [ + "log", + "serde", + "winapi", +] + +[[package]] +name = "os_str_bytes" +version = "6.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" [[package]] name = "overload" @@ -2322,13 +2387,12 @@ dependencies = [ "byteorder", "bytes", "chrono", - "clap 4.0.15", + "clap 4.0.32", "close_fds", "const_format", "crc32c", "criterion", "crossbeam-utils", - "etcd_broker", "fail", "futures", "git-version", @@ -2339,7 +2403,7 @@ dependencies = [ "hyper", "itertools", "metrics", - "nix 0.25.0", + "nix 0.25.1", "num-traits", "once_cell", "pageserver_api", @@ -2354,6 +2418,7 @@ dependencies = [ "rand", "regex", "remote_storage", + "reqwest", "rpds", "rstar", "scopeguard", @@ -2361,6 +2426,7 @@ dependencies = [ "serde_json", "serde_with", "signal-hook", + "storage_broker", "svg_fmt", "tar", "tempfile", @@ -2400,7 +2466,7 @@ checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" dependencies = [ "instant", "lock_api", - "parking_lot_core 0.8.5", + "parking_lot_core 0.8.6", ] [[package]] @@ -2410,14 +2476,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", - "parking_lot_core 0.9.3", + "parking_lot_core 0.9.5", ] [[package]] name = "parking_lot_core" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" +checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" dependencies = [ "cfg-if", "instant", @@ -2429,15 +2495,15 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.3" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09a279cbf25cb0757810394fbc1e359949b59e348145c643a939a525692e6929" +checksum = "7ff9f3fef3968a3ec5945535ed654cb38ff72d7495a25619e2247fb15a2ed9ba" dependencies = [ "cfg-if", "libc", "redox_syscall", "smallvec", - "windows-sys", + "windows-sys 0.42.0", ] [[package]] @@ -2452,7 +2518,7 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03c64931a1a212348ec4f3b4362585eca7159d0d09cbdf4a7f74f02173596fd4" dependencies = [ - "base64", + "base64 0.13.1", ] [[package]] @@ -2473,18 +2539,18 @@ dependencies = [ [[package]] name = "phf" -version = "0.10.1" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +checksum = "928c6535de93548188ef63bb7c4036bd415cd8f36ad25af44b9789b2ee72a48c" dependencies = [ "phf_shared", ] [[package]] name = "phf_shared" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +checksum = "e1fb5f6f826b772a8d4c0394209441e7d37cbbb967ae9c7e0e8134365c9ee676" dependencies = [ "siphasher", ] @@ -2521,6 +2587,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" + [[package]] name = "plotters" version = "0.3.4" @@ -2551,12 +2623,12 @@ dependencies = [ [[package]] name = "postgres" -version = "0.19.2" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +version = "0.19.4" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" dependencies = [ "bytes", "fallible-iterator", - "futures", + "futures-util", "log", "tokio", "tokio-postgres", @@ -2565,9 +2637,9 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" dependencies = [ - "base64", + "base64 0.20.0", "byteorder", "bytes", "fallible-iterator", @@ -2582,8 +2654,8 @@ dependencies = [ [[package]] name = "postgres-types" -version = "0.2.3" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +version = "0.2.4" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" dependencies = [ "bytes", "fallible-iterator", @@ -2639,7 +2711,7 @@ dependencies = [ "lazy_static", "libc", "log", - "nix 0.23.1", + "nix 0.23.2", "parking_lot 0.11.2", "symbolic-demangle", "tempfile", @@ -2648,9 +2720,9 @@ dependencies = [ [[package]] name = "ppv-lite86" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "pq_proto" @@ -2669,9 +2741,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.1.21" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c142c0e46b57171fe0c528bee8c5b7569e80f0c17e377cd0e30ea57dbc11bb51" +checksum = "2c8992a85d8e93a28bdf76137db888d3874e3b230dee5ed8bebac4c9f7617773" dependencies = [ "proc-macro2", "syn", @@ -2703,37 +2775,37 @@ dependencies = [ [[package]] name = "proc-macro-hack" -version = "0.5.19" +version = "0.5.20+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" +checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.47" +version = "1.0.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" +checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5" dependencies = [ "unicode-ident", ] [[package]] name = "procfs" -version = "0.12.0" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0941606b9934e2d98a3677759a971756eb821f75764d0e0d26946d08e74d9104" +checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69" dependencies = [ "bitflags", "byteorder", "hex", "lazy_static", - "libc", + "rustix", ] [[package]] name = "prometheus" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45c8babc29389186697fe5a2a4859d697825496b83db5d0b65271cdc0488e88c" +checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c" dependencies = [ "cfg-if", "fnv", @@ -2747,51 +2819,19 @@ dependencies = [ [[package]] name = "prost" -version = "0.10.4" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71adf41db68aa0daaefc69bb30bcd68ded9b9abaad5d1fbb6304c4fb390e083e" +checksum = "c01db6702aa05baa3f57dec92b8eeeeb4cb19e894e73996b32a4093289e54592" dependencies = [ "bytes", - "prost-derive 0.10.1", -] - -[[package]] -name = "prost" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0841812012b2d4a6145fae9a6af1534873c32aa67fff26bd09f8fa42c83f95a" -dependencies = [ - "bytes", - "prost-derive 0.11.2", + "prost-derive", ] [[package]] name = "prost-build" -version = "0.10.4" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ae5a4388762d5815a9fc0dea33c56b021cdc8dde0c55e0c9ca57197254b0cab" -dependencies = [ - "bytes", - "cfg-if", - "cmake", - "heck", - "itertools", - "lazy_static", - "log", - "multimap", - "petgraph", - "prost 0.10.4", - "prost-types 0.10.1", - "regex", - "tempfile", - "which", -] - -[[package]] -name = "prost-build" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d8b442418ea0822409d9e7d047cbf1e7e9e1760b172bf9982cf29d517c93511" +checksum = "cb5320c680de74ba083512704acb90fe00f28f79207286a848e730c45dd73ed6" dependencies = [ "bytes", "heck", @@ -2801,8 +2841,8 @@ dependencies = [ "multimap", "petgraph", "prettyplease", - "prost 0.11.2", - "prost-types 0.11.2", + "prost", + "prost-types", "regex", "syn", "tempfile", @@ -2811,22 +2851,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.10.1" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b670f45da57fb8542ebdbb6105a925fe571b67f9e7ed9f47a06a84e72b4e7cc" -dependencies = [ - "anyhow", - "itertools", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "prost-derive" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "164ae68b6587001ca506d3bf7f1000bfa248d0e1217b618108fba4ec1d0cc306" +checksum = "c8842bad1a5419bca14eac663ba798f6bc19c413c2fdceb5f3ba3b0932d96720" dependencies = [ "anyhow", "itertools", @@ -2837,22 +2864,12 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.10.1" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d0a014229361011dc8e69c8a1ec6c2e8d0f2af7c91e3ea3f5b2170298461e68" +checksum = "017f79637768cde62820bc2d4fe0e45daaa027755c323ad077767c6c5f173091" dependencies = [ "bytes", - "prost 0.10.4", -] - -[[package]] -name = "prost-types" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "747761bc3dc48f9a34553bf65605cf6cb6288ba219f3450b4275dbd81539551a" -dependencies = [ - "bytes", - "prost 0.11.2", + "prost", ] [[package]] @@ -2862,10 +2879,10 @@ dependencies = [ "anyhow", "async-trait", "atty", - "base64", + "base64 0.13.1", "bstr", "bytes", - "clap 4.0.15", + "clap 4.0.32", "futures", "git-version", "hashbrown", @@ -2900,7 +2917,7 @@ dependencies = [ "tracing-subscriber", "url", "utils", - "uuid 1.2.1", + "uuid 1.2.2", "workspace_hack", "x509-parser", ] @@ -2916,9 +2933,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.21" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" +checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" dependencies = [ "proc-macro2", ] @@ -2965,21 +2982,19 @@ dependencies = [ [[package]] name = "rayon" -version = "1.5.3" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" +checksum = "6db3a213adf02b3bcfd2d3846bb41cb22857d131789e01df434fb7e7bc0759b7" dependencies = [ - "autocfg", - "crossbeam-deque", "either", "rayon-core", ] [[package]] name = "rayon-core" -version = "1.9.3" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" +checksum = "cac410af5d00ab6884528b4ab69d1e8e146e8d471201800fa1b4524126de6ad3" dependencies = [ "crossbeam-channel", "crossbeam-deque", @@ -2995,7 +3010,7 @@ checksum = "ffbe84efe2f38dea12e9bfc1f65377fdf03e53a18cb3b995faedf7934c7e785b" dependencies = [ "pem", "ring", - "time 0.3.15", + "time", "yasna", ] @@ -3010,9 +3025,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" +checksum = "e076559ef8e241f2ae3479e36f97bd5741c0330689e217ad51ce2c76808b868a" dependencies = [ "aho-corasick", "memchr", @@ -3030,9 +3045,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.27" +version = "0.6.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" +checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" [[package]] name = "remote_storage" @@ -3069,11 +3084,11 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.12" +version = "0.11.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "431949c384f4e2ae07605ccaa56d1d9d2ecdb5cadd4f9577ccfab29f2e5149fc" +checksum = "68cc60575865c7831548863cc02356512e3f1dc2f3f82cb837d7fc4cc8f3c97c" dependencies = [ - "base64", + "base64 0.13.1", "bytes", "encoding_rs", "futures-core", @@ -3083,10 +3098,12 @@ dependencies = [ "http-body", "hyper", "hyper-rustls", + "hyper-tls", "ipnet", "js-sys", "log", "mime", + "native-tls", "once_cell", "percent-encoding", "pin-project-lite", @@ -3096,6 +3113,7 @@ dependencies = [ "serde_json", "serde_urlencoded", "tokio", + "tokio-native-tls", "tokio-rustls", "tower-service", "url", @@ -3130,27 +3148,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "riscv" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6907ccdd7a31012b70faf2af85cd9e5ba97657cc3987c4f13f8e4d2c2a088aba" -dependencies = [ - "bare-metal 1.0.0", - "bit_field", - "riscv-target", -] - -[[package]] -name = "riscv-target" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88aa938cda42a0cf62a20cfe8d139ff1af20c2e681212b5b34adb5a58333f222" -dependencies = [ - "lazy_static", - "regex", -] - [[package]] name = "routerify" version = "3.0.0" @@ -3193,7 +3190,7 @@ dependencies = [ "futures", "futures-timer", "rstest_macros", - "rustc_version 0.4.0", + "rustc_version", ] [[package]] @@ -3205,7 +3202,7 @@ dependencies = [ "cfg-if", "proc-macro2", "quote", - "rustc_version 0.4.0", + "rustc_version", "syn", ] @@ -3221,22 +3218,13 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" -[[package]] -name = "rustc_version" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" -dependencies = [ - "semver 0.9.0", -] - [[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ - "semver 1.0.14", + "semver", ] [[package]] @@ -3249,10 +3237,24 @@ dependencies = [ ] [[package]] -name = "rustls" -version = "0.20.6" +name = "rustix" +version = "0.36.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aab8ee6c7097ed6057f43c187a62418d0c05a4bd5f18b3571db50ee0f9ce033" +checksum = "4feacf7db682c6c329c4ede12649cd36ecab0f3be5b7d74e6a20304725db4549" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "windows-sys 0.42.0", +] + +[[package]] +name = "rustls" +version = "0.20.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "539a2bfe908f471bfa933876bd1eb6a19cf2176d375f82ef7f99530a40e48c2c" dependencies = [ "log", "ring", @@ -3278,7 +3280,7 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0864aeff53f8c05aa08d86e5ef839d3dfcf07aeba2db32f12db0ef716e87bd55" dependencies = [ - "base64", + "base64 0.13.1", ] [[package]] @@ -3292,35 +3294,35 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.9" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8" +checksum = "5583e89e108996506031660fe09baa5011b9dd0341b89029313006d1fb508d70" [[package]] name = "ryu" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" +checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde" [[package]] name = "safekeeper" version = "0.1.0" dependencies = [ "anyhow", + "async-stream", "async-trait", "byteorder", "bytes", - "clap 4.0.15", + "clap 4.0.32", "const_format", "crc32c", - "etcd_broker", "fs2", "git-version", "hex", "humantime", "hyper", "metrics", - "nix 0.25.0", + "nix 0.25.1", "once_cell", "parking_lot 0.12.1", "postgres", @@ -3334,6 +3336,7 @@ dependencies = [ "serde_json", "serde_with", "signal-hook", + "storage_broker", "tempfile", "thiserror", "tokio", @@ -3372,7 +3375,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "88d6731146462ea25d9244b2ed5fd1d716d25c52e4d54aa4fb0f3c4e9854dbe2" dependencies = [ "lazy_static", - "windows-sys", + "windows-sys 0.36.1", ] [[package]] @@ -3383,9 +3386,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "scratch" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8132065adcfd6e02db789d9285a0deb2f3fcb04002865ab67d5fb103533898" +checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" [[package]] name = "sct" @@ -3422,39 +3425,107 @@ dependencies = [ [[package]] name = "semver" -version = "0.9.0" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +checksum = "58bc9567378fc7690d6b2addae4e60ac2eeea07becb2c64b9f218b53865cba2a" + +[[package]] +name = "sentry" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17ad137b9df78294b98cab1a650bef237cc6c950e82e5ce164655e674d07c5cc" dependencies = [ - "semver-parser", + "httpdate", + "native-tls", + "reqwest", + "sentry-backtrace", + "sentry-contexts", + "sentry-core", + "sentry-panic", + "tokio", + "ureq", ] [[package]] -name = "semver" -version = "1.0.14" +name = "sentry-backtrace" +version = "0.29.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e25dfac463d778e353db5be2449d1cce89bd6fd23c9f1ea21310ce6e5a1b29c4" +checksum = "afe4800806552aab314129761d5d3b3d422284eca3de2ab59e9fd133636cbd3d" +dependencies = [ + "backtrace", + "once_cell", + "regex", + "sentry-core", +] [[package]] -name = "semver-parser" -version = "0.7.0" +name = "sentry-contexts" +version = "0.29.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" +checksum = "a42938426670f6e7974989cd1417837a96dd8bbb01567094f567d6acb360bf88" +dependencies = [ + "hostname", + "libc", + "os_info", + "rustc_version", + "sentry-core", + "uname", +] + +[[package]] +name = "sentry-core" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4df9b9d8de2658a1ecd4e45f7b06c80c5dd97b891bfbc7c501186189b7e9bbdf" +dependencies = [ + "once_cell", + "rand", + "sentry-types", + "serde", + "serde_json", +] + +[[package]] +name = "sentry-panic" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0af37b8500f273e511ebd6eb0d342ff7937d64ce3f134764b2b4653112d48cb4" +dependencies = [ + "sentry-backtrace", + "sentry-core", +] + +[[package]] +name = "sentry-types" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccc95faa4078768a6bf8df45e2b894bbf372b3dbbfb364e9429c1c58ab7545c6" +dependencies = [ + "debugid 0.8.0", + "getrandom", + "hex", + "serde", + "serde_json", + "thiserror", + "time", + "url", + "uuid 1.2.2", +] [[package]] name = "serde" -version = "1.0.145" +version = "1.0.152" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "728eb6351430bccb993660dfffc5a72f91ccc1295abaa8ce19b27ebe4f75568b" +checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.145" +version = "1.0.152" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81fa1584d3d1bcacd84c277a0dfe21f5b0f6accf4a23d04d4c6d61f1af522b4c" +checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e" dependencies = [ "proc-macro2", "quote", @@ -3463,9 +3534,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.86" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41feea4228a6f1cd09ec7a3593a682276702cd67b5273544757dae23c096f074" +checksum = "877c235533714907a8c2464236f5c4b2a17262ef1bd71f38f35ea592c8da6883" dependencies = [ "itoa", "ryu", @@ -3486,25 +3557,25 @@ dependencies = [ [[package]] name = "serde_with" -version = "2.0.1" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368f2d60d049ea019a84dcd6687b0d1e0030fe663ae105039bdf967ed5e6a9a7" +checksum = "25bf4a5a814902cd1014dbccfa4d4560fb8432c779471e96e035602519f82eef" dependencies = [ - "base64", + "base64 0.13.1", "chrono", "hex", "indexmap", "serde", "serde_json", "serde_with_macros", - "time 0.3.15", + "time", ] [[package]] name = "serde_with_macros" -version = "2.0.1" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ccadfacf6cf10faad22bbadf55986bdd0856edfb5d9210aa1dcf1f516e84e93" +checksum = "e3452b4c0f6c1e357f73fdb87cd1efabaa12acf328c7a528e252893baeb3f4aa" dependencies = [ "darling", "proc-macro2", @@ -3588,7 +3659,7 @@ dependencies = [ "num-bigint", "num-traits", "thiserror", - "time 0.3.15", + "time", ] [[package]] @@ -3653,9 +3724,11 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" name = "storage_broker" version = "0.1.0" dependencies = [ + "anyhow", "async-stream", "bytes", - "clap 4.0.15", + "clap 4.0.32", + "const_format", "futures", "futures-core", "futures-util", @@ -3665,11 +3738,11 @@ dependencies = [ "metrics", "once_cell", "parking_lot 0.12.1", - "prost 0.11.2", + "prost", "tokio", "tokio-stream", - "tonic 0.8.2", - "tonic-build 0.8.2", + "tonic", + "tonic-build", "tracing", "utils", "workspace_hack", @@ -3734,7 +3807,7 @@ version = "8.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f551f902d5642e58039aee6a9021a61037926af96e071816361644983966f540" dependencies = [ - "debugid", + "debugid 0.7.3", "memmap2", "stable_deref_trait", "uuid 0.8.2", @@ -3753,9 +3826,9 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.102" +version = "1.0.107" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fcd952facd492f9be3ef0d0b7032a6e442ee9b361d4acc2b1d0c4aaa5f613a1" +checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5" dependencies = [ "proc-macro2", "quote", @@ -3823,24 +3896,24 @@ dependencies = [ [[package]] name = "textwrap" -version = "0.15.1" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "949517c0cf1bf4ee812e2e07e08ab448e3ae0d23472aee8a06c985f0c8815b16" +checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "thiserror" -version = "1.0.37" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10deb33631e3c9018b9baf9dcbbc4f737320d2b576bac10f6aefa048fa407e3e" +checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.37" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "982d17546b47146b28f7c22e3d08465f6b8903d0ea13c1660d9d84a6e7adcdbb" +checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" dependencies = [ "proc-macro2", "quote", @@ -3858,33 +3931,30 @@ dependencies = [ [[package]] name = "time" -version = "0.1.44" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi", -] - -[[package]] -name = "time" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d634a985c4d4238ec39cacaed2e7ae552fbd3c476b552c1deac3021b7d7eaf0c" +checksum = "a561bf4617eebd33bca6434b988f39ed798e527f51a1e797d0ee4f61c0a38376" dependencies = [ "itoa", - "libc", - "num_threads", "serde", + "time-core", "time-macros", ] [[package]] -name = "time-macros" -version = "0.2.4" +name = "time-core" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" +checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" + +[[package]] +name = "time-macros" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d967f99f534ca7e495c575c62638eebc2898a8c84c119b89e250477bc4ba16b2" +dependencies = [ + "time-core", +] [[package]] name = "tinytemplate" @@ -3943,25 +4013,36 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "1.8.0" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9724f9a975fb987ef7a3cd9be0350edcbe130698af5b8f7a631e23d42d052484" +checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8" dependencies = [ "proc-macro2", "quote", "syn", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-postgres" -version = "0.7.6" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=d052ee8b86fff9897c77b0fe89ea9daba0e1fa38#d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" +version = "0.7.7" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=43e6db254a97fdecbce33d8bc0890accfd74495e#43e6db254a97fdecbce33d8bc0890accfd74495e" dependencies = [ "async-trait", "byteorder", "bytes", "fallible-iterator", - "futures", + "futures-channel", + "futures-util", "log", "parking_lot 0.12.1", "percent-encoding", @@ -4026,9 +4107,9 @@ dependencies = [ [[package]] name = "toml" -version = "0.5.9" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d82e1a7758622a465f8cee077614c73484dac5b836c02ff6a40d5d1010324d7" +checksum = "1333c76748e868a4d9d1017b5ab53171dfd095f70c712fdb4653a406547f598f" dependencies = [ "serde", ] @@ -4047,14 +4128,14 @@ dependencies = [ [[package]] name = "tonic" -version = "0.7.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5be9d60db39854b30b835107500cf0aca0b0d14d6e1c3de124217c23a29c2ddb" +checksum = "8f219fad3b929bef19b1f86fbc0358d35daed8f2cac972037ac0dc10bbb8d5fb" dependencies = [ "async-stream", "async-trait", "axum", - "base64", + "base64 0.13.1", "bytes", "futures-core", "futures-util", @@ -4065,41 +4146,12 @@ dependencies = [ "hyper-timeout", "percent-encoding", "pin-project", - "prost 0.10.4", - "prost-derive 0.10.1", - "tokio", - "tokio-stream", - "tokio-util", - "tower", - "tower-layer", - "tower-service", - "tracing", - "tracing-futures", -] - -[[package]] -name = "tonic" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55b9af819e54b8f33d453655bef9b9acc171568fb49523078d0cc4e7484200ec" -dependencies = [ - "async-stream", - "async-trait", - "axum", - "base64", - "bytes", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "hyper", - "hyper-timeout", - "percent-encoding", - "pin-project", - "prost 0.11.2", - "prost-derive 0.11.2", + "prost", + "prost-derive", + "rustls-native-certs", + "rustls-pemfile", "tokio", + "tokio-rustls", "tokio-stream", "tokio-util", "tower", @@ -4111,26 +4163,13 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.7.2" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9263bf4c9bfaae7317c1c2faf7f18491d2fe476f70c414b73bf5d445b00ffa1" +checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4" dependencies = [ "prettyplease", "proc-macro2", - "prost-build 0.10.4", - "quote", - "syn", -] - -[[package]] -name = "tonic-build" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c6fd7c2581e36d63388a9e04c350c21beb7a8b059580b2e93993c526899ddc" -dependencies = [ - "prettyplease", - "proc-macro2", - "prost-build 0.11.2", + "prost-build", "quote", "syn", ] @@ -4157,9 +4196,9 @@ dependencies = [ [[package]] name = "tower-http" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c530c8675c1dbf98facee631536fa116b5fb6382d7dd6dc1b118d970eafe3ba" +checksum = "f873044bf02dd1e8239e9c1293ea39dad76dc594ec16185d0a1bf31d8dc8d858" dependencies = [ "bitflags", "bytes", @@ -4280,9 +4319,18 @@ checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" [[package]] name = "typenum" -version = "1.15.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987" +checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" + +[[package]] +name = "uname" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b72f89f0ca32e4db1c04e2a72f5345d59796d4866a1ee0609084569f73683dc8" +dependencies = [ + "libc", +] [[package]] name = "unicode-bidi" @@ -4292,9 +4340,9 @@ checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992" [[package]] name = "unicode-ident" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" +checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" [[package]] name = "unicode-normalization" @@ -4323,6 +4371,19 @@ version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" +[[package]] +name = "ureq" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "733b5ad78377302af52c0dbcb2623d78fe50e4b3bf215948ff29e9ee031d8566" +dependencies = [ + "base64 0.13.1", + "log", + "native-tls", + "once_cell", + "url", +] + [[package]] name = "url" version = "2.3.1" @@ -4332,6 +4393,7 @@ dependencies = [ "form_urlencoded", "idna", "percent-encoding", + "serde", ] [[package]] @@ -4356,7 +4418,7 @@ dependencies = [ "hyper", "jsonwebtoken", "metrics", - "nix 0.25.0", + "nix 0.25.1", "once_cell", "pq_proto", "rand", @@ -4364,6 +4426,7 @@ dependencies = [ "rustls", "rustls-pemfile", "rustls-split", + "sentry", "serde", "serde_json", "serde_with", @@ -4387,9 +4450,9 @@ checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" [[package]] name = "uuid" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "feb41e78f93363bb2df8b0e86a2ca30eed7806ea16ea0c790d757cf93f79be83" +checksum = "422ee0de9031b5b948b97a8fc04e3aa35230001a722ddd27943e0be31564ce4c" dependencies = [ "getrandom", "serde", @@ -4402,10 +4465,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" [[package]] -name = "vcell" -version = "0.1.3" +name = "vcpkg" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77439c1b53d2303b20d9459b1ade71a83c716e3f9c34f3228c00e6f185d6c002" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "version_check" @@ -4413,27 +4476,12 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" -[[package]] -name = "void" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" - -[[package]] -name = "volatile-register" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ee8f19f9d74293faf70901bc20ad067dc1ad390d2cbf1e3f75f721ffee908b6" -dependencies = [ - "vcell", -] - [[package]] name = "wal_craft" version = "0.1.0" dependencies = [ "anyhow", - "clap 4.0.15", + "clap 4.0.32", "env_logger", "log", "once_cell", @@ -4464,12 +4512,6 @@ dependencies = [ "try-lock", ] -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -4564,9 +4606,9 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368bfe657969fb01238bb756d351dcade285e0f6fcbd36dcb23359a5169975be" +checksum = "b6c71e40d7d2c34a5106301fb632274ca37242cd0c9d3e64dbece371a40a2d87" dependencies = [ "webpki", ] @@ -4619,43 +4661,100 @@ version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2" dependencies = [ - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_msvc", + "windows_aarch64_msvc 0.36.1", + "windows_i686_gnu 0.36.1", + "windows_i686_msvc 0.36.1", + "windows_x86_64_gnu 0.36.1", + "windows_x86_64_msvc 0.36.1", ] +[[package]] +name = "windows-sys" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc 0.42.0", + "windows_i686_gnu 0.42.0", + "windows_i686_msvc 0.42.0", + "windows_x86_64_gnu 0.42.0", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc 0.42.0", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" + [[package]] name = "windows_aarch64_msvc" version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47" +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" + [[package]] name = "windows_i686_gnu" version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6" +[[package]] +name = "windows_i686_gnu" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" + [[package]] name = "windows_i686_msvc" version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024" +[[package]] +name = "windows_i686_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" + [[package]] name = "windows_x86_64_gnu" version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1" +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" + [[package]] name = "windows_x86_64_msvc" version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680" +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5" + [[package]] name = "winreg" version = "0.10.1" @@ -4672,7 +4771,8 @@ dependencies = [ "ahash", "anyhow", "bytes", - "clap 4.0.15", + "chrono", + "clap 4.0.32", "crossbeam-utils", "either", "fail", @@ -4688,22 +4788,23 @@ dependencies = [ "num-bigint", "num-integer", "num-traits", - "prost 0.10.4", - "prost 0.11.2", + "prost", "rand", "regex", "regex-syntax", "reqwest", "scopeguard", "serde", + "serde_json", + "socket2", "stable_deref_trait", "syn", - "time 0.3.15", "tokio", "tokio-util", "tower", "tracing", "tracing-core", + "url", ] [[package]] @@ -4713,7 +4814,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0ecbeb7b67ce215e40e3cc7f2ff902f94a223acf44995934763467e7b1febc8" dependencies = [ "asn1-rs", - "base64", + "base64 0.13.1", "data-encoding", "der-parser", "lazy_static", @@ -4721,7 +4822,7 @@ dependencies = [ "oid-registry", "rusticata-macros", "thiserror", - "time 0.3.15", + "time", ] [[package]] @@ -4741,11 +4842,11 @@ checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd" [[package]] name = "yasna" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "346d34a236c9d3e5f3b9b74563f238f955bbd05fa0b8b4efa53c130c43982f4c" +checksum = "aed2e7a52e3744ab4d0c05c20aa065258e84c49fd4226f5191b2ed29712710b4" dependencies = [ - "time 0.3.15", + "time", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 2f73215d3f..927900d5c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -86,4 +86,4 @@ lto = true # This is only needed for proxy's tests. # TODO: we should probably fork `tokio-postgres-rustls` instead. [patch.crates-io] -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } diff --git a/Dockerfile b/Dockerfile index f0244fa8d3..0d5ba73456 100644 --- a/Dockerfile +++ b/Dockerfile @@ -79,7 +79,7 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/ RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ && /usr/local/bin/pageserver -D /data/.neon/ --init \ -c "id=1234" \ - -c "broker_endpoints=['http://etcd:2379']" \ + -c "broker_endpoint='http://storage_broker:50051'" \ -c "pg_distrib_dir='/usr/local/'" \ -c "listen_pg_addr='0.0.0.0:6400'" \ -c "listen_http_addr='0.0.0.0:9898'" diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14 index ad036338a0..e7fba49bb1 100644 --- a/Dockerfile.compute-node-v14 +++ b/Dockerfile.compute-node-v14 @@ -170,9 +170,6 @@ RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgto # Remove headers that we won't need anymore - we've completed installation of all extensions RUN rm -r /usr/local/pgsql/include -# Remove now-useless PGXS src infrastructure -RUN rm -r /usr/local/pgsql/lib/pgxs/src - # Remove static postgresql libraries - all compilation is finished, so we # can now remove these files - they must be included in other binaries by now # if they were to be used by other libraries. @@ -207,7 +204,8 @@ RUN apt update && \ libgeos-c1v5 \ libgdal28 \ libproj19 \ - libprotobuf-c1 && \ + libprotobuf-c1 \ + gdb && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* USER postgres diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15 index 4526644421..cd03525b97 100644 --- a/Dockerfile.compute-node-v15 +++ b/Dockerfile.compute-node-v15 @@ -170,9 +170,6 @@ RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgto # Remove headers that we won't need anymore - we've completed installation of all extensions RUN rm -r /usr/local/pgsql/include -# Remove now-useless PGXS src infrastructure -RUN rm -r /usr/local/pgsql/lib/pgxs/src - # Remove static postgresql libraries - all compilation is finished, so we # can now remove these files - they must be included in other binaries by now # if they were to be used by other libraries. @@ -207,7 +204,8 @@ RUN apt update && \ libgeos-c1v5 \ libgdal28 \ libproj19 \ - libprotobuf-c1 && \ + libprotobuf-c1 \ + gdb && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* USER postgres diff --git a/Makefile b/Makefile index 4711dc1c7d..92a4532684 100644 --- a/Makefile +++ b/Makefile @@ -61,146 +61,115 @@ all: neon postgres neon-pg-ext # # The 'postgres_ffi' depends on the Postgres headers. .PHONY: neon -neon: postgres-v14-headers postgres-v15-headers +neon: postgres-headers +@echo "Compiling Neon" $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) ### PostgreSQL parts -# The rules are duplicated for Postgres v14 and 15. We may want to refactor +# Some rules are duplicated for Postgres v14 and 15. We may want to refactor # to avoid the duplication in the future, but it's tolerable for now. # -$(POSTGRES_INSTALL_DIR)/build/v14/config.status: - +@echo "Configuring Postgres v14 build" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14 - (cd $(POSTGRES_INSTALL_DIR)/build/v14 && \ - env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure \ +$(POSTGRES_INSTALL_DIR)/build/%/config.status: + +@echo "Configuring Postgres $* build" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/$* + (cd $(POSTGRES_INSTALL_DIR)/build/$* && \ + env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \ CFLAGS='$(PG_CFLAGS)' \ $(PG_CONFIGURE_OPTS) \ - --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log) - -$(POSTGRES_INSTALL_DIR)/build/v15/config.status: - +@echo "Configuring Postgres v15 build" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15 - (cd $(POSTGRES_INSTALL_DIR)/build/v15 && \ - env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure \ - CFLAGS='$(PG_CFLAGS)' \ - $(PG_CONFIGURE_OPTS) \ - --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log) + --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log) # nicer alias to run 'configure' -.PHONY: postgres-v14-configure -postgres-v14-configure: $(POSTGRES_INSTALL_DIR)/build/v14/config.status - -.PHONY: postgres-v15-configure -postgres-v15-configure: $(POSTGRES_INSTALL_DIR)/build/v15/config.status +# Note: I've been unable to use templates for this part of our configuration. +# I'm not sure why it wouldn't work, but this is the only place (apart from +# the "build-all-versions" entry points) where direct mention of PostgreSQL +# versions is used. +.PHONY: postgres-configure-v15 +postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status +.PHONY: postgres-configure-v14 +postgres-configure-v14: $(POSTGRES_INSTALL_DIR)/build/v14/config.status # Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)//include -.PHONY: postgres-v14-headers -postgres-v14-headers: postgres-v14-configure - +@echo "Installing PostgreSQL v14 headers" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/include MAKELEVEL=0 install - -.PHONY: postgres-v15-headers -postgres-v15-headers: postgres-v15-configure - +@echo "Installing PostgreSQL v15 headers" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/include MAKELEVEL=0 install +.PHONY: postgres-headers-% +postgres-headers-%: postgres-configure-% + +@echo "Installing PostgreSQL $* headers" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/include MAKELEVEL=0 install # Compile and install PostgreSQL -.PHONY: postgres-v14 -postgres-v14: postgres-v14-configure \ - postgres-v14-headers # to prevent `make install` conflicts with neon's `postgres-headers` - +@echo "Compiling PostgreSQL v14" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install - +@echo "Compiling libpq v14" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install - +@echo "Compiling pg_prewarm v14" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_prewarm install - +@echo "Compiling pg_buffercache v14" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install - +@echo "Compiling pageinspect v14" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect install +.PHONY: postgres-% +postgres-%: postgres-configure-% \ + postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers` + +@echo "Compiling PostgreSQL $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 install + +@echo "Compiling libpq $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq install + +@echo "Compiling pg_prewarm $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install + +@echo "Compiling pg_buffercache $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install + +@echo "Compiling pageinspect $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install -.PHONY: postgres-v15 -postgres-v15: postgres-v15-configure \ - postgres-v15-headers # to prevent `make install` conflicts with neon's `postgres-headers` - +@echo "Compiling PostgreSQL v15" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install - +@echo "Compiling libpq v15" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install - +@echo "Compiling pg_prewarm v15" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_prewarm install - +@echo "Compiling pg_buffercache v15" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install - +@echo "Compiling pageinspect v15" - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect install +.PHONY: postgres-clean-% +postgres-clean-%: + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean -# shorthand to build all Postgres versions -postgres: postgres-v14 postgres-v15 +.PHONY: neon-pg-ext-% +neon-pg-ext-%: postgres-% + +@echo "Compiling neon $*" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$* + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install + +@echo "Compiling neon_walredo $*" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install + +@echo "Compiling neon_test_utils $*" + mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install -.PHONY: postgres-v14-clean -postgres-v14-clean: - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq clean +.PHONY: neon-pg-ext-clean-% +neon-pg-ext-clean-%: + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean -.PHONY: postgres-v15-clean -postgres-v15-clean: - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq clean - -neon-pg-ext-v14: postgres-v14 - +@echo "Compiling neon v14" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v14 - (cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) - +@echo "Compiling neon_walredo v14" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 - (cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install) - +@echo "Compiling neon_test_utils" v14 - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 - (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install) - -neon-pg-ext-v15: postgres-v15 - +@echo "Compiling neon v15" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v15 - (cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install) - +@echo "Compiling neon_walredo v15" - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 - (cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install) - +@echo "Compiling neon_test_utils" v15 - mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 - (cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \ - $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \ - -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install) +.PHONY: neon-pg-ext +neon-pg-ext: \ + neon-pg-ext-v14 \ + neon-pg-ext-v15 .PHONY: neon-pg-ext-clean - $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean - $(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean +neon-pg-ext-clean: \ + neon-pg-ext-clean-v14 \ + neon-pg-ext-clean-v15 -neon-pg-ext: neon-pg-ext-v14 neon-pg-ext-v15 -postgres-headers: postgres-v14-headers postgres-v15-headers -postgres-clean: postgres-v14-clean postgres-v15-clean +# shorthand to build all Postgres versions +.PHONY: postgres +postgres: \ + postgres-v14 \ + postgres-v15 + +.PHONY: postgres-headers +postgres-headers: \ + postgres-headers-v14 \ + postgres-headers-v15 + +.PHONY: postgres-clean +postgres-clean: \ + postgres-clean-v14 \ + postgres-clean-v15 # This doesn't remove the effects of 'configure'. .PHONY: clean -clean: - cd $(POSTGRES_INSTALL_DIR)/build/v14 && $(MAKE) clean - cd $(POSTGRES_INSTALL_DIR)/build/v15 && $(MAKE) clean +clean: postgres-clean neon-pg-ext-clean $(CARGO_CMD_PREFIX) cargo clean - cd pgxn/neon && $(MAKE) clean - cd pgxn/neon_test_utils && $(MAKE) clean # This removes everything .PHONY: distclean diff --git a/README.md b/README.md index cda36008d8..30bde949a9 100644 --- a/README.md +++ b/README.md @@ -2,29 +2,20 @@ Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes. -The project used to be called "Zenith". Many of the commands and code comments -still refer to "zenith", but we are in the process of renaming things. - ## Quick start -[Join the waitlist](https://neon.tech/) for our free tier to receive your serverless postgres instance. Then connect to it with your preferred postgres client (psql, dbeaver, etc) or use the online SQL editor. +Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions. Alternatively, compile and run the project [locally](#running-local-installation). ## Architecture overview -A Neon installation consists of compute nodes and a Neon storage engine. - -Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine. +A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine. The Neon storage engine consists of two major components: - Pageserver. Scalable storage backend for the compute nodes. -- WAL service. The service receives WAL from the compute node and ensures that it is stored durably. +- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage. -Pageserver consists of: -- Repository - Neon storage implementation. -- WAL receiver - service that receives WAL from WAL service and stores it in the repository. -- Page service - service that communicates with compute nodes and responds with pages from the repository. -- WAL redo - service that builds pages from base images and WAL records on Page service request +See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more information. ## Running local installation @@ -35,12 +26,12 @@ Pageserver consists of: * On Ubuntu or Debian, this set of packages should be sufficient to build the code: ```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ -libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client protobuf-compiler +libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler ``` * On Fedora, these packages are needed: ```bash dnf install flex bison readline-devel zlib-devel openssl-devel \ - libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib protobuf-compiler + libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler ``` 2. [Install Rust](https://www.rust-lang.org/tools/install) @@ -53,7 +44,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 1. Install XCode and dependencies ``` xcode-select --install -brew install protobuf etcd openssl flex bison +brew install protobuf openssl flex bison ``` 2. [Install Rust](https://www.rust-lang.org/tools/install) @@ -132,12 +123,12 @@ Stopped pageserver 1 process with pid 2545906 # start pageserver and safekeeper > ./target/debug/neon_local start -Starting etcd broker using "/usr/bin/etcd" -etcd started, pid: 2545996 +Starting neon broker at 127.0.0.1:50051 +storage_broker started, pid: 2918372 Starting pageserver at '127.0.0.1:64000' in '.neon'. -pageserver started, pid: 2546005 +pageserver started, pid: 2918386 Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'. -safekeeper 1 started, pid: 2546041 +safekeeper 1 started, pid: 2918437 # start postgres compute node > ./target/debug/neon_local pg start main @@ -229,12 +220,20 @@ CARGO_BUILD_FLAGS="--features=testing" make ## Documentation -Now we use README files to cover design ideas and overall architecture for each module and `rustdoc` style documentation comments. See also [/docs/](/docs/) a top-level overview of all available markdown documentation. +[/docs/](/docs/) Contains a top-level overview of all available markdown documentation. - [/docs/sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout. To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open` +See also README files in some source directories, and `rustdoc` style documentation comments. + +Other resources: + +- [SELECT 'Hello, World'](https://neon.tech/blog/hello-world/): Blog post by Nikita Shamgunov on the high level architecture +- [Architecture decisions in Neon](https://neon.tech/blog/architecture-decisions-in-neon/): Blog post by Heikki Linnakangas +- [Neon: Serverless PostgreSQL!](https://www.youtube.com/watch?v=rES0yzeERns): Presentation on storage system by Heikki Linnakangas in the CMU Database Group seminar series + ### Postgres-specific terms Due to Neon's very close relation with PostgreSQL internals, numerous specific terms are used. diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index d6f8fae34c..c40d870649 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -5,19 +5,19 @@ edition = "2021" [dependencies] anyhow = "1.0" -chrono = "0.4" +chrono = { version = "0.4", default-features = false, features = ["clock"] } clap = "4.0" env_logger = "0.9" futures = "0.3.13" hyper = { version = "0.14", features = ["full"] } log = { version = "0.4", features = ["std", "serde"] } notify = "5.0.0" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } regex = "1" serde = { version = "1.0", features = ["derive"] } serde_json = "1" tar = "0.4" tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } url = "2.2.2" workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 7786d7af9c..f3b787209d 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -105,7 +105,7 @@ fn main() -> Result<()> { tenant, timeline, pageserver_connstr, - metrics: ComputeMetrics::new(), + metrics: ComputeMetrics::default(), state: RwLock::new(ComputeState::new()), }; let compute = Arc::new(compute_state); diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index b6ba1692f9..ee1605c814 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -5,7 +5,7 @@ use tokio_postgres::NoTls; use crate::compute::ComputeNode; -pub fn create_writablity_check_data(client: &mut Client) -> Result<()> { +pub fn create_writability_check_data(client: &mut Client) -> Result<()> { let query = " CREATE TABLE IF NOT EXISTS health_check ( id serial primary key, diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index bfdd2340ec..c2c9ab2230 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -23,11 +23,11 @@ use std::sync::RwLock; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; -use log::info; +use log::{info, warn}; use postgres::{Client, NoTls}; use serde::{Serialize, Serializer}; -use crate::checker::create_writablity_check_data; +use crate::checker::create_writability_check_data; use crate::config; use crate::pg_helpers::*; use crate::spec::*; @@ -91,7 +91,7 @@ pub enum ComputeStatus { Failed, } -#[derive(Serialize)] +#[derive(Default, Serialize)] pub struct ComputeMetrics { pub sync_safekeepers_ms: AtomicU64, pub basebackup_ms: AtomicU64, @@ -99,23 +99,6 @@ pub struct ComputeMetrics { pub total_startup_ms: AtomicU64, } -impl ComputeMetrics { - pub fn new() -> Self { - Self { - sync_safekeepers_ms: AtomicU64::new(0), - basebackup_ms: AtomicU64::new(0), - config_ms: AtomicU64::new(0), - total_startup_ms: AtomicU64::new(0), - } - } -} - -impl Default for ComputeMetrics { - fn default() -> Self { - Self::new() - } -} - impl ComputeNode { pub fn set_status(&self, status: ComputeStatus) { self.state.write().unwrap().status = status; @@ -175,7 +158,7 @@ impl ComputeNode { let start_time = Utc::now(); let sync_handle = Command::new(&self.pgbin) - .args(&["--sync-safekeepers"]) + .args(["--sync-safekeepers"]) .env("PGDATA", &self.pgdata) // we cannot use -D in this mode .stdout(Stdio::piped()) .spawn() @@ -253,7 +236,7 @@ impl ComputeNode { // Run postgres as a child process. let mut pg = Command::new(&self.pgbin) - .args(&["-D", &self.pgdata]) + .args(["-D", &self.pgdata]) .spawn() .expect("cannot start postgres process"); @@ -292,7 +275,7 @@ impl ComputeNode { handle_databases(&self.spec, &mut client)?; handle_role_deletions(self, &mut client)?; handle_grants(self, &mut client)?; - create_writablity_check_data(&mut client)?; + create_writability_check_data(&mut client)?; // 'Close' connection drop(client); @@ -328,6 +311,9 @@ impl ComputeNode { .wait() .expect("failed to start waiting on Postgres process"); + self.check_for_core_dumps() + .expect("failed to check for core dumps"); + Ok(ecode) } @@ -343,4 +329,68 @@ impl ComputeNode { self.prepare_pgdata()?; self.run() } + + // Look for core dumps and collect backtraces. + // + // EKS worker nodes have following core dump settings: + // /proc/sys/kernel/core_pattern -> core + // /proc/sys/kernel/core_uses_pid -> 1 + // ulimint -c -> unlimited + // which results in core dumps being written to postgres data directory as core.. + // + // Use that as a default location and pattern, except macos where core dumps are written + // to /cores/ directory by default. + fn check_for_core_dumps(&self) -> Result<()> { + let core_dump_dir = match std::env::consts::OS { + "macos" => Path::new("/cores/"), + _ => Path::new(&self.pgdata), + }; + + // Collect core dump paths if any + info!("checking for core dumps in {}", core_dump_dir.display()); + let files = fs::read_dir(core_dump_dir)?; + let cores = files.filter_map(|entry| { + let entry = entry.ok()?; + let _ = entry.file_name().to_str()?.strip_prefix("core.")?; + Some(entry.path()) + }); + + // Print backtrace for each core dump + for core_path in cores { + warn!( + "core dump found: {}, collecting backtrace", + core_path.display() + ); + + // Try first with gdb + let backtrace = Command::new("gdb") + .args(["--batch", "-q", "-ex", "bt", &self.pgbin]) + .arg(&core_path) + .output(); + + // Try lldb if no gdb is found -- that is handy for local testing on macOS + let backtrace = match backtrace { + Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => { + warn!("cannot find gdb, trying lldb"); + Command::new("lldb") + .arg("-c") + .arg(&core_path) + .args(["--batch", "-o", "bt all", "-o", "quit"]) + .output() + } + _ => backtrace, + }?; + + warn!( + "core dump backtrace: {}", + String::from_utf8_lossy(&backtrace.stdout) + ); + warn!( + "debugger stderr: {}", + String::from_utf8_lossy(&backtrace.stderr) + ); + } + + Ok(()) + } } diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 4c8bbc608b..44f83e5003 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -9,29 +9,11 @@ use hyper::{Body, Method, Request, Response, Server, StatusCode}; use log::{error, info}; use serde_json; -use crate::compute::{ComputeNode, ComputeStatus}; +use crate::compute::ComputeNode; // Service function to handle all available routes. async fn routes(req: Request, compute: Arc) -> Response { match (req.method(), req.uri().path()) { - // Timestamp of the last Postgres activity in the plain text. - // DEPRECATED in favour of /status - (&Method::GET, "/last_activity") => { - info!("serving /last_active GET request"); - let state = compute.state.read().unwrap(); - - // Use RFC3339 format for consistency. - Response::new(Body::from(state.last_active.to_rfc3339())) - } - - // Has compute setup process finished? -> true/false. - // DEPRECATED in favour of /status - (&Method::GET, "/ready") => { - info!("serving /ready GET request"); - let status = compute.get_status(); - Response::new(Body::from(format!("{}", status == ComputeStatus::Running))) - } - // Serialized compute state. (&Method::GET, "/status") => { info!("serving /status GET request"); @@ -46,16 +28,6 @@ async fn routes(req: Request, compute: Arc) -> Response Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap())) } - // DEPRECATED, use POST instead - (&Method::GET, "/check_writability") => { - info!("serving /check_writability GET request"); - let res = crate::checker::check_writability(&compute).await; - match res { - Ok(_) => Response::new(Body::from("true")), - Err(e) => Response::new(Body::from(e.to_string())), - } - } - (&Method::POST, "/check_writability") => { info!("serving /check_writability POST request"); let res = crate::checker::check_writability(&compute).await; diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 9c0f8e3ccd..a857531d26 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -37,58 +37,7 @@ paths: schema: $ref: "#/components/schemas/ComputeMetrics" - /ready: - get: - deprecated: true - tags: - - "info" - summary: Check whether compute startup process finished successfully - description: "" - operationId: computeIsReady - responses: - "200": - description: Compute is ready ('true') or not ('false') - content: - text/plain: - schema: - type: string - example: "true" - - /last_activity: - get: - deprecated: true - tags: - - "info" - summary: Get timestamp of the last compute activity - description: "" - operationId: getLastComputeActivityTS - responses: - "200": - description: Timestamp of the last compute activity - content: - text/plain: - schema: - type: string - example: "2022-10-12T07:20:50.52Z" - /check_writability: - get: - deprecated: true - tags: - - "check" - summary: Check that we can write new data on this compute - description: "" - operationId: checkComputeWritabilityDeprecated - responses: - "200": - description: Check result - content: - text/plain: - schema: - type: string - description: Error text or 'true' if check passed - example: "true" - post: tags: - "check" diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 58cdf796bc..1588f5d62e 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -74,10 +74,8 @@ fn watch_compute_activity(compute: &ComputeNode) { } } - // Sort idle backend `state_change` timestamps. The last one corresponds - // to the last activity. - idle_backs.sort(); - if let Some(last) = idle_backs.last() { + // Get idle backend `state_change` with the max timestamp. + if let Some(last) = idle_backs.iter().max() { last_active = *last; } } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 289f223bda..ff422f1cf5 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -119,16 +119,9 @@ pub trait GenericOptionsSearch { impl GenericOptionsSearch for GenericOptions { /// Lookup option by name fn find(&self, name: &str) -> Option { - match &self { - Some(ops) => { - let op = ops.iter().find(|s| s.name == name); - match op { - Some(op) => op.value.clone(), - None => None, - } - } - None => None, - } + let ops = self.as_ref()?; + let op = ops.iter().find(|s| s.name == name)?; + op.value.clone() } } @@ -161,6 +154,14 @@ impl Role { } impl Database { + pub fn new(name: PgIdent, owner: PgIdent) -> Self { + Self { + name, + owner, + options: None, + } + } + /// Serialize a list of database parameters into a Postgres-acceptable /// string of arguments. /// NB: `TEMPLATE` is actually also an identifier, but so far we only need @@ -219,11 +220,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { &[], )? .iter() - .map(|row| Database { - name: row.get("datname"), - owner: row.get("owner"), - options: None, - }) + .map(|row| Database::new(row.get("datname"), row.get("owner"))) .collect(); Ok(postgres_dbs) diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 24cad4663a..431d9794bc 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -38,4 +38,33 @@ mod pg_helpers_tests { assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\""); } + + #[test] + fn generic_options_search() { + let generic_options: GenericOptions = Some(vec![ + GenericOption { + name: "present_value".into(), + value: Some("value".into()), + vartype: "string".into(), + }, + GenericOption { + name: "missed_value".into(), + value: None, + vartype: "int".into(), + }, + ]); + assert_eq!(generic_options.find("present_value"), Some("value".into())); + assert_eq!(generic_options.find("missed_value"), None); + assert_eq!(generic_options.find("invalid_value"), None); + + let empty_generic_options: GenericOptions = Some(vec![]); + assert_eq!(empty_generic_options.find("present_value"), None); + assert_eq!(empty_generic_options.find("missed_value"), None); + assert_eq!(empty_generic_options.find("invalid_value"), None); + + let none_generic_options: GenericOptions = None; + assert_eq!(none_generic_options.find("present_value"), None); + assert_eq!(none_generic_options.find("missed_value"), None); + assert_eq!(none_generic_options.find("invalid_value"), None); + } } diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 2ab48fa76c..180508a01a 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -10,7 +10,7 @@ comfy-table = "6.1" git-version = "0.3.5" nix = "0.25" once_cell = "1.13.0" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" } regex = "1" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } serde = { version = "1.0", features = ["derive"] } @@ -25,5 +25,7 @@ url = "2.2.2" pageserver_api = { path = "../libs/pageserver_api" } postgres_connection = { path = "../libs/postgres_connection" } safekeeper_api = { path = "../libs/safekeeper_api" } +# Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy. +storage_broker = { version = "0.1", path = "../storage_broker" } utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/simple.conf b/control_plane/simple.conf index ae60657400..6014e8dffd 100644 --- a/control_plane/simple.conf +++ b/control_plane/simple.conf @@ -10,5 +10,5 @@ id = 1 pg_port = 5454 http_port = 7676 -[etcd_broker] -broker_endpoints = ['http://127.0.0.1:2379'] +[broker] +listen_addr = '127.0.0.1:50051' diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index d21a939cb7..8909e27c94 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -14,17 +14,19 @@ use std::ffi::OsStr; use std::io::Write; -use std::path::Path; +use std::os::unix::prelude::AsRawFd; +use std::os::unix::process::CommandExt; +use std::path::{Path, PathBuf}; use std::process::{Child, Command}; use std::time::Duration; use std::{fs, io, thread}; -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::Context; use nix::errno::Errno; +use nix::fcntl::{FcntlArg, FdFlag}; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; - -use utils::lock_file; +use utils::pid_file::{self, PidFileRead}; // These constants control the loop used to poll for process start / stop. // @@ -49,21 +51,21 @@ pub enum InitialPidFile<'t> { } /// Start a background child process using the parameters given. -pub fn start_process< - F, - S: AsRef, - EI: IntoIterator, // Not generic AsRef, otherwise empty `envs` prevents type inference ->( +pub fn start_process( process_name: &str, datadir: &Path, command: &Path, - args: &[S], + args: AI, envs: EI, initial_pid_file: InitialPidFile, process_status_check: F, ) -> anyhow::Result where F: Fn() -> anyhow::Result, + AI: IntoIterator, + A: AsRef, + // Not generic AsRef, otherwise empty `envs` prevents type inference + EI: IntoIterator, { let log_path = datadir.join(format!("{process_name}.log")); let process_log_file = fs::OpenOptions::new() @@ -86,6 +88,14 @@ where let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command)); filled_cmd.envs(envs); + let pid_file_to_check = match initial_pid_file { + InitialPidFile::Create(path) => { + pre_exec_create_pidfile(filled_cmd, path); + path + } + InitialPidFile::Expect(path) => path, + }; + let mut spawned_process = filled_cmd.spawn().with_context(|| { format!("Could not spawn {process_name}, see console output and log files for details.") })?; @@ -95,29 +105,8 @@ where .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?, ); - let pid_file_to_check = match initial_pid_file { - InitialPidFile::Create(target_pid_file_path) => { - match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) { - lock_file::LockCreationResult::Created { .. } => { - // We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon - // as this CLI invocation exits, so it's a bit useless, but doesn't any harm either. - } - lock_file::LockCreationResult::AlreadyLocked { .. } => { - anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process") - } - lock_file::LockCreationResult::CreationFailed(e) => { - return Err(e.context(format!( - "Failed to create pid file for {process_name} at path {target_pid_file_path:?}" - ))) - } - } - None - } - InitialPidFile::Expect(pid_file_path) => Some(pid_file_path), - }; - for retries in 0..RETRIES { - match process_started(pid, pid_file_to_check, &process_status_check) { + match process_started(pid, Some(pid_file_to_check), &process_status_check) { Ok(true) => { println!("\n{process_name} started, pid: {pid}"); return Ok(spawned_process); @@ -165,12 +154,27 @@ pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<() /// Stops the process, using the pid file given. Returns Ok also if the process is already not running. pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> { - if !pid_file.exists() { - println!("{process_name} is already stopped: no pid file {pid_file:?} is present"); - return Ok(()); - } - let pid = read_pidfile(pid_file)?; + let pid = match pid_file::read(pid_file) + .with_context(|| format!("read pid_file {pid_file:?}"))? + { + PidFileRead::NotExist => { + println!("{process_name} is already stopped: no pid file present at {pid_file:?}"); + return Ok(()); + } + PidFileRead::NotHeldByAnyProcess(_) => { + // Don't try to kill according to file contents beacuse the pid might have been re-used by another process. + // Don't delete the file either, it can race with new pid file creation. + // Read `pid_file` module comment for details. + println!( + "No process is holding the pidfile. The process must have already exited. Leave in place to avoid race conditions: {pid_file:?}" + ); + return Ok(()); + } + PidFileRead::LockedByOtherProcess(pid) => pid, + }; + // XXX the pid could become invalid (and recycled) at any time before the kill() below. + // send signal let sig = if immediate { print!("Stopping {process_name} with pid {pid} immediately.."); Signal::SIGQUIT @@ -182,8 +186,9 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any match kill(pid, sig) { Ok(()) => (), Err(Errno::ESRCH) => { + // Again, don't delete the pid file. The unlink can race with a new pid file being created. println!( - "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found" + "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found. Likely the pid got recycled. Lucky we didn't harm anyone." ); return Ok(()); } @@ -252,6 +257,69 @@ fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { cmd } +/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(), +/// 1. Claims a pidfile with a fcntl lock on it and +/// 2. Sets up the pidfile's file descriptor so that it (and the lock) +/// will remain held until the cmd exits. +fn pre_exec_create_pidfile

(cmd: &mut Command, path: P) -> &mut Command +where + P: Into, +{ + let path: PathBuf = path.into(); + // SAFETY + // pre_exec is marked unsafe because it runs between fork and exec. + // Why is that dangerous in various ways? + // Long answer: https://github.com/rust-lang/rust/issues/39575 + // Short answer: in a multi-threaded program, other threads may have + // been inside of critical sections at the time of fork. In the + // original process, that was allright, assuming they protected + // the critical sections appropriately, e.g., through locks. + // Fork adds another process to the mix that + // 1. Has a single thread T + // 2. In an exact copy of the address space at the time of fork. + // A variety of problems scan occur now: + // 1. T tries to grab a lock that was locked at the time of fork. + // It will wait forever since in its address space, the lock + // is in state 'taken' but the thread that would unlock it is + // not there. + // 2. A rust object that represented some external resource in the + // parent now got implicitly copied by the the fork, even though + // the object's type is not `Copy`. The parent program may use + // non-copyability as way to enforce unique ownership of an + // external resource in the typesystem. The fork breaks that + // assumption, as now both parent and child process have an + // owned instance of the object that represents the same + // underlying resource. + // While these seem like niche problems, (1) in particular is + // highly relevant. For example, `malloc()` may grab a mutex internally, + // and so, if we forked while another thread was mallocing' and our + // pre_exec closure allocates as well, it will block on the malloc + // mutex forever + // + // The proper solution is to only use C library functions that are marked + // "async-signal-safe": https://man7.org/linux/man-pages/man7/signal-safety.7.html + // + // With this specific pre_exec() closure, the non-error path doesn't allocate. + // The error path uses `anyhow`, and hence does allocate. + // We take our chances there, hoping that any potential disaster is constrained + // to the child process (e.g., malloc has no state ourside of the child process). + // Last, `expect` prints to stderr, and stdio is not async-signal-safe. + // Again, we take our chances, making the same assumptions as for malloc. + unsafe { + cmd.pre_exec(move || { + let file = pid_file::claim_for_current_process(&path).expect("claim pid file"); + // Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile + // remains locked after exec. + nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty())) + .expect("remove FD_CLOEXEC"); + // Don't run drop(file), it would close the file before we actually exec. + std::mem::forget(file); + Ok(()) + }); + } + cmd +} + fn process_started( pid: Pid, pid_file_to_check: Option<&Path>, @@ -262,14 +330,11 @@ where { match status_check() { Ok(true) => match pid_file_to_check { - Some(pid_file_path) => { - if pid_file_path.exists() { - let pid_in_file = read_pidfile(pid_file_path)?; - Ok(pid_in_file == pid) - } else { - Ok(false) - } - } + Some(pid_file_path) => match pid_file::read(pid_file_path)? { + PidFileRead::NotExist => Ok(false), + PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid), + PidFileRead::NotHeldByAnyProcess(_) => Ok(false), + }, None => Ok(true), }, Ok(false) => Ok(false), @@ -277,21 +342,6 @@ where } } -/// Read a PID file -/// -/// We expect a file that contains a single integer. -fn read_pidfile(pidfile: &Path) -> Result { - let pid_str = fs::read_to_string(pidfile) - .with_context(|| format!("failed to read pidfile {pidfile:?}"))?; - let pid: i32 = pid_str - .parse() - .map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?; - if pid < 1 { - bail!("pidfile {pidfile:?} contained bad value '{pid}'"); - } - Ok(Pid::from_raw(pid)) -} - fn process_has_stopped(pid: Pid) -> anyhow::Result { match kill(pid, None) { // Process exists, keep waiting diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 42a9199037..71de741640 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -8,10 +8,10 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; use control_plane::compute::ComputeControlPlane; -use control_plane::local_env::{EtcdBroker, LocalEnv}; +use control_plane::local_env::LocalEnv; use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; -use control_plane::{etcd, local_env}; +use control_plane::{broker, local_env}; use pageserver_api::models::TimelineInfo; use pageserver_api::{ DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, @@ -22,9 +22,10 @@ use safekeeper_api::{ DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; use std::collections::{BTreeSet, HashMap}; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; use std::process::exit; use std::str::FromStr; +use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR; use utils::{ auth::{Claims, Scope}, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, @@ -41,13 +42,12 @@ project_git_version!(GIT_VERSION); const DEFAULT_PG_VERSION: &str = "14"; -fn default_conf(etcd_binary_path: &Path) -> String { +fn default_conf() -> String { format!( r#" # Default built-in configuration, defined in main.rs -[etcd_broker] -broker_endpoints = ['http://localhost:2379'] -etcd_binary_path = '{etcd_binary_path}' +[broker] +listen_addr = '{DEFAULT_BROKER_ADDR}' [pageserver] id = {DEFAULT_PAGESERVER_ID} @@ -60,7 +60,6 @@ id = {DEFAULT_SAFEKEEPER_ID} pg_port = {DEFAULT_SAFEKEEPER_PG_PORT} http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT} "#, - etcd_binary_path = etcd_binary_path.display(), pageserver_auth_type = AuthType::Trust, ) } @@ -298,7 +297,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { })? } else { // Built-in default config - default_conf(&EtcdBroker::locate_etcd()?) + default_conf() }; let pg_version = init_match @@ -324,7 +323,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result { pg_version, ) .unwrap_or_else(|e| { - eprintln!("pageserver init failed: {e}"); + eprintln!("pageserver init failed: {e:?}"); exit(1); }); @@ -342,7 +341,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { .get_many::("pageserver-config-override") .into_iter() .flatten() - .map(|s| s.as_str()) + .map(String::as_str) .collect() } @@ -550,7 +549,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { table.load_preset(comfy_table::presets::NOTHING); - table.set_header(&[ + table.set_header([ "NODE", "ADDRESS", "TIMELINE", @@ -585,7 +584,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { .map(|name| name.as_str()) .unwrap_or("?"); - table.add_row(&[ + table.add_row([ node_name.as_str(), &node.address.to_string(), &node.timeline_id.to_string(), @@ -748,7 +747,7 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result Resul } fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { - etcd::start_etcd_process(env)?; - let pageserver = PageServerNode::from_env(env); - // Postgres nodes are not started automatically + broker::start_broker_process(env)?; + + let pageserver = PageServerNode::from_env(env); if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) { - eprintln!("pageserver start failed: {e}"); - try_stop_etcd_process(env); + eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e); + try_stop_all(env, true); exit(1); } for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.start() { - eprintln!("safekeeper '{}' start failed: {e}", safekeeper.id); - try_stop_etcd_process(env); + eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e); + try_stop_all(env, false); exit(1); } } @@ -833,35 +832,41 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result< let immediate = sub_match.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); + try_stop_all(env, immediate); + + Ok(()) +} + +fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { let pageserver = PageServerNode::from_env(env); // Stop all compute nodes - let cplane = ComputeControlPlane::load(env.clone())?; - for (_k, node) in cplane.nodes { - if let Err(e) = node.stop(false) { - eprintln!("postgres stop failed: {}", e); + match ComputeControlPlane::load(env.clone()) { + Ok(cplane) => { + for (_k, node) in cplane.nodes { + if let Err(e) = node.stop(false) { + eprintln!("postgres stop failed: {e:#}"); + } + } + } + Err(e) => { + eprintln!("postgres stop failed, could not restore control plane data from env: {e:#}") } } if let Err(e) = pageserver.stop(immediate) { - eprintln!("pageserver stop failed: {}", e); + eprintln!("pageserver {} stop failed: {:#}", env.pageserver.id, e); } for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.stop(immediate) { - eprintln!("safekeeper '{}' stop failed: {}", safekeeper.id, e); + eprintln!("safekeeper {} stop failed: {:#}", safekeeper.id, e); } } - try_stop_etcd_process(env); - - Ok(()) -} - -fn try_stop_etcd_process(env: &local_env::LocalEnv) { - if let Err(e) = etcd::stop_etcd_process(env) { - eprintln!("etcd stop failed: {e}"); + if let Err(e) = broker::stop_broker_process(env) { + eprintln!("neon broker stop failed: {e:#}"); } } @@ -901,6 +906,7 @@ fn cli() -> Command { let stop_mode_arg = Arg::new("stop-mode") .short('m') .value_parser(["fast", "immediate"]) + .default_value("fast") .help("If 'immediate', don't flush repository data at shutdown") .required(false) .value_name("stop-mode"); diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs new file mode 100644 index 0000000000..6c0604a076 --- /dev/null +++ b/control_plane/src/broker.rs @@ -0,0 +1,48 @@ +use anyhow::Context; + +use std::path::PathBuf; + +use crate::{background_process, local_env}; + +pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { + let broker = &env.broker; + let listen_addr = &broker.listen_addr; + + print!("Starting neon broker at {}", listen_addr); + + let args = [format!("--listen-addr={listen_addr}")]; + + let client = reqwest::blocking::Client::new(); + background_process::start_process( + "storage_broker", + &env.base_data_dir, + &env.storage_broker_bin(), + args, + [], + background_process::InitialPidFile::Create(&storage_broker_pid_file_path(env)), + || { + let url = broker.client_url(); + let status_url = url.join("status").with_context(|| { + format!("Failed to append /status path to broker endpoint {url}",) + })?; + let request = client + .get(status_url) + .build() + .with_context(|| format!("Failed to construct request to broker endpoint {url}"))?; + match client.execute(request) { + Ok(resp) => Ok(resp.status().is_success()), + Err(_) => Ok(false), + } + }, + ) + .context("Failed to spawn storage_broker subprocess")?; + Ok(()) +} + +pub fn stop_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { + background_process::stop_process(true, "storage_broker", &storage_broker_pid_file_path(env)) +} + +fn storage_broker_pid_file_path(env: &local_env::LocalEnv) -> PathBuf { + env.base_data_dir.join("storage_broker.pid") +} diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index 0eec25c51e..8731cf2583 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -44,7 +44,7 @@ impl ComputeControlPlane { let mut nodes = BTreeMap::default(); let pgdatadirspath = &env.pg_data_dirs_path(); - for tenant_dir in fs::read_dir(&pgdatadirspath) + for tenant_dir in fs::read_dir(pgdatadirspath) .with_context(|| format!("failed to list {}", pgdatadirspath.display()))? { let tenant_dir = tenant_dir?; @@ -67,8 +67,8 @@ impl ComputeControlPlane { fn get_port(&mut self) -> u16 { 1 + self .nodes - .iter() - .map(|(_name, node)| node.address.port()) + .values() + .map(|node| node.address.port()) .max() .unwrap_or(self.base_port) } @@ -183,7 +183,7 @@ impl PostgresNode { fn sync_safekeepers(&self, auth_token: &Option, pg_version: u32) -> Result { let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres"); - let mut cmd = Command::new(&pg_path); + let mut cmd = Command::new(pg_path); cmd.arg("--sync-safekeepers") .env_clear() @@ -201,7 +201,7 @@ impl PostgresNode { .stderr(Stdio::piped()); if let Some(token) = auth_token { - cmd.env("ZENITH_AUTH_TOKEN", token); + cmd.env("NEON_AUTH_TOKEN", token); } let sync_handle = cmd @@ -261,7 +261,7 @@ impl PostgresNode { } fn create_pgdata(&self) -> Result<()> { - fs::create_dir_all(&self.pgdata()).with_context(|| { + fs::create_dir_all(self.pgdata()).with_context(|| { format!( "could not create data directory {}", self.pgdata().display() @@ -304,17 +304,17 @@ impl PostgresNode { // Set up authentication // - // $ZENITH_AUTH_TOKEN will be replaced with value from environment + // $NEON_AUTH_TOKEN will be replaced with value from environment // variable during compute pg startup. It is done this way because // otherwise user will be able to retrieve the value using SHOW // command or pg_settings let password = if let AuthType::NeonJWT = auth_type { - "$ZENITH_AUTH_TOKEN" + "$NEON_AUTH_TOKEN" } else { "" }; // NOTE avoiding spaces in connection string, because it is less error prone if we forward it somewhere. - // Also note that not all parameters are supported here. Because in compute we substitute $ZENITH_AUTH_TOKEN + // Also note that not all parameters are supported here. Because in compute we substitute $NEON_AUTH_TOKEN // We parse this string and build it back with token from env var, and for simplicity rebuild // uses only needed variables namely host, port, user, password. format!("postgresql://no_user:{password}@{host}:{port}") @@ -323,7 +323,7 @@ impl PostgresNode { conf.append_line(""); conf.append("neon.pageserver_connstring", &pageserver_connstr); if let AuthType::NeonJWT = auth_type { - conf.append("neon.safekeeper_token_env", "$ZENITH_AUTH_TOKEN"); + conf.append("neon.safekeeper_token_env", "$NEON_AUTH_TOKEN"); } conf.append("neon.tenant_id", &self.tenant_id.to_string()); conf.append("neon.timeline_id", &self.timeline_id.to_string()); @@ -448,7 +448,7 @@ impl PostgresNode { self.env.pg_lib_dir(self.pg_version)?.to_str().unwrap(), ); if let Some(token) = auth_token { - cmd.env("ZENITH_AUTH_TOKEN", token); + cmd.env("NEON_AUTH_TOKEN", token); } let pg_ctl = cmd.output().context("pg_ctl failed")?; @@ -478,7 +478,7 @@ impl PostgresNode { postgresql_conf_path.to_str().unwrap() ) })?; - fs::remove_dir_all(&self.pgdata())?; + fs::remove_dir_all(self.pgdata())?; self.create_pgdata()?; // 2. Bring back config files @@ -514,7 +514,7 @@ impl PostgresNode { "Destroying postgres data directory '{}'", self.pgdata().to_str().unwrap() ); - fs::remove_dir_all(&self.pgdata())?; + fs::remove_dir_all(self.pgdata())?; } else { self.pg_ctl(&["stop"], &None)?; } diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs deleted file mode 100644 index 031ffa539b..0000000000 --- a/control_plane/src/etcd.rs +++ /dev/null @@ -1,78 +0,0 @@ -use std::{fs, path::PathBuf}; - -use anyhow::Context; - -use crate::{background_process, local_env}; - -pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { - let etcd_broker = &env.etcd_broker; - print!( - "Starting etcd broker using {:?}", - etcd_broker.etcd_binary_path - ); - - let etcd_data_dir = env.base_data_dir.join("etcd"); - fs::create_dir_all(&etcd_data_dir) - .with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?; - - let client_urls = etcd_broker.comma_separated_endpoints(); - let args = [ - format!("--data-dir={}", etcd_data_dir.display()), - format!("--listen-client-urls={client_urls}"), - format!("--advertise-client-urls={client_urls}"), - // Set --quota-backend-bytes to keep the etcd virtual memory - // size smaller. Our test etcd clusters are very small. - // See https://github.com/etcd-io/etcd/issues/7910 - "--quota-backend-bytes=100000000".to_string(), - // etcd doesn't compact (vacuum) with default settings, - // enable it to prevent space exhaustion. - "--auto-compaction-mode=revision".to_string(), - "--auto-compaction-retention=1".to_string(), - ]; - - let pid_file_path = etcd_pid_file_path(env); - - let client = reqwest::blocking::Client::new(); - - background_process::start_process( - "etcd", - &etcd_data_dir, - &etcd_broker.etcd_binary_path, - &args, - [], - background_process::InitialPidFile::Create(&pid_file_path), - || { - for broker_endpoint in &etcd_broker.broker_endpoints { - let request = broker_endpoint - .join("health") - .with_context(|| { - format!( - "Failed to append /health path to broker endopint {}", - broker_endpoint - ) - }) - .and_then(|url| { - client.get(&url.to_string()).build().with_context(|| { - format!("Failed to construct request to etcd endpoint {url}") - }) - })?; - if client.execute(request).is_ok() { - return Ok(true); - } - } - - Ok(false) - }, - ) - .context("Failed to spawn etcd subprocess")?; - - Ok(()) -} - -pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> { - background_process::stop_process(true, "etcd", &etcd_pid_file_path(env)) -} - -fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf { - env.base_data_dir.join("etcd.pid") -} diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index 7c1007b133..6829479ad5 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -8,8 +8,8 @@ // mod background_process; +pub mod broker; pub mod compute; -pub mod etcd; pub mod local_env; pub mod pageserver; pub mod postgresql_conf; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index ac4ebd0d1e..ea936640ec 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -4,12 +4,16 @@ //! script which will use local paths. use anyhow::{bail, ensure, Context}; + use reqwest::Url; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; use std::collections::HashMap; use std::env; use std::fs; +use std::net::IpAddr; +use std::net::Ipv4Addr; +use std::net::SocketAddr; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use utils::{ @@ -62,7 +66,7 @@ pub struct LocalEnv { #[serde(default)] pub private_key_path: PathBuf, - pub etcd_broker: EtcdBroker, + pub broker: NeonBroker, pub pageserver: PageServerConf, @@ -78,67 +82,26 @@ pub struct LocalEnv { branch_name_mappings: HashMap>, } -/// Etcd broker config for cluster internal communication. -#[serde_as] +/// Broker config for cluster internal communication. #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] -pub struct EtcdBroker { - /// A prefix to all to any key when pushing/polling etcd from a node. - #[serde(default)] - pub broker_etcd_prefix: Option, - - /// Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'. - #[serde(default)] - #[serde_as(as = "Vec")] - pub broker_endpoints: Vec, - - /// Etcd binary path to use. - #[serde(default)] - pub etcd_binary_path: PathBuf, +#[serde(default)] +pub struct NeonBroker { + /// Broker listen address for storage nodes coordination, e.g. '127.0.0.1:50051'. + pub listen_addr: SocketAddr, } -impl EtcdBroker { - pub fn locate_etcd() -> anyhow::Result { - let which_output = Command::new("which") - .arg("etcd") - .output() - .context("Failed to run 'which etcd' command")?; - let stdout = String::from_utf8_lossy(&which_output.stdout); - ensure!( - which_output.status.success(), - "'which etcd' invocation failed. Status: {}, stdout: {stdout}, stderr: {}", - which_output.status, - String::from_utf8_lossy(&which_output.stderr) - ); - - let etcd_path = PathBuf::from(stdout.trim()); - ensure!( - etcd_path.is_file(), - "'which etcd' invocation was successful, but the path it returned is not a file or does not exist: {}", - etcd_path.display() - ); - - Ok(etcd_path) +// Dummy Default impl to satisfy Deserialize derive. +impl Default for NeonBroker { + fn default() -> Self { + NeonBroker { + listen_addr: SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 0), + } } +} - pub fn comma_separated_endpoints(&self) -> String { - self.broker_endpoints - .iter() - .map(|url| { - // URL by default adds a '/' path at the end, which is not what etcd CLI wants. - let url_string = url.as_str(); - if url_string.ends_with('/') { - &url_string[0..url_string.len() - 1] - } else { - url_string - } - }) - .fold(String::new(), |mut comma_separated_urls, url| { - if !comma_separated_urls.is_empty() { - comma_separated_urls.push(','); - } - comma_separated_urls.push_str(url); - comma_separated_urls - }) +impl NeonBroker { + pub fn client_url(&self) -> Url { + Url::parse(&format!("http://{}", self.listen_addr)).expect("failed to construct url") } } @@ -234,6 +197,10 @@ impl LocalEnv { self.neon_distrib_dir.join("safekeeper") } + pub fn storage_broker_bin(&self) -> PathBuf { + self.neon_distrib_dir.join("storage_broker") + } + pub fn pg_data_dirs_path(&self) -> PathBuf { self.base_data_dir.join("pgdatadirs").join("tenants") } @@ -437,7 +404,7 @@ impl LocalEnv { } } - fs::create_dir(&base_path)?; + fs::create_dir(base_path)?; // generate keys for jwt // openssl genrsa -out private_key.pem 2048 @@ -446,7 +413,7 @@ impl LocalEnv { private_key_path = base_path.join("auth_private_key.pem"); let keygen_output = Command::new("openssl") .arg("genrsa") - .args(&["-out", private_key_path.to_str().unwrap()]) + .args(["-out", private_key_path.to_str().unwrap()]) .arg("2048") .stdout(Stdio::null()) .output() @@ -463,10 +430,10 @@ impl LocalEnv { // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem let keygen_output = Command::new("openssl") .arg("rsa") - .args(&["-in", private_key_path.to_str().unwrap()]) + .args(["-in", private_key_path.to_str().unwrap()]) .arg("-pubout") - .args(&["-outform", "PEM"]) - .args(&["-out", public_key_path.to_str().unwrap()]) + .args(["-outform", "PEM"]) + .args(["-out", public_key_path.to_str().unwrap()]) .stdout(Stdio::null()) .output() .context("failed to generate auth private key")?; @@ -511,8 +478,8 @@ mod tests { "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}" ); - let string_to_replace = "broker_endpoints = ['http://127.0.0.1:2379']"; - let spoiled_url_str = "broker_endpoints = ['!@$XOXO%^&']"; + let string_to_replace = "listen_addr = '127.0.0.1:50051'"; + let spoiled_url_str = "listen_addr = '!@$XOXO%^&'"; let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str); assert!( spoiled_url_toml.contains(spoiled_url_str), diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index d845c9d7e9..68e94b2fdc 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -1,9 +1,10 @@ +use std::borrow::Cow; use std::collections::HashMap; use std::fs::File; use std::io::{BufReader, Write}; use std::num::NonZeroU64; -use std::path::{Path, PathBuf}; -use std::process::Child; +use std::path::PathBuf; +use std::process::{Child, Command}; use std::{io, result}; use anyhow::{bail, ensure, Context}; @@ -96,13 +97,8 @@ impl PageServerNode { } } - pub fn initialize( - &self, - create_tenant: Option, - initial_timeline_id: Option, - config_overrides: &[&str], - pg_version: u32, - ) -> anyhow::Result { + // pageserver conf overrides defined by neon_local configuration. + fn neon_local_overrides(&self) -> Vec { let id = format!("id={}", self.env.pageserver.id); // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = format!( @@ -117,44 +113,54 @@ impl PageServerNode { ); let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr); - let broker_endpoints_param = format!( - "broker_endpoints=[{}]", - self.env - .etcd_broker - .broker_endpoints - .iter() - .map(|url| format!("'{url}'")) - .collect::>() - .join(",") - ); - let broker_etcd_prefix_param = self - .env - .etcd_broker - .broker_etcd_prefix - .as_ref() - .map(|prefix| format!("broker_etcd_prefix='{prefix}'")); + let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url()); - let mut init_config_overrides = config_overrides.to_vec(); - init_config_overrides.push(&id); - init_config_overrides.push(&pg_distrib_dir_param); - init_config_overrides.push(&authg_type_param); - init_config_overrides.push(&listen_http_addr_param); - init_config_overrides.push(&listen_pg_addr_param); - init_config_overrides.push(&broker_endpoints_param); - - if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() { - init_config_overrides.push(broker_etcd_prefix_param); - } + let mut overrides = vec![ + id, + pg_distrib_dir_param, + authg_type_param, + listen_http_addr_param, + listen_pg_addr_param, + broker_endpoint_param, + ]; if self.env.pageserver.auth_type != AuthType::Trust { - init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'"); + overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned()); } + overrides + } + /// Initializes a pageserver node by creating its config with the overrides provided, + /// and creating an initial tenant and timeline afterwards. + pub fn initialize( + &self, + create_tenant: Option, + initial_timeline_id: Option, + config_overrides: &[&str], + pg_version: u32, + ) -> anyhow::Result { + // First, run `pageserver --init` and wait for it to write a config into FS and exit. + self.pageserver_init(config_overrides).with_context(|| { + format!( + "Failed to run init for pageserver node {}", + self.env.pageserver.id, + ) + })?; + + // Then, briefly start it fully to run HTTP commands on it, + // to create initial tenant and timeline. + // We disable the remote storage, since we stop pageserver right after the timeline creation, + // hence most of the uploads will either aborted or not started: no point to start them at all. + let disabled_remote_storage_override = "remote_storage={}"; let mut pageserver_process = self - .start_node(&init_config_overrides, &self.env.base_data_dir, true) + .start_node( + &[disabled_remote_storage_override], + // Previous overrides will be taken from the config created before, don't overwrite them. + false, + ) .with_context(|| { format!( - "Failed to start a process for pageserver {}", + "Failed to start a process for pageserver node {}", self.env.pageserver.id, ) })?; @@ -215,52 +221,73 @@ impl PageServerNode { } pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result { - self.start_node(config_overrides, &self.repo_path(), false) + self.start_node(config_overrides, false) } - fn start_node( - &self, - config_overrides: &[&str], - datadir: &Path, - update_config: bool, - ) -> anyhow::Result { - print!( - "Starting pageserver at '{}' in '{}'", + fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> { + let datadir = self.repo_path(); + let node_id = self.env.pageserver.id; + println!( + "Initializing pageserver node {} at '{}' in {:?}", + node_id, self.pg_connection_config.raw_address(), - datadir.display() + datadir ); io::stdout().flush()?; - let mut args = vec![ - "-D", - datadir.to_str().with_context(|| { - format!("Datadir path {datadir:?} cannot be represented as a unicode string") - })?, - ]; + let datadir_path_str = datadir.to_str().with_context(|| { + format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}") + })?; + let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str); + args.push(Cow::Borrowed("--init")); + let init_output = Command::new(self.env.pageserver_bin()) + .args(args.iter().map(Cow::as_ref)) + .envs(self.pageserver_env_variables()?) + .output() + .with_context(|| format!("Failed to run pageserver init for node {node_id}"))?; + + anyhow::ensure!( + init_output.status.success(), + "Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}", + node_id, + String::from_utf8_lossy(&init_output.stdout), + String::from_utf8_lossy(&init_output.stderr), + ); + + Ok(()) + } + + fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result { + let mut overrides = self.neon_local_overrides(); + overrides.extend(config_overrides.iter().map(|&c| c.to_owned())); + + let datadir = self.repo_path(); + print!( + "Starting pageserver node {} at '{}' in {:?}", + self.env.pageserver.id, + self.pg_connection_config.raw_address(), + datadir + ); + io::stdout().flush()?; + + let datadir_path_str = datadir.to_str().with_context(|| { + format!( + "Cannot start pageserver node {} in path that has no string representation: {:?}", + self.env.pageserver.id, datadir, + ) + })?; + let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str); if update_config { - args.push("--update-config"); + args.push(Cow::Borrowed("--update-config")); } - for config_override in config_overrides { - args.extend(["-c", config_override]); - } - - let envs = if self.env.pageserver.auth_type != AuthType::Trust { - // Generate a token to connect from the pageserver to a safekeeper - let token = self - .env - .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?; - vec![("ZENITH_AUTH_TOKEN".to_owned(), token)] - } else { - vec![] - }; background_process::start_process( "pageserver", - datadir, + &datadir, &self.env.pageserver_bin(), - &args, - envs, + args.iter().map(Cow::as_ref), + self.pageserver_env_variables()?, background_process::InitialPidFile::Expect(&self.pid_file()), || match self.check_status() { Ok(()) => Ok(true), @@ -270,6 +297,35 @@ impl PageServerNode { ) } + fn pageserver_basic_args<'a>( + &self, + config_overrides: &'a [&'a str], + datadir_path_str: &'a str, + ) -> Vec> { + let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)]; + + let mut overrides = self.neon_local_overrides(); + overrides.extend(config_overrides.iter().map(|&c| c.to_owned())); + for config_override in overrides { + args.push(Cow::Borrowed("-c")); + args.push(Cow::Owned(config_override)); + } + + args + } + + fn pageserver_env_variables(&self) -> anyhow::Result> { + Ok(if self.env.pageserver.auth_type != AuthType::Trust { + // Generate a token to connect from the pageserver to a safekeeper + let token = self + .env + .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?; + vec![("NEON_AUTH_TOKEN".to_owned(), token)] + } else { + Vec::new() + }) + } + /// /// Stop the server. /// diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 583d9709d0..4c0812a5e3 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -131,13 +131,8 @@ impl SafekeeperNode { args.push("--no-sync"); } - let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints(); - if !comma_separated_endpoints.is_empty() { - args.extend(["--broker-endpoints", &comma_separated_endpoints]); - } - if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() { - args.extend(["--broker-etcd-prefix", prefix]); - } + let broker_endpoint = format!("{}", self.env.broker.client_url()); + args.extend(["--broker-endpoint", &broker_endpoint]); let mut backup_threads = String::new(); if let Some(threads) = self.conf.backup_threads { diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 61b53dba41..b24cb80ce4 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -1,29 +1,6 @@ version: '3' services: - etcd: - restart: always - image: quay.io/coreos/etcd:v3.5.4 - ports: - - 2379:2379 - - 2380:2380 - environment: - # This signifficantly speeds up etcd and we anyway don't data persistency there. - ETCD_UNSAFE_NO_FSYNC: "1" - command: - - "etcd" - - "--auto-compaction-mode=revision" - - "--auto-compaction-retention=1" - - "--name=etcd-cluster" - - "--initial-cluster-state=new" - - "--initial-cluster-token=etcd-cluster-1" - - "--initial-cluster=etcd-cluster=http://etcd:2380" - - "--initial-advertise-peer-urls=http://etcd:2380" - - "--advertise-client-urls=http://etcd:2379" - - "--listen-client-urls=http://0.0.0.0:2379" - - "--listen-peer-urls=http://0.0.0.0:2380" - - "--quota-backend-bytes=134217728" # 128 MB - minio: restart: always image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z @@ -56,7 +33,7 @@ services: restart: always image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} environment: - - BROKER_ENDPOINT='http://etcd:2379' + - BROKER_ENDPOINT='http://storage_broker:50051' - AWS_ACCESS_KEY_ID=minio - AWS_SECRET_ACCESS_KEY=password #- RUST_BACKTRACE=1 @@ -68,7 +45,7 @@ services: - "-c" command: - "/usr/local/bin/pageserver -D /data/.neon/ - -c \"broker_endpoints=[$$BROKER_ENDPOINT]\" + -c \"broker_endpoint=$$BROKER_ENDPOINT\" -c \"listen_pg_addr='0.0.0.0:6400'\" -c \"listen_http_addr='0.0.0.0:9898'\" -c \"remote_storage={endpoint='http://minio:9000', @@ -76,7 +53,7 @@ services: bucket_region='eu-north-1', prefix_in_bucket='/pageserver/'}\"" depends_on: - - etcd + - storage_broker - minio_create_buckets safekeeper1: @@ -85,7 +62,7 @@ services: environment: - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454 - SAFEKEEPER_ID=1 - - BROKER_ENDPOINT=http://etcd:2379 + - BROKER_ENDPOINT=http://storage_broker:50051 - AWS_ACCESS_KEY_ID=minio - AWS_SECRET_ACCESS_KEY=password #- RUST_BACKTRACE=1 @@ -99,14 +76,14 @@ services: - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL --listen-http='0.0.0.0:7676' --id=$$SAFEKEEPER_ID - --broker-endpoints=$$BROKER_ENDPOINT + --broker-endpoint=$$BROKER_ENDPOINT -D /data --remote-storage=\"{endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/safekeeper/'}\"" depends_on: - - etcd + - storage_broker - minio_create_buckets safekeeper2: @@ -115,7 +92,7 @@ services: environment: - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454 - SAFEKEEPER_ID=2 - - BROKER_ENDPOINT=http://etcd:2379 + - BROKER_ENDPOINT=http://storage_broker:50051 - AWS_ACCESS_KEY_ID=minio - AWS_SECRET_ACCESS_KEY=password #- RUST_BACKTRACE=1 @@ -129,14 +106,14 @@ services: - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL --listen-http='0.0.0.0:7676' --id=$$SAFEKEEPER_ID - --broker-endpoints=$$BROKER_ENDPOINT + --broker-endpoint=$$BROKER_ENDPOINT -D /data --remote-storage=\"{endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/safekeeper/'}\"" depends_on: - - etcd + - storage_broker - minio_create_buckets safekeeper3: @@ -145,7 +122,7 @@ services: environment: - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454 - SAFEKEEPER_ID=3 - - BROKER_ENDPOINT=http://etcd:2379 + - BROKER_ENDPOINT=http://storage_broker:50051 - AWS_ACCESS_KEY_ID=minio - AWS_SECRET_ACCESS_KEY=password #- RUST_BACKTRACE=1 @@ -159,16 +136,25 @@ services: - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL --listen-http='0.0.0.0:7676' --id=$$SAFEKEEPER_ID - --broker-endpoints=$$BROKER_ENDPOINT + --broker-endpoint=$$BROKER_ENDPOINT -D /data --remote-storage=\"{endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/safekeeper/'}\"" depends_on: - - etcd + - storage_broker - minio_create_buckets + storage_broker: + restart: always + image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} + ports: + - 50051:50051 + command: + - "storage_broker" + - "--listen-addr=0.0.0.0:50051" + compute: restart: always build: diff --git a/docs/authentication.md b/docs/authentication.md index c5c4f02833..e22d7b700f 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -2,7 +2,7 @@ ### Overview We use JWT tokens in communication between almost all components (compute, pageserver, safekeeper, CLI) regardless of the protocol used (HTTP/PostgreSQL). -Etcd currently has no authentication. +storage_broker currently has no authentication. Authentication is optional and is disabled by default for easier debugging. It is used in some tests, though. Note that we do not cover authentication with `pg.neon.tech` here. @@ -65,7 +65,7 @@ There is no administrative API except those provided by PostgreSQL. #### Outgoing connections Compute connects to Pageserver for getting pages. -The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$ZENITH_AUTH_TOKEN@localhost:15028`. +The connection string is configured by the `neon.pageserver_connstring` PostgreSQL GUC, e.g. `postgresql://no_user:$NEON_AUTH_TOKEN@localhost:15028`. The environment variable inside the connection string is substituted with the JWT token. @@ -77,14 +77,14 @@ If the GUC is unset, no token is passed. Note that both tokens can be (and typically are) the same; the scope is the tenant and the token is usually passed through the -`$ZENITH_AUTH_TOKEN` environment variable. +`$NEON_AUTH_TOKEN` environment variable. ### Pageserver #### Overview Pageserver keeps track of multiple tenants, each having multiple timelines. For each timeline, it connects to the corresponding Safekeeper. Information about "corresponding Safekeeper" is published by Safekeepers -in the Etcd, but they do not publish access tokens, otherwise what is +in the storage_broker, but they do not publish access tokens, otherwise what is the point of authentication. Pageserver keeps a connection to some set of Safekeepers, which @@ -114,7 +114,7 @@ either of three values: Pageserver makes a connection to a Safekeeper for each active timeline. As Pageserver may want to access any timeline it has on the disk, it is given a blanket JWT token to access any data on any Safekeeper. -This token is passed through an environment variable called `ZENITH_AUTH_TOKEN` +This token is passed through an environment variable called `NEON_AUTH_TOKEN` (non-configurable as of writing this text). A better way _may be_ to store JWT token for each timeline next to it, diff --git a/docs/docker.md b/docs/docker.md index 42f0048e6f..d264a1a748 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -23,9 +23,9 @@ We build all images after a successful `release` tests run and push automaticall You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers. -- etcd x 1 - pageserver x 1 - safekeeper x 3 +- storage_broker x 1 - compute x 1 - MinIO x 1 # This is Amazon S3 compatible object storage @@ -41,7 +41,7 @@ $ cd docker-compose/docker-compose.yml $ docker-compose down # remove the conainers if exists $ PG_VERSION=15 TAG=2221 docker-compose up --build -d # You can specify the postgres and image version Creating network "dockercompose_default" with the default driver -Creating dockercompose_etcd3_1 ... +Creating docker-compose_storage_broker_1 ... done (...omit...) ``` diff --git a/docs/settings.md b/docs/settings.md index 878681fce1..58d32157a3 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -10,7 +10,6 @@ the values in the config file, if any are specified for the same key and get int ```toml # Initial configuration file created by 'pageserver --init' - listen_pg_addr = '127.0.0.1:64000' listen_http_addr = '127.0.0.1:9898' @@ -25,13 +24,12 @@ max_file_descriptors = '100' # initial superuser role name to use when creating a new tenant initial_superuser_name = 'cloud_admin' -broker_etcd_prefix = 'neon' -broker_endpoints = ['some://etcd'] +broker_endpoint = 'http://127.0.0.1:50051' # [remote_storage] ``` -The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user, +The config above shows default values for all basic pageserver settings, besides `broker_endpoint`: that one has to be set by the user, see the corresponding section below. Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank. Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start. @@ -50,16 +48,10 @@ Example: `${PAGESERVER_BIN} -c "checkpoint_timeout = '10 m'" -c "remote_storage= Note that TOML distinguishes between strings and integers, the former require single or double quotes around them. -#### broker_endpoints +#### broker_endpoint -A list of endpoints (etcd currently) to connect and pull the information from. -Mandatory, does not have a default, since requires etcd to be started as a separate process, -and its connection url should be specified separately. - -#### broker_etcd_prefix - -A prefix to add for every etcd key used, to separate one group of related instances from another, in the same cluster. -Default is `neon`. +A storage broker endpoint to connect and pull the information from. Default is +`'http://127.0.0.1:50051'`. #### checkpoint_distance diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 309f5a6966..17e47b670c 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -45,9 +45,9 @@ and create new databases and accounts (control plane API in our case). Integration tests, written in Python using the `pytest` framework. -`/vendor/postgres-v14`: +`/vendor/postgres-v14` and `/vendor/postgres-v15`: -PostgreSQL source tree, with the modifications needed for Neon. +PostgreSQL source tree per version, with the modifications needed for Neon. `/pgxn/neon`: diff --git a/libs/etcd_broker/Cargo.toml b/libs/etcd_broker/Cargo.toml deleted file mode 100644 index b18dcbe5a3..0000000000 --- a/libs/etcd_broker/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] - name = "etcd_broker" - version = "0.1.0" - edition = "2021" - - [dependencies] - etcd-client = "0.9.0" - regex = "1.4.5" - serde = { version = "1.0", features = ["derive"] } - serde_json = "1" - serde_with = "2.0" - once_cell = "1.13.0" - - utils = { path = "../utils" } - workspace_hack = { version = "0.1", path = "../../workspace_hack" } - tokio = "1" - tracing = "0.1" - thiserror = "1" diff --git a/libs/etcd_broker/src/lib.rs b/libs/etcd_broker/src/lib.rs deleted file mode 100644 index 8f698977a9..0000000000 --- a/libs/etcd_broker/src/lib.rs +++ /dev/null @@ -1,209 +0,0 @@ -//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent). -//! Intended to connect services to each other, not to store their data. - -/// All broker keys, that are used when dealing with etcd. -pub mod subscription_key; -/// All broker values, possible to use when dealing with etcd. -pub mod subscription_value; - -use std::str::FromStr; - -use serde::de::DeserializeOwned; - -use subscription_key::SubscriptionKey; -use tokio::{sync::mpsc, task::JoinHandle}; -use tracing::*; - -use crate::subscription_key::SubscriptionFullKey; - -pub use etcd_client::*; - -/// Default value to use for prefixing to all etcd keys with. -/// This way allows isolating safekeeper/pageserver groups in the same etcd cluster. -pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon"; - -/// A way to control the data retrieval from a certain subscription. -pub struct BrokerSubscription { - /// An unbounded channel to fetch the relevant etcd updates from. - pub value_updates: mpsc::UnboundedReceiver>, - key: SubscriptionKey, - /// A subscription task handle, to allow waiting on it for the task to complete. - /// Both the updates channel and the handle require `&mut`, so it's better to keep - /// both `pub` to allow using both in the same structures without borrow checker complaining. - pub watcher_handle: JoinHandle>, - watcher: Watcher, -} - -impl BrokerSubscription { - /// Cancels the subscription, stopping the data poller and waiting for it to shut down. - pub async fn cancel(mut self) -> Result<(), BrokerError> { - self.watcher.cancel().await.map_err(|e| { - BrokerError::EtcdClient( - e, - format!("Failed to cancel broker subscription, kind: {:?}", self.key), - ) - })?; - match (&mut self.watcher_handle).await { - Ok(res) => res, - Err(e) => { - if e.is_cancelled() { - // don't error on the tasks that are cancelled already - Ok(()) - } else { - Err(BrokerError::InternalError(format!( - "Panicked during broker subscription task, kind: {:?}, error: {e}", - self.key - ))) - } - } - } - } -} - -impl Drop for BrokerSubscription { - fn drop(&mut self) { - // we poll data from etcd into the channel in the same struct, so if the whole struct gets dropped, - // no more data is used by the receiver and it's safe to cancel and drop the whole etcd subscription task. - self.watcher_handle.abort(); - } -} - -/// An update from the etcd broker. -pub struct BrokerUpdate { - /// Etcd generation version, the bigger the more actual the data is. - pub etcd_version: i64, - /// Etcd key for the corresponding value, parsed from the broker KV. - pub key: SubscriptionFullKey, - /// Current etcd value, parsed from the broker KV. - pub value: V, -} - -#[derive(Debug, thiserror::Error)] -pub enum BrokerError { - #[error("Etcd client error: {0}. Context: {1}")] - EtcdClient(etcd_client::Error, String), - #[error("Error during parsing etcd key: {0}")] - KeyNotParsed(String), - #[error("Internal error: {0}")] - InternalError(String), -} - -/// Creates a background task to poll etcd for timeline updates from safekeepers. -/// Stops and returns `Err` on any error during etcd communication. -/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle, -/// exiting normally in such cases. -/// Etcd values are parsed as json fukes into a type, specified in the generic patameter. -pub async fn subscribe_for_json_values( - client: &mut Client, - key: SubscriptionKey, -) -> Result, BrokerError> -where - V: DeserializeOwned + Send + 'static, -{ - subscribe_for_values(client, key, |_, value_str| { - match serde_json::from_str::(value_str) { - Ok(value) => Some(value), - Err(e) => { - error!("Failed to parse value str '{value_str}': {e}"); - None - } - } - }) - .await -} - -/// Same as [`subscribe_for_json_values`], but allows to specify a custom parser of a etcd value string. -pub async fn subscribe_for_values( - client: &mut Client, - key: SubscriptionKey, - value_parser: P, -) -> Result, BrokerError> -where - V: Send + 'static, - P: Fn(SubscriptionFullKey, &str) -> Option + Send + 'static, -{ - info!("Subscribing to broker value updates, key: {key:?}"); - let subscription_key = key.clone(); - - let (watcher, mut stream) = client - .watch(key.watch_key(), Some(WatchOptions::new().with_prefix())) - .await - .map_err(|e| { - BrokerError::EtcdClient( - e, - format!("Failed to init the watch for subscription {key:?}"), - ) - })?; - - let (value_updates_sender, value_updates_receiver) = mpsc::unbounded_channel(); - let watcher_handle = tokio::spawn(async move { - while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!( - "Failed to get messages from the subscription stream, kind: {:?}, error: {e}", key.kind - )))? { - if resp.canceled() { - info!("Watch for timeline updates subscription was canceled, exiting"); - break; - } - - let events = resp.events(); - debug!("Processing {} events", events.len()); - - for event in events { - if EventType::Put == event.event_type() { - if let Some(new_etcd_kv) = event.kv() { - match parse_etcd_kv(new_etcd_kv, &value_parser, &key.cluster_prefix) { - Ok(Some((key, value))) => if let Err(e) = value_updates_sender.send(BrokerUpdate { - etcd_version: new_etcd_kv.version(), - key, - value, - }) { - info!("Broker value updates for key {key:?} sender got dropped, exiting: {e}"); - break; - }, - Ok(None) => debug!("Ignoring key {key:?} : no value was returned by the parser"), - Err(BrokerError::KeyNotParsed(e)) => debug!("Unexpected key {key:?} for timeline update: {e}"), - Err(e) => error!("Failed to represent etcd KV {new_etcd_kv:?}: {e}"), - }; - } - } - } - } - - Ok(()) - }.instrument(info_span!("etcd_broker"))); - - Ok(BrokerSubscription { - key: subscription_key, - value_updates: value_updates_receiver, - watcher_handle, - watcher, - }) -} - -fn parse_etcd_kv( - kv: &KeyValue, - value_parser: &P, - cluster_prefix: &str, -) -> Result, BrokerError> -where - P: Fn(SubscriptionFullKey, &str) -> Option, -{ - let key_str = kv.key_str().map_err(|e| { - BrokerError::EtcdClient(e, "Failed to extract key str out of etcd KV".to_string()) - })?; - let value_str = kv.value_str().map_err(|e| { - BrokerError::EtcdClient(e, "Failed to extract value str out of etcd KV".to_string()) - })?; - - if !key_str.starts_with(cluster_prefix) { - return Err(BrokerError::KeyNotParsed(format!( - "KV has unexpected key '{key_str}' that does not start with cluster prefix {cluster_prefix}" - ))); - } - - let key = SubscriptionFullKey::from_str(&key_str[cluster_prefix.len()..]).map_err(|e| { - BrokerError::KeyNotParsed(format!("Failed to parse KV key '{key_str}': {e}")) - })?; - - Ok(value_parser(key, value_str).map(|value| (key, value))) -} diff --git a/libs/etcd_broker/src/subscription_key.rs b/libs/etcd_broker/src/subscription_key.rs deleted file mode 100644 index a11d2ab106..0000000000 --- a/libs/etcd_broker/src/subscription_key.rs +++ /dev/null @@ -1,310 +0,0 @@ -//! Etcd broker keys, used in the project and shared between instances. -//! The keys are split into two categories: -//! -//! * [`SubscriptionFullKey`] full key format: `/////` -//! Always returned from etcd in this form, always start with the user key provided. -//! -//! * [`SubscriptionKey`] user input key format: always partial, since it's unknown which `node_id`'s are available. -//! Full key always starts with the user input one, due to etcd subscription properties. - -use std::{fmt::Display, str::FromStr}; - -use once_cell::sync::Lazy; -use regex::{Captures, Regex}; -use utils::id::{NodeId, TenantId, TenantTimelineId}; - -/// The subscription kind to the timeline updates from safekeeper. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct SubscriptionKey { - /// Generic cluster prefix, allowing to use the same etcd instance by multiple logic groups. - pub cluster_prefix: String, - /// The subscription kind. - pub kind: SubscriptionKind, -} - -/// All currently possible key kinds of a etcd broker subscription. -/// Etcd works so, that every key that starts with the subbscription key given is considered matching and -/// returned as part of the subscrption. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum SubscriptionKind { - /// Get every update in etcd. - All, - /// Get etcd updates for any timeiline of a certain tenant, affected by any operation from any node kind. - TenantTimelines(TenantId), - /// Get etcd updates for a certain timeline of a tenant, affected by any operation from any node kind. - Timeline(TenantTimelineId), - /// Get etcd timeline updates, specific to a certain node kind. - Node(TenantTimelineId, NodeKind), - /// Get etcd timeline updates for a certain operation on specific nodes. - Operation(TenantTimelineId, NodeKind, OperationKind), -} - -/// All kinds of nodes, able to write into etcd. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum NodeKind { - Safekeeper, - Pageserver, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum OperationKind { - Safekeeper(SkOperationKind), -} - -/// Current operations, running inside the safekeeper node. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum SkOperationKind { - TimelineInfo, - WalBackup, -} - -static SUBSCRIPTION_FULL_KEY_REGEX: Lazy = Lazy::new(|| { - Regex::new("/([[:xdigit:]]+)/([[:xdigit:]]+)/([^/]+)/([^/]+)/([[:digit:]]+)$") - .expect("wrong subscription full etcd key regex") -}); - -/// Full key, received from etcd during any of the component's work. -/// No other etcd keys are considered during system's work. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct SubscriptionFullKey { - pub id: TenantTimelineId, - pub node_kind: NodeKind, - pub operation: OperationKind, - pub node_id: NodeId, -} - -impl SubscriptionKey { - /// Subscribes for all etcd updates. - pub fn all(cluster_prefix: String) -> Self { - SubscriptionKey { - cluster_prefix, - kind: SubscriptionKind::All, - } - } - - /// Subscribes to a given timeline info updates from safekeepers. - pub fn sk_timeline_info(cluster_prefix: String, timeline: TenantTimelineId) -> Self { - Self { - cluster_prefix, - kind: SubscriptionKind::Operation( - timeline, - NodeKind::Safekeeper, - OperationKind::Safekeeper(SkOperationKind::TimelineInfo), - ), - } - } - - /// Subscribes to all timeine updates during specific operations, running on the corresponding nodes. - pub fn operation( - cluster_prefix: String, - timeline: TenantTimelineId, - node_kind: NodeKind, - operation: OperationKind, - ) -> Self { - Self { - cluster_prefix, - kind: SubscriptionKind::Operation(timeline, node_kind, operation), - } - } - - /// Etcd key to use for watching a certain timeline updates from safekeepers. - pub fn watch_key(&self) -> String { - let cluster_prefix = &self.cluster_prefix; - match self.kind { - SubscriptionKind::All => cluster_prefix.to_string(), - SubscriptionKind::TenantTimelines(tenant_id) => { - format!("{cluster_prefix}/{tenant_id}") - } - SubscriptionKind::Timeline(id) => { - format!("{cluster_prefix}/{id}") - } - SubscriptionKind::Node(id, node_kind) => { - format!("{cluster_prefix}/{id}/{node_kind}") - } - SubscriptionKind::Operation(id, node_kind, operation_kind) => { - format!("{cluster_prefix}/{id}/{node_kind}/{operation_kind}") - } - } - } -} - -impl Display for OperationKind { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - OperationKind::Safekeeper(o) => o.fmt(f), - } - } -} - -impl FromStr for OperationKind { - type Err = String; - - fn from_str(operation_kind_str: &str) -> Result { - match operation_kind_str { - "timeline_info" => Ok(OperationKind::Safekeeper(SkOperationKind::TimelineInfo)), - "wal_backup" => Ok(OperationKind::Safekeeper(SkOperationKind::WalBackup)), - _ => Err(format!("Unknown operation kind: {operation_kind_str}")), - } - } -} - -impl Display for SubscriptionFullKey { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let Self { - id, - node_kind, - operation, - node_id, - } = self; - write!(f, "{id}/{node_kind}/{operation}/{node_id}") - } -} - -impl FromStr for SubscriptionFullKey { - type Err = String; - - fn from_str(subscription_kind_str: &str) -> Result { - let key_captures = match SUBSCRIPTION_FULL_KEY_REGEX.captures(subscription_kind_str) { - Some(captures) => captures, - None => { - return Err(format!( - "Subscription kind str does not match a subscription full key regex {}", - SUBSCRIPTION_FULL_KEY_REGEX.as_str() - )); - } - }; - - Ok(Self { - id: TenantTimelineId::new( - parse_capture(&key_captures, 1)?, - parse_capture(&key_captures, 2)?, - ), - node_kind: parse_capture(&key_captures, 3)?, - operation: parse_capture(&key_captures, 4)?, - node_id: NodeId(parse_capture(&key_captures, 5)?), - }) - } -} - -fn parse_capture(caps: &Captures, index: usize) -> Result -where - T: FromStr, - ::Err: Display, -{ - let capture_match = caps - .get(index) - .ok_or_else(|| format!("Failed to get capture match at index {index}"))? - .as_str(); - capture_match.parse().map_err(|e| { - format!( - "Failed to parse {} from {capture_match}: {e}", - std::any::type_name::() - ) - }) -} - -impl Display for NodeKind { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Safekeeper => write!(f, "safekeeper"), - Self::Pageserver => write!(f, "pageserver"), - } - } -} - -impl FromStr for NodeKind { - type Err = String; - - fn from_str(node_kind_str: &str) -> Result { - match node_kind_str { - "safekeeper" => Ok(Self::Safekeeper), - "pageserver" => Ok(Self::Pageserver), - _ => Err(format!("Invalid node kind: {node_kind_str}")), - } - } -} - -impl Display for SkOperationKind { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::TimelineInfo => write!(f, "timeline_info"), - Self::WalBackup => write!(f, "wal_backup"), - } - } -} - -impl FromStr for SkOperationKind { - type Err = String; - - fn from_str(operation_str: &str) -> Result { - match operation_str { - "timeline_info" => Ok(Self::TimelineInfo), - "wal_backup" => Ok(Self::WalBackup), - _ => Err(format!("Invalid operation: {operation_str}")), - } - } -} - -#[cfg(test)] -mod tests { - use utils::id::TimelineId; - - use super::*; - - #[test] - fn full_cluster_key_parsing() { - let prefix = "neon"; - let node_kind = NodeKind::Safekeeper; - let operation_kind = OperationKind::Safekeeper(SkOperationKind::WalBackup); - let tenant_id = TenantId::generate(); - let timeline_id = TimelineId::generate(); - let id = TenantTimelineId::new(tenant_id, timeline_id); - let node_id = NodeId(1); - - let timeline_subscription_keys = [ - SubscriptionKey { - cluster_prefix: prefix.to_string(), - kind: SubscriptionKind::All, - }, - SubscriptionKey { - cluster_prefix: prefix.to_string(), - kind: SubscriptionKind::TenantTimelines(tenant_id), - }, - SubscriptionKey { - cluster_prefix: prefix.to_string(), - kind: SubscriptionKind::Timeline(id), - }, - SubscriptionKey { - cluster_prefix: prefix.to_string(), - kind: SubscriptionKind::Node(id, node_kind), - }, - SubscriptionKey { - cluster_prefix: prefix.to_string(), - kind: SubscriptionKind::Operation(id, node_kind, operation_kind), - }, - ]; - - let full_key_string = format!( - "{}/{node_id}", - timeline_subscription_keys.last().unwrap().watch_key() - ); - - for key in timeline_subscription_keys { - assert!(full_key_string.starts_with(&key.watch_key()), "Full key '{full_key_string}' should start with any of the keys, keys, but {key:?} did not match"); - } - - let full_key = SubscriptionFullKey::from_str(&full_key_string).unwrap_or_else(|e| { - panic!("Failed to parse {full_key_string} as a subscription full key: {e}") - }); - - assert_eq!( - full_key, - SubscriptionFullKey { - id, - node_kind, - operation: operation_kind, - node_id - } - ) - } -} diff --git a/libs/etcd_broker/src/subscription_value.rs b/libs/etcd_broker/src/subscription_value.rs deleted file mode 100644 index 60a5411926..0000000000 --- a/libs/etcd_broker/src/subscription_value.rs +++ /dev/null @@ -1,38 +0,0 @@ -//! Module for the values to put into etcd. - -use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, DisplayFromStr}; -use utils::lsn::Lsn; - -/// Data about safekeeper's timeline. Fields made optional for easy migrations. -#[serde_as] -#[derive(Debug, Clone, Deserialize, Serialize)] -pub struct SkTimelineInfo { - /// Term of the last entry. - pub last_log_term: Option, - /// LSN of the last record. - #[serde_as(as = "Option")] - #[serde(default)] - pub flush_lsn: Option, - /// Up to which LSN safekeeper regards its WAL as committed. - #[serde_as(as = "Option")] - #[serde(default)] - pub commit_lsn: Option, - /// LSN up to which safekeeper has backed WAL. - #[serde_as(as = "Option")] - #[serde(default)] - pub backup_lsn: Option, - /// LSN of last checkpoint uploaded by pageserver. - #[serde_as(as = "Option")] - #[serde(default)] - pub remote_consistent_lsn: Option, - #[serde_as(as = "Option")] - #[serde(default)] - pub peer_horizon_lsn: Option, - #[serde_as(as = "Option")] - #[serde(default)] - pub local_start_lsn: Option, - /// A connection string to use for WAL receiving. - #[serde(default)] - pub safekeeper_connstr: Option, -} diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 9d1ad8a022..d954e5d21f 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -117,6 +117,7 @@ impl TenantCreateRequest { #[serde_as] #[derive(Serialize, Deserialize)] pub struct TenantConfigRequest { + #[serde_as(as = "DisplayFromStr")] pub tenant_id: TenantId, #[serde(default)] #[serde_as(as = "Option")] @@ -162,6 +163,8 @@ pub struct TenantInfo { #[serde_as(as = "DisplayFromStr")] pub id: TenantId, pub state: TenantState, + /// Sum of the size of all layer files. + /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub has_in_progress_downloads: Option, } @@ -190,9 +193,12 @@ pub struct TimelineInfo { #[serde_as(as = "DisplayFromStr")] pub remote_consistent_lsn: Lsn, pub current_logical_size: Option, // is None when timeline is Unloaded + /// Sum of the size of all layer files. + /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, - pub current_physical_size_non_incremental: Option, + + pub timeline_dir_layer_file_size_sum: Option, pub wal_source_connstr: Option, #[serde_as(as = "Option")] @@ -201,32 +207,23 @@ pub struct TimelineInfo { pub last_received_msg_ts: Option, pub pg_version: u32, - pub awaits_download: bool, - pub state: TimelineState, - - // Some of the above fields are duplicated in 'local' and 'remote', for backwards- - // compatility with older clients. - pub local: LocalTimelineInfo, - pub remote: RemoteTimelineInfo, } -#[serde_as] #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct LocalTimelineInfo { - #[serde_as(as = "Option")] - pub ancestor_timeline_id: Option, - #[serde_as(as = "Option")] - pub ancestor_lsn: Option, - pub current_logical_size: Option, // is None when timeline is Unloaded - pub current_physical_size: Option, // is None when timeline is Unloaded +pub struct DownloadRemoteLayersTaskInfo { + pub task_id: String, + pub state: DownloadRemoteLayersTaskState, + pub total_layer_count: u64, // stable once `completed` + pub successful_download_count: u64, // stable once `completed` + pub failed_download_count: u64, // stable once `completed` } -#[serde_as] #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct RemoteTimelineInfo { - #[serde_as(as = "Option")] - pub remote_consistent_lsn: Option, +pub enum DownloadRemoteLayersTaskState { + Running, + Completed, + ShutDown, } pub type ConfigureFailpointsRequest = Vec; @@ -326,7 +323,7 @@ impl PagestreamFeMessage { match self { Self::Exists(req) => { bytes.put_u8(0); - bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u8(u8::from(req.latest)); bytes.put_u64(req.lsn.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); @@ -336,7 +333,7 @@ impl PagestreamFeMessage { Self::Nblocks(req) => { bytes.put_u8(1); - bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u8(u8::from(req.latest)); bytes.put_u64(req.lsn.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); @@ -346,7 +343,7 @@ impl PagestreamFeMessage { Self::GetPage(req) => { bytes.put_u8(2); - bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u8(u8::from(req.latest)); bytes.put_u64(req.lsn.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); @@ -357,7 +354,7 @@ impl PagestreamFeMessage { Self::DbSize(req) => { bytes.put_u8(3); - bytes.put_u8(if req.latest { 1 } else { 0 }); + bytes.put_u8(u8::from(req.latest)); bytes.put_u64(req.lsn.0); bytes.put_u32(req.dbnode); } diff --git a/libs/postgres_connection/Cargo.toml b/libs/postgres_connection/Cargo.toml index 314f3c6f1c..1924b260fa 100644 --- a/libs/postgres_connection/Cargo.toml +++ b/libs/postgres_connection/Cargo.toml @@ -8,8 +8,8 @@ edition = "2021" [dependencies] anyhow = "1.0" itertools = "0.10.3" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } url = "2.2.2" workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 01ff6ab60e..59eec3de32 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -21,7 +21,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" } [dev-dependencies] env_logger = "0.9" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } wal_craft = { path = "wal_craft" } [build-dependencies] diff --git a/libs/postgres_ffi/src/nonrelfile_utils.rs b/libs/postgres_ffi/src/nonrelfile_utils.rs index 01e5554b8a..5acf90be70 100644 --- a/libs/postgres_ffi/src/nonrelfile_utils.rs +++ b/libs/postgres_ffi/src/nonrelfile_utils.rs @@ -14,8 +14,8 @@ pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) { status ); - let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32) - / pg_constants::CLOG_XACTS_PER_BYTE) as usize; + let byteno: usize = + ((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize; let bshift: u8 = ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8; @@ -25,13 +25,13 @@ pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) { } pub fn transaction_id_get_status(xid: u32, page: &[u8]) -> u8 { - let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32) - / pg_constants::CLOG_XACTS_PER_BYTE) as usize; + let byteno: usize = + ((xid % pg_constants::CLOG_XACTS_PER_PAGE) / pg_constants::CLOG_XACTS_PER_BYTE) as usize; let bshift: u8 = ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8; - ((page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK) as u8 + (page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK } // See CLOGPagePrecedes in clog.c diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 953723a8f0..272c4d6dcc 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -333,7 +333,7 @@ impl CheckPoint { // We need this segment to start compute node. // pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result { - let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE as usize); + let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE); let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE); let hdr = XLogLongPageHeaderData { @@ -574,7 +574,7 @@ mod tests { // Rename file to partial to actually find last valid lsn, then rename it back. fs::rename( - cfg.wal_dir().join(&last_segment), + cfg.wal_dir().join(last_segment), cfg.wal_dir().join(format!("{}.partial", last_segment)), ) .unwrap(); diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml index 4c35c5a650..dd9f82a87a 100644 --- a/libs/postgres_ffi/wal_craft/Cargo.toml +++ b/libs/postgres_ffi/wal_craft/Cargo.toml @@ -11,7 +11,7 @@ clap = "4.0" env_logger = "0.9" log = "0.4" once_cell = "1.13.0" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } postgres_ffi = { path = "../" } tempfile = "3.2" workspace_hack = { version = "0.1", path = "../../../workspace_hack" } diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index c4404b37ba..969befc8e7 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -1,7 +1,6 @@ use anyhow::*; use core::time::Duration; use log::*; -use once_cell::sync::Lazy; use postgres::types::PgLsn; use postgres::Client; use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; @@ -26,15 +25,13 @@ pub struct PostgresServer { client_config: postgres::Config, } -pub static REQUIRED_POSTGRES_CONFIG: Lazy> = Lazy::new(|| { - vec![ - "wal_keep_size=50MB", // Ensure old WAL is not removed - "shared_preload_libraries=neon", // can only be loaded at startup - // Disable background processes as much as possible - "wal_writer_delay=10s", - "autovacuum=off", - ] -}); +pub static REQUIRED_POSTGRES_CONFIG: [&str; 4] = [ + "wal_keep_size=50MB", // Ensure old WAL is not removed + "shared_preload_libraries=neon", // can only be loaded at startup + // Disable background processes as much as possible + "wal_writer_delay=10s", + "autovacuum=off", +]; impl Conf { pub fn pg_distrib_dir(&self) -> anyhow::Result { @@ -84,7 +81,7 @@ impl Conf { .new_pg_command("initdb")? .arg("-D") .arg(self.datadir.as_os_str()) - .args(&["-U", "postgres", "--no-instructions", "--no-sync"]) + .args(["-U", "postgres", "--no-instructions", "--no-sync"]) .output()?; debug!("initdb output: {:?}", output); ensure!( @@ -108,12 +105,12 @@ impl Conf { let unix_socket_dir_path = unix_socket_dir.path().to_owned(); let server_process = self .new_pg_command("postgres")? - .args(&["-c", "listen_addresses="]) + .args(["-c", "listen_addresses="]) .arg("-k") .arg(unix_socket_dir_path.as_os_str()) .arg("-D") .arg(self.datadir.as_os_str()) - .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output + .args(["-c", "logging_collector=on"]) // stderr will mess up with tests output .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg])) .stderr(Stdio::from(log_file)) .spawn()?; @@ -145,7 +142,7 @@ impl Conf { ); let output = self .new_pg_command("pg_waldump")? - .args(&[ + .args([ &first_segment_file.as_os_str(), &last_segment_file.as_os_str(), ]) diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml index 4d48e431b4..76d8fbf28d 100644 --- a/libs/pq_proto/Cargo.toml +++ b/libs/pq_proto/Cargo.toml @@ -7,7 +7,7 @@ edition = "2021" anyhow = "1.0" bytes = "1.0.1" pin-project-lite = "0.2.7" -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } rand = "0.8.3" serde = { version = "1.0", features = ["derive"] } tokio = { version = "1.17", features = ["macros"] } diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 2e311dd6e3..278f044c15 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -463,7 +463,10 @@ pub enum BeMessage<'a> { EncryptionResponse(bool), NoData, ParameterDescription, - ParameterStatus(BeParameterStatusMessage<'a>), + ParameterStatus { + name: &'a [u8], + value: &'a [u8], + }, ParseComplete, ReadyForQuery, RowDescription(&'a [RowDescriptor<'a>]), @@ -472,6 +475,28 @@ pub enum BeMessage<'a> { KeepAlive(WalSndKeepAlive), } +/// Common shorthands. +impl<'a> BeMessage<'a> { + /// A [`BeMessage::ParameterStatus`] holding the client encoding, i.e. UTF-8. + /// This is a sensible default, given that: + /// * rust strings only support this encoding out of the box. + /// * tokio-postgres, postgres-jdbc (and probably more) mandate it. + /// + /// TODO: do we need to report `server_encoding` as well? + pub const CLIENT_ENCODING: Self = Self::ParameterStatus { + name: b"client_encoding", + value: b"UTF8", + }; + + /// Build a [`BeMessage::ParameterStatus`] holding the server version. + pub fn server_version(version: &'a str) -> Self { + Self::ParameterStatus { + name: b"server_version", + value: version.as_bytes(), + } + } +} + #[derive(Debug)] pub enum BeAuthenticationSaslMessage<'a> { Methods(&'a [&'a str]), @@ -485,12 +510,6 @@ pub enum BeParameterStatusMessage<'a> { ServerVersion(&'a str), } -impl BeParameterStatusMessage<'static> { - pub fn encoding() -> BeMessage<'static> { - BeMessage::ParameterStatus(Self::Encoding("UTF8")) - } -} - // One row description in RowDescription packet. #[derive(Debug)] pub struct RowDescriptor<'a> { @@ -587,14 +606,15 @@ fn write_body(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R { } /// Safe write of s into buf as cstring (String in the protocol). -fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> { - if s.contains(&0) { +fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), io::Error> { + let bytes = s.as_ref(); + if bytes.contains(&0) { return Err(io::Error::new( io::ErrorKind::InvalidInput, "string contains embedded null", )); } - buf.put_slice(s); + buf.put_slice(bytes); buf.put_u8(0); Ok(()) } @@ -644,7 +664,7 @@ impl<'a> BeMessage<'a> { Methods(methods) => { buf.put_i32(10); // Specifies that SASL auth method is used. for method in methods.iter() { - write_cstr(method.as_bytes(), buf)?; + write_cstr(method, buf)?; } buf.put_u8(0); // zero terminator for the list } @@ -759,7 +779,7 @@ impl<'a> BeMessage<'a> { buf.put_slice(b"CXX000\0"); buf.put_u8(b'M'); // the message - write_cstr(error_msg.as_bytes(), buf)?; + write_cstr(error_msg, buf)?; buf.put_u8(0); // terminator Ok::<_, io::Error>(()) @@ -799,24 +819,12 @@ impl<'a> BeMessage<'a> { buf.put_u8(response); } - BeMessage::ParameterStatus(param) => { - use std::io::{IoSlice, Write}; - use BeParameterStatusMessage::*; - - let [name, value] = match param { - Encoding(name) => [b"client_encoding", name.as_bytes()], - ServerVersion(version) => [b"server_version", version.as_bytes()], - }; - - // Parameter names and values are passed as null-terminated strings - let iov = &mut [name, b"\0", value, b"\0"].map(IoSlice::new); - let mut buffer = [0u8; 64]; // this should be enough - let cnt = buffer.as_mut().write_vectored(iov).unwrap(); - + BeMessage::ParameterStatus { name, value } => { buf.put_u8(b'S'); write_body(buf, |buf| { - buf.put_slice(&buffer[..cnt]); - }); + write_cstr(name, buf)?; + write_cstr(value, buf) + })?; } BeMessage::ParameterDescription => { @@ -873,7 +881,7 @@ impl<'a> BeMessage<'a> { buf.put_u8(b'k'); buf.put_u64(req.sent_ptr); buf.put_i64(req.timestamp); - buf.put_u8(if req.request_reply { 1 } else { 0 }); + buf.put_u8(u8::from(req.request_reply)); }); } } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 4bdd2b9608..1091a8bd5c 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -7,10 +7,11 @@ //! mod local_fs; mod s3_bucket; +mod simulate_failures; use std::{ collections::HashMap, - fmt::{Debug, Display}, + fmt::Debug, num::{NonZeroU32, NonZeroUsize}, ops::Deref, path::{Path, PathBuf}, @@ -24,7 +25,7 @@ use tokio::io; use toml_edit::Item; use tracing::info; -pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket}; +pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper}; /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage. /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency @@ -41,44 +42,27 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; -#[derive(Clone, PartialEq, Eq)] -pub struct RemoteObjectId(String); +/// Path on the remote storage, relative to some inner prefix. +/// The prefix is an implementation detail, that allows representing local paths +/// as the remote ones, stripping the local storage prefix away. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct RemotePath(PathBuf); + +impl RemotePath { + pub fn new(relative_path: &Path) -> anyhow::Result { + anyhow::ensure!( + relative_path.is_relative(), + "Path {relative_path:?} is not relative" + ); + Ok(Self(relative_path.to_path_buf())) + } + + pub fn with_base(&self, base_path: &Path) -> PathBuf { + base_path.join(&self.0) + } -/// -/// A key that refers to an object in remote storage. It works much like a Path, -/// but it's a separate datatype so that you don't accidentally mix local paths -/// and remote keys. -/// -impl RemoteObjectId { - // Needed to retrieve last component for RemoteObjectId. - // In other words a file name - /// Turn a/b/c or a/b/c/ into c pub fn object_name(&self) -> Option<&str> { - // corner case, char::to_string is not const, thats why this is more verbose than it needs to be - // see https://github.com/rust-lang/rust/issues/88674 - if self.0.len() == 1 && self.0.chars().next().unwrap() == REMOTE_STORAGE_PREFIX_SEPARATOR { - return None; - } - - if self.0.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { - self.0.rsplit(REMOTE_STORAGE_PREFIX_SEPARATOR).nth(1) - } else { - self.0 - .rsplit_once(REMOTE_STORAGE_PREFIX_SEPARATOR) - .map(|(_, last)| last) - } - } -} - -impl Debug for RemoteObjectId { - fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { - Debug::fmt(&self.0, fmt) - } -} - -impl Display for RemoteObjectId { - fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - Display::fmt(&self.0, fmt) + self.0.file_name().and_then(|os_str| os_str.to_str()) } } @@ -87,14 +71,8 @@ impl Display for RemoteObjectId { /// providing basic CRUD operations for storage files. #[async_trait::async_trait] pub trait RemoteStorage: Send + Sync + 'static { - /// Attempts to derive the storage path out of the local path, if the latter is correct. - fn remote_object_id(&self, local_path: &Path) -> anyhow::Result; - - /// Gets the download path of the given storage file. - fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result; - /// Lists all items the storage has right now. - async fn list(&self) -> anyhow::Result>; + async fn list(&self) -> anyhow::Result>; /// Lists all top level subdirectories for a given prefix /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id @@ -102,39 +80,34 @@ pub trait RemoteStorage: Send + Sync + 'static { /// so this method doesnt need to. async fn list_prefixes( &self, - prefix: Option<&RemoteObjectId>, - ) -> anyhow::Result>; + prefix: Option<&RemotePath>, + ) -> Result, DownloadError>; /// Streams the local file contents into remote into the remote storage entry. async fn upload( &self, - from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, + data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, // S3 PUT request requires the content length to be specified, // otherwise it starts to fail with the concurrent connection count increasing. - from_size_bytes: usize, - to: &RemoteObjectId, + data_size_bytes: usize, + to: &RemotePath, metadata: Option, ) -> anyhow::Result<()>; /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer. /// Returns the metadata, if any was stored with the file previously. - async fn download(&self, from: &RemoteObjectId) -> Result; + async fn download(&self, from: &RemotePath) -> Result; /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer. /// Returns the metadata, if any was stored with the file previously. async fn download_byte_range( &self, - from: &RemoteObjectId, + from: &RemotePath, start_inclusive: u64, end_exclusive: Option, ) -> Result; - async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()>; - - /// Downcast to LocalFs implementation. For tests. - fn as_local(&self) -> Option<&LocalFs> { - None - } + async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>; } pub struct Download { @@ -168,7 +141,7 @@ impl std::fmt::Display for DownloadError { write!(f, "Failed to download a remote file due to user input: {e}") } DownloadError::NotFound => write!(f, "No file found for the remote object id given"), - DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e}"), + DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"), } } } @@ -178,38 +151,43 @@ impl std::error::Error for DownloadError {} /// Every storage, currently supported. /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. #[derive(Clone)] -pub struct GenericRemoteStorage(Arc); +pub enum GenericRemoteStorage { + LocalFs(LocalFs), + AwsS3(Arc), + Unreliable(Arc), +} impl Deref for GenericRemoteStorage { type Target = dyn RemoteStorage; fn deref(&self) -> &Self::Target { - self.0.as_ref() + match self { + GenericRemoteStorage::LocalFs(local_fs) => local_fs, + GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(), + GenericRemoteStorage::Unreliable(s) => s.as_ref(), + } } } impl GenericRemoteStorage { - pub fn new(storage: impl RemoteStorage) -> Self { - Self(Arc::new(storage)) - } - - pub fn from_config( - working_directory: PathBuf, - storage_config: &RemoteStorageConfig, - ) -> anyhow::Result { + pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { Ok(match &storage_config.storage { RemoteStorageKind::LocalFs(root) => { info!("Using fs root '{}' as a remote storage", root.display()); - GenericRemoteStorage::new(LocalFs::new(root.clone(), working_directory)?) + Self::LocalFs(LocalFs::new(root.clone())?) } RemoteStorageKind::AwsS3(s3_config) => { info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'", s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); - GenericRemoteStorage::new(S3Bucket::new(s3_config, working_directory)?) + Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?)) } }) } + pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self { + Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first))) + } + /// Takes storage object contents and its size and uploads to remote storage, /// mapping `from_path` to the corresponding remote object id in the storage. /// @@ -219,23 +197,12 @@ impl GenericRemoteStorage { &self, from: Box, from_size_bytes: usize, - from_path: &Path, + to: &RemotePath, ) -> anyhow::Result<()> { - let target_storage_path = self.remote_object_id(from_path).with_context(|| { - format!( - "Failed to get the storage path for source local path '{}'", - from_path.display() - ) - })?; - - self.upload(from, from_size_bytes, &target_storage_path, None) + self.upload(from, from_size_bytes, to, None) .await .with_context(|| { - format!( - "Failed to upload from '{}' to storage path '{:?}'", - from_path.display(), - target_storage_path - ) + format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}") }) } @@ -244,24 +211,11 @@ impl GenericRemoteStorage { pub async fn download_storage_object( &self, byte_range: Option<(u64, Option)>, - to_path: &Path, + from: &RemotePath, ) -> Result { - let remote_object_path = self - .remote_object_id(to_path) - .with_context(|| { - format!( - "Failed to get the storage path for target local path '{}'", - to_path.display() - ) - }) - .map_err(DownloadError::BadInput)?; - match byte_range { - Some((start, end)) => { - self.download_byte_range(&remote_object_path, start, end) - .await - } - None => self.download(&remote_object_path).await, + Some((start, end)) => self.download_byte_range(from, start, end).await, + None => self.download(from).await, } } } @@ -271,23 +225,6 @@ impl GenericRemoteStorage { #[derive(Debug, Clone, PartialEq, Eq)] pub struct StorageMetadata(HashMap); -fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> { - if prefix == path { - anyhow::bail!( - "Prefix and the path are equal, cannot strip: '{}'", - prefix.display() - ) - } else { - path.strip_prefix(prefix).with_context(|| { - format!( - "Path '{}' is not prefixed with '{}'", - path.display(), - prefix.display(), - ) - }) - } -} - /// External backup storage configuration, enough for creating a client for that storage. #[derive(Debug, Clone, PartialEq, Eq)] pub struct RemoteStorageConfig { @@ -343,7 +280,7 @@ impl Debug for S3Config { } impl RemoteStorageConfig { - pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { + pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result> { let local_path = toml.get("local_path"); let bucket_name = toml.get("bucket_name"); let bucket_region = toml.get("bucket_region"); @@ -367,7 +304,8 @@ impl RemoteStorageConfig { .context("Failed to parse 'concurrency_limit' as a positive integer")?; let storage = match (local_path, bucket_name, bucket_region) { - (None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"), + // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled + (None, None, None) => return Ok(None), (_, Some(_), None) => { bail!("'bucket_region' option is mandatory if 'bucket_name' is given ") } @@ -393,11 +331,11 @@ impl RemoteStorageConfig { (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"), }; - Ok(RemoteStorageConfig { + Ok(Some(RemoteStorageConfig { max_concurrent_syncs, max_sync_errors, storage, - }) + })) } } @@ -431,21 +369,24 @@ mod tests { use super::*; #[test] - fn object_name() { - let k = RemoteObjectId("a/b/c".to_owned()); + fn test_object_name() { + let k = RemotePath::new(Path::new("a/b/c")).unwrap(); assert_eq!(k.object_name(), Some("c")); - let k = RemoteObjectId("a/b/c/".to_owned()); + let k = RemotePath::new(Path::new("a/b/c/")).unwrap(); assert_eq!(k.object_name(), Some("c")); - let k = RemoteObjectId("a/".to_owned()); + let k = RemotePath::new(Path::new("a/")).unwrap(); assert_eq!(k.object_name(), Some("a")); // XXX is it impossible to have an empty key? - let k = RemoteObjectId("".to_owned()); - assert_eq!(k.object_name(), None); - - let k = RemoteObjectId("/".to_owned()); + let k = RemotePath::new(Path::new("")).unwrap(); assert_eq!(k.object_name(), None); } + + #[test] + fn rempte_path_cannot_be_created_from_absolute_ones() { + let err = RemotePath::new(Path::new("/")).expect_err("Should fail on absolute paths"); + assert_eq!(err.to_string(), "Path \"/\" is not relative"); + } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 2f824cc453..f1289569ae 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -5,6 +5,7 @@ //! volume is mounted to the local FS. use std::{ + borrow::Cow, future::Future, path::{Path, PathBuf}, pin::Pin, @@ -18,60 +19,33 @@ use tokio::{ use tracing::*; use utils::crashsafe::path_with_suffix_extension; -use crate::{Download, DownloadError, RemoteObjectId}; +use crate::{Download, DownloadError, RemotePath}; -use super::{strip_path_prefix, RemoteStorage, StorageMetadata}; +use super::{RemoteStorage, StorageMetadata}; const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp"; -/// Convert a Path in the remote storage into a RemoteObjectId -fn remote_object_id_from_path(path: &Path) -> anyhow::Result { - Ok(RemoteObjectId( - path.to_str() - .ok_or_else(|| anyhow::anyhow!("unexpected characters found in path"))? - .to_string(), - )) -} - +#[derive(Debug, Clone)] pub struct LocalFs { - working_directory: PathBuf, storage_root: PathBuf, } impl LocalFs { /// Attempts to create local FS storage, along with its root directory. - pub fn new(root: PathBuf, working_directory: PathBuf) -> anyhow::Result { - if !root.exists() { - std::fs::create_dir_all(&root).with_context(|| { - format!( - "Failed to create all directories in the given root path '{}'", - root.display(), - ) + /// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative). + pub fn new(mut storage_root: PathBuf) -> anyhow::Result { + if !storage_root.exists() { + std::fs::create_dir_all(&storage_root).with_context(|| { + format!("Failed to create all directories in the given root path {storage_root:?}") })?; } - Ok(Self { - working_directory, - storage_root: root, - }) - } - - /// - /// Get the absolute path in the local filesystem to given remote object. - /// - /// This is public so that it can be used in tests. Should not be used elsewhere. - /// - pub fn resolve_in_storage(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result { - let path = PathBuf::from(&remote_object_id.0); - if path.is_relative() { - Ok(self.storage_root.join(path)) - } else if path.starts_with(&self.storage_root) { - Ok(path) - } else { - bail!( - "Path '{}' does not belong to the current storage", - path.display() - ) + if !storage_root.is_absolute() { + storage_root = storage_root.canonicalize().with_context(|| { + format!("Failed to represent path {storage_root:?} as an absolute path") + })?; } + + Ok(Self { storage_root }) } async fn read_storage_metadata( @@ -103,45 +77,52 @@ impl LocalFs { #[async_trait::async_trait] impl RemoteStorage for LocalFs { - /// Convert a "local" path into a "remote path" - fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { - let path = self.storage_root.join( - strip_path_prefix(&self.working_directory, local_path) - .context("local path does not belong to this storage")?, - ); - remote_object_id_from_path(&path) - } - - fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result { - let storage_path = PathBuf::from(&remote_object_id.0); - let relative_path = strip_path_prefix(&self.storage_root, &storage_path) - .context("local path does not belong to this storage")?; - Ok(self.working_directory.join(relative_path)) - } - - async fn list(&self) -> anyhow::Result> { - get_all_files(&self.storage_root, true).await + async fn list(&self) -> anyhow::Result> { + Ok(get_all_files(&self.storage_root, true) + .await? + .into_iter() + .map(|path| { + path.strip_prefix(&self.storage_root) + .context("Failed to strip storage root prefix") + .and_then(RemotePath::new) + .expect( + "We list files for storage root, hence should be able to remote the prefix", + ) + }) + .collect()) } async fn list_prefixes( &self, - prefix: Option<&RemoteObjectId>, - ) -> anyhow::Result> { + prefix: Option<&RemotePath>, + ) -> Result, DownloadError> { let path = match prefix { - Some(prefix) => Path::new(&prefix.0), - None => &self.storage_root, + Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)), + None => Cow::Borrowed(&self.storage_root), }; - get_all_files(path, false).await + Ok(get_all_files(path.as_ref(), false) + .await + .map_err(DownloadError::Other)? + .into_iter() + .map(|path| { + path.strip_prefix(&self.storage_root) + .context("Failed to strip preifix") + .and_then(RemotePath::new) + .expect( + "We list files for storage root, hence should be able to remote the prefix", + ) + }) + .collect()) } async fn upload( &self, - from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, - from_size_bytes: usize, - to: &RemoteObjectId, + data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, + data_size_bytes: usize, + to: &RemotePath, metadata: Option, ) -> anyhow::Result<()> { - let target_file_path = self.resolve_in_storage(to)?; + let target_file_path = to.with_base(&self.storage_root); create_target_directory(&target_file_path).await?; // We need this dance with sort of durable rename (without fsyncs) // to prevent partial uploads. This was really hit when pageserver shutdown @@ -162,8 +143,8 @@ impl RemoteStorage for LocalFs { })?, ); - let from_size_bytes = from_size_bytes as u64; - let mut buffer_to_read = from.take(from_size_bytes); + let from_size_bytes = data_size_bytes as u64; + let mut buffer_to_read = data.take(from_size_bytes); let bytes_read = io::copy(&mut buffer_to_read, &mut destination) .await @@ -220,27 +201,22 @@ impl RemoteStorage for LocalFs { Ok(()) } - async fn download(&self, from: &RemoteObjectId) -> Result { - let file_path = self - .resolve_in_storage(from) - .map_err(DownloadError::BadInput)?; - if file_exists(&file_path).map_err(DownloadError::BadInput)? { + async fn download(&self, from: &RemotePath) -> Result { + let target_path = from.with_base(&self.storage_root); + if file_exists(&target_path).map_err(DownloadError::BadInput)? { let source = io::BufReader::new( fs::OpenOptions::new() .read(true) - .open(&file_path) + .open(&target_path) .await .with_context(|| { - format!( - "Failed to open source file '{}' to use in the download", - file_path.display() - ) + format!("Failed to open source file {target_path:?} to use in the download") }) .map_err(DownloadError::Other)?, ); let metadata = self - .read_storage_metadata(&file_path) + .read_storage_metadata(&target_path) .await .map_err(DownloadError::Other)?; Ok(Download { @@ -254,7 +230,7 @@ impl RemoteStorage for LocalFs { async fn download_byte_range( &self, - from: &RemoteObjectId, + from: &RemotePath, start_inclusive: u64, end_exclusive: Option, ) -> Result { @@ -266,20 +242,15 @@ impl RemoteStorage for LocalFs { return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes"))); } } - let file_path = self - .resolve_in_storage(from) - .map_err(DownloadError::BadInput)?; - if file_exists(&file_path).map_err(DownloadError::BadInput)? { + let target_path = from.with_base(&self.storage_root); + if file_exists(&target_path).map_err(DownloadError::BadInput)? { let mut source = io::BufReader::new( fs::OpenOptions::new() .read(true) - .open(&file_path) + .open(&target_path) .await .with_context(|| { - format!( - "Failed to open source file '{}' to use in the download", - file_path.display() - ) + format!("Failed to open source file {target_path:?} to use in the download") }) .map_err(DownloadError::Other)?, ); @@ -289,7 +260,7 @@ impl RemoteStorage for LocalFs { .context("Failed to seek to the range start in a local storage file") .map_err(DownloadError::Other)?; let metadata = self - .read_storage_metadata(&file_path) + .read_storage_metadata(&target_path) .await .map_err(DownloadError::Other)?; @@ -308,21 +279,14 @@ impl RemoteStorage for LocalFs { } } - async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()> { - let file_path = self.resolve_in_storage(path)?; + async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { + let file_path = path.with_base(&self.storage_root); if file_path.exists() && file_path.is_file() { Ok(fs::remove_file(file_path).await?) } else { - bail!( - "File '{}' either does not exist or is not a file", - file_path.display() - ) + bail!("File {file_path:?} either does not exist or is not a file") } } - - fn as_local(&self) -> Option<&LocalFs> { - Some(self) - } } fn storage_metadata_path(original_path: &Path) -> PathBuf { @@ -332,7 +296,7 @@ fn storage_metadata_path(original_path: &Path) -> PathBuf { fn get_all_files<'a, P>( directory_path: P, recursive: bool, -) -> Pin>> + Send + Sync + 'a>> +) -> Pin>> + Send + Sync + 'a>> where P: AsRef + Send + Sync + 'a, { @@ -346,20 +310,20 @@ where let file_type = dir_entry.file_type().await?; let entry_path = dir_entry.path(); if file_type.is_symlink() { - debug!("{:?} us a symlink, skipping", entry_path) + debug!("{entry_path:?} us a symlink, skipping") } else if file_type.is_dir() { if recursive { paths.extend(get_all_files(&entry_path, true).await?.into_iter()) } else { - paths.push(remote_object_id_from_path(&dir_entry.path())?) + paths.push(entry_path) } } else { - paths.push(remote_object_id_from_path(&dir_entry.path())?); + paths.push(entry_path); } } Ok(paths) } else { - bail!("Path '{}' is not a directory", directory_path.display()) + bail!("Path {directory_path:?} is not a directory") } } else { Ok(Vec::new()) @@ -394,173 +358,6 @@ fn file_exists(file_path: &Path) -> anyhow::Result { } } -#[cfg(test)] -mod pure_tests { - use tempfile::tempdir; - - use super::*; - - #[test] - fn storage_path_positive() -> anyhow::Result<()> { - let workdir = tempdir()?.path().to_owned(); - - let storage_root = PathBuf::from("somewhere").join("else"); - let storage = LocalFs { - working_directory: workdir.clone(), - storage_root: storage_root.clone(), - }; - - let local_path = workdir - .join("timelines") - .join("some_timeline") - .join("file_name"); - let expected_path = storage_root.join(local_path.strip_prefix(&workdir)?); - - let actual_path = PathBuf::from( - storage - .remote_object_id(&local_path) - .expect("Matching path should map to storage path normally") - .0, - ); - assert_eq!( - expected_path, - actual_path, - "File paths from workdir should be stored in local fs storage with the same path they have relative to the workdir" - ); - - Ok(()) - } - - #[test] - fn storage_path_negatives() -> anyhow::Result<()> { - #[track_caller] - fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String { - match storage.remote_object_id(mismatching_path) { - Ok(wrong_path) => panic!( - "Expected path '{}' to error, but got storage path: {:?}", - mismatching_path.display(), - wrong_path, - ), - Err(e) => format!("{:?}", e), - } - } - - let workdir = tempdir()?.path().to_owned(); - let storage_root = PathBuf::from("somewhere").join("else"); - let storage = LocalFs { - working_directory: workdir.clone(), - storage_root, - }; - - let error_string = storage_path_error(&storage, &workdir); - assert!(error_string.contains("does not belong to this storage")); - assert!(error_string.contains(workdir.to_str().unwrap())); - - let mismatching_path_str = "/something/else"; - let error_message = storage_path_error(&storage, Path::new(mismatching_path_str)); - assert!( - error_message.contains(mismatching_path_str), - "Error should mention wrong path" - ); - assert!( - error_message.contains(workdir.to_str().unwrap()), - "Error should mention server workdir" - ); - assert!(error_message.contains("does not belong to this storage")); - - Ok(()) - } - - #[test] - fn local_path_positive() -> anyhow::Result<()> { - let workdir = tempdir()?.path().to_owned(); - let storage_root = PathBuf::from("somewhere").join("else"); - let storage = LocalFs { - working_directory: workdir.clone(), - storage_root: storage_root.clone(), - }; - - let name = "not a metadata"; - let local_path = workdir.join("timelines").join("some_timeline").join(name); - assert_eq!( - local_path, - storage - .local_path(&remote_object_id_from_path( - &storage_root.join(local_path.strip_prefix(&workdir)?) - )?) - .expect("For a valid input, valid local path should be parsed"), - "Should be able to parse metadata out of the correctly named remote delta file" - ); - - let local_metadata_path = workdir - .join("timelines") - .join("some_timeline") - .join("metadata"); - let remote_metadata_path = storage.remote_object_id(&local_metadata_path)?; - assert_eq!( - local_metadata_path, - storage - .local_path(&remote_metadata_path) - .expect("For a valid input, valid local path should be parsed"), - "Should be able to parse metadata out of the correctly named remote metadata file" - ); - - Ok(()) - } - - #[test] - fn local_path_negatives() -> anyhow::Result<()> { - #[track_caller] - fn local_path_error(storage: &LocalFs, storage_path: &RemoteObjectId) -> String { - match storage.local_path(storage_path) { - Ok(wrong_path) => panic!( - "Expected local path input {:?} to cause an error, but got file path: {:?}", - storage_path, wrong_path, - ), - Err(e) => format!("{:?}", e), - } - } - - let storage_root = PathBuf::from("somewhere").join("else"); - let storage = LocalFs { - working_directory: tempdir()?.path().to_owned(), - storage_root, - }; - - let totally_wrong_path = "wrong_wrong_wrong"; - let error_message = - local_path_error(&storage, &RemoteObjectId(totally_wrong_path.to_string())); - assert!(error_message.contains(totally_wrong_path)); - - Ok(()) - } - - #[test] - fn download_destination_matches_original_path() -> anyhow::Result<()> { - let workdir = tempdir()?.path().to_owned(); - let original_path = workdir - .join("timelines") - .join("some_timeline") - .join("some name"); - - let storage_root = PathBuf::from("somewhere").join("else"); - let dummy_storage = LocalFs { - working_directory: workdir, - storage_root, - }; - - let storage_path = dummy_storage.remote_object_id(&original_path)?; - let download_destination = dummy_storage.local_path(&storage_path)?; - - assert_eq!( - original_path, download_destination, - "'original path -> storage path -> matching fs path' transformation should produce the same path as the input one for the correct path" - ); - - Ok(()) - } -} - #[cfg(test)] mod fs_tests { use super::*; @@ -572,7 +369,7 @@ mod fs_tests { storage: &LocalFs, #[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.local_path` parameter requirements - remote_storage_path: &RemoteObjectId, + remote_storage_path: &RemotePath, expected_metadata: Option<&StorageMetadata>, ) -> anyhow::Result { let mut download = storage @@ -595,41 +392,16 @@ mod fs_tests { #[tokio::test] async fn upload_file() -> anyhow::Result<()> { - let workdir = tempdir()?.path().to_owned(); let storage = create_storage()?; - let (file, size) = create_file_for_upload( - &storage.working_directory.join("whatever"), - "whatever_contents", - ) - .await?; - let target_path = "/somewhere/else"; - match storage - .upload( - Box::new(file), - size, - &RemoteObjectId(target_path.to_string()), - None, - ) - .await - { - Ok(()) => panic!("Should not allow storing files with wrong target path"), - Err(e) => { - let message = format!("{:?}", e); - assert!(message.contains(target_path)); - assert!(message.contains("does not belong to the current storage")); - } - } - assert!(storage.list().await?.is_empty()); - - let target_path_1 = upload_dummy_file(&workdir, &storage, "upload_1", None).await?; + let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?; assert_eq!( storage.list().await?, vec![target_path_1.clone()], "Should list a single file after first upload" ); - let target_path_2 = upload_dummy_file(&workdir, &storage, "upload_2", None).await?; + let target_path_2 = upload_dummy_file(&storage, "upload_2", None).await?; assert_eq!( list_files_sorted(&storage).await?, vec![target_path_1.clone(), target_path_2.clone()], @@ -643,7 +415,7 @@ mod fs_tests { async fn upload_file_negatives() -> anyhow::Result<()> { let storage = create_storage()?; - let id = storage.remote_object_id(&storage.working_directory.join("dummy"))?; + let id = RemotePath::new(Path::new("dummy"))?; let content = std::io::Cursor::new(b"12345"); // Check that you get an error if the size parameter doesn't match the actual @@ -668,16 +440,14 @@ mod fs_tests { } fn create_storage() -> anyhow::Result { - LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned()) + LocalFs::new(tempdir()?.path().to_owned()) } #[tokio::test] async fn download_file() -> anyhow::Result<()> { - let workdir = tempdir()?.path().to_owned(); - let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None).await?; let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?; assert_eq!( @@ -687,7 +457,7 @@ mod fs_tests { ); let non_existing_path = "somewhere/else"; - match storage.download(&RemoteObjectId(non_existing_path.to_string())).await { + match storage.download(&RemotePath::new(Path::new(non_existing_path))?).await { Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"), } @@ -696,11 +466,9 @@ mod fs_tests { #[tokio::test] async fn download_file_range_positive() -> anyhow::Result<()> { - let workdir = tempdir()?.path().to_owned(); - let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None).await?; let full_range_download_contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?; @@ -766,11 +534,9 @@ mod fs_tests { #[tokio::test] async fn download_file_range_negative() -> anyhow::Result<()> { - let workdir = tempdir()?.path().to_owned(); - let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None).await?; let start = 1_000_000_000; let end = start + 1; @@ -812,11 +578,9 @@ mod fs_tests { #[tokio::test] async fn delete_file() -> anyhow::Result<()> { - let workdir = tempdir()?.path().to_owned(); - let storage = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None).await?; storage.delete(&upload_target).await?; assert!(storage.list().await?.is_empty()); @@ -826,7 +590,8 @@ mod fs_tests { Err(e) => { let error_string = e.to_string(); assert!(error_string.contains("does not exist")); - assert!(error_string.contains(&upload_target.0)); + let expected_path = upload_target.with_base(&storage.storage_root); + assert!(error_string.contains(expected_path.to_str().unwrap())); } } Ok(()) @@ -834,8 +599,6 @@ mod fs_tests { #[tokio::test] async fn file_with_metadata() -> anyhow::Result<()> { - let workdir = tempdir()?.path().to_owned(); - let storage = create_storage()?; let upload_name = "upload_1"; let metadata = StorageMetadata(HashMap::from([ @@ -843,7 +606,7 @@ mod fs_tests { ("two".to_string(), "2".to_string()), ])); let upload_target = - upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?; + upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?; let full_range_download_contents = read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?; @@ -883,23 +646,32 @@ mod fs_tests { } async fn upload_dummy_file( - workdir: &Path, storage: &LocalFs, name: &str, metadata: Option, - ) -> anyhow::Result { - let timeline_path = workdir.join("timelines").join("some_timeline"); - let relative_timeline_path = timeline_path.strip_prefix(&workdir)?; - let storage_path = storage.storage_root.join(relative_timeline_path).join(name); - let remote_object_id = RemoteObjectId(storage_path.to_str().unwrap().to_string()); - - let from_path = storage.working_directory.join(name); + ) -> anyhow::Result { + let from_path = storage + .storage_root + .join("timelines") + .join("some_timeline") + .join(name); let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?; + let relative_path = from_path + .strip_prefix(&storage.storage_root) + .context("Failed to strip storage root prefix") + .and_then(RemotePath::new) + .with_context(|| { + format!( + "Failed to resolve remote part of path {:?} for base {:?}", + from_path, storage.storage_root + ) + })?; + storage - .upload(Box::new(file), size, &remote_object_id, metadata) + .upload(Box::new(file), size, &relative_path, metadata) .await?; - remote_object_id_from_path(&storage_path) + Ok(relative_path) } async fn create_file_for_upload( @@ -924,7 +696,7 @@ mod fs_tests { format!("contents for {name}") } - async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result> { + async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result> { let mut files = storage.list().await?; files.sort_by(|a, b| a.0.cmp(&b.0)); Ok(files) diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index c721560c29..18a2c5dedd 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -4,15 +4,13 @@ //! allowing multiple api users to independently work with the same S3 bucket, if //! their bucket prefixes are both specified and different. -use std::env::var; -use std::path::{Path, PathBuf}; use std::sync::Arc; -use std::time::Duration; use anyhow::Context; use aws_config::{ - environment::credentials::EnvironmentVariableCredentialsProvider, imds, - imds::credentials::ImdsCredentialsProvider, meta::credentials::provide_credentials_fn, + environment::credentials::EnvironmentVariableCredentialsProvider, + imds::credentials::ImdsCredentialsProvider, + meta::credentials::{CredentialsProviderChain, LazyCachingCredentialsProvider}, }; use aws_sdk_s3::{ config::Config, @@ -21,7 +19,6 @@ use aws_sdk_s3::{ Client, Endpoint, Region, }; use aws_smithy_http::body::SdkBody; -use aws_types::credentials::{CredentialsError, ProvideCredentials}; use hyper::Body; use tokio::{io, sync::Semaphore}; use tokio_util::io::ReaderStream; @@ -29,12 +26,9 @@ use tracing::debug; use super::StorageMetadata; use crate::{ - strip_path_prefix, Download, DownloadError, RemoteObjectId, RemoteStorage, S3Config, - REMOTE_STORAGE_PREFIX_SEPARATOR, + Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR, }; -const DEFAULT_IMDS_TIMEOUT: Duration = Duration::from_secs(10); - pub(super) mod metrics { use metrics::{register_int_counter_vec, IntCounterVec}; use once_cell::sync::Lazy; @@ -100,31 +94,8 @@ pub(super) mod metrics { } } -fn download_destination( - id: &RemoteObjectId, - workdir: &Path, - prefix_to_strip: Option<&str>, -) -> PathBuf { - let path_without_prefix = match prefix_to_strip { - Some(prefix) => id.0.strip_prefix(prefix).unwrap_or_else(|| { - panic!( - "Could not strip prefix '{}' from S3 object key '{}'", - prefix, id.0 - ) - }), - None => &id.0, - }; - - workdir.join( - path_without_prefix - .split(REMOTE_STORAGE_PREFIX_SEPARATOR) - .collect::(), - ) -} - /// AWS S3 storage. pub struct S3Bucket { - workdir: PathBuf, client: Client, bucket_name: String, prefix_in_bucket: Option, @@ -142,35 +113,28 @@ struct GetObjectRequest { } impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. - pub fn new(aws_config: &S3Config, workdir: PathBuf) -> anyhow::Result { + pub fn new(aws_config: &S3Config) -> anyhow::Result { debug!( "Creating s3 remote storage for S3 bucket {}", aws_config.bucket_name ); + + let credentials_provider = { + // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" + let env_creds = EnvironmentVariableCredentialsProvider::new(); + // uses imds v2 + let imds = ImdsCredentialsProvider::builder().build(); + + // finally add caching. + // this might change in future, see https://github.com/awslabs/aws-sdk-rust/issues/629 + LazyCachingCredentialsProvider::builder() + .load(CredentialsProviderChain::first_try("env", env_creds).or_else("imds", imds)) + .build() + }; + let mut config_builder = Config::builder() .region(Region::new(aws_config.bucket_region.clone())) - .credentials_provider(provide_credentials_fn(|| async { - match var("AWS_ACCESS_KEY_ID").is_ok() && var("AWS_SECRET_ACCESS_KEY").is_ok() { - true => { - EnvironmentVariableCredentialsProvider::new() - .provide_credentials() - .await - } - false => { - let imds_client = imds::Client::builder() - .connect_timeout(DEFAULT_IMDS_TIMEOUT) - .read_timeout(DEFAULT_IMDS_TIMEOUT) - .build() - .await - .map_err(CredentialsError::unhandled)?; - ImdsCredentialsProvider::builder() - .imds_client(imds_client) - .build() - .provide_credentials() - .await - } - } - })); + .credentials_provider(credentials_provider); if let Some(custom_endpoint) = aws_config.endpoint.clone() { let endpoint = Endpoint::immutable( @@ -196,13 +160,39 @@ impl S3Bucket { }); Ok(Self { client, - workdir, bucket_name: aws_config.bucket_name.clone(), prefix_in_bucket, concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()), }) } + fn s3_object_to_relative_path(&self, key: &str) -> RemotePath { + let relative_path = + match key.strip_prefix(self.prefix_in_bucket.as_deref().unwrap_or_default()) { + Some(stripped) => stripped, + // we rely on AWS to return properly prefixed paths + // for requests with a certain prefix + None => panic!( + "Key {} does not start with bucket prefix {:?}", + key, self.prefix_in_bucket + ), + }; + RemotePath( + relative_path + .split(REMOTE_STORAGE_PREFIX_SEPARATOR) + .collect(), + ) + } + + fn relative_path_to_s3_object(&self, path: &RemotePath) -> String { + let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default(); + for segment in path.0.iter() { + full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + full_path.push_str(segment.to_str().unwrap_or_default()); + } + full_path + } + async fn download_object(&self, request: GetObjectRequest) -> Result { let _guard = self .concurrency_limiter @@ -252,25 +242,7 @@ impl S3Bucket { #[async_trait::async_trait] impl RemoteStorage for S3Bucket { - fn remote_object_id(&self, local_path: &Path) -> anyhow::Result { - let relative_path = strip_path_prefix(&self.workdir, local_path)?; - let mut key = self.prefix_in_bucket.clone().unwrap_or_default(); - for segment in relative_path { - key.push(REMOTE_STORAGE_PREFIX_SEPARATOR); - key.push_str(&segment.to_string_lossy()); - } - Ok(RemoteObjectId(key)) - } - - fn local_path(&self, storage_path: &RemoteObjectId) -> anyhow::Result { - Ok(download_destination( - storage_path, - &self.workdir, - self.prefix_in_bucket.as_deref(), - )) - } - - async fn list(&self) -> anyhow::Result> { + async fn list(&self) -> anyhow::Result> { let mut document_keys = Vec::new(); let mut continuation_token = None; @@ -300,7 +272,7 @@ impl RemoteStorage for S3Bucket { .contents .unwrap_or_default() .into_iter() - .filter_map(|o| Some(RemoteObjectId(o.key?))), + .filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))), ); match fetch_response.continuation_token { @@ -316,11 +288,11 @@ impl RemoteStorage for S3Bucket { /// Note: it wont include empty "directories" async fn list_prefixes( &self, - prefix: Option<&RemoteObjectId>, - ) -> anyhow::Result> { + prefix: Option<&RemotePath>, + ) -> Result, DownloadError> { // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix - .map(|p| p.0.clone()) + .map(|p| self.relative_path_to_s3_object(p)) .or_else(|| self.prefix_in_bucket.clone()) .map(|mut p| { // required to end with a separator @@ -339,7 +311,8 @@ impl RemoteStorage for S3Bucket { .concurrency_limiter .acquire() .await - .context("Concurrency limiter semaphore got closed during S3 list")?; + .context("Concurrency limiter semaphore got closed during S3 list") + .map_err(DownloadError::Other)?; metrics::inc_list_objects(); @@ -355,14 +328,16 @@ impl RemoteStorage for S3Bucket { .map_err(|e| { metrics::inc_list_objects_fail(); e - })?; + }) + .context("Failed to list S3 prefixes") + .map_err(DownloadError::Other)?; document_keys.extend( fetch_response .common_prefixes .unwrap_or_default() .into_iter() - .filter_map(|o| Some(RemoteObjectId(o.prefix?))), + .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))), ); match fetch_response.continuation_token { @@ -378,7 +353,7 @@ impl RemoteStorage for S3Bucket { &self, from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, from_size_bytes: usize, - to: &RemoteObjectId, + to: &RemotePath, metadata: Option, ) -> anyhow::Result<()> { let _guard = self @@ -395,7 +370,7 @@ impl RemoteStorage for S3Bucket { self.client .put_object() .bucket(self.bucket_name.clone()) - .key(to.0.to_owned()) + .key(self.relative_path_to_s3_object(to)) .set_metadata(metadata.map(|m| m.0)) .content_length(from_size_bytes.try_into()?) .body(bytes_stream) @@ -408,10 +383,10 @@ impl RemoteStorage for S3Bucket { Ok(()) } - async fn download(&self, from: &RemoteObjectId) -> Result { + async fn download(&self, from: &RemotePath) -> Result { self.download_object(GetObjectRequest { bucket: self.bucket_name.clone(), - key: from.0.to_owned(), + key: self.relative_path_to_s3_object(from), ..GetObjectRequest::default() }) .await @@ -419,7 +394,7 @@ impl RemoteStorage for S3Bucket { async fn download_byte_range( &self, - from: &RemoteObjectId, + from: &RemotePath, start_inclusive: u64, end_exclusive: Option, ) -> Result { @@ -427,19 +402,19 @@ impl RemoteStorage for S3Bucket { // and needs both ends to be exclusive let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1)); let range = Some(match end_inclusive { - Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive), - None => format!("bytes={}-", start_inclusive), + Some(end_inclusive) => format!("bytes={start_inclusive}-{end_inclusive}"), + None => format!("bytes={start_inclusive}-"), }); self.download_object(GetObjectRequest { bucket: self.bucket_name.clone(), - key: from.0.to_owned(), + key: self.relative_path_to_s3_object(from), range, }) .await } - async fn delete(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<()> { + async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { let _guard = self .concurrency_limiter .acquire() @@ -451,7 +426,7 @@ impl RemoteStorage for S3Bucket { self.client .delete_object() .bucket(self.bucket_name.clone()) - .key(remote_object_id.0.to_owned()) + .key(self.relative_path_to_s3_object(path)) .send() .await .map_err(|e| { @@ -461,181 +436,3 @@ impl RemoteStorage for S3Bucket { Ok(()) } } - -#[cfg(test)] -mod tests { - use tempfile::tempdir; - - use super::*; - - #[test] - fn test_download_destination() -> anyhow::Result<()> { - let workdir = tempdir()?.path().to_owned(); - let local_path = workdir.join("one").join("two").join("test_name"); - let relative_path = local_path.strip_prefix(&workdir)?; - - let key = RemoteObjectId(format!( - "{}{}", - REMOTE_STORAGE_PREFIX_SEPARATOR, - relative_path - .iter() - .map(|segment| segment.to_str().unwrap()) - .collect::>() - .join(&REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()), - )); - - assert_eq!( - local_path, - download_destination(&key, &workdir, None), - "Download destination should consist of s3 path joined with the workdir prefix" - ); - - Ok(()) - } - - #[test] - fn storage_path_positive() -> anyhow::Result<()> { - let workdir = tempdir()?.path().to_owned(); - - let segment_1 = "matching"; - let segment_2 = "file"; - let local_path = &workdir.join(segment_1).join(segment_2); - - let storage = dummy_storage(workdir); - - let expected_key = RemoteObjectId(format!( - "{}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_1}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_2}", - storage.prefix_in_bucket.as_deref().unwrap_or_default(), - )); - - let actual_key = storage - .remote_object_id(local_path) - .expect("Matching path should map to S3 path normally"); - assert_eq!( - expected_key, - actual_key, - "S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator" - ); - - Ok(()) - } - - #[test] - fn storage_path_negatives() -> anyhow::Result<()> { - #[track_caller] - fn storage_path_error(storage: &S3Bucket, mismatching_path: &Path) -> String { - match storage.remote_object_id(mismatching_path) { - Ok(wrong_key) => panic!( - "Expected path '{}' to error, but got S3 key: {:?}", - mismatching_path.display(), - wrong_key, - ), - Err(e) => e.to_string(), - } - } - - let workdir = tempdir()?.path().to_owned(); - let storage = dummy_storage(workdir.clone()); - - let error_message = storage_path_error(&storage, &workdir); - assert!( - error_message.contains("Prefix and the path are equal"), - "Message '{}' does not contain the required string", - error_message - ); - - let mismatching_path = PathBuf::from("somewhere").join("else"); - let error_message = storage_path_error(&storage, &mismatching_path); - assert!( - error_message.contains(mismatching_path.to_str().unwrap()), - "Error should mention wrong path" - ); - assert!( - error_message.contains(workdir.to_str().unwrap()), - "Error should mention server workdir" - ); - assert!( - error_message.contains("is not prefixed with"), - "Message '{}' does not contain a required string", - error_message - ); - - Ok(()) - } - - #[test] - fn local_path_positive() -> anyhow::Result<()> { - let workdir = tempdir()?.path().to_owned(); - let storage = dummy_storage(workdir.clone()); - let timeline_dir = workdir.join("timelines").join("test_timeline"); - let relative_timeline_path = timeline_dir.strip_prefix(&workdir)?; - - let s3_key = create_s3_key( - &relative_timeline_path.join("not a metadata"), - storage.prefix_in_bucket.as_deref(), - ); - assert_eq!( - download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()), - storage - .local_path(&s3_key) - .expect("For a valid input, valid S3 info should be parsed"), - "Should be able to parse metadata out of the correctly named remote delta file" - ); - - let s3_key = create_s3_key( - &relative_timeline_path.join("metadata"), - storage.prefix_in_bucket.as_deref(), - ); - assert_eq!( - download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()), - storage - .local_path(&s3_key) - .expect("For a valid input, valid S3 info should be parsed"), - "Should be able to parse metadata out of the correctly named remote metadata file" - ); - - Ok(()) - } - - #[test] - fn download_destination_matches_original_path() -> anyhow::Result<()> { - let workdir = tempdir()?.path().to_owned(); - let original_path = workdir - .join("timelines") - .join("some_timeline") - .join("some name"); - - let dummy_storage = dummy_storage(workdir); - - let key = dummy_storage.remote_object_id(&original_path)?; - let download_destination = dummy_storage.local_path(&key)?; - - assert_eq!( - original_path, download_destination, - "'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path" - ); - - Ok(()) - } - - fn dummy_storage(workdir: PathBuf) -> S3Bucket { - S3Bucket { - workdir, - client: Client::new(&aws_config::SdkConfig::builder().build()), - bucket_name: "dummy-bucket".to_string(), - prefix_in_bucket: Some("dummy_prefix/".to_string()), - concurrency_limiter: Semaphore::new(1), - } - } - - fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> RemoteObjectId { - RemoteObjectId(relative_file_path.iter().fold( - prefix.unwrap_or_default().to_string(), - |mut path_string, segment| { - path_string.push(REMOTE_STORAGE_PREFIX_SEPARATOR); - path_string.push_str(segment.to_str().unwrap()); - path_string - }, - )) - } -} diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs new file mode 100644 index 0000000000..643bb99dce --- /dev/null +++ b/libs/remote_storage/src/simulate_failures.rs @@ -0,0 +1,129 @@ +//! This module provides a wrapper around a real RemoteStorage implementation that +//! causes the first N attempts at each upload or download operatio to fail. For +//! testing purposes. +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::sync::Mutex; + +use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata}; + +pub struct UnreliableWrapper { + inner: crate::GenericRemoteStorage, + + // This many attempts of each operation will fail, then we let it succeed. + attempts_to_fail: u64, + + // Tracks how many failed attempts of each operation has been made. + attempts: Mutex>, +} + +/// Used to identify retries of different unique operation. +#[derive(Debug, Hash, Eq, PartialEq)] +enum RemoteOp { + List, + ListPrefixes(Option), + Upload(RemotePath), + Download(RemotePath), + Delete(RemotePath), +} + +impl UnreliableWrapper { + pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self { + assert!(attempts_to_fail > 0); + UnreliableWrapper { + inner, + attempts_to_fail, + attempts: Mutex::new(HashMap::new()), + } + } + + /// + /// Common functionality for all operations. + /// + /// On the first attempts of this operation, return an error. After 'attempts_to_fail' + /// attempts, let the operation go ahead, and clear the counter. + /// + fn attempt(&self, op: RemoteOp) -> Result { + let mut attempts = self.attempts.lock().unwrap(); + + match attempts.entry(op) { + Entry::Occupied(mut e) => { + let attempts_before_this = { + let p = e.get_mut(); + *p += 1; + *p + }; + + if attempts_before_this >= self.attempts_to_fail { + // let it succeed + e.remove(); + Ok(attempts_before_this) + } else { + let error = + anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); + Err(DownloadError::Other(error)) + } + } + Entry::Vacant(e) => { + let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); + e.insert(1); + Err(DownloadError::Other(error)) + } + } + } +} + +#[async_trait::async_trait] +impl RemoteStorage for UnreliableWrapper { + /// Lists all items the storage has right now. + async fn list(&self) -> anyhow::Result> { + self.attempt(RemoteOp::List)?; + self.inner.list().await + } + + async fn list_prefixes( + &self, + prefix: Option<&RemotePath>, + ) -> Result, DownloadError> { + self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?; + self.inner.list_prefixes(prefix).await + } + + async fn upload( + &self, + data: Box<(dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static)>, + // S3 PUT request requires the content length to be specified, + // otherwise it starts to fail with the concurrent connection count increasing. + data_size_bytes: usize, + to: &RemotePath, + metadata: Option, + ) -> anyhow::Result<()> { + self.attempt(RemoteOp::Upload(to.clone()))?; + self.inner.upload(data, data_size_bytes, to, metadata).await + } + + async fn download(&self, from: &RemotePath) -> Result { + self.attempt(RemoteOp::Download(from.clone()))?; + self.inner.download(from).await + } + + async fn download_byte_range( + &self, + from: &RemotePath, + start_inclusive: u64, + end_exclusive: Option, + ) -> Result { + // Note: We treat any download_byte_range as an "attempt" of the same + // operation. We don't pay attention to the ranges. That's good enough + // for now. + self.attempt(RemoteOp::Download(from.clone()))?; + self.inner + .download_byte_range(from, start_inclusive, end_exclusive) + .await + } + + async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { + self.attempt(RemoteOp::Delete(path.clone()))?; + self.inner.delete(path).await + } +} diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 85c6439367..0c1310eef9 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -22,3 +22,40 @@ pub struct TimelineCreateRequest { // If not passed, it is assigned to the beginning of commit_lsn segment. pub local_start_lsn: Option, } + +fn lsn_invalid() -> Lsn { + Lsn::INVALID +} + +/// Data about safekeeper's timeline, mirrors broker.proto. +#[serde_as] +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct SkTimelineInfo { + /// Term of the last entry. + pub last_log_term: Option, + /// LSN of the last record. + #[serde_as(as = "DisplayFromStr")] + #[serde(default = "lsn_invalid")] + pub flush_lsn: Lsn, + /// Up to which LSN safekeeper regards its WAL as committed. + #[serde_as(as = "DisplayFromStr")] + #[serde(default = "lsn_invalid")] + pub commit_lsn: Lsn, + /// LSN up to which safekeeper has backed WAL. + #[serde_as(as = "DisplayFromStr")] + #[serde(default = "lsn_invalid")] + pub backup_lsn: Lsn, + /// LSN of last checkpoint uploaded by pageserver. + #[serde_as(as = "DisplayFromStr")] + #[serde(default = "lsn_invalid")] + pub remote_consistent_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + #[serde(default = "lsn_invalid")] + pub peer_horizon_lsn: Lsn, + #[serde_as(as = "DisplayFromStr")] + #[serde(default = "lsn_invalid")] + pub local_start_lsn: Lsn, + /// A connection string to use for WAL receiving. + #[serde(default)] + pub safekeeper_connstr: Option, +} diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 36a379b47a..47639e8205 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] +sentry = "0.29.0" async-trait = "0.1" anyhow = "1.0" bincode = "1.3" diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index 3726779cb2..2c7e6e20ab 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -157,34 +157,34 @@ mod tests { assert_eq!(err.kind(), io::ErrorKind::AlreadyExists); let invalid_dir_path = file_path.join("folder"); - create_dir_all(&invalid_dir_path).unwrap_err(); + create_dir_all(invalid_dir_path).unwrap_err(); } #[test] fn test_path_with_suffix_extension() { let p = PathBuf::from("/foo/bar"); assert_eq!( - &path_with_suffix_extension(&p, "temp").to_string_lossy(), + &path_with_suffix_extension(p, "temp").to_string_lossy(), "/foo/bar.temp" ); let p = PathBuf::from("/foo/bar"); assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + &path_with_suffix_extension(p, "temp.temp").to_string_lossy(), "/foo/bar.temp.temp" ); let p = PathBuf::from("/foo/bar.baz"); assert_eq!( - &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(), + &path_with_suffix_extension(p, "temp.temp").to_string_lossy(), "/foo/bar.baz.temp.temp" ); let p = PathBuf::from("/foo/bar.baz"); assert_eq!( - &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + &path_with_suffix_extension(p, ".temp").to_string_lossy(), "/foo/bar.baz..temp" ); let p = PathBuf::from("/foo/bar/dir/"); assert_eq!( - &path_with_suffix_extension(&p, ".temp").to_string_lossy(), + &path_with_suffix_extension(p, ".temp").to_string_lossy(), "/foo/bar/dir..temp" ); } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index e1c9a373e5..6d35fd9f7b 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -34,6 +34,7 @@ pub mod sock_split; pub mod logging; pub mod lock_file; +pub mod pid_file; // Misc pub mod accum; @@ -46,6 +47,7 @@ pub mod tcp_listener; pub mod nonblock; // Default signal handling +pub mod sentry_init; pub mod signals; pub mod fs_ext; diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs index 4fef65852b..adbf47eb7a 100644 --- a/libs/utils/src/lock_file.rs +++ b/libs/utils/src/lock_file.rs @@ -1,81 +1,133 @@ -//! A module to create and read lock files. A lock file ensures that only one -//! process is running at a time, in a particular directory. +//! A module to create and read lock files. //! -//! File locking is done using [`fcntl::flock`], which means that holding the -//! lock on file only prevents acquiring another lock on it; all other -//! operations are still possible on files. Other process can still open, read, -//! write, or remove the file, for example. -//! If the file is removed while a process is holding a lock on it, -//! the process that holds the lock does not get any error or notification. -//! Furthermore, you can create a new file with the same name and lock the new file, -//! while the old process is still running. -//! Deleting the lock file while the locking process is still running is a bad idea! +//! File locking is done using [`fcntl::flock`] exclusive locks. +//! The only consumer of this module is currently [`pid_file`]. +//! See the module-level comment there for potential pitfalls +//! with lock files that are used to store PIDs (pidfiles). -use std::{fs, os::unix::prelude::AsRawFd, path::Path}; +use std::{ + fs, + io::{Read, Write}, + ops::Deref, + os::unix::prelude::AsRawFd, + path::{Path, PathBuf}, +}; use anyhow::Context; -use nix::fcntl; +use nix::{errno::Errno::EAGAIN, fcntl}; use crate::crashsafe; -pub enum LockCreationResult { - Created { - new_lock_contents: String, - file: fs::File, - }, - AlreadyLocked { - existing_lock_contents: String, - }, - CreationFailed(anyhow::Error), +/// A handle to an open and unlocked, but not-yet-written lock file. +/// Returned by [`create_exclusive`]. +#[must_use] +pub struct UnwrittenLockFile { + path: PathBuf, + file: fs::File, } -/// Creates a lock file in the path given and writes the given contents into the file. -/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program. -pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult { - let lock_file = match fs::OpenOptions::new() +/// Returned by [`UnwrittenLockFile::write_content`]. +#[must_use] +pub struct LockFileGuard(fs::File); + +impl Deref for LockFileGuard { + type Target = fs::File; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl UnwrittenLockFile { + /// Replace the content of this lock file with the byte representation of `contents`. + pub fn write_content(mut self, contents: String) -> anyhow::Result { + self.file + .set_len(0) + .context("Failed to truncate lockfile")?; + self.file + .write_all(contents.as_bytes()) + .with_context(|| format!("Failed to write '{contents}' contents into lockfile"))?; + crashsafe::fsync_file_and_parent(&self.path).context("fsync lockfile")?; + Ok(LockFileGuard(self.file)) + } +} + +/// Creates and opens a lock file in the path, grabs an exclusive flock on it, and returns +/// a handle that allows overwriting the locked file's content. +/// +/// The exclusive lock is released when dropping the returned handle. +/// +/// It is not an error if the file already exists. +/// It is an error if the file is already locked. +pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result { + let lock_file = fs::OpenOptions::new() .create(true) // O_CREAT .write(true) .open(lock_file_path) - .context("Failed to open lock file") - { - Ok(file) => file, - Err(e) => return LockCreationResult::CreationFailed(e), - }; + .context("open lock file")?; - match fcntl::flock( + let res = fcntl::flock( lock_file.as_raw_fd(), fcntl::FlockArg::LockExclusiveNonblock, - ) { - Ok(()) => { - match lock_file - .set_len(0) - .context("Failed to truncate lockfile") - .and_then(|()| { - fs::write(lock_file_path, &contents).with_context(|| { - format!("Failed to write '{contents}' contents into lockfile") - }) - }) - .and_then(|()| { - crashsafe::fsync_file_and_parent(lock_file_path) - .context("Failed to fsync lockfile") - }) { - Ok(()) => LockCreationResult::Created { - new_lock_contents: contents, - file: lock_file, - }, - Err(e) => LockCreationResult::CreationFailed(e), - } - } - Err(nix::errno::Errno::EAGAIN) => { - match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") { - Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked { - existing_lock_contents, - }, - Err(e) => LockCreationResult::CreationFailed(e), - } - } - Err(e) => { - LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}")) - } + ); + match res { + Ok(()) => Ok(UnwrittenLockFile { + path: lock_file_path.to_owned(), + file: lock_file, + }), + Err(EAGAIN) => anyhow::bail!("file is already locked"), + Err(e) => Err(e).context("flock error"), + } +} + +/// Returned by [`read_and_hold_lock_file`]. +/// Check out the [`pid_file`] module for what the variants mean +/// and potential caveats if the lock files that are used to store PIDs. +pub enum LockFileRead { + /// No file exists at the given path. + NotExist, + /// No other process held the lock file, so we grabbed an flock + /// on it and read its contents. + /// Release the flock by dropping the [`LockFileGuard`]. + NotHeldByAnyProcess(LockFileGuard, String), + /// The file exists but another process was holding an flock on it. + LockedByOtherProcess { + not_locked_file: fs::File, + content: String, + }, +} + +/// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to +/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked. +/// Check the [`LockFileRead`] variants for details. +pub fn read_and_hold_lock_file(path: &Path) -> anyhow::Result { + let res = fs::OpenOptions::new().read(true).open(path); + let mut lock_file = match res { + Ok(f) => f, + Err(e) => match e.kind() { + std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist), + _ => return Err(e).context("open lock file"), + }, + }; + let res = fcntl::flock( + lock_file.as_raw_fd(), + fcntl::FlockArg::LockExclusiveNonblock, + ); + // We need the content regardless of lock success / failure. + // But, read it after flock so that, if it succeeded, the content is consistent. + let mut content = String::new(); + lock_file + .read_to_string(&mut content) + .context("read lock file")?; + match res { + Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess( + LockFileGuard(lock_file), + content, + )), + Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess { + not_locked_file: lock_file, + content, + }), + Err(e) => Err(e).context("flock error"), } } diff --git a/libs/utils/src/pid_file.rs b/libs/utils/src/pid_file.rs new file mode 100644 index 0000000000..e634b08f2a --- /dev/null +++ b/libs/utils/src/pid_file.rs @@ -0,0 +1,165 @@ +//! Abstraction to create & read pidfiles. +//! +//! A pidfile is a file in the filesystem that stores a process's PID. +//! Its purpose is to implement a singleton behavior where only +//! one process of some "kind" is supposed to be running at a given time. +//! The "kind" is identified by the pidfile. +//! +//! During process startup, the process that is supposed to be a singleton +//! must [claim][`claim_for_current_process`] the pidfile first. +//! If that is unsuccessful, the process must not act as the singleton, i.e., +//! it must not access any of the resources that only the singleton may access. +//! +//! A common need is to signal a running singleton process, e.g., to make +//! it shut down and exit. +//! For that, we have to [`read`] the pidfile. The result of the `read` operation +//! tells us if there is any singleton process, and if so, what PID it has. +//! We can then proceed to signal it, although some caveats still apply. +//! Read the function-level documentation of [`read`] for that. +//! +//! ## Never Remove Pidfiles +//! +//! It would be natural to assume that the process who claimed the pidfile +//! should remove it upon exit to avoid leaving a stale pidfile in place. +//! However, we already have a reliable way to detect staleness of the pidfile, +//! i.e., the `flock` that [claiming][`claim_for_current_process`] puts on it. +//! +//! And further, removing pidfiles would introduce a **catastrophic race condition** +//! where two processes are running that are supposed to be singletons. +//! Suppose we were to remove our pidfile during process shutdown. +//! Here is how the race plays out: +//! - Suppose we have a service called `myservice` with pidfile `myservice.pidfile`. +//! - Process `A` starts to shut down. +//! - Process `B` is just starting up +//! - It `open("myservice.pid", O_WRONLY|O_CREAT)` the file +//! - It blocks on `flock` +//! - Process `A` removes the pidfile as the last step of its shutdown procedure +//! - `unlink("myservice.pid") +//! - Process `A` exits +//! - This releases its `flock` and unblocks `B` +//! - Process `B` still has the file descriptor for `myservice.pid` open +//! - Process `B` writes its PID into `myservice.pid`. +//! - But the `myservice.pid` file has been unlinked, so, there is `myservice.pid` +//! in the directory. +//! - Process `C` starts +//! - It `open("myservice.pid", O_WRONLY|O_CREAT)` which creates a new file (new inode) +//! - It `flock`s the file, which, since it's a different file, does not block +//! - It writes its PID into the file +//! +//! At this point, `B` and `C` are running, which is hazardous. +//! Morale of the story: don't unlink pidfiles, ever. + +use std::{ops::Deref, path::Path}; + +use anyhow::Context; +use nix::unistd::Pid; + +use crate::lock_file::{self, LockFileRead}; + +/// Keeps a claim on a pidfile alive until it is dropped. +/// Returned by [`claim_for_current_process`]. +#[must_use] +pub struct PidFileGuard(lock_file::LockFileGuard); + +impl Deref for PidFileGuard { + type Target = lock_file::LockFileGuard; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +/// Try to claim `path` as a pidfile for the current process. +/// +/// If another process has already claimed the pidfile, and it is still running, +/// this function returns ane error. +/// Otherwise, the function `flock`s the file and updates its contents to the +/// current process's PID. +/// If the update fails, the flock is released and an error returned. +/// On success, the function returns a [`PidFileGuard`] to keep the flock alive. +/// +/// ### Maintaining A Claim +/// +/// It is the caller's responsibility to maintain the claim. +/// The claim ends as soon as the returned guard object is dropped. +/// To maintain the claim for the remaining lifetime of the current process, +/// use [`std::mem::forget`] or similar. +pub fn claim_for_current_process(path: &Path) -> anyhow::Result { + let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?; + // if any of the next steps fail, we drop the file descriptor and thereby release the lock + let guard = unwritten_lock_file + .write_content(Pid::this().to_string()) + .context("write pid to lock file")?; + Ok(PidFileGuard(guard)) +} + +/// Returned by [`read`]. +pub enum PidFileRead { + /// No file exists at the given path. + NotExist, + /// The given pidfile is currently not claimed by any process. + /// To determine this, the [`read`] operation acquired + /// an exclusive flock on the file. The lock is still held and responsibility + /// to release it is returned through the guard object. + /// Before releasing it, other [`claim_for_current_process`] or [`read`] calls + /// will fail. + /// + /// ### Caveats + /// + /// Do not unlink the pidfile from the filesystem. See module-comment for why. + NotHeldByAnyProcess(PidFileGuard), + /// The given pidfile is still claimed by another process whose PID is given + /// as part of this variant. + /// + /// ### Caveats + /// + /// 1. The other process might exit at any time, turning the given PID stale. + /// 2. There is a small window in which `claim_for_current_process` has already + /// locked the file but not yet updates its contents. [`read`] will return + /// this variant here, but with the old file contents, i.e., a stale PID. + /// + /// The kernel is free to recycle PID once it has been `wait(2)`ed upon by + /// its creator. Thus, acting upon a stale PID, e.g., by issuing a `kill` + /// system call on it, bears the risk of killing an unrelated process. + /// This is an inherent limitation of using pidfiles. + /// The only race-free solution is to have a supervisor-process with a lifetime + /// that exceeds that of all of its child-processes (e.g., `runit`, `supervisord`). + LockedByOtherProcess(Pid), +} + +/// Try to read the file at the given path as a pidfile that was previously created +/// through [`claim_for_current_process`]. +/// +/// On success, this function returns a [`PidFileRead`]. +/// Check its docs for a description of the meaning of its different variants. +pub fn read(pidfile: &Path) -> anyhow::Result { + let res = lock_file::read_and_hold_lock_file(pidfile).context("read and hold pid file")?; + let ret = match res { + LockFileRead::NotExist => PidFileRead::NotExist, + LockFileRead::NotHeldByAnyProcess(guard, _) => { + PidFileRead::NotHeldByAnyProcess(PidFileGuard(guard)) + } + LockFileRead::LockedByOtherProcess { + not_locked_file: _not_locked_file, + content, + } => { + // XXX the read races with the write in claim_pid_file_for_pid(). + // But pids are smaller than a page, so the kernel page cache will lock for us. + // The only problem is that we might get the old contents here. + // Can only fix that by implementing some scheme that downgrades the + // exclusive lock to shared lock in claim_pid_file_for_pid(). + PidFileRead::LockedByOtherProcess(parse_pidfile_content(&content)?) + } + }; + Ok(ret) +} + +fn parse_pidfile_content(content: &str) -> anyhow::Result { + let pid: i32 = content + .parse() + .map_err(|_| anyhow::anyhow!("parse pidfile content to PID"))?; + if pid < 1 { + anyhow::bail!("bad value in pidfile '{pid}'"); + } + Ok(Pid::from_raw(pid)) +} diff --git a/libs/utils/src/postgres_backend.rs b/libs/utils/src/postgres_backend.rs index 89f7197718..5b34c7adfb 100644 --- a/libs/utils/src/postgres_backend.rs +++ b/libs/utils/src/postgres_backend.rs @@ -6,7 +6,7 @@ use crate::sock_split::{BidiStream, ReadStream, WriteStream}; use anyhow::{bail, ensure, Context, Result}; use bytes::{Bytes, BytesMut}; -use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket}; +use pq_proto::{BeMessage, FeMessage, FeStartupPacket}; use rand::Rng; use serde::{Deserialize, Serialize}; use std::fmt; @@ -361,11 +361,9 @@ impl PostgresBackend { match self.auth_type { AuthType::Trust => { self.write_message_noflush(&BeMessage::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())? + .write_message_noflush(&BeMessage::CLIENT_ENCODING)? // The async python driver requires a valid server_version - .write_message_noflush(&BeMessage::ParameterStatus( - BeParameterStatusMessage::ServerVersion("14.1"), - ))? + .write_message_noflush(&BeMessage::server_version("14.1"))? .write_message(&BeMessage::ReadyForQuery)?; self.state = ProtoState::Established; } @@ -413,7 +411,7 @@ impl PostgresBackend { } } self.write_message_noflush(&BeMessage::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())? + .write_message_noflush(&BeMessage::CLIENT_ENCODING)? .write_message(&BeMessage::ReadyForQuery)?; self.state = ProtoState::Established; } diff --git a/libs/utils/src/postgres_backend_async.rs b/libs/utils/src/postgres_backend_async.rs index 376819027b..a22774c69e 100644 --- a/libs/utils/src/postgres_backend_async.rs +++ b/libs/utils/src/postgres_backend_async.rs @@ -6,7 +6,7 @@ use crate::postgres_backend::AuthType; use anyhow::{bail, Context, Result}; use bytes::{Bytes, BytesMut}; -use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket}; +use pq_proto::{BeMessage, FeMessage, FeStartupPacket}; use rand::Rng; use std::future::Future; use std::net::SocketAddr; @@ -331,11 +331,9 @@ impl PostgresBackend { match self.auth_type { AuthType::Trust => { self.write_message(&BeMessage::AuthenticationOk)? - .write_message(&BeParameterStatusMessage::encoding())? + .write_message(&BeMessage::CLIENT_ENCODING)? // The async python driver requires a valid server_version - .write_message(&BeMessage::ParameterStatus( - BeParameterStatusMessage::ServerVersion("14.1"), - ))? + .write_message(&BeMessage::server_version("14.1"))? .write_message(&BeMessage::ReadyForQuery)?; self.state = ProtoState::Established; } @@ -384,7 +382,7 @@ impl PostgresBackend { } } self.write_message(&BeMessage::AuthenticationOk)? - .write_message(&BeParameterStatusMessage::encoding())? + .write_message(&BeMessage::CLIENT_ENCODING)? .write_message(&BeMessage::ReadyForQuery)?; self.state = ProtoState::Established; } diff --git a/libs/utils/src/sentry_init.rs b/libs/utils/src/sentry_init.rs new file mode 100644 index 0000000000..992cb5c671 --- /dev/null +++ b/libs/utils/src/sentry_init.rs @@ -0,0 +1,29 @@ +use sentry::ClientInitGuard; +use std::borrow::Cow; +use std::env; + +pub use sentry::release_name; + +#[must_use] +pub fn init_sentry( + release_name: Option>, + extra_options: &[(&str, &str)], +) -> Option { + let dsn = env::var("SENTRY_DSN").ok()?; + let environment = env::var("SENTRY_ENVIRONMENT").unwrap_or_else(|_| "development".into()); + + let guard = sentry::init(( + dsn, + sentry::ClientOptions { + release: release_name, + environment: Some(environment.into()), + ..Default::default() + }, + )); + sentry::configure_scope(|scope| { + for &(key, value) in extra_options { + scope.set_extra(key, value.into()); + } + }); + Some(guard) +} diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index bf330a482c..e3f0b505da 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -11,11 +11,13 @@ use tokio::time::timeout; /// An error happened while waiting for a number #[derive(Debug, PartialEq, Eq, thiserror::Error)] -#[error("SeqWaitError")] pub enum SeqWaitError { /// The wait timeout was reached + #[error("seqwait timeout was reached")] Timeout, + /// [`SeqWait::shutdown`] was called + #[error("SeqWait::shutdown was called")] Shutdown, } diff --git a/libs/utils/src/sock_split.rs b/libs/utils/src/sock_split.rs index 5e4598daf1..b0e5a0bf6a 100644 --- a/libs/utils/src/sock_split.rs +++ b/libs/utils/src/sock_split.rs @@ -50,7 +50,7 @@ impl BufStream { /// Returns a reference to the underlying TcpStream. fn get_ref(&self) -> &TcpStream { - &*self.0.get_ref().0 + &self.0.get_ref().0 } } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 6ada2c5cb1..12fe0705cf 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -18,7 +18,7 @@ async-stream = "0.3" async-trait = "0.1" byteorder = "1.4.3" bytes = "1.0.1" -chrono = "0.4.19" +chrono = { version = "0.4.23", default-features = false, features = ["clock", "serde"] } clap = { version = "4.0", features = ["string"] } close_fds = "0.3.2" const_format = "0.2.21" @@ -36,40 +36,41 @@ nix = "0.25" num-traits = "0.2.15" once_cell = "1.13.0" pin-project-lite = "0.2.7" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } +postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true } rand = "0.8.3" regex = "1.4.5" rstar = "0.9.3" scopeguard = "1.1.0" serde = { version = "1.0", features = ["derive"] } -serde_json = "1" +serde_json = { version = "1.0", features = ["raw_value"] } serde_with = "2.0" signal-hook = "0.3.10" svg_fmt = "0.4.1" tar = "0.4.33" thiserror = "1.0" tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } tokio-util = { version = "0.7.3", features = ["io", "io-util"] } toml_edit = { version = "0.14", features = ["easy"] } tracing = "0.1.36" url = "2" walkdir = "2.3.2" -etcd_broker = { path = "../libs/etcd_broker" } metrics = { path = "../libs/metrics" } pageserver_api = { path = "../libs/pageserver_api" } postgres_connection = { path = "../libs/postgres_connection" } postgres_ffi = { path = "../libs/postgres_ffi" } pq_proto = { path = "../libs/pq_proto" } remote_storage = { path = "../libs/remote_storage" } +storage_broker = { version = "0.1", path = "../storage_broker" } tenant_size_model = { path = "../libs/tenant_size_model" } utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } rpds = "0.12.0" +reqwest = "0.11.13" [dev-dependencies] criterion = "0.4" diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 6f68ebd662..7f659be46f 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -4,9 +4,8 @@ use pageserver::repository::{Key, Value}; use pageserver::tenant::bst_layer_map::RetroactiveLayerMap; use pageserver::tenant::filename::{DeltaFileName, ImageFileName}; use pageserver::tenant::layer_map::LayerMap; -use pageserver::tenant::storage_layer::Layer; -use pageserver::tenant::storage_layer::ValueReconstructResult; -use pageserver::tenant::storage_layer::ValueReconstructState; +use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState}; +use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult}; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use std::cmp::{max, min}; use std::fs::File; @@ -16,7 +15,7 @@ use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; use std::time::Instant; -use utils::id::{TenantId, TimelineId}; + use utils::lsn::Lsn; use criterion::{criterion_group, criterion_main, Criterion}; @@ -27,14 +26,6 @@ struct DummyDelta { } impl Layer for DummyDelta { - fn get_tenant_id(&self) -> TenantId { - TenantId::from_str("00000000000000000000000000000000").unwrap() - } - - fn get_timeline_id(&self) -> TimelineId { - TimelineId::from_str("00000000000000000000000000000000").unwrap() - } - fn get_key_range(&self) -> Range { self.key_range.clone() } @@ -42,15 +33,6 @@ impl Layer for DummyDelta { fn get_lsn_range(&self) -> Range { self.lsn_range.clone() } - - fn filename(&self) -> PathBuf { - todo!() - } - - fn local_path(&self) -> Option { - todo!() - } - fn get_value_reconstruct_data( &self, _key: Key, @@ -64,24 +46,12 @@ impl Layer for DummyDelta { true } - fn is_in_memory(&self) -> bool { - false - } - - fn iter(&self) -> Box> + '_> { - panic!() - } - - fn key_iter(&self) -> Box + '_> { - panic!("Not implemented") - } - - fn delete(&self) -> Result<()> { - panic!() - } - fn dump(&self, _verbose: bool) -> Result<()> { - todo!() + unimplemented!() + } + + fn short_id(&self) -> String { + unimplemented!() } } @@ -91,14 +61,6 @@ struct DummyImage { } impl Layer for DummyImage { - fn get_tenant_id(&self) -> TenantId { - TenantId::from_str("00000000000000000000000000000000").unwrap() - } - - fn get_timeline_id(&self) -> TimelineId { - TimelineId::from_str("00000000000000000000000000000000").unwrap() - } - fn get_key_range(&self) -> Range { self.key_range.clone() } @@ -108,14 +70,6 @@ impl Layer for DummyImage { self.lsn..(self.lsn + 1) } - fn filename(&self) -> PathBuf { - todo!() - } - - fn local_path(&self) -> Option { - todo!() - } - fn get_value_reconstruct_data( &self, _key: Key, @@ -129,29 +83,17 @@ impl Layer for DummyImage { false } - fn is_in_memory(&self) -> bool { - false - } - - fn iter(&self) -> Box> + '_> { - panic!() - } - - fn key_iter(&self) -> Box + '_> { - panic!("Not implemented") - } - - fn delete(&self) -> Result<()> { - panic!() - } - fn dump(&self, _verbose: bool) -> Result<()> { - todo!() + unimplemented!() + } + + fn short_id(&self) -> String { + unimplemented!() } } -fn build_layer_map(filename_dump: PathBuf) -> LayerMap { - let mut layer_map = LayerMap::default(); +fn build_layer_map(filename_dump: PathBuf) -> LayerMap { + let mut layer_map = LayerMap::::default(); let mut min_lsn = Lsn(u64::MAX); let mut max_lsn = Lsn(0); @@ -188,7 +130,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap { } /// Construct a layer map query pattern for benchmarks -fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> { +fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> { // For each image layer we query one of the pages contained, at LSN right // before the image layer was created. This gives us a somewhat uniform // coverage of both the lsn and key space because image layers have @@ -224,7 +166,7 @@ fn bench_from_captest_env(c: &mut Criterion) { c.bench_function("captest_uniform_queries", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1).unwrap(); + layer_map.search(q.0, q.1); } }); }); @@ -277,7 +219,7 @@ fn bench_from_real_project(c: &mut Criterion) { group.bench_function("current_code", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1).unwrap(); + layer_map.search(q.0, q.1); } }); }); @@ -342,7 +284,7 @@ fn bench_sequential(c: &mut Criterion) { group.bench_function("current_code", |b| { b.iter(|| { for q in queries.clone().into_iter() { - layer_map.search(q.0, q.1).unwrap(); + layer_map.search(q.0, q.1); } }); }); diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 85caa565fe..61011c9f36 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -84,7 +84,7 @@ fn add_multithreaded_walredo_requesters( barrier.wait(); - execute_all(input, &*manager).unwrap(); + execute_all(input, &manager).unwrap(); barrier.wait(); } @@ -431,7 +431,7 @@ fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord { struct Request { key: Key, lsn: Lsn, - base_img: Option, + base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, pg_version: u32, } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 973c3cd3a6..36664e119e 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -22,7 +22,8 @@ use std::time::SystemTime; use tar::{Builder, EntryType, Header}; use tracing::*; -use crate::tenant::Timeline; +use crate::task_mgr; +use crate::tenant::{with_ondemand_download, PageReconstructResult, Timeline}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; @@ -130,7 +131,7 @@ where // Create pgdata subdirs structure for dir in PGDATA_SUBDIRS.iter() { - let header = new_tar_header_dir(*dir)?; + let header = new_tar_header_dir(dir)?; self.ar.append(&header, &mut io::empty())?; } @@ -152,23 +153,29 @@ where SlruKind::MultiXactOffsets, SlruKind::MultiXactMembers, ] { - for segno in self.timeline.list_slru_segments(kind, self.lsn)? { + for segno in + with_ondemand_download_sync(|| self.timeline.list_slru_segments(kind, self.lsn))? + { self.add_slru_segment(kind, segno)?; } } // Create tablespace directories - for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? { + for ((spcnode, dbnode), has_relmap_file) in + with_ondemand_download_sync(|| self.timeline.list_dbdirs(self.lsn))? + { self.add_dbdir(spcnode, dbnode, has_relmap_file)?; // Gather and send relational files in each database if full backup is requested. if self.full_backup { - for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? { + for rel in with_ondemand_download_sync(|| { + self.timeline.list_rels(spcnode, dbnode, self.lsn) + })? { self.add_rel(rel)?; } } } - for xid in self.timeline.list_twophase_files(self.lsn)? { + for xid in with_ondemand_download_sync(|| self.timeline.list_twophase_files(self.lsn))? { self.add_twophase_file(xid)?; } @@ -185,7 +192,8 @@ where } fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> { - let nblocks = self.timeline.get_rel_size(tag, self.lsn, false)?; + let nblocks = + with_ondemand_download_sync(|| self.timeline.get_rel_size(tag, self.lsn, false))?; // Function that adds relation segment data to archive let mut add_file = |segment_index, data: &Vec| -> anyhow::Result<()> { @@ -208,7 +216,8 @@ where for blknum in blocks { let img = self .timeline - .get_rel_page_at_lsn(tag, blknum, self.lsn, false)?; + .get_rel_page_at_lsn(tag, blknum, self.lsn, false) + .no_ondemand_download()?; segment_data.extend_from_slice(&img[..]); } @@ -222,13 +231,16 @@ where // Generate SLRU segment files from repository. // fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { - let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?; + let nblocks = with_ondemand_download_sync(|| { + self.timeline.get_slru_segment_size(slru, segno, self.lsn) + })?; let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize); for blknum in 0..nblocks { - let img = self - .timeline - .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?; + let img = with_ondemand_download_sync(|| { + self.timeline + .get_slru_page_at_lsn(slru, segno, blknum, self.lsn) + })?; if slru == SlruKind::Clog { ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8); @@ -260,7 +272,9 @@ where has_relmap_file: bool, ) -> anyhow::Result<()> { let relmap_img = if has_relmap_file { - let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?; + let img = with_ondemand_download_sync(|| { + self.timeline.get_relmap_file(spcnode, dbnode, self.lsn) + })?; ensure!(img.len() == 512); Some(img) } else { @@ -295,7 +309,8 @@ where if !has_relmap_file && self .timeline - .list_rels(spcnode, dbnode, self.lsn)? + .list_rels(spcnode, dbnode, self.lsn) + .no_ondemand_download()? .is_empty() { return Ok(()); @@ -327,7 +342,7 @@ where // Extract twophase state files // fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { - let img = self.timeline.get_twophase_file(xid, self.lsn)?; + let img = with_ondemand_download_sync(|| self.timeline.get_twophase_file(xid, self.lsn))?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); @@ -361,14 +376,12 @@ where zenith_signal.as_bytes(), )?; - let checkpoint_bytes = self - .timeline - .get_checkpoint(self.lsn) - .context("failed to get checkpoint bytes")?; - let pg_control_bytes = self - .timeline - .get_control_file(self.lsn) - .context("failed get control bytes")?; + let checkpoint_bytes = + with_ondemand_download_sync(|| self.timeline.get_checkpoint(self.lsn)) + .context("failed to get checkpoint bytes")?; + let pg_control_bytes = + with_ondemand_download_sync(|| self.timeline.get_control_file(self.lsn)) + .context("failed get control bytes")?; let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control( &pg_control_bytes, @@ -490,3 +503,11 @@ where } } } + +fn with_ondemand_download_sync(f: F) -> anyhow::Result +where + F: Send + Fn() -> PageReconstructResult, + T: Send, +{ + task_mgr::COMPUTE_REQUEST_RUNTIME.block_on(with_ondemand_download(f)) +} diff --git a/pageserver/src/bin/draw_timeline_dir.rs b/pageserver/src/bin/draw_timeline_dir.rs index ea1ff7f3c7..da13ee452c 100644 --- a/pageserver/src/bin/draw_timeline_dir.rs +++ b/pageserver/src/bin/draw_timeline_dir.rs @@ -11,8 +11,8 @@ //! //! Example use: //! ``` -//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE -//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg +//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \ +//! $ grep "__" | cargo run --release --bin draw_timeline_dir > out.svg //! $ firefox out.svg //! ``` //! @@ -25,6 +25,8 @@ use anyhow::Result; use pageserver::repository::Key; use std::cmp::Ordering; use std::io::{self, BufRead}; +use std::path::PathBuf; +use std::str::FromStr; use std::{ collections::{BTreeMap, BTreeSet}, ops::Range, @@ -65,7 +67,11 @@ fn main() -> Result<()> { let mut ranges: Vec<(Range, Range)> = vec![]; let stdin = io::stdin(); for line in stdin.lock().lines() { - let range = parse_filename(&line.unwrap()); + let line = line.unwrap(); + let line = PathBuf::from_str(&line).unwrap(); + let filename = line.file_name().unwrap(); + let filename = filename.to_str().unwrap(); + let range = parse_filename(filename); ranges.push(range); } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 32d3fca47c..b3d9b0f809 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -7,7 +7,7 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr}; use anyhow::{anyhow, Context}; use clap::{Arg, ArgAction, Command}; use fail::FailScenario; -use nix::unistd::Pid; +use remote_storage::GenericRemoteStorage; use tracing::*; use metrics::set_build_info_metric; @@ -18,14 +18,15 @@ use pageserver::{ task_mgr::{ BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME, }, - tenant_mgr, virtual_file, + tenant::mgr, + virtual_file, }; -use remote_storage::GenericRemoteStorage; use utils::{ auth::JwtAuth, - lock_file, logging, + logging, postgres_backend::AuthType, project_git_version, + sentry_init::{init_sentry, release_name}, signals::{self, Signal}, tcp_listener, }; @@ -85,6 +86,9 @@ fn main() -> anyhow::Result<()> { } }; + // initialize sentry if SENTRY_DSN is provided + let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.id.to_string())]); + let tenants_path = conf.tenants_path(); if !tenants_path.exists() { utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| { @@ -124,7 +128,7 @@ fn initialize_config( ); } // Supplement the CLI arguments with the config file - let cfg_file_contents = std::fs::read_to_string(&cfg_file_path).with_context(|| { + let cfg_file_contents = std::fs::read_to_string(cfg_file_path).with_context(|| { format!( "Failed to read pageserver config at '{}'", cfg_file_path.display() @@ -178,7 +182,7 @@ fn initialize_config( if update_config { info!("Writing pageserver config to '{}'", cfg_file_path.display()); - std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| { + std::fs::write(cfg_file_path, toml.to_string()).with_context(|| { format!( "Failed to write pageserver config to '{}'", cfg_file_path.display() @@ -198,8 +202,12 @@ fn initialize_config( } fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { + // Initialize logging logging::init(conf.log_format)?; + + // Print version to the log, and expose it as a prometheus metric too. info!("version: {}", version()); + set_build_info_metric(GIT_VERSION); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes @@ -215,53 +223,37 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { ) } + // Create and lock PID file. This ensures that there cannot be more than one + // pageserver process running at the same time. let lock_file_path = conf.workdir.join(PID_FILE_NAME); - let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) { - lock_file::LockCreationResult::Created { - new_lock_contents, - file, - } => { - info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}"); - file - } - lock_file::LockCreationResult::AlreadyLocked { - existing_lock_contents, - } => anyhow::bail!( - "Could not lock pid file; pageserver is already running in {:?} with PID {}", - conf.workdir, - existing_lock_contents - ), - lock_file::LockCreationResult::CreationFailed(e) => { - return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}"))) - } - }; - // ensure that the lock file is held even if the main thread of the process is panics - // we need to release the lock file only when the current process is gone - let _ = Box::leak(Box::new(lock_file)); + let lock_file = + utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?; + info!("Claimed pid file at {lock_file_path:?}"); - // TODO: Check that it looks like a valid repository before going further + // Ensure that the lock file is held even if the main thread of the process panics. + // We need to release the lock file only when the process exits. + std::mem::forget(lock_file); - // bind sockets before daemonizing so we report errors early and do not return until we are listening - info!( - "Starting pageserver http handler on {}", - conf.listen_http_addr - ); - let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?; + // Bind the HTTP and libpq ports early, so that if they are in use by some other + // process, we error out early. + let http_addr = &conf.listen_http_addr; + info!("Starting pageserver http handler on {http_addr}"); + let http_listener = tcp_listener::bind(http_addr)?; - info!( - "Starting pageserver pg protocol handler on {}", - conf.listen_pg_addr - ); - let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?; + let pg_addr = &conf.listen_pg_addr; + info!("Starting pageserver pg protocol handler on {pg_addr}"); + let pageserver_listener = tcp_listener::bind(pg_addr)?; + // Install signal handlers let signals = signals::install_shutdown_handlers()?; - // start profiler (if enabled) + // Start profiler (if enabled) let profiler_guard = profiling::init_profiler(conf); - WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_etcd_client(conf))?; + // Launch broker client + WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?; - // initialize authentication for incoming connections + // Initialize authentication for incoming connections let auth = match &conf.auth_type { AuthType::Trust | AuthType::MD5 => None, AuthType::NeonJWT => { @@ -272,46 +264,54 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { }; info!("Using auth: {:#?}", conf.auth_type); - match var("ZENITH_AUTH_TOKEN") { - Ok(v) => { + // TODO: remove ZENITH_AUTH_TOKEN once it's not used anywhere in development/staging/prod configuration. + match (var("ZENITH_AUTH_TOKEN"), var("NEON_AUTH_TOKEN")) { + (old, Ok(v)) => { info!("Loaded JWT token for authentication with Safekeeper"); + if let Ok(v_old) = old { + warn!( + "JWT token for Safekeeper is specified twice, ZENITH_AUTH_TOKEN is deprecated" + ); + if v_old != v { + warn!("JWT token for Safekeeper has two different values, choosing NEON_AUTH_TOKEN"); + } + } pageserver::config::SAFEKEEPER_AUTH_TOKEN .set(Arc::new(v)) .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?; } - Err(VarError::NotPresent) => { + (Ok(v), _) => { + info!("Loaded JWT token for authentication with Safekeeper"); + warn!("Please update pageserver configuration: the JWT token should be NEON_AUTH_TOKEN, not ZENITH_AUTH_TOKEN"); + pageserver::config::SAFEKEEPER_AUTH_TOKEN + .set(Arc::new(v)) + .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?; + } + (_, Err(VarError::NotPresent)) => { info!("No JWT token for authentication with Safekeeper detected"); } - Err(e) => { + (_, Err(e)) => { return Err(e).with_context(|| { - "Failed to either load to detect non-present ZENITH_AUTH_TOKEN environment variable" + "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable" }) } }; - let remote_storage = conf - .remote_storage_config - .as_ref() - .map(|storage_config| { - GenericRemoteStorage::from_config(conf.workdir.clone(), storage_config) - }) - .transpose() - .context("Failed to init generic remote storage")?; - { - let _rt_guard = BACKGROUND_RUNTIME.enter(); - tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())? - }; + // Set up remote storage client + let remote_storage = create_remote_storage_client(conf)?; - // Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME. - // bind before launching separate thread so the error reported before startup exits + // Scan the local 'tenants/' directory and start loading the tenants + BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?; - // Create a Service from the router above to handle incoming requests. + // Start up the service to handle HTTP mgmt API request. We created the + // listener earlier already. { let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); - let router = http::make_router(conf, auth.clone(), remote_storage)?; - let service = - utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap(); + let router = http::make_router(conf, auth.clone(), remote_storage)? + .build() + .map_err(|err| anyhow!(err))?; + let service = utils::http::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)? .serve(service) .with_graceful_shutdown(task_mgr::shutdown_watcher()); @@ -328,10 +328,31 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { Ok(()) }, ); + + if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint { + task_mgr::spawn( + MGMT_REQUEST_RUNTIME.handle(), + TaskKind::MetricsCollection, + None, + None, + "consumption metrics collection", + true, + async move { + pageserver::consumption_metrics::collect_metrics( + metric_collection_endpoint, + conf.metric_collection_interval, + conf.id, + ) + .instrument(info_span!("metrics_collection")) + .await?; + Ok(()) + }, + ); + } } // Spawn a task to listen for libpq connections. It will spawn further tasks - // for each connection. + // for each connection. We created the listener earlier already. task_mgr::spawn( COMPUTE_REQUEST_RUNTIME.handle(), TaskKind::LibpqEndpointListener, @@ -344,8 +365,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { }, ); - set_build_info_metric(GIT_VERSION); - // All started up! Now just sit and wait for shutdown signal. signals.handle(|signal| match signal { Signal::Quit => { @@ -369,6 +388,36 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { }) } +fn create_remote_storage_client( + conf: &'static PageServerConf, +) -> anyhow::Result> { + let config = if let Some(config) = &conf.remote_storage_config { + config + } else { + // No remote storage configured. + return Ok(None); + }; + + // Create the client + let mut remote_storage = GenericRemoteStorage::from_config(config)?; + + // If `test_remote_failures` is non-zero, wrap the client with a + // wrapper that simulates failures. + if conf.test_remote_failures > 0 { + if !cfg!(feature = "testing") { + anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature"); + } + info!( + "Simulating remote failures for first {} attempts of each op", + conf.test_remote_failures + ); + remote_storage = + GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures); + } + + Ok(Some(remote_storage)) +} + fn cli() -> Command { Command::new("Neon page server") .about("Materializes WAL stream to pages and serves them to the postgres") diff --git a/pageserver/src/bin/pageserver_binutils.rs b/pageserver/src/bin/pageserver_binutils.rs index b1484ac45a..9da173c873 100644 --- a/pageserver/src/bin/pageserver_binutils.rs +++ b/pageserver/src/bin/pageserver_binutils.rs @@ -60,7 +60,7 @@ fn main() -> anyhow::Result<()> { } fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> { - let control_file = ControlFileData::decode(&std::fs::read(&control_file_path)?)?; + let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?; println!("{control_file:?}"); let control_file_initdb = Lsn(control_file.checkPoint); println!( @@ -79,7 +79,7 @@ fn print_layerfile(path: &Path) -> anyhow::Result<()> { } fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> { - let metadata_bytes = std::fs::read(&path)?; + let metadata_bytes = std::fs::read(path)?; let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; println!("Current metadata:\n{meta:?}"); let mut update_meta = false; @@ -110,7 +110,7 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an if update_meta { let metadata_bytes = meta.to_bytes()?; - std::fs::write(&path, &metadata_bytes)?; + std::fs::write(path, metadata_bytes)?; } Ok(()) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 1ac07f6ebc..deb79531a4 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -5,12 +5,14 @@ //! See also `settings.md` for better description on every parameter. use anyhow::{anyhow, bail, ensure, Context, Result}; -use remote_storage::RemoteStorageConfig; +use remote_storage::{RemotePath, RemoteStorageConfig}; use std::env; +use storage_broker::Uri; use utils::crashsafe::path_with_suffix_extension; use utils::id::ConnectionId; use once_cell::sync::OnceCell; +use reqwest::Url; use std::num::NonZeroUsize; use std::path::{Path, PathBuf}; use std::str::FromStr; @@ -18,25 +20,29 @@ use std::sync::Arc; use std::time::Duration; use toml_edit; use toml_edit::{Document, Item}; -use url::Url; + use utils::{ id::{NodeId, TenantId, TimelineId}, logging::LogFormat, postgres_backend::AuthType, }; +use crate::tenant::config::TenantConf; +use crate::tenant::config::TenantConfOpt; use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME}; -use crate::tenant_config::{TenantConf, TenantConfOpt}; -use crate::{METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX}; +use crate::{ + IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX, +}; pub mod defaults { - use crate::tenant_config::defaults::*; + use crate::tenant::config::defaults::*; use const_format::formatcp; pub use pageserver_api::{ DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_PG_LISTEN_PORT, }; + pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; @@ -51,13 +57,14 @@ pub mod defaults { pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = super::ConfigurableSemaphore::DEFAULT_INITIAL.get(); + pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min"; + pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; /// /// Default built-in configuration file. /// pub const DEFAULT_CONFIG_FILE: &str = formatcp!( r###" # Initial configuration file created by 'pageserver --init' - #listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}' #listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}' @@ -69,10 +76,14 @@ pub mod defaults { # initial superuser role name to use when creating a new tenant #initial_superuser_name = '{DEFAULT_SUPERUSER}' +#broker_endpoint = '{BROKER_DEFAULT_ENDPOINT}' + #log_format = '{DEFAULT_LOG_FORMAT}' #concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}' +#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}' + # [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -130,24 +141,26 @@ pub struct PageServerConf { pub profiling: ProfilingConfig, pub default_tenant_conf: TenantConf, - /// A prefix to add in etcd brokers before every key. - /// Can be used for isolating different pageserver groups within the same etcd cluster. - pub broker_etcd_prefix: String, - - /// Etcd broker endpoints to connect to. - pub broker_endpoints: Vec, + /// Storage broker endpoints to connect to. + pub broker_endpoint: Uri, + pub broker_keepalive_interval: Duration, pub log_format: LogFormat, /// Number of concurrent [`Tenant::gather_size_inputs`] allowed. pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore, + + // How often to collect metrics and send them to the metrics endpoint. + pub metric_collection_interval: Duration, + pub metric_collection_endpoint: Option, + + pub test_remote_failures: u64, } /// We do not want to store this in a PageServerConf because the latter may be logged /// and/or serialized at a whim, while the token is secret. Currently this token is the /// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in -/// the future, more tokens and auth may arrive for etcd and/or its rewrite (see -/// https://github.com/neondatabase/neon/issues/2394), completely changing the logic. +/// the future, more tokens and auth may arrive for storage broker, completely changing the logic. /// Hence, we resort to a global variable for now instead of passing the token from the /// startup code to the connection code through a dozen layers. pub static SAFEKEEPER_AUTH_TOKEN: OnceCell> = OnceCell::new(); @@ -214,12 +227,17 @@ struct PageServerConfigBuilder { id: BuilderValue, profiling: BuilderValue, - broker_etcd_prefix: BuilderValue, - broker_endpoints: BuilderValue>, + broker_endpoint: BuilderValue, + broker_keepalive_interval: BuilderValue, log_format: BuilderValue, concurrent_tenant_size_logical_size_queries: BuilderValue, + + metric_collection_interval: BuilderValue, + metric_collection_endpoint: BuilderValue>, + + test_remote_failures: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -245,11 +263,23 @@ impl Default for PageServerConfigBuilder { remote_storage_config: Set(None), id: NotSet, profiling: Set(ProfilingConfig::Disabled), - broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()), - broker_endpoints: Set(Vec::new()), + broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT + .parse() + .expect("failed to parse default broker endpoint")), + broker_keepalive_interval: Set(humantime::parse_duration( + storage_broker::DEFAULT_KEEPALIVE_INTERVAL, + ) + .expect("cannot parse default keepalive interval")), log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()), + metric_collection_interval: Set(humantime::parse_duration( + DEFAULT_METRIC_COLLECTION_INTERVAL, + ) + .expect("cannot parse default metric collection interval")), + metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT), + + test_remote_failures: Set(0), } } } @@ -306,12 +336,12 @@ impl PageServerConfigBuilder { self.remote_storage_config = BuilderValue::Set(remote_storage_config) } - pub fn broker_endpoints(&mut self, broker_endpoints: Vec) { - self.broker_endpoints = BuilderValue::Set(broker_endpoints) + pub fn broker_endpoint(&mut self, broker_endpoint: Uri) { + self.broker_endpoint = BuilderValue::Set(broker_endpoint) } - pub fn broker_etcd_prefix(&mut self, broker_etcd_prefix: String) { - self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix) + pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) { + self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval) } pub fn id(&mut self, node_id: NodeId) { @@ -330,11 +360,19 @@ impl PageServerConfigBuilder { self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u); } - pub fn build(self) -> anyhow::Result { - let broker_endpoints = self - .broker_endpoints - .ok_or(anyhow!("No broker endpoints provided"))?; + pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) { + self.metric_collection_interval = BuilderValue::Set(metric_collection_interval) + } + pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option) { + self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint) + } + + pub fn test_remote_failures(&mut self, fail_first: u64) { + self.test_remote_failures = BuilderValue::Set(fail_first); + } + + pub fn build(self) -> anyhow::Result { Ok(PageServerConf { listen_pg_addr: self .listen_pg_addr @@ -370,16 +408,27 @@ impl PageServerConfigBuilder { profiling: self.profiling.ok_or(anyhow!("missing profiling"))?, // TenantConf is handled separately default_tenant_conf: TenantConf::default(), - broker_endpoints, - broker_etcd_prefix: self - .broker_etcd_prefix - .ok_or(anyhow!("missing broker_etcd_prefix"))?, + broker_endpoint: self + .broker_endpoint + .ok_or(anyhow!("No broker endpoints provided"))?, + broker_keepalive_interval: self + .broker_keepalive_interval + .ok_or(anyhow!("No broker keepalive interval provided"))?, log_format: self.log_format.ok_or(anyhow!("missing log_format"))?, concurrent_tenant_size_logical_size_queries: self .concurrent_tenant_size_logical_size_queries .ok_or(anyhow!( "missing concurrent_tenant_size_logical_size_queries" ))?, + metric_collection_interval: self + .metric_collection_interval + .ok_or(anyhow!("missing metric_collection_interval"))?, + metric_collection_endpoint: self + .metric_collection_endpoint + .ok_or(anyhow!("missing metric_collection_endpoint"))?, + test_remote_failures: self + .test_remote_failures + .ok_or(anyhow!("missing test_remote_failuers"))?, }) } } @@ -402,6 +451,10 @@ impl PageServerConf { .join(TENANT_ATTACHING_MARKER_FILENAME) } + pub fn tenant_ignore_mark_file_path(&self, tenant_id: TenantId) -> PathBuf { + self.tenant_path(&tenant_id).join(IGNORED_TENANT_FILE_NAME) + } + /// Points to a place in pageserver's local directory, /// where certain tenant's tenantconf file should be located. pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf { @@ -450,6 +503,28 @@ impl PageServerConf { .join(METADATA_FILE_NAME) } + /// Files on the remote storage are stored with paths, relative to the workdir. + /// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path. + /// + /// Errors if the path provided does not start from pageserver's workdir. + pub fn remote_path(&self, local_path: &Path) -> anyhow::Result { + local_path + .strip_prefix(&self.workdir) + .context("Failed to strip workdir prefix") + .and_then(RemotePath::new) + .with_context(|| { + format!( + "Failed to resolve remote part of path {:?} for base {:?}", + local_path, self.workdir + ) + }) + } + + /// Turns storage remote path of a file into its local path. + pub fn local_path(&self, remote_path: &RemotePath) -> PathBuf { + remote_path.with_base(&self.workdir) + } + // // Postgres distribution paths // @@ -486,7 +561,7 @@ impl PageServerConf { let mut builder = PageServerConfigBuilder::default(); builder.workdir(workdir.to_owned()); - let mut t_conf: TenantConfOpt = Default::default(); + let mut t_conf = TenantConfOpt::default(); for (key, item) in toml.iter() { match key { @@ -507,24 +582,15 @@ impl PageServerConf { )), "auth_type" => builder.auth_type(parse_toml_from_str(key, item)?), "remote_storage" => { - builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item)?)) + builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?) } "tenant_config" => { t_conf = Self::parse_toml_tenant_conf(item)?; } "id" => builder.id(NodeId(parse_toml_u64(key, item)?)), "profiling" => builder.profiling(parse_toml_from_str(key, item)?), - "broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?), - "broker_endpoints" => builder.broker_endpoints( - parse_toml_array(key, item)? - .into_iter() - .map(|endpoint_str| { - endpoint_str.parse::().with_context(|| { - format!("Array item {endpoint_str} for key {key} is not a valid url endpoint") - }) - }) - .collect::>()?, - ), + "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?), + "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?), "log_format" => builder.log_format( LogFormat::from_config(&parse_toml_string(key, item)?)? ), @@ -534,6 +600,13 @@ impl PageServerConf { let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?; ConfigurableSemaphore::new(permits) }), + "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?), + "metric_collection_endpoint" => { + let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?; + builder.metric_collection_endpoint(Some(endpoint)); + }, + + "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -617,6 +690,12 @@ impl PageServerConf { if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") { t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?); } + if let Some(trace_read_requests) = item.get("trace_read_requests") { + t_conf.trace_read_requests = + Some(trace_read_requests.as_bool().with_context(|| { + "configure option trace_read_requests is not a bool".to_string() + })?); + } Ok(t_conf) } @@ -644,11 +723,14 @@ impl PageServerConf { auth_validation_public_key_path: None, remote_storage_config: None, profiling: ProfilingConfig::Disabled, - default_tenant_conf: TenantConf::dummy_conf(), - broker_endpoints: Vec::new(), - broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), + default_tenant_conf: TenantConf::default(), + broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), + broker_keepalive_interval: Duration::from_secs(5000), log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), + metric_collection_interval: Duration::from_secs(60), + metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, + test_remote_failures: 0, } } } @@ -698,22 +780,6 @@ where }) } -fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result> { - let array = item - .as_array() - .with_context(|| format!("configure option {name} is not an array"))?; - - array - .iter() - .map(|value| { - value - .as_str() - .map(str::to_string) - .with_context(|| format!("Array item {value:?} for key {name} is not a string")) - }) - .collect() -} - /// Configurable semaphore permits setting. /// /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty @@ -795,6 +861,8 @@ max_file_descriptors = 333 initial_superuser_name = 'zzzz' id = 10 +metric_collection_interval = '222 s' +metric_collection_endpoint = 'http://localhost:80/metrics' log_format = 'json' "#; @@ -803,10 +871,10 @@ log_format = 'json' fn parse_defaults() -> anyhow::Result<()> { let tempdir = tempdir()?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - let broker_endpoint = "http://127.0.0.1:7777"; + let broker_endpoint = storage_broker::DEFAULT_ENDPOINT; // we have to create dummy values to overcome the validation errors let config_string = format!( - "pg_distrib_dir='{}'\nid=10\nbroker_endpoints = ['{broker_endpoint}']", + "pg_distrib_dir='{}'\nid=10\nbroker_endpoint = '{broker_endpoint}'", pg_distrib_dir.display() ); let toml = config_string.parse()?; @@ -832,12 +900,17 @@ log_format = 'json' remote_storage_config: None, profiling: ProfilingConfig::Disabled, default_tenant_conf: TenantConf::default(), - broker_endpoints: vec![broker_endpoint - .parse() - .expect("Failed to parse a valid broker endpoint URL")], - broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), + broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), + broker_keepalive_interval: humantime::parse_duration( + storage_broker::DEFAULT_KEEPALIVE_INTERVAL + )?, log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), + metric_collection_interval: humantime::parse_duration( + defaults::DEFAULT_METRIC_COLLECTION_INTERVAL + )?, + metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, + test_remote_failures: 0, }, "Correct defaults should be used when no config values are provided" ); @@ -849,10 +922,10 @@ log_format = 'json' fn parse_basic_config() -> anyhow::Result<()> { let tempdir = tempdir()?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - let broker_endpoint = "http://127.0.0.1:7777"; + let broker_endpoint = storage_broker::DEFAULT_ENDPOINT; let config_string = format!( - "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoints = ['{broker_endpoint}']", + "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoint = '{broker_endpoint}'", pg_distrib_dir.display() ); let toml = config_string.parse()?; @@ -878,12 +951,13 @@ log_format = 'json' remote_storage_config: None, profiling: ProfilingConfig::Disabled, default_tenant_conf: TenantConf::default(), - broker_endpoints: vec![broker_endpoint - .parse() - .expect("Failed to parse a valid broker endpoint URL")], - broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), + broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), + broker_keepalive_interval: Duration::from_secs(5), log_format: LogFormat::Json, concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), + metric_collection_interval: Duration::from_secs(222), + metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), + test_remote_failures: 0, }, "Should be able to parse all basic config values correctly" ); @@ -915,7 +989,7 @@ local_path = '{}'"#, let config_string = format!( r#"{ALL_BASE_VALUES_TOML} pg_distrib_dir='{}' -broker_endpoints = ['{broker_endpoint}'] +broker_endpoint = '{broker_endpoint}' {remote_storage_config_str}"#, pg_distrib_dir.display(), @@ -982,7 +1056,7 @@ concurrency_limit = {s3_concurrency_limit}"# let config_string = format!( r#"{ALL_BASE_VALUES_TOML} pg_distrib_dir='{}' -broker_endpoints = ['{broker_endpoint}'] +broker_endpoint = '{broker_endpoint}' {remote_storage_config_str}"#, pg_distrib_dir.display(), @@ -1016,6 +1090,35 @@ broker_endpoints = ['{broker_endpoint}'] Ok(()) } + #[test] + fn parse_tenant_config() -> anyhow::Result<()> { + let tempdir = tempdir()?; + let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; + + let broker_endpoint = "http://127.0.0.1:7777"; + let trace_read_requests = true; + + let config_string = format!( + r#"{ALL_BASE_VALUES_TOML} +pg_distrib_dir='{}' +broker_endpoint = '{broker_endpoint}' + +[tenant_config] +trace_read_requests = {trace_read_requests}"#, + pg_distrib_dir.display(), + ); + + let toml = config_string.parse()?; + + let conf = PageServerConf::parse_and_validate(&toml, &workdir)?; + assert_eq!( + conf.default_tenant_conf.trace_read_requests, trace_read_requests, + "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants", + ); + + Ok(()) + } + fn prepare_fs(tempdir: &TempDir) -> anyhow::Result<(PathBuf, PathBuf)> { let tempdir_path = tempdir.path(); diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs new file mode 100644 index 0000000000..c411a9e025 --- /dev/null +++ b/pageserver/src/consumption_metrics.rs @@ -0,0 +1,324 @@ +//! +//! Periodically collect consumption metrics for all active tenants +//! and push them to a HTTP endpoint. +//! Cache metrics to send only the updated ones. +//! + +use anyhow; +use tracing::*; +use utils::id::NodeId; +use utils::id::TimelineId; + +use crate::task_mgr; +use crate::tenant::mgr; +use pageserver_api::models::TenantState; +use utils::id::TenantId; + +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DisplayFromStr}; +use std::collections::HashMap; +use std::fmt; +use std::str::FromStr; +use std::time::Duration; + +use chrono::{DateTime, Utc}; +use rand::Rng; +use reqwest::Url; + +/// ConsumptionMetric struct that defines the format for one metric entry +/// i.e. +/// +/// ```json +/// { +/// "metric": "remote_storage_size", +/// "type": "absolute", +/// "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d", +/// "timeline_id": "a03ebb4f5922a1c56ff7485cc8854143", +/// "time": "2022-12-28T11:07:19.317310284Z", +/// "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019", +/// "value": 12345454, +/// } +/// ``` +#[serde_as] +#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)] +pub struct ConsumptionMetric { + pub metric: ConsumptionMetricKind, + #[serde(rename = "type")] + pub metric_type: &'static str, + #[serde_as(as = "DisplayFromStr")] + pub tenant_id: TenantId, + #[serde_as(as = "Option")] + #[serde(skip_serializing_if = "Option::is_none")] + pub timeline_id: Option, + pub time: DateTime, + pub idempotency_key: String, + pub value: u64, +} + +impl ConsumptionMetric { + pub fn new_absolute( + metric: ConsumptionMetricKind, + tenant_id: TenantId, + timeline_id: Option, + value: u64, + node_id: NodeId, + rng: &mut R, + ) -> Self { + Self { + metric, + metric_type: "absolute", + tenant_id, + timeline_id, + time: Utc::now(), + // key that allows metric collector to distinguish unique events + idempotency_key: format!("{}-{}-{:04}", Utc::now(), node_id, rng.gen_range(0..=9999)), + value, + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ConsumptionMetricKind { + /// Amount of WAL produced , by a timeline, i.e. last_record_lsn + /// This is an absolute, per-timeline metric. + WrittenSize, + /// Size of all tenant branches including WAL + /// This is an absolute, per-tenant metric. + /// This is the same metric that tenant/tenant_id/size endpoint returns. + SyntheticStorageSize, + /// Size of all the layer files in the tenant's directory on disk on the pageserver. + /// This is an absolute, per-tenant metric. + /// See also prometheus metric RESIDENT_PHYSICAL_SIZE. + ResidentSize, + /// Size of the remote storage (S3) directory. + /// This is an absolute, per-tenant metric. + RemoteStorageSize, + /// Logical size of the data in the timeline + /// This is an absolute, per-timeline metric + TimelineLogicalSize, +} + +impl FromStr for ConsumptionMetricKind { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "written_size" => Ok(Self::WrittenSize), + "synthetic_storage_size" => Ok(Self::SyntheticStorageSize), + "resident_size" => Ok(Self::ResidentSize), + "remote_storage_size" => Ok(Self::RemoteStorageSize), + "timeline_logical_size" => Ok(Self::TimelineLogicalSize), + _ => anyhow::bail!("invalid value \"{s}\" for metric type"), + } + } +} + +impl fmt::Display for ConsumptionMetricKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(match self { + ConsumptionMetricKind::WrittenSize => "written_size", + ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size", + ConsumptionMetricKind::ResidentSize => "resident_size", + ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size", + ConsumptionMetricKind::TimelineLogicalSize => "timeline_logical_size", + }) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ConsumptionMetricsKey { + tenant_id: TenantId, + timeline_id: Option, + metric: ConsumptionMetricKind, +} + +#[derive(serde::Serialize)] +struct EventChunk<'a> { + events: &'a [ConsumptionMetric], +} + +/// Main thread that serves metrics collection +pub async fn collect_metrics( + metric_collection_endpoint: &Url, + metric_collection_interval: Duration, + node_id: NodeId, +) -> anyhow::Result<()> { + let mut ticker = tokio::time::interval(metric_collection_interval); + + info!("starting collect_metrics"); + + // define client here to reuse it for all requests + let client = reqwest::Client::new(); + let mut cached_metrics: HashMap = HashMap::new(); + + loop { + tokio::select! { + _ = task_mgr::shutdown_watcher() => { + info!("collect_metrics received cancellation request"); + return Ok(()); + }, + _ = ticker.tick() => { + collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await?; + } + } + } +} + +/// One iteration of metrics collection +/// +/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`. +/// Cache metrics to avoid sending the same metrics multiple times. +pub async fn collect_metrics_task( + client: &reqwest::Client, + cached_metrics: &mut HashMap, + metric_collection_endpoint: &reqwest::Url, + node_id: NodeId, +) -> anyhow::Result<()> { + let mut current_metrics: Vec<(ConsumptionMetricsKey, u64)> = Vec::new(); + trace!( + "starting collect_metrics_task. metric_collection_endpoint: {}", + metric_collection_endpoint + ); + + // get list of tenants + let tenants = mgr::list_tenants().await; + + // iterate through list of Active tenants and collect metrics + for (tenant_id, tenant_state) in tenants { + if tenant_state != TenantState::Active { + continue; + } + + let tenant = mgr::get_tenant(tenant_id, true).await?; + + let mut tenant_resident_size = 0; + + // iterate through list of timelines in tenant + for timeline in tenant.list_timelines().iter() { + // collect per-timeline metrics only for active timelines + if timeline.is_active() { + let timeline_written_size = u64::from(timeline.get_last_record_lsn()); + + current_metrics.push(( + ConsumptionMetricsKey { + tenant_id, + timeline_id: Some(timeline.timeline_id), + metric: ConsumptionMetricKind::WrittenSize, + }, + timeline_written_size, + )); + + let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?; + // Only send timeline logical size when it is fully calculated. + if is_exact { + current_metrics.push(( + ConsumptionMetricsKey { + tenant_id, + timeline_id: Some(timeline.timeline_id), + metric: ConsumptionMetricKind::TimelineLogicalSize, + }, + timeline_logical_size, + )); + } + } + + let timeline_resident_size = timeline.get_resident_physical_size(); + tenant_resident_size += timeline_resident_size; + } + + let tenant_remote_size = tenant.get_remote_size().await?; + debug!( + "collected current metrics for tenant: {}: state={:?} resident_size={} remote_size={}", + tenant_id, tenant_state, tenant_resident_size, tenant_remote_size + ); + + current_metrics.push(( + ConsumptionMetricsKey { + tenant_id, + timeline_id: None, + metric: ConsumptionMetricKind::ResidentSize, + }, + tenant_resident_size, + )); + + current_metrics.push(( + ConsumptionMetricsKey { + tenant_id, + timeline_id: None, + metric: ConsumptionMetricKind::RemoteStorageSize, + }, + tenant_remote_size, + )); + + // TODO add SyntheticStorageSize metric + } + + // Filter metrics + current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) { + Some(val) => val != curr_val, + None => true, + }); + + if current_metrics.is_empty() { + trace!("no new metrics to send"); + return Ok(()); + } + + // Send metrics. + // Split into chunks of 1000 metrics to avoid exceeding the max request size + const CHUNK_SIZE: usize = 1000; + let chunks = current_metrics.chunks(CHUNK_SIZE); + + let mut chunk_to_send: Vec = Vec::with_capacity(1000); + + for chunk in chunks { + chunk_to_send.clear(); + + // this code block is needed to convince compiler + // that rng is not reused aroung await point + { + // enrich metrics with timestamp and metric_kind before sending + let mut rng = rand::thread_rng(); + chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| { + ConsumptionMetric::new_absolute( + curr_key.metric, + curr_key.tenant_id, + curr_key.timeline_id, + *curr_val, + node_id, + &mut rng, + ) + })); + } + + let chunk_json = serde_json::value::to_raw_value(&EventChunk { + events: &chunk_to_send, + }) + .expect("ConsumptionMetric should not fail serialization"); + + let res = client + .post(metric_collection_endpoint.clone()) + .json(&chunk_json) + .send() + .await; + + match res { + Ok(res) => { + if res.status().is_success() { + // update cached metrics after they were sent successfully + for (curr_key, curr_val) in chunk.iter() { + cached_metrics.insert(curr_key.clone(), *curr_val); + } + } else { + error!("metrics endpoint refused the sent metrics: {:?}", res); + } + } + Err(err) => { + error!("failed to send metrics: {:?}", err); + } + } + } + + Ok(()) +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index b8f467cd02..f9b8a81dad 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -77,16 +77,6 @@ paths: schema: type: string format: hex - - name: include-non-incremental-logical-size - in: query - schema: - type: string - description: Controls calculation of current_logical_size_non_incremental - - name: include-non-incremental-physical-size - in: query - schema: - type: string - description: Controls calculation of current_physical_size_non_incremental get: description: Get timelines for tenant responses: @@ -139,17 +129,6 @@ paths: format: hex get: description: Get info about the timeline - parameters: - - name: include-non-incremental-logical-size - in: query - schema: - type: string - description: Controls calculation of current_logical_size_non_incremental - - name: include-non-incremental-physical-size - in: query - schema: - type: string - description: Controls calculation of current_physical_size_non_incremental responses: "200": description: TimelineInfo @@ -274,6 +253,7 @@ paths: schema: type: string format: hex + post: description: Schedules attach operation to happen in the background for given tenant responses: @@ -325,7 +305,9 @@ paths: type: string format: hex post: - description: Detach local tenant + description: | + Remove tenant data (including all corresponding timelines) from pageserver's memory and file system. + Files on the remote storage are not affected. responses: "200": description: Tenant detached @@ -354,6 +336,92 @@ paths: schema: $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/ignore: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + post: + description: | + Remove tenant data (including all corresponding timelines) from pageserver's memory. + Files on local disk and remote storage are not affected. + + Future pageserver restarts won't load the data back until `load` is called on such tenant. + responses: + "200": + description: Tenant ignored + "400": + description: Error when no tenant id found in path parameters + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + /v1/tenant/{tenant_id}/load: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + format: hex + post: + description: | + Schedules an operation that attempts to load a tenant from the local disk and + synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load. + If the tenant was ignored before, removes the ignore mark and continues with load scheduling. + + Errors if the tenant is absent on disk, already present in memory or fails to schedule its load. + Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness. + responses: + "202": + description: Tenant scheduled to load successfully + "400": + description: Error when no tenant id found in path parameters + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + /v1/tenant/{tenant_id}/size: parameters: - name: tenant_id @@ -659,7 +727,6 @@ components: - tenant_id - last_record_lsn - disk_consistent_lsn - - awaits_download - state - latest_gc_cutoff_lsn properties: @@ -691,10 +758,6 @@ components: type: integer current_physical_size: type: integer - current_logical_size_non_incremental: - type: integer - current_physical_size_non_incremental: - type: integer wal_source_connstr: type: string last_received_msg_lsn: @@ -702,44 +765,11 @@ components: format: hex last_received_msg_ts: type: integer - awaits_download: - type: boolean state: type: string latest_gc_cutoff_lsn: type: string format: hex - - # These 'local' and 'remote' fields just duplicate some of the fields - # above. They are kept for backwards-compatibility. They can be removed, - # when the control plane has been updated to look at the above fields - # directly. - local: - $ref: "#/components/schemas/LocalTimelineInfo" - remote: - $ref: "#/components/schemas/RemoteTimelineInfo" - - LocalTimelineInfo: - type: object - properties: - ancestor_timeline_id: - type: string - format: hex - ancestor_lsn: - type: string - format: hex - current_logical_size: - type: integer - current_physical_size: - type: integer - RemoteTimelineInfo: - type: object - required: - - remote_consistent_lsn - properties: - remote_consistent_lsn: - type: string - format: hex Error: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 32f96b3c5c..4f4c397abe 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3,19 +3,18 @@ use std::sync::Arc; use anyhow::{anyhow, Context, Result}; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; -use pageserver_api::models::TenantState; use remote_storage::GenericRemoteStorage; -use tokio::task::JoinError; +use tokio_util::sync::CancellationToken; use tracing::*; use super::models::{ - LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, TenantConfigRequest, - TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo, + StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, + TimelineCreateRequest, TimelineInfo, }; use crate::pgdatadir_mapping::LsnForTimestamp; -use crate::tenant::Timeline; -use crate::tenant_config::TenantConfOpt; -use crate::{config::PageServerConf, tenant_mgr}; +use crate::tenant::config::TenantConfOpt; +use crate::tenant::{with_ondemand_download, Timeline}; +use crate::{config::PageServerConf, tenant::mgr}; use utils::{ auth::JwtAuth, http::{ @@ -32,8 +31,6 @@ use utils::{ // Imports only used for testing APIs #[cfg(feature = "testing")] use super::models::{ConfigureFailpointsRequest, TimelineGcRequest}; -#[cfg(feature = "testing")] -use crate::CheckpointConfig; struct State { conf: &'static PageServerConf, @@ -81,28 +78,28 @@ fn check_permission(request: &Request, tenant_id: Option) -> Res } // Helper function to construct a TimelineInfo struct for a timeline -fn build_timeline_info( - tenant_state: TenantState, +async fn build_timeline_info( timeline: &Arc, include_non_incremental_logical_size: bool, - include_non_incremental_physical_size: bool, ) -> anyhow::Result { - let mut info = build_timeline_info_common(tenant_state, timeline)?; + let mut info = build_timeline_info_common(timeline)?; if include_non_incremental_logical_size { - info.current_logical_size_non_incremental = - Some(timeline.get_current_logical_size_non_incremental(info.last_record_lsn)?); - } - if include_non_incremental_physical_size { - info.current_physical_size_non_incremental = - Some(timeline.get_physical_size_non_incremental()?) + // XXX we should be using spawn_ondemand_logical_size_calculation here. + // Otherwise, if someone deletes the timeline / detaches the tenant while + // we're executing this function, we will outlive the timeline on-disk state. + info.current_logical_size_non_incremental = Some( + timeline + .get_current_logical_size_non_incremental( + info.last_record_lsn, + CancellationToken::new(), + ) + .await?, + ); } Ok(info) } -fn build_timeline_info_common( - tenant_state: TenantState, - timeline: &Arc, -) -> anyhow::Result { +fn build_timeline_info_common(timeline: &Arc) -> anyhow::Result { let last_record_lsn = timeline.get_last_record_lsn(); let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = { let guard = timeline.last_received_wal.lock().unwrap(); @@ -123,13 +120,13 @@ fn build_timeline_info_common( lsn @ Lsn(_) => Some(lsn), }; let current_logical_size = match timeline.get_current_logical_size() { - Ok(size) => Some(size), + Ok((size, _)) => Some(size), Err(err) => { error!("Timeline info creation failed to get current logical size: {err:?}"); None } }; - let current_physical_size = Some(timeline.get_physical_size()); + let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok()); let state = timeline.current_state(); let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0)); @@ -146,29 +143,13 @@ fn build_timeline_info_common( current_logical_size, current_physical_size, current_logical_size_non_incremental: None, - current_physical_size_non_incremental: None, + timeline_dir_layer_file_size_sum: None, wal_source_connstr, last_received_msg_lsn, last_received_msg_ts, pg_version: timeline.pg_version, state, - - // XXX bring back tracking of downloads per timeline, or, introduce - // an 'Attaching' state for the timeline and get rid of this field. - awaits_download: tenant_state == TenantState::Attaching, - - // Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility - // with the control plane. - local: LocalTimelineInfo { - ancestor_timeline_id, - ancestor_lsn, - current_logical_size, - current_physical_size, - }, - remote: RemoteTimelineInfo { - remote_consistent_lsn: Some(remote_consistent_lsn), - }, }; Ok(info) } @@ -189,7 +170,9 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result { // Created. Construct a TimelineInfo for it. - let timeline_info = build_timeline_info_common(tenant.current_state(), &new_timeline) + let timeline_info = build_timeline_info_common(&new_timeline) .map_err(ApiError::InternalServerError)?; json_response(StatusCode::CREATED, timeline_info) } @@ -213,30 +196,30 @@ async fn timeline_list_handler(request: Request) -> Result, let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let include_non_incremental_logical_size = query_param_present(&request, "include-non-incremental-logical-size"); - let include_non_incremental_physical_size = - query_param_present(&request, "include-non-incremental-physical-size"); check_permission(&request, Some(tenant_id))?; - let _entered = info_span!("timeline_list", tenant = %tenant_id).entered(); + let response_data = async { + let tenant = mgr::get_tenant(tenant_id, true) + .await + .map_err(ApiError::NotFound)?; + let timelines = tenant.list_timelines(); - let (tenant_state, timelines) = { - let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?; - (tenant.current_state(), tenant.list_timelines()) - }; + let mut response_data = Vec::with_capacity(timelines.len()); + for timeline in timelines { + let timeline_info = + build_timeline_info(&timeline, include_non_incremental_logical_size) + .await + .context( + "Failed to convert tenant timeline {timeline_id} into the local one: {e:?}", + ) + .map_err(ApiError::InternalServerError)?; - let mut response_data = Vec::with_capacity(timelines.len()); - for timeline in timelines { - let timeline_info = build_timeline_info( - tenant_state, - &timeline, - include_non_incremental_logical_size, - include_non_incremental_physical_size, - ) - .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}") - .map_err(ApiError::InternalServerError)?; - - response_data.push(timeline_info); + response_data.push(timeline_info); + } + Ok(response_data) } + .instrument(info_span!("timeline_list", tenant = %tenant_id)) + .await?; json_response(StatusCode::OK, response_data) } @@ -276,31 +259,21 @@ async fn timeline_detail_handler(request: Request) -> Result(timeline_info) } @@ -321,13 +294,15 @@ async fn get_lsn_by_timestamp_handler(request: Request) -> Result format!("{lsn}"), LsnForTimestamp::Future(_lsn) => "future".into(), LsnForTimestamp::Past(_lsn) => "past".into(), @@ -347,13 +322,13 @@ async fn tenant_attach_handler(request: Request) -> Result, if let Some(remote_storage) = &state.remote_storage { // FIXME: distinguish between "Tenant already exists" and other errors - tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage) + mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone()) .instrument(info_span!("tenant_attach", tenant = %tenant_id)) .await .map_err(ApiError::InternalServerError)?; } else { return Err(ApiError::BadRequest(anyhow!( - "attach_tenant is possible because pageserver was configured without remote storage" + "attach_tenant is not possible because pageserver was configured without remote storage" ))); } @@ -365,7 +340,7 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, let state = get_state(&request); let conf = state.conf; - tenant_mgr::detach_tenant(conf, tenant_id) + mgr::detach_tenant(conf, tenant_id) .instrument(info_span!("tenant_detach", tenant = %tenant_id)) .await // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors. @@ -392,23 +367,49 @@ async fn tenant_detach_handler(request: Request) -> Result, json_response(StatusCode::OK, ()) } +async fn tenant_load_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + let state = get_state(&request); + mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone()) + .instrument(info_span!("load", tenant = %tenant_id)) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::ACCEPTED, ()) +} + +async fn tenant_ignore_handler(request: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + let state = get_state(&request); + let conf = state.conf; + mgr::ignore_tenant(conf, tenant_id) + .instrument(info_span!("ignore_tenant", tenant = %tenant_id)) + .await + // FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors. + // Replace this with better handling once the error type permits it. + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + async fn tenant_list_handler(request: Request) -> Result, ApiError> { check_permission(&request, None)?; - let response_data = tokio::task::spawn_blocking(move || { - let _enter = info_span!("tenant_list").entered(); - tenant_mgr::list_tenants() - .iter() - .map(|(id, state)| TenantInfo { - id: *id, - state: *state, - current_physical_size: None, - has_in_progress_downloads: Some(state.has_in_progress_downloads()), - }) - .collect::>() - }) - .await - .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?; + let response_data = mgr::list_tenants() + .instrument(info_span!("tenant_list")) + .await + .iter() + .map(|(id, state)| TenantInfo { + id: *id, + state: *state, + current_physical_size: None, + has_in_progress_downloads: Some(state.has_in_progress_downloads()), + }) + .collect::>(); json_response(StatusCode::OK, response_data) } @@ -417,28 +418,25 @@ async fn tenant_status(request: Request) -> Result, ApiErro let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - let tenant_info = tokio::task::spawn_blocking(move || { - let _enter = info_span!("tenant_status_handler", tenant = %tenant_id).entered(); - let tenant = tenant_mgr::get_tenant(tenant_id, false)?; + let tenant_info = async { + let tenant = mgr::get_tenant(tenant_id, false).await?; // Calculate total physical size of all timelines let mut current_physical_size = 0; for timeline in tenant.list_timelines().iter() { - current_physical_size += timeline.get_physical_size(); + current_physical_size += timeline.layer_size_sum().approximate_is_ok(); } let state = tenant.current_state(); - let tenant_info = TenantInfo { + Ok(TenantInfo { id: tenant_id, state, current_physical_size: Some(current_physical_size), has_in_progress_downloads: Some(state.has_in_progress_downloads()), - }; - - Ok::<_, anyhow::Error>(tenant_info) - }) + }) + } + .instrument(info_span!("tenant_status_handler", tenant = %tenant_id)) .await - .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))? .map_err(ApiError::InternalServerError)?; json_response(StatusCode::OK, tenant_info) @@ -448,7 +446,9 @@ async fn tenant_size_handler(request: Request) -> Result, A let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::InternalServerError)?; + let tenant = mgr::get_tenant(tenant_id, true) + .await + .map_err(ApiError::InternalServerError)?; // this can be long operation, it currently is not backed by any request coalescing or similar let inputs = tenant @@ -565,22 +565,19 @@ async fn tenant_create_handler(mut request: Request) -> Result { @@ -671,17 +668,13 @@ async fn tenant_config_handler(mut request: Request) -> Result) -> Result) -> Result) -> Result, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let tenant = mgr::get_tenant(tenant_id, true) + .await + .map_err(ApiError::NotFound)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(ApiError::NotFound)?; + match timeline.spawn_download_all_remote_layers().await { + Ok(st) => json_response(StatusCode::ACCEPTED, st), + Err(st) => json_response(StatusCode::CONFLICT, st), + } +} + +async fn timeline_download_remote_layers_handler_get( + request: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_id))?; + + let tenant = mgr::get_tenant(tenant_id, true) + .await + .map_err(ApiError::NotFound)?; + let timeline = tenant + .get_timeline(timeline_id, true) + .map_err(ApiError::NotFound)?; + let info = timeline + .get_download_all_remote_layers_task_info() + .context("task never started since last pageserver process start") + .map_err(ApiError::NotFound)?; + json_response(StatusCode::OK, info) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -838,6 +878,8 @@ pub fn make_router( .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) .post("/v1/tenant/:tenant_id/attach", tenant_attach_handler) .post("/v1/tenant/:tenant_id/detach", tenant_detach_handler) + .post("/v1/tenant/:tenant_id/load", tenant_load_handler) + .post("/v1/tenant/:tenant_id/ignore", tenant_ignore_handler) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler, @@ -858,6 +900,14 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", testing_api!("run timeline checkpoint", timeline_checkpoint_handler), ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", + timeline_download_remote_layers_handler_post, + ) + .get( + "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", + timeline_download_remote_layers_handler_get, + ) .delete( "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_delete_handler, diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 642e41765b..588b92c13f 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -187,13 +187,13 @@ fn import_slru( path: &Path, mut reader: Reader, len: usize, -) -> Result<()> { - trace!("importing slru file {}", path.display()); +) -> anyhow::Result<()> { + info!("importing slru file {path:?}"); let mut buf: [u8; 8192] = [0u8; 8192]; let filename = &path .file_name() - .expect("missing slru filename") + .with_context(|| format!("missing slru filename for path {path:?}"))? .to_string_lossy(); let segno = u32::from_str_radix(filename, 16)?; @@ -237,14 +237,19 @@ fn import_slru( /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. -fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> { +fn import_wal( + walpath: &Path, + tline: &Timeline, + startpoint: Lsn, + endpoint: Lsn, +) -> anyhow::Result<()> { let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version); let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE); let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = startpoint; - let mut walingest = WalIngest::new(tline, startpoint)?; + let mut walingest = WalIngest::new(tline, startpoint).no_ondemand_download()?; while last_lsn <= endpoint { // FIXME: assume postgresql tli 1 for now @@ -267,7 +272,7 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) } let nread = file.read_to_end(&mut buf)?; - if nread != WAL_SEGMENT_SIZE - offset as usize { + if nread != WAL_SEGMENT_SIZE - offset { // Maybe allow this for .partial files? error!("read only {} bytes from WAL file", nread); } @@ -279,7 +284,9 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; + walingest + .ingest_record(recdata, lsn, &mut modification, &mut decoded) + .no_ondemand_download()?; last_lsn = lsn; nrecords += 1; @@ -360,7 +367,7 @@ pub fn import_wal_from_tar( let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE); let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE); let mut last_lsn = start_lsn; - let mut walingest = WalIngest::new(tline, start_lsn)?; + let mut walingest = WalIngest::new(tline, start_lsn).no_ondemand_download()?; // Ingest wal until end_lsn info!("importing wal until {}", end_lsn); @@ -405,7 +412,9 @@ pub fn import_wal_from_tar( let mut decoded = DecodedWALRecord::default(); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?; + walingest + .ingest_record(recdata, lsn, &mut modification, &mut decoded) + .no_ondemand_download()?; last_lsn = lsn; debug!("imported record at {} (end {})", lsn, end_lsn); @@ -440,16 +449,22 @@ fn import_file( reader: Reader, len: usize, ) -> Result> { + let file_name = match file_path.file_name() { + Some(name) => name.to_string_lossy(), + None => return Ok(None), + }; + + if file_name.starts_with('.') { + // tar archives on macOs, created without COPYFILE_DISABLE=1 env var + // will contain "fork files", skip them. + return Ok(None); + } + if file_path.starts_with("global") { let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; let dbnode = 0; - match file_path - .file_name() - .expect("missing filename") - .to_string_lossy() - .as_ref() - { + match file_name.as_ref() { "pg_control" => { let bytes = read_all_bytes(reader)?; @@ -485,12 +500,7 @@ fn import_file( .to_string_lossy() .parse()?; - match file_path - .file_name() - .expect("missing base filename") - .to_string_lossy() - .as_ref() - { + match file_name.as_ref() { "pg_filenode.map" => { let bytes = read_all_bytes(reader)?; modification.put_relmap_file(spcnode, dbnode, bytes)?; @@ -520,11 +530,7 @@ fn import_file( import_slru(modification, slru, file_path, reader, len)?; debug!("imported multixact members slru"); } else if file_path.starts_with("pg_twophase") { - let file_name = &file_path - .file_name() - .expect("missing twophase filename") - .to_string_lossy(); - let xid = u32::from_str_radix(file_name, 16)?; + let xid = u32::from_str_radix(file_name.as_ref(), 16)?; let bytes = read_all_bytes(reader)?; modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 5147bd26bb..2f78c199b9 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -1,6 +1,7 @@ mod auth; pub mod basebackup; pub mod config; +pub mod consumption_metrics; pub mod http; pub mod import_datadir; pub mod keyspace; @@ -10,13 +11,8 @@ pub mod page_service; pub mod pgdatadir_mapping; pub mod profiling; pub mod repository; -pub mod storage_sync2; -pub use storage_sync2 as storage_sync; pub mod task_mgr; pub mod tenant; -pub mod tenant_config; -pub mod tenant_mgr; -pub mod tenant_tasks; pub mod trace; pub mod virtual_file; pub mod walingest; @@ -26,9 +22,8 @@ pub mod walredo; use std::path::Path; -use tracing::info; - use crate::task_mgr::TaskKind; +use tracing::info; /// Current storage format version /// @@ -47,15 +42,6 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61; static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); -/// Config for the Repository checkpointer -#[derive(Debug, Clone, Copy)] -pub enum CheckpointConfig { - // Flush all in-memory data - Flush, - // Flush all in-memory data and reconstruct all page images - Forced, -} - pub async fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint task. This prevents new connections from // being accepted. @@ -66,7 +52,7 @@ pub async fn shutdown_pageserver(exit_code: i32) { // Shut down all the tenants. This flushes everything to disk and kills // the checkpoint and GC tasks. - tenant_mgr::shutdown_all_tenants().await; + tenant::mgr::shutdown_all_tenants().await; // Stop syncing with remote storage. // @@ -99,7 +85,7 @@ async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) { } } -fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { +pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { if n == 0 { 0.0 } else { @@ -125,6 +111,13 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp"; /// Full path: `tenants//timelines/___uninit`. pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; +/// A marker file to prevent pageserver from loading a certain tenant on restart. +/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding +/// `ignore` management API command, that expects the ignored tenant to be properly loaded +/// into pageserver's memory before being ignored. +/// Full path: `tenants//___ignored_tenant`. +pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant"; + pub fn is_temporary(path: &Path) -> bool { match path.file_name() { Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX), diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 454ff01f0e..205ee0ffad 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -84,13 +84,20 @@ static LAST_RECORD_LSN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -// Metrics for determining timeline's physical size. -// A layered timeline's physical is defined as the total size of -// (delta/image) layer files on disk. -static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { +static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( - "pageserver_current_physical_size", - "Current physical size grouped by timeline", + "pageserver_resident_physical_size", + "The size of the layer files present in the pageserver's filesystem.", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_remote_physical_size", + "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.", + // Corollary: If any files are missing from the index part, they won't be included here. &["tenant_id", "timeline_id"] ) .expect("failed to define a metric") @@ -136,8 +143,9 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ 1.0, // 1 sec ]; -const STORAGE_IO_TIME_OPERATIONS: &[&str] = - &["open", "close", "read", "write", "seek", "fsync", "gc"]; +const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[ + "open", "close", "read", "write", "seek", "fsync", "gc", "metadata", +]; const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"]; @@ -201,7 +209,7 @@ pub static NUM_ONDISK_LAYERS: Lazy = Lazy::new(|| { // remote storage metrics -pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy = Lazy::new(|| { +static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_remote_upload_queue_unfinished_tasks", "Number of tasks in the upload queue that are not finished yet.", @@ -210,14 +218,14 @@ pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy = Lazy::new(| .expect("failed to define a metric") }); -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RemoteOpKind { Upload, Download, Delete, } impl RemoteOpKind { - pub fn as_str(&self) -> &str { + pub fn as_str(&self) -> &'static str { match self { Self::Upload => "upload", Self::Download => "download", @@ -226,13 +234,13 @@ impl RemoteOpKind { } } -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] pub enum RemoteOpFileKind { Layer, Index, } impl RemoteOpFileKind { - pub fn as_str(&self) -> &str { + pub fn as_str(&self) -> &'static str { match self { Self::Layer => "layer", Self::Index => "index", @@ -365,7 +373,7 @@ pub struct TimelineMetrics { pub load_layer_map_histo: Histogram, pub last_record_gauge: IntGauge, pub wait_lsn_time_histo: Histogram, - pub current_physical_size_gauge: UIntGauge, + pub resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, pub num_persistent_files_created: IntCounter, @@ -406,7 +414,7 @@ impl TimelineMetrics { let wait_lsn_time_histo = WAIT_LSN_TIME .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); - let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE + let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); let current_logical_size_gauge = CURRENT_LOGICAL_SIZE @@ -432,7 +440,7 @@ impl TimelineMetrics { load_layer_map_histo, last_record_gauge, wait_lsn_time_histo, - current_physical_size_gauge, + resident_physical_size_gauge, current_logical_size_gauge, num_persistent_files_created, persistent_bytes_written, @@ -448,7 +456,7 @@ impl Drop for TimelineMetrics { let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]); let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]); - let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); @@ -491,10 +499,114 @@ pub fn remove_tenant_metrics(tenant_id: &TenantId) { use futures::Future; use pin_project_lite::pin_project; +use std::collections::HashMap; use std::pin::Pin; +use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; use std::time::Instant; +pub struct RemoteTimelineClientMetrics { + tenant_id: String, + timeline_id: String, + remote_physical_size_gauge: Mutex>, + remote_operation_time: Mutex>, + unfinished_tasks: Mutex>, +} + +impl RemoteTimelineClientMetrics { + pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { + RemoteTimelineClientMetrics { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + remote_operation_time: Mutex::new(HashMap::default()), + unfinished_tasks: Mutex::new(HashMap::default()), + remote_physical_size_gauge: Mutex::new(None), + } + } + pub fn remote_physical_size_gauge(&self) -> UIntGauge { + let mut guard = self.remote_physical_size_gauge.lock().unwrap(); + guard + .get_or_insert_with(|| { + REMOTE_PHYSICAL_SIZE + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + ]) + .unwrap() + }) + .clone() + } + pub fn remote_operation_time( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + status: &'static str, + ) -> Histogram { + // XXX would be nice to have an upgradable RwLock + let mut guard = self.remote_operation_time.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str(), status); + let metric = guard.entry(key).or_insert_with(move || { + REMOTE_OPERATION_TIME + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + key.0, + key.1, + key.2, + ]) + .unwrap() + }); + metric.clone() + } + pub fn unfinished_tasks( + &self, + file_kind: &RemoteOpFileKind, + op_kind: &RemoteOpKind, + ) -> IntGauge { + // XXX would be nice to have an upgradable RwLock + let mut guard = self.unfinished_tasks.lock().unwrap(); + let key = (file_kind.as_str(), op_kind.as_str()); + let metric = guard.entry(key).or_insert_with(move || { + REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS + .get_metric_with_label_values(&[ + &self.tenant_id.to_string(), + &self.timeline_id.to_string(), + key.0, + key.1, + ]) + .unwrap() + }); + metric.clone() + } +} + +impl Drop for RemoteTimelineClientMetrics { + fn drop(&mut self) { + let RemoteTimelineClientMetrics { + tenant_id, + timeline_id, + remote_physical_size_gauge, + remote_operation_time, + unfinished_tasks, + } = self; + for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() { + let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]); + } + for ((a, b), _) in unfinished_tasks.get_mut().unwrap().drain() { + let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[ + tenant_id, + timeline_id, + a, + b, + ]); + } + { + let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above + let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + } + } +} + /// Wrapper future that measures the time spent by a remote storage operation, /// and records the time and success/failure as a prometheus metric. pub trait MeasureRemoteOp: Sized { @@ -504,6 +616,7 @@ pub trait MeasureRemoteOp: Sized { timeline_id: TimelineId, file_kind: RemoteOpFileKind, op: RemoteOpKind, + metrics: Arc, ) -> MeasuredRemoteOp { let start = Instant::now(); MeasuredRemoteOp { @@ -513,6 +626,7 @@ pub trait MeasureRemoteOp: Sized { file_kind, op, start, + metrics, } } } @@ -529,6 +643,7 @@ pin_project! { file_kind: RemoteOpFileKind, op: RemoteOpKind, start: Instant, + metrics: Arc, } } @@ -541,15 +656,8 @@ impl>, O, E> Future for MeasuredRemoteOp { if let Poll::Ready(ref res) = poll_result { let duration = this.start.elapsed(); let status = if res.is_ok() { &"success" } else { &"failure" }; - REMOTE_OPERATION_TIME - .get_metric_with_label_values(&[ - &this.tenant_id.to_string(), - &this.timeline_id.to_string(), - this.file_kind.as_str(), - this.op.as_str(), - status, - ]) - .unwrap() + this.metrics + .remote_operation_time(this.file_kind, this.op, status) .observe(duration.as_secs_f64()); } poll_result diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index fe52124805..b84b2694f4 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -48,10 +48,9 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME}; use crate::profiling::profpoint_start; use crate::task_mgr; use crate::task_mgr::TaskKind; +use crate::tenant::mgr; use crate::tenant::{Tenant, Timeline}; -use crate::tenant_mgr; use crate::trace::Tracer; -use crate::CheckpointConfig; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; @@ -445,9 +444,7 @@ impl PageServerHandler { pgb.flush().await?; let mut copyin_stream = Box::pin(copyin_stream(pgb)); let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream)); - tokio::task::block_in_place(|| { - import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn) - })?; + tokio::task::block_in_place(|| import_wal_from_tar(&timeline, reader, start_lsn, end_lsn))?; info!("wal import complete"); // Drain the rest of the Copy data @@ -466,7 +463,7 @@ impl PageServerHandler { // We only want to persist the data, and it doesn't matter if it's in the // shape of deltas or images. info!("flushing layers"); - timeline.checkpoint(CheckpointConfig::Flush).await?; + timeline.freeze_and_flush().await?; info!("done"); Ok(()) @@ -542,7 +539,10 @@ impl PageServerHandler { let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; - let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?; + let exists = crate::tenant::with_ondemand_download(|| { + timeline.get_rel_exists(req.rel, lsn, req.latest) + }) + .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { exists, @@ -559,7 +559,10 @@ impl PageServerHandler { let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; - let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?; + let n_blocks = crate::tenant::with_ondemand_download(|| { + timeline.get_rel_size(req.rel, lsn, req.latest) + }) + .await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { n_blocks, @@ -576,9 +579,10 @@ impl PageServerHandler { let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn) .await?; - let total_blocks = - timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?; - + let total_blocks = crate::tenant::with_ondemand_download(|| { + timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest) + }) + .await?; let db_size = total_blocks as i64 * BLCKSZ as i64; Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse { @@ -604,11 +608,14 @@ impl PageServerHandler { } */ - // FIXME: this profiling now happens at different place than it used to. The - // current profiling is based on a thread-local variable, so it doesn't work - // across awaits - let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); - let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?; + let page = crate::tenant::with_ondemand_download(|| { + // FIXME: this profiling now happens at different place than it used to. The + // current profiling is based on a thread-local variable, so it doesn't work + // across awaits + let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests); + timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest) + }) + .await?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page, @@ -649,7 +656,7 @@ impl PageServerHandler { tokio::task::block_in_place(|| { let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?; - tracing::Span::current().record("lsn", &basebackup.lsn.to_string().as_str()); + tracing::Span::current().record("lsn", basebackup.lsn.to_string().as_str()); basebackup.send_tarball() })?; pgb.write_message(&BeMessage::CopyDone)?; @@ -941,7 +948,7 @@ impl postgres_backend_async::Handler for PageServerHandler { /// ensures that queries don't fail immediately after pageserver startup, because /// all tenants are still loading. async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result> { - let tenant = tenant_mgr::get_tenant(tenant_id, false)?; + let tenant = mgr::get_tenant(tenant_id, false).await?; match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await { Ok(wait_result) => wait_result // no .context(), the error message is good enough and some tests depend on it diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 0e334a63df..82b1576145 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,11 +6,12 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! +use super::tenant::PageReconstructResult; use crate::keyspace::{KeySpace, KeySpaceAccum}; -use crate::repository::*; -use crate::tenant::Timeline; +use crate::tenant::{with_ondemand_download, Timeline}; use crate::walrecord::NeonWalRecord; -use anyhow::{bail, ensure, Result}; +use crate::{repository::*, try_no_ondemand_download}; +use anyhow::Context; use bytes::{Buf, Bytes}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; @@ -19,6 +20,7 @@ use postgres_ffi::{Oid, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{hash_map, HashMap, HashSet}; use std::ops::Range; +use tokio_util::sync::CancellationToken; use tracing::{debug, trace, warn}; use utils::{bin_ser::BeSer, lsn::Lsn}; @@ -33,6 +35,14 @@ pub enum LsnForTimestamp { NoData(Lsn), } +#[derive(Debug, thiserror::Error)] +pub enum CalculateLogicalSizeError { + #[error("cancelled")] + Cancelled, + #[error(transparent)] + Other(#[from] anyhow::Error), +} + /// /// This impl provides all the functionality to store PostgreSQL relations, SLRUs, /// and other special kinds of files, in a versioned key-value store. The @@ -88,16 +98,18 @@ impl Timeline { blknum: BlockNumber, lsn: Lsn, latest: bool, - ) -> Result { - ensure!(tag.relnode != 0, "invalid relnode"); + ) -> PageReconstructResult { + if tag.relnode == 0 { + return PageReconstructResult::from(anyhow::anyhow!("invalid relnode")); + } - let nblocks = self.get_rel_size(tag, lsn, latest)?; + let nblocks = try_no_ondemand_download!(self.get_rel_size(tag, lsn, latest)); if blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", tag, blknum, lsn, nblocks ); - return Ok(ZERO_PAGE.clone()); + return PageReconstructResult::Success(ZERO_PAGE.clone()); } let key = rel_block_to_key(tag, blknum); @@ -105,38 +117,51 @@ impl Timeline { } // Get size of a database in blocks - pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn, latest: bool) -> Result { + pub fn get_db_size( + &self, + spcnode: Oid, + dbnode: Oid, + lsn: Lsn, + latest: bool, + ) -> PageReconstructResult { let mut total_blocks = 0; - let rels = self.list_rels(spcnode, dbnode, lsn)?; + let rels = try_no_ondemand_download!(self.list_rels(spcnode, dbnode, lsn)); for rel in rels { - let n_blocks = self.get_rel_size(rel, lsn, latest)?; + let n_blocks = try_no_ondemand_download!(self.get_rel_size(rel, lsn, latest)); total_blocks += n_blocks as usize; } - Ok(total_blocks) + PageReconstructResult::Success(total_blocks) } /// Get size of a relation file - pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> Result { - ensure!(tag.relnode != 0, "invalid relnode"); + pub fn get_rel_size( + &self, + tag: RelTag, + lsn: Lsn, + latest: bool, + ) -> PageReconstructResult { + if tag.relnode == 0 { + return PageReconstructResult::from(anyhow::anyhow!("invalid relnode")); + } if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) { - return Ok(nblocks); + return PageReconstructResult::Success(nblocks); } if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) - && !self.get_rel_exists(tag, lsn, latest)? + && !try_no_ondemand_download!(self.get_rel_exists(tag, lsn, latest)) { // FIXME: Postgres sometimes calls smgrcreate() to create // FSM, and smgrnblocks() on it immediately afterwards, // without extending it. Tolerate that by claiming that // any non-existent FSM fork has size 0. - return Ok(0); + return PageReconstructResult::Success(0); } let key = rel_size_to_key(tag); - let mut buf = self.get(key, lsn)?; + let mut buf = try_no_ondemand_download!(self.get(key, lsn)); let nblocks = buf.get_u32_le(); if latest { @@ -149,43 +174,62 @@ impl Timeline { // associated with most recent value of LSN. self.update_cached_rel_size(tag, lsn, nblocks); } - Ok(nblocks) + PageReconstructResult::Success(nblocks) } /// Does relation exist? - pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> Result { - ensure!(tag.relnode != 0, "invalid relnode"); + pub fn get_rel_exists( + &self, + tag: RelTag, + lsn: Lsn, + _latest: bool, + ) -> PageReconstructResult { + if tag.relnode == 0 { + return PageReconstructResult::from(anyhow::anyhow!("invalid relnode")); + } // first try to lookup relation in cache if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) { - return Ok(true); + return PageReconstructResult::Success(true); } // fetch directory listing let key = rel_dir_to_key(tag.spcnode, tag.dbnode); - let buf = self.get(key, lsn)?; - let dir = RelDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(key, lsn)); - let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); - - Ok(exists) + match RelDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => { + let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); + PageReconstructResult::Success(exists) + } + Err(e) => PageReconstructResult::from(e), + } } /// Get a list of all existing relations in given tablespace and database. - pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { + pub fn list_rels( + &self, + spcnode: Oid, + dbnode: Oid, + lsn: Lsn, + ) -> PageReconstructResult> { // fetch directory listing let key = rel_dir_to_key(spcnode, dbnode); - let buf = self.get(key, lsn)?; - let dir = RelDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(key, lsn)); - let rels: HashSet = - HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { - spcnode, - dbnode, - relnode: *relnode, - forknum: *forknum, - })); + match RelDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => { + let rels: HashSet = + HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { + spcnode, + dbnode, + relnode: *relnode, + forknum: *forknum, + })); - Ok(rels) + PageReconstructResult::Success(rels) + } + Err(e) => PageReconstructResult::from(e), + } } /// Look up given SLRU page version. @@ -195,7 +239,7 @@ impl Timeline { segno: u32, blknum: BlockNumber, lsn: Lsn, - ) -> Result { + ) -> PageReconstructResult { let key = slru_block_to_key(kind, segno, blknum); self.get(key, lsn) } @@ -206,21 +250,30 @@ impl Timeline { kind: SlruKind, segno: u32, lsn: Lsn, - ) -> Result { + ) -> PageReconstructResult { let key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(key, lsn)?; - Ok(buf.get_u32_le()) + let mut buf = try_no_ondemand_download!(self.get(key, lsn)); + PageReconstructResult::Success(buf.get_u32_le()) } /// Get size of an SLRU segment - pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { + pub fn get_slru_segment_exists( + &self, + kind: SlruKind, + segno: u32, + lsn: Lsn, + ) -> PageReconstructResult { // fetch directory listing let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn)?; - let dir = SlruSegmentDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(key, lsn)); - let exists = dir.segments.get(&segno).is_some(); - Ok(exists) + match SlruSegmentDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => { + let exists = dir.segments.get(&segno).is_some(); + PageReconstructResult::Success(exists) + } + Err(e) => PageReconstructResult::from(e), + } } /// Locate LSN, such that all transactions that committed before @@ -230,7 +283,10 @@ impl Timeline { /// so it's not well defined which LSN you get if there were multiple commits /// "in flight" at that point in time. /// - pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result { + pub fn find_lsn_for_timestamp( + &self, + search_timestamp: TimestampTz, + ) -> PageReconstructResult { let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); let min_lsn = *gc_cutoff_lsn_guard; let max_lsn = self.get_last_record_lsn(); @@ -246,12 +302,12 @@ impl Timeline { // cannot overflow, high and low are both smaller than u64::MAX / 2 let mid = (high + low) / 2; - let cmp = self.is_latest_commit_timestamp_ge_than( + let cmp = try_no_ondemand_download!(self.is_latest_commit_timestamp_ge_than( search_timestamp, Lsn(mid * 8), &mut found_smaller, &mut found_larger, - )?; + )); if cmp { high = mid; @@ -263,15 +319,15 @@ impl Timeline { (false, false) => { // This can happen if no commit records have been processed yet, e.g. // just after importing a cluster. - Ok(LsnForTimestamp::NoData(max_lsn)) + PageReconstructResult::Success(LsnForTimestamp::NoData(max_lsn)) } (true, false) => { // Didn't find any commit timestamps larger than the request - Ok(LsnForTimestamp::Future(max_lsn)) + PageReconstructResult::Success(LsnForTimestamp::Future(max_lsn)) } (false, true) => { // Didn't find any commit timestamps smaller than the request - Ok(LsnForTimestamp::Past(max_lsn)) + PageReconstructResult::Success(LsnForTimestamp::Past(max_lsn)) } (true, true) => { // low is the LSN of the first commit record *after* the search_timestamp, @@ -281,7 +337,7 @@ impl Timeline { // Otherwise, if you restore to the returned LSN, the database will // include physical changes from later commits that will be marked // as aborted, and will need to be vacuumed away. - Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8))) + PageReconstructResult::Success(LsnForTimestamp::Present(Lsn((low - 1) * 8))) } } } @@ -299,12 +355,20 @@ impl Timeline { probe_lsn: Lsn, found_smaller: &mut bool, found_larger: &mut bool, - ) -> Result { - for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? { - let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?; + ) -> PageReconstructResult { + for segno in try_no_ondemand_download!(self.list_slru_segments(SlruKind::Clog, probe_lsn)) { + let nblocks = try_no_ondemand_download!(self.get_slru_segment_size( + SlruKind::Clog, + segno, + probe_lsn + )); for blknum in (0..nblocks).rev() { - let clog_page = - self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?; + let clog_page = try_no_ondemand_download!(self.get_slru_page_at_lsn( + SlruKind::Clog, + segno, + blknum, + probe_lsn + )); if clog_page.len() == BLCKSZ as usize + 8 { let mut timestamp_bytes = [0u8; 8]; @@ -313,61 +377,75 @@ impl Timeline { if timestamp >= search_timestamp { *found_larger = true; - return Ok(true); + return PageReconstructResult::Success(true); } else { *found_smaller = true; } } } } - Ok(false) + PageReconstructResult::Success(false) } /// Get a list of SLRU segments - pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { + pub fn list_slru_segments( + &self, + kind: SlruKind, + lsn: Lsn, + ) -> PageReconstructResult> { // fetch directory entry let key = slru_dir_to_key(kind); - let buf = self.get(key, lsn)?; - let dir = SlruSegmentDirectory::des(&buf)?; - - Ok(dir.segments) + let buf = try_no_ondemand_download!(self.get(key, lsn)); + match SlruSegmentDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => PageReconstructResult::Success(dir.segments), + Err(e) => PageReconstructResult::from(e), + } } - pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + pub fn get_relmap_file( + &self, + spcnode: Oid, + dbnode: Oid, + lsn: Lsn, + ) -> PageReconstructResult { let key = relmap_file_key(spcnode, dbnode); - let buf = self.get(key, lsn)?; - Ok(buf) + let buf = try_no_ondemand_download!(self.get(key, lsn)); + PageReconstructResult::Success(buf) } - pub fn list_dbdirs(&self, lsn: Lsn) -> Result> { + pub fn list_dbdirs(&self, lsn: Lsn) -> PageReconstructResult> { // fetch directory entry - let buf = self.get(DBDIR_KEY, lsn)?; - let dir = DbDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(DBDIR_KEY, lsn)); - Ok(dir.dbdirs) + match DbDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => PageReconstructResult::Success(dir.dbdirs), + Err(e) => PageReconstructResult::from(e), + } } - pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { + pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> PageReconstructResult { let key = twophase_file_key(xid); - let buf = self.get(key, lsn)?; - Ok(buf) + let buf = try_no_ondemand_download!(self.get(key, lsn)); + PageReconstructResult::Success(buf) } - pub fn list_twophase_files(&self, lsn: Lsn) -> Result> { + pub fn list_twophase_files(&self, lsn: Lsn) -> PageReconstructResult> { // fetch directory entry - let buf = self.get(TWOPHASEDIR_KEY, lsn)?; - let dir = TwoPhaseDirectory::des(&buf)?; + let buf = try_no_ondemand_download!(self.get(TWOPHASEDIR_KEY, lsn)); - Ok(dir.xids) + match TwoPhaseDirectory::des(&buf).context("deserialization failure") { + Ok(dir) => PageReconstructResult::Success(dir.xids), + Err(e) => PageReconstructResult::from(e), + } } - pub fn get_control_file(&self, lsn: Lsn) -> Result { + pub fn get_control_file(&self, lsn: Lsn) -> PageReconstructResult { self.get(CONTROLFILE_KEY, lsn) } - pub fn get_checkpoint(&self, lsn: Lsn) -> Result { + pub fn get_checkpoint(&self, lsn: Lsn) -> PageReconstructResult { self.get(CHECKPOINT_KEY, lsn) } @@ -376,16 +454,26 @@ impl Timeline { /// /// Only relation blocks are counted currently. That excludes metadata, /// SLRUs, twophase files etc. - pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { + pub async fn get_current_logical_size_non_incremental( + &self, + lsn: Lsn, + cancel: CancellationToken, + ) -> Result { // Fetch list of database dirs and iterate them - let buf = self.get(DBDIR_KEY, lsn)?; - let dbdir = DbDirectory::des(&buf)?; + let buf = self.get_download(DBDIR_KEY, lsn).await?; + let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?; let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { - for rel in self.list_rels(*spcnode, *dbnode, lsn)? { + for rel in + crate::tenant::with_ondemand_download(|| self.list_rels(*spcnode, *dbnode, lsn)) + .await? + { + if cancel.is_cancelled() { + return Err(CalculateLogicalSizeError::Cancelled); + } let relsize_key = rel_size_to_key(rel); - let mut buf = self.get(relsize_key, lsn)?; + let mut buf = self.get_download(relsize_key, lsn).await?; let relsize = buf.get_u32_le(); total_size += relsize as u64; @@ -398,7 +486,7 @@ impl Timeline { /// Get a KeySpace that covers all the Keys that are in use at the given LSN. /// Anything that's not listed maybe removed from the underlying storage (from /// that LSN forwards). - pub fn collect_keyspace(&self, lsn: Lsn) -> Result { + pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result { // Iterate through key ranges, greedily packing them into partitions let mut result = KeySpaceAccum::new(); @@ -406,8 +494,8 @@ impl Timeline { result.add_key(DBDIR_KEY); // Fetch list of database dirs and iterate them - let buf = self.get(DBDIR_KEY, lsn)?; - let dbdir = DbDirectory::des(&buf)?; + let buf = self.get_download(DBDIR_KEY, lsn).await?; + let dbdir = DbDirectory::des(&buf).context("deserialization failure")?; let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect(); dbs.sort_unstable(); @@ -415,15 +503,15 @@ impl Timeline { result.add_key(relmap_file_key(spcnode, dbnode)); result.add_key(rel_dir_to_key(spcnode, dbnode)); - let mut rels: Vec = self - .list_rels(spcnode, dbnode, lsn)? - .iter() - .cloned() - .collect(); + let mut rels: Vec = + with_ondemand_download(|| self.list_rels(spcnode, dbnode, lsn)) + .await? + .into_iter() + .collect(); rels.sort_unstable(); for rel in rels { let relsize_key = rel_size_to_key(rel); - let mut buf = self.get(relsize_key, lsn)?; + let mut buf = self.get_download(relsize_key, lsn).await?; let relsize = buf.get_u32_le(); result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize)); @@ -439,13 +527,13 @@ impl Timeline { ] { let slrudir_key = slru_dir_to_key(kind); result.add_key(slrudir_key); - let buf = self.get(slrudir_key, lsn)?; - let dir = SlruSegmentDirectory::des(&buf)?; + let buf = self.get_download(slrudir_key, lsn).await?; + let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?; let mut segments: Vec = dir.segments.iter().cloned().collect(); segments.sort_unstable(); for segno in segments { let segsize_key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(segsize_key, lsn)?; + let mut buf = self.get_download(segsize_key, lsn).await?; let segsize = buf.get_u32_le(); result.add_range( @@ -457,8 +545,8 @@ impl Timeline { // Then pg_twophase result.add_key(TWOPHASEDIR_KEY); - let buf = self.get(TWOPHASEDIR_KEY, lsn)?; - let twophase_dir = TwoPhaseDirectory::des(&buf)?; + let buf = self.get_download(TWOPHASEDIR_KEY, lsn).await?; + let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?; let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); xids.sort_unstable(); for xid in xids { @@ -537,7 +625,7 @@ impl<'a> DatadirModification<'a> { /// /// This inserts the directory metadata entries that are assumed to /// always exist. - pub fn init_empty(&mut self) -> Result<()> { + pub fn init_empty(&mut self) -> anyhow::Result<()> { let buf = DbDirectory::ser(&DbDirectory { dbdirs: HashMap::new(), })?; @@ -570,8 +658,8 @@ impl<'a> DatadirModification<'a> { rel: RelTag, blknum: BlockNumber, rec: NeonWalRecord, - ) -> Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + ) -> anyhow::Result<()> { + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec)); Ok(()) } @@ -583,7 +671,7 @@ impl<'a> DatadirModification<'a> { segno: u32, blknum: BlockNumber, rec: NeonWalRecord, - ) -> Result<()> { + ) -> anyhow::Result<()> { self.put( slru_block_to_key(kind, segno, blknum), Value::WalRecord(rec), @@ -597,8 +685,8 @@ impl<'a> DatadirModification<'a> { rel: RelTag, blknum: BlockNumber, img: Bytes, - ) -> Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + ) -> anyhow::Result<()> { + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); self.put(rel_block_to_key(rel, blknum), Value::Image(img)); Ok(()) } @@ -609,26 +697,26 @@ impl<'a> DatadirModification<'a> { segno: u32, blknum: BlockNumber, img: Bytes, - ) -> Result<()> { + ) -> anyhow::Result<()> { self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img)); Ok(()) } /// Store a relmapper file (pg_filenode.map) in the repository - pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> Result<()> { + pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> anyhow::Result<()> { // Add it to the directory (if it doesn't exist already) - let buf = self.get(DBDIR_KEY)?; + let buf = self.get(DBDIR_KEY).no_ondemand_download()?; let mut dbdir = DbDirectory::des(&buf)?; let r = dbdir.dbdirs.insert((spcnode, dbnode), true); - if r == None || r == Some(false) { + if r.is_none() || r == Some(false) { // The dbdir entry didn't exist, or it contained a // 'false'. The 'insert' call already updated it with // 'true', now write the updated 'dbdirs' map back. let buf = DbDirectory::ser(&dbdir)?; self.put(DBDIR_KEY, Value::Image(buf.into())); } - if r == None { + if r.is_none() { // Create RelDirectory let buf = RelDirectory::ser(&RelDirectory { rels: HashSet::new(), @@ -643,12 +731,12 @@ impl<'a> DatadirModification<'a> { Ok(()) } - pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> Result<()> { + pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> anyhow::Result<()> { // Add it to the directory entry - let buf = self.get(TWOPHASEDIR_KEY)?; + let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?; let mut dir = TwoPhaseDirectory::des(&buf)?; if !dir.xids.insert(xid) { - bail!("twophase file for xid {} already exists", xid); + anyhow::bail!("twophase file for xid {} already exists", xid); } self.put( TWOPHASEDIR_KEY, @@ -659,23 +747,26 @@ impl<'a> DatadirModification<'a> { Ok(()) } - pub fn put_control_file(&mut self, img: Bytes) -> Result<()> { + pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> { self.put(CONTROLFILE_KEY, Value::Image(img)); Ok(()) } - pub fn put_checkpoint(&mut self, img: Bytes) -> Result<()> { + pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> { self.put(CHECKPOINT_KEY, Value::Image(img)); Ok(()) } - pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> { + pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> { let req_lsn = self.tline.get_last_record_lsn(); - let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?; + let total_blocks = self + .tline + .get_db_size(spcnode, dbnode, req_lsn, true) + .no_ondemand_download()?; // Remove entry from dbdir - let buf = self.get(DBDIR_KEY)?; + let buf = self.get(DBDIR_KEY).no_ondemand_download()?; let mut dir = DbDirectory::des(&buf)?; if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { let buf = DbDirectory::ser(&dir)?; @@ -698,11 +789,11 @@ impl<'a> DatadirModification<'a> { /// Create a relation fork. /// /// 'nblocks' is the initial size. - pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> { + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); // It's possible that this is the first rel for this db in this // tablespace. Create the reldir entry for it if so. - let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY)?)?; + let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).no_ondemand_download()?)?; let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() { // Didn't exist. Update dbdir @@ -714,12 +805,12 @@ impl<'a> DatadirModification<'a> { RelDirectory::default() } else { // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key)?)? + RelDirectory::des(&self.get(rel_dir_key).no_ondemand_download()?)? }; // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { - bail!("rel {} already exists", rel); + anyhow::bail!("rel {rel} already exists"); } self.put( rel_dir_key, @@ -742,13 +833,17 @@ impl<'a> DatadirModification<'a> { } /// Truncate relation - pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> { + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); let last_lsn = self.tline.get_last_record_lsn(); - if self.tline.get_rel_exists(rel, last_lsn, true)? { + if self + .tline + .get_rel_exists(rel, last_lsn, true) + .no_ondemand_download()? + { let size_key = rel_size_to_key(rel); // Fetch the old size first - let old_size = self.get(size_key)?.get_u32_le(); + let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le(); // Update the entry with the new size. let buf = nblocks.to_le_bytes(); @@ -768,12 +863,12 @@ impl<'a> DatadirModification<'a> { /// Extend relation /// If new size is smaller, do nothing. - pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> { + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); // Put size let size_key = rel_size_to_key(rel); - let old_size = self.get(size_key)?.get_u32_le(); + let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le(); // only extend relation here. never decrease the size if nblocks > old_size { @@ -789,12 +884,12 @@ impl<'a> DatadirModification<'a> { } /// Drop a relation. - pub fn put_rel_drop(&mut self, rel: RelTag) -> Result<()> { - ensure!(rel.relnode != 0, "invalid relnode"); + pub fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> { + anyhow::ensure!(rel.relnode != 0, "invalid relnode"); // Remove it from the directory entry let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let buf = self.get(dir_key)?; + let buf = self.get(dir_key).no_ondemand_download()?; let mut dir = RelDirectory::des(&buf)?; if dir.rels.remove(&(rel.relnode, rel.forknum)) { @@ -805,7 +900,7 @@ impl<'a> DatadirModification<'a> { // update logical size let size_key = rel_size_to_key(rel); - let old_size = self.get(size_key)?.get_u32_le(); + let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le(); self.pending_nblocks -= old_size as i64; // Remove enty from relation size cache @@ -822,14 +917,14 @@ impl<'a> DatadirModification<'a> { kind: SlruKind, segno: u32, nblocks: BlockNumber, - ) -> Result<()> { + ) -> anyhow::Result<()> { // Add it to the directory entry let dir_key = slru_dir_to_key(kind); - let buf = self.get(dir_key)?; + let buf = self.get(dir_key).no_ondemand_download()?; let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.insert(segno) { - bail!("slru segment {:?}/{} already exists", kind, segno); + anyhow::bail!("slru segment {kind:?}/{segno} already exists"); } self.put( dir_key, @@ -852,7 +947,7 @@ impl<'a> DatadirModification<'a> { kind: SlruKind, segno: u32, nblocks: BlockNumber, - ) -> Result<()> { + ) -> anyhow::Result<()> { // Put size let size_key = slru_segment_size_to_key(kind, segno); let buf = nblocks.to_le_bytes(); @@ -861,10 +956,10 @@ impl<'a> DatadirModification<'a> { } /// This method is used for marking truncated SLRU files - pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> Result<()> { + pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> { // Remove it from the directory entry let dir_key = slru_dir_to_key(kind); - let buf = self.get(dir_key)?; + let buf = self.get(dir_key).no_ondemand_download()?; let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.remove(&segno) { @@ -882,15 +977,15 @@ impl<'a> DatadirModification<'a> { } /// Drop a relmapper file (pg_filenode.map) - pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<()> { + pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> { // TODO Ok(()) } /// This method is used for marking truncated SLRU files - pub fn drop_twophase_file(&mut self, xid: TransactionId) -> Result<()> { + pub fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { // Remove it from the directory entry - let buf = self.get(TWOPHASEDIR_KEY)?; + let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?; let mut dir = TwoPhaseDirectory::des(&buf)?; if !dir.xids.remove(&xid) { @@ -925,7 +1020,7 @@ impl<'a> DatadirModification<'a> { /// retains all the metadata, but data pages are flushed. That's again OK /// for bulk import, where you are just loading data pages and won't try to /// modify the same pages twice. - pub fn flush(&mut self) -> Result<()> { + pub fn flush(&mut self) -> anyhow::Result<()> { // Unless we have accumulated a decent amount of changes, it's not worth it // to scan through the pending_updates list. let pending_nblocks = self.pending_nblocks; @@ -936,7 +1031,7 @@ impl<'a> DatadirModification<'a> { let writer = self.tline.writer(); // Flush relation and SLRU data blocks, keep metadata. - let mut result: Result<()> = Ok(()); + let mut result: anyhow::Result<()> = Ok(()); self.pending_updates.retain(|&key, value| { if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) { result = writer.put(key, self.lsn, value); @@ -984,7 +1079,7 @@ impl<'a> DatadirModification<'a> { // Internal helper functions to batch the modifications - fn get(&self, key: Key) -> Result { + fn get(&self, key: Key) -> PageReconstructResult { // Have we already updated the same key? Read the pending updated // version in that case. // @@ -992,14 +1087,14 @@ impl<'a> DatadirModification<'a> { // value that has been removed, deletion only avoids leaking storage. if let Some(value) = self.pending_updates.get(&key) { if let Value::Image(img) = value { - Ok(img.clone()) + PageReconstructResult::Success(img.clone()) } else { // Currently, we never need to read back a WAL record that we // inserted in the same "transaction". All the metadata updates // work directly with Images, and we never need to read actual // data pages. We could handle this if we had to, by calling // the walredo manager, but let's keep it simple for now. - bail!("unexpected pending WAL record"); + PageReconstructResult::from(anyhow::anyhow!("unexpected pending WAL record")) } } else { let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); @@ -1327,7 +1422,7 @@ fn twophase_key_range(xid: TransactionId) -> Range { field2: 0, field3: 0, field4: 0, - field5: if overflowed { 1 } else { 0 }, + field5: u8::from(overflowed), field6: next_xid, } } @@ -1354,7 +1449,7 @@ const CHECKPOINT_KEY: Key = Key { // Reverse mappings for a few Keys. // These are needed by WAL redo manager. -pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> { +pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { Ok(match key.field1 { 0x00 => ( RelTag { @@ -1365,7 +1460,7 @@ pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> { }, key.field6, ), - _ => bail!("unexpected value kind 0x{:02x}", key.field1), + _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), }) } @@ -1384,21 +1479,21 @@ pub fn is_rel_vm_block_key(key: Key) -> bool { && key.field6 != 0xffffffff } -pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> { +pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> { Ok(match key.field1 { 0x01 => { let kind = match key.field2 { 0x00 => SlruKind::Clog, 0x01 => SlruKind::MultiXactMembers, 0x02 => SlruKind::MultiXactOffsets, - _ => bail!("unrecognized slru kind 0x{:02x}", key.field2), + _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2), }; let segno = key.field4; let blknum = key.field6; (kind, segno, blknum) } - _ => bail!("unexpected value kind 0x{:02x}", key.field1), + _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), }) } @@ -1413,7 +1508,7 @@ pub fn create_test_timeline( tenant: &crate::tenant::Tenant, timeline_id: utils::id::TimelineId, pg_version: u32, -) -> Result> { +) -> anyhow::Result> { let tline = tenant .create_empty_timeline(timeline_id, Lsn(8), pg_version)? .initialize()?; diff --git a/pageserver/src/storage_sync2/delete.rs b/pageserver/src/storage_sync2/delete.rs deleted file mode 100644 index f22dbdc2d8..0000000000 --- a/pageserver/src/storage_sync2/delete.rs +++ /dev/null @@ -1,38 +0,0 @@ -//! Helper functions to delete files from remote storage with a RemoteStorage -use anyhow::Context; -use std::path::Path; -use tracing::debug; - -use remote_storage::GenericRemoteStorage; - -pub(super) async fn delete_layer( - storage: &GenericRemoteStorage, - local_layer_path: &Path, -) -> anyhow::Result<()> { - fail::fail_point!("before-delete-layer", |_| { - anyhow::bail!("failpoint before-delete-layer") - }); - debug!( - "Deleting layer from remote storage: {:?}", - local_layer_path.display() - ); - - let storage_path = storage - .remote_object_id(local_layer_path) - .with_context(|| { - format!( - "Failed to get the layer storage path for local path '{}'", - local_layer_path.display() - ) - })?; - - // XXX: If the deletion fails because the object already didn't exist, - // it would be good to just issue a warning but consider it success. - // https://github.com/neondatabase/neon/issues/2934 - storage.delete(&storage_path).await.with_context(|| { - format!( - "Failed to delete remote layer from storage at '{:?}'", - storage_path - ) - }) -} diff --git a/pageserver/src/storage_sync2/download.rs b/pageserver/src/storage_sync2/download.rs deleted file mode 100644 index 12b858fb57..0000000000 --- a/pageserver/src/storage_sync2/download.rs +++ /dev/null @@ -1,257 +0,0 @@ -//! Helper functions to download files from remote storage with a RemoteStorage -use std::collections::HashSet; -use std::path::Path; - -use anyhow::{bail, Context}; -use futures::stream::{FuturesUnordered, StreamExt}; -use tokio::fs; -use tokio::io::AsyncWriteExt; -use tracing::debug; - -use crate::config::PageServerConf; -use crate::storage_sync::index::LayerFileMetadata; -use remote_storage::{DownloadError, GenericRemoteStorage}; -use utils::crashsafe::path_with_suffix_extension; -use utils::id::{TenantId, TimelineId}; - -use super::index::IndexPart; -use super::RelativePath; - -async fn fsync_path(path: impl AsRef) -> Result<(), std::io::Error> { - fs::File::open(path).await?.sync_all().await -} - -/// -/// If 'metadata' is given, we will validate that the downloaded file's size matches that -/// in the metadata. (In the future, we might do more cross-checks, like CRC validation) -/// -/// Returns the size of the downloaded file. -pub async fn download_layer_file<'a>( - conf: &'static PageServerConf, - storage: &'a GenericRemoteStorage, - tenant_id: TenantId, - timeline_id: TimelineId, - path: &'a RelativePath, - layer_metadata: &'a LayerFileMetadata, -) -> anyhow::Result { - let timeline_path = conf.timeline_path(&timeline_id, &tenant_id); - - let local_path = path.to_local_path(&timeline_path); - - let layer_storage_path = storage.remote_object_id(&local_path).with_context(|| { - format!( - "Failed to get the layer storage path for local path '{}'", - local_path.display() - ) - })?; - - // Perform a rename inspired by durable_rename from file_utils.c. - // The sequence: - // write(tmp) - // fsync(tmp) - // rename(tmp, new) - // fsync(new) - // fsync(parent) - // For more context about durable_rename check this email from postgres mailing list: - // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com - // If pageserver crashes the temp file will be deleted on startup and re-downloaded. - let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); - - // TODO: this doesn't use the cached fd for some reason? - let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| { - format!( - "Failed to create a destination file for layer '{}'", - temp_file_path.display() - ) - })?; - let mut download = storage - .download(&layer_storage_path) - .await - .with_context(|| { - format!( - "Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'" - ) - })?; - let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| { - format!( - "Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display() - ) - })?; - - // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: - // A file will not be closed immediately when it goes out of scope if there are any IO operations - // that have not yet completed. To ensure that a file is closed immediately when it is dropped, - // you should call flush before dropping it. - // - // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because - // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations. - // But for additional safety lets check/wait for any pending operations. - destination_file.flush().await.with_context(|| { - format!( - "failed to flush source file at {}", - temp_file_path.display() - ) - })?; - - match layer_metadata.file_size() { - Some(expected) if expected != bytes_amount => { - anyhow::bail!( - "According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'", - temp_file_path.display() - ); - } - Some(_) | None => { - // matches, or upgrading from an earlier IndexPart version - } - } - - // not using sync_data because it can lose file size update - destination_file.sync_all().await.with_context(|| { - format!( - "failed to fsync source file at {}", - temp_file_path.display() - ) - })?; - drop(destination_file); - - fail::fail_point!("remote-storage-download-pre-rename", |_| { - bail!("remote-storage-download-pre-rename failpoint triggered") - }); - - fs::rename(&temp_file_path, &local_path).await?; - - fsync_path(&local_path) - .await - .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))?; - - tracing::info!("download complete: {}", local_path.display()); - - Ok(bytes_amount) -} - -const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; - -pub fn is_temp_download_file(path: &Path) -> bool { - let extension = path.extension().map(|pname| { - pname - .to_str() - .expect("paths passed to this function must be valid Rust strings") - }); - match extension { - Some(TEMP_DOWNLOAD_EXTENSION) => true, - Some(_) => false, - None => false, - } -} - -/// List timelines of given tenant in remote storage -pub async fn list_remote_timelines<'a>( - storage: &'a GenericRemoteStorage, - conf: &'static PageServerConf, - tenant_id: TenantId, -) -> anyhow::Result> { - let tenant_path = conf.timelines_path(&tenant_id); - let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| { - format!( - "Failed to get tenant storage path for local path '{}'", - tenant_path.display() - ) - })?; - - let timelines = storage - .list_prefixes(Some(&tenant_storage_path)) - .await - .with_context(|| { - format!( - "Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download" - ) - })?; - - if timelines.is_empty() { - anyhow::bail!("no timelines found on the remote storage") - } - - let mut timeline_ids = HashSet::new(); - let mut part_downloads = FuturesUnordered::new(); - - for timeline_remote_storage_key in timelines { - let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { - anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") - })?; - - let timeline_id: TimelineId = object_name.parse().with_context(|| { - format!("failed to parse object name into timeline id '{object_name}'") - })?; - - // list_prefixes returns all files with the prefix. If we haven't seen this timeline ID - // yet, launch a download task for it. - if !timeline_ids.contains(&timeline_id) { - timeline_ids.insert(timeline_id); - let storage_clone = storage.clone(); - part_downloads.push(async move { - ( - timeline_id, - download_index_part(conf, &storage_clone, tenant_id, timeline_id).await, - ) - }); - } - } - - // Wait for all the download tasks to complete. - let mut timeline_parts = Vec::new(); - while let Some((timeline_id, part_upload_result)) = part_downloads.next().await { - let index_part = part_upload_result - .with_context(|| format!("Failed to fetch index part for timeline {timeline_id}"))?; - - debug!("Successfully fetched index part for timeline {timeline_id}"); - timeline_parts.push((timeline_id, index_part)); - } - Ok(timeline_parts) -} - -pub async fn download_index_part( - conf: &'static PageServerConf, - storage: &GenericRemoteStorage, - tenant_id: TenantId, - timeline_id: TimelineId, -) -> Result { - let index_part_path = conf - .metadata_path(timeline_id, tenant_id) - .with_file_name(IndexPart::FILE_NAME); - let part_storage_path = storage - .remote_object_id(&index_part_path) - .with_context(|| { - format!( - "Failed to get the index part storage path for local path '{}'", - index_part_path.display() - ) - }) - .map_err(DownloadError::BadInput)?; - - let mut index_part_download = storage.download(&part_storage_path).await?; - - let mut index_part_bytes = Vec::new(); - tokio::io::copy( - &mut index_part_download.download_stream, - &mut index_part_bytes, - ) - .await - .with_context(|| { - format!( - "Failed to download an index part into file '{}'", - index_part_path.display() - ) - }) - .map_err(DownloadError::Other)?; - - let index_part: IndexPart = serde_json::from_slice(&index_part_bytes) - .with_context(|| { - format!( - "Failed to deserialize index part file into file '{}'", - index_part_path.display() - ) - }) - .map_err(DownloadError::Other)?; - - Ok(index_part) -} diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 86d1266f09..a1b3ad26b0 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -25,7 +25,6 @@ //! the current task has been requested to shut down. You can use that with //! Tokio select!(). //! -//! //! TODO: This would be a good place to also handle panics in a somewhat sane way. //! Depending on what task panics, we might want to kill the whole server, or //! only a single tenant or timeline. @@ -36,6 +35,7 @@ #![allow(clippy::declare_interior_mutable_const)] use std::collections::HashMap; +use std::fmt; use std::future::Future; use std::panic::AssertUnwindSafe; use std::sync::atomic::{AtomicU64, Ordering}; @@ -43,9 +43,9 @@ use std::sync::{Arc, Mutex}; use futures::FutureExt; use tokio::runtime::Runtime; -use tokio::sync::watch; use tokio::task::JoinHandle; use tokio::task_local; +use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, warn}; @@ -71,7 +71,7 @@ use crate::shutdown_pageserver; // // WAL receiver runtime: // - used to handle WAL receiver connections. -// - and to receiver updates from etcd +// - and to receiver updates from storage_broker // // Background runtime // - layer flushing @@ -135,22 +135,28 @@ pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { .expect("Failed to create background op runtime") }); +#[derive(Debug, Clone, Copy)] pub struct PageserverTaskId(u64); +impl fmt::Display for PageserverTaskId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + /// Each task that we track is associated with a "task ID". It's just an /// increasing number that we assign. Note that it is different from tokio::task::Id. -static NEXT_TASK_ID: Lazy = Lazy::new(|| AtomicU64::new(1)); +static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1); /// Global registry of tasks static TASKS: Lazy>>> = Lazy::new(|| Mutex::new(HashMap::new())); task_local! { - // There is a Tokio watch channel for each task, which can be used to signal the - // task that it needs to shut down. This task local variable holds the receiving - // end of the channel. The sender is kept in the global registry, so that anyone - // can send the signal to request task shutdown. - static SHUTDOWN_RX: watch::Receiver; + // This is a cancellation token which will be cancelled when a task needs to shut down. The + // root token is kept in the global registry, so that anyone can send the signal to request + // task shutdown. + static SHUTDOWN_TOKEN: CancellationToken; // Each task holds reference to its own PageServerTask here. static CURRENT_TASK: Arc; @@ -178,7 +184,7 @@ pub enum TaskKind { PageRequestHandler, // Manages the WAL receiver connection for one timeline. It subscribes to - // events from etcd, decides which safekeeper to connect to. It spawns a + // events from storage_broker, decides which safekeeper to connect to. It spawns a // separate WalReceiverConnection task to handle each connection. WalReceiverManager, @@ -200,11 +206,20 @@ pub enum TaskKind { // Task that uploads a file to remote storage RemoteUploadTask, + // Task that downloads a file from remote storage + RemoteDownloadTask, + // task that handles the initial downloading of all tenants InitialLoad, // task that handles attaching a tenant Attach, + + // task that handhes metrics collection + MetricsCollection, + + // task that drives downloading layers + DownloadAllRemoteLayers, } #[derive(Default)] @@ -226,8 +241,8 @@ struct PageServerTask { name: String, - // To request task shutdown, send 'true' to the channel to notify the task. - shutdown_tx: watch::Sender, + // To request task shutdown, just cancel this token. + cancel: CancellationToken, mutable: Mutex, } @@ -247,13 +262,13 @@ pub fn spawn( where F: Future> + Send + 'static, { - let (shutdown_tx, shutdown_rx) = watch::channel(false); + let cancel = CancellationToken::new(); let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed); let task = Arc::new(PageServerTask { task_id: PageserverTaskId(task_id), kind, name: name.to_string(), - shutdown_tx, + cancel: cancel.clone(), mutable: Mutex::new(MutableTaskState { tenant_id, timeline_id, @@ -271,7 +286,7 @@ where task_name, task_id, task_cloned, - shutdown_rx, + cancel, shutdown_process_on_error, future, )); @@ -288,7 +303,7 @@ async fn task_wrapper( task_name: String, task_id: u64, task: Arc, - shutdown_rx: watch::Receiver, + shutdown_token: CancellationToken, shutdown_process_on_error: bool, future: F, ) where @@ -296,9 +311,9 @@ async fn task_wrapper( { debug!("Starting task '{}'", task_name); - let result = SHUTDOWN_RX + let result = SHUTDOWN_TOKEN .scope( - shutdown_rx, + shutdown_token, CURRENT_TASK.scope(task, { // We use AssertUnwindSafe here so that the payload function // doesn't need to be UnwindSafe. We don't do anything after the @@ -408,7 +423,7 @@ pub async fn shutdown_tasks( && (tenant_id.is_none() || task_mut.tenant_id == tenant_id) && (timeline_id.is_none() || task_mut.timeline_id == timeline_id) { - let _ = task.shutdown_tx.send_replace(true); + task.cancel.cancel(); victim_tasks.push(Arc::clone(task)); } } @@ -436,24 +451,35 @@ pub fn current_task_kind() -> Option { CURRENT_TASK.try_with(|ct| ct.kind).ok() } +pub fn current_task_id() -> Option { + CURRENT_TASK.try_with(|ct| ct.task_id).ok() +} + /// A Future that can be used to check if the current task has been requested to /// shut down. pub async fn shutdown_watcher() { - let mut shutdown_rx = SHUTDOWN_RX - .try_with(|rx| rx.clone()) + let token = SHUTDOWN_TOKEN + .try_with(|t| t.clone()) .expect("shutdown_requested() called in an unexpected task or thread"); - while !*shutdown_rx.borrow() { - if shutdown_rx.changed().await.is_err() { - break; - } - } + token.cancelled().await; +} + +/// Clone the current task's cancellation token, which can be moved across tasks. +/// +/// When the task which is currently executing is shutdown, the cancellation token will be +/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or +/// `tokio::task::JoinSet::spawn`. +pub fn shutdown_token() -> CancellationToken { + SHUTDOWN_TOKEN + .try_with(|t| t.clone()) + .expect("shutdown_token() called in an unexpected task or thread") } /// Has the current task been requested to shut down? pub fn is_shutdown_requested() -> bool { - if let Ok(shutdown_rx) = SHUTDOWN_RX.try_with(|rx| rx.clone()) { - *shutdown_rx.borrow() + if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) { + cancel.is_cancelled() } else { if !cfg!(test) { warn!("is_shutdown_requested() called in an unexpected task or thread"); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index be478600be..c4ee795b07 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -45,23 +45,25 @@ use std::sync::{Mutex, RwLock}; use std::time::{Duration, Instant}; use self::metadata::TimelineMetadata; +use self::remote_timeline_client::RemoteTimelineClient; use crate::config::PageServerConf; use crate::import_datadir; use crate::is_uninit_mark; use crate::metrics::{remove_tenant_metrics, STORAGE_TIME}; use crate::repository::GcResult; -use crate::storage_sync::create_remote_timeline_client; -use crate::storage_sync::index::IndexPart; -use crate::storage_sync::list_remote_timelines; -use crate::storage_sync::RemoteTimelineClient; use crate::task_mgr; use crate::task_mgr::TaskKind; +use crate::tenant::config::TenantConfOpt; use crate::tenant::metadata::load_metadata; -use crate::tenant_config::TenantConfOpt; +use crate::tenant::remote_timeline_client::index::IndexPart; +use crate::tenant::storage_layer::DeltaLayer; +use crate::tenant::storage_layer::ImageLayer; +use crate::tenant::storage_layer::Layer; + use crate::virtual_file::VirtualFile; use crate::walredo::PostgresRedoManager; use crate::walredo::WalRedoManager; -use crate::{CheckpointConfig, TEMP_FILE_SUFFIX}; +use crate::TEMP_FILE_SUFFIX; pub use pageserver_api::models::TenantState; use toml_edit; @@ -75,26 +77,26 @@ mod blob_io; pub mod block_io; pub mod bst_layer_map; pub mod coverage; -mod delta_layer; mod disk_btree; pub(crate) mod ephemeral_file; -pub mod filename; -mod image_layer; -mod inmemory_layer; pub mod latest_layer_map; pub mod layer_map; pub mod metadata; mod par_fsync; +mod remote_timeline_client; pub mod storage_layer; +pub mod config; +pub mod mgr; +pub mod tasks; +pub mod upload_queue; + mod timeline; pub mod size; -use storage_layer::Layer; - -pub use timeline::Timeline; +pub use timeline::{with_ondemand_download, PageReconstructError, PageReconstructResult, Timeline}; // re-export this function so that page_cache.rs can use it. pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file; @@ -129,11 +131,11 @@ pub struct Tenant { timelines: Mutex>>, // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding - // `timelines` mutex during all GC iteration (especially with enforced checkpoint) + // `timelines` mutex during all GC iteration // may block for a long time `get_timeline`, `get_timelines_state`,... and other operations // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn // timeout... - gc_cs: Mutex<()>, + gc_cs: tokio::sync::Mutex<()>, walredo_mgr: Arc, // provides access to timeline data sitting in the remote storage @@ -253,7 +255,7 @@ impl UninitializedTimeline<'_> { .context("Failed to import basebackup") })?; - // Flush loop needs to be spawned in order for checkpoint to be able to flush. + // Flush loop needs to be spawned in order to be able to flush. // We want to run proper checkpoint before we mark timeline as available to outside world // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock raw_timeline.maybe_spawn_flush_loop(); @@ -263,9 +265,9 @@ impl UninitializedTimeline<'_> { }); raw_timeline - .checkpoint(CheckpointConfig::Flush) + .freeze_and_flush() .await - .context("Failed to checkpoint after basebackup import")?; + .context("Failed to flush after basebackup import")?; let timeline = self.initialize()?; @@ -340,7 +342,7 @@ impl TimelineUninitMark { let uninit_mark_parent = uninit_mark_file .parent() .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?; - ignore_absent_files(|| fs::remove_file(&uninit_mark_file)).with_context(|| { + ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| { format!("Failed to remove uninit mark file at path {uninit_mark_file:?}") })?; crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?; @@ -375,7 +377,7 @@ impl Drop for TimelineUninitMark { // We should not blindly overwrite local metadata with remote one. // For example, consider the following case: -// Checkpoint comes, we update local metadata and start upload task but after that +// Image layer is flushed to disk as a new delta layer, we update local metadata and start upload task but after that // pageserver crashes. During startup we'll load new metadata, and then reset it // to the state of remote one. But current layermap will have layers from the old // metadata which is inconsistent. @@ -484,7 +486,7 @@ impl Tenant { let timeline = UninitializedTimeline { owning_tenant: self, timeline_id, - raw_timeline: Some((Arc::new(dummy_timeline), TimelineUninitMark::dummy())), + raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())), }; // Do not start walreceiver here. We do need loaded layer map for reconcile_with_remote // But we shouldnt start walreceiver before we have all the data locally, because working walreceiver @@ -514,7 +516,7 @@ impl Tenant { ) })?; broken_timeline.set_state(TimelineState::Broken); - timelines_accessor.insert(timeline_id, Arc::new(broken_timeline)); + timelines_accessor.insert(timeline_id, broken_timeline); Err(e) } } @@ -574,7 +576,7 @@ impl Tenant { pub fn spawn_attach( conf: &'static PageServerConf, tenant_id: TenantId, - remote_storage: &GenericRemoteStorage, + remote_storage: GenericRemoteStorage, ) -> Arc { // XXX: Attach should provide the config, especially during tenant migration. // See https://github.com/neondatabase/neon/issues/1555 @@ -587,7 +589,7 @@ impl Tenant { tenant_conf, wal_redo_manager, tenant_id, - Some(remote_storage.clone()), + Some(remote_storage), )); // Do all the hard work in the background @@ -649,8 +651,12 @@ impl Tenant { .as_ref() .ok_or_else(|| anyhow::anyhow!("cannot attach without remote storage"))?; - let remote_timelines = - list_remote_timelines(remote_storage, self.conf, self.tenant_id).await?; + let remote_timelines = remote_timeline_client::list_remote_timelines( + remote_storage, + self.conf, + self.tenant_id, + ) + .await?; info!("found {} timelines", remote_timelines.len()); @@ -704,6 +710,22 @@ impl Tenant { Ok(()) } + /// get size of all remote timelines + /// + /// This function relies on the index_part instead of listing the remote storage + /// + pub async fn get_remote_size(&self) -> anyhow::Result { + let mut size = 0; + + for timeline in self.list_timelines().iter() { + if let Some(remote_client) = &timeline.remote_client { + size += remote_client.get_remote_physical_size(); + } + } + + Ok(size) + } + #[instrument(skip(self, index_part, remote_metadata, remote_storage), fields(timeline_id=%timeline_id))] async fn load_remote_timeline( &self, @@ -718,7 +740,7 @@ impl Tenant { .context("Failed to create new timeline directory")?; let remote_client = - create_remote_timeline_client(remote_storage, self.conf, self.tenant_id, timeline_id)?; + RemoteTimelineClient::new(remote_storage, self.conf, self.tenant_id, timeline_id)?; let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() { let timelines = self.timelines.lock().unwrap(); @@ -785,7 +807,7 @@ impl Tenant { let tenant_conf = match Self::load_tenant_config(conf, tenant_id) { Ok(conf) => conf, Err(e) => { - error!("load tenant config failed: {}", e); + error!("load tenant config failed: {:?}", e); return Tenant::create_broken_tenant(conf, tenant_id); } }; @@ -980,7 +1002,7 @@ impl Tenant { .remote_storage .as_ref() .map(|remote_storage| { - create_remote_timeline_client( + RemoteTimelineClient::new( remote_storage.clone(), self.conf, self.tenant_id, @@ -1146,7 +1168,8 @@ impl Tenant { ancestor_timeline.wait_lsn(*lsn).await?; } - self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)? + self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn) + .await? } None => self.bootstrap_timeline(new_timeline_id, pg_version).await?, }; @@ -1158,17 +1181,20 @@ impl Tenant { /// this function is periodically called by gc task. /// also it can be explicitly requested through page server api 'do_gc' command. /// - /// 'target_timeline_id' specifies the timeline to GC, or None for all. - /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval). - /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC - /// to make tests more deterministic. - /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed? + /// `target_timeline_id` specifies the timeline to GC, or None for all. + /// + /// The `horizon` an `pitr` parameters determine how much WAL history needs to be retained. + /// Also known as the retention period, or the GC cutoff point. `horizon` specifies + /// the amount of history, as LSN difference from current latest LSN on each timeline. + /// `pitr` specifies the same as a time difference from the current time. The effective + /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever + /// requires more history to be retained. + // pub async fn gc_iteration( &self, target_timeline_id: Option, horizon: u64, pitr: Duration, - checkpoint_before_gc: bool, ) -> anyhow::Result { anyhow::ensure!( self.is_active(), @@ -1183,7 +1209,7 @@ impl Tenant { let _timer = STORAGE_TIME .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str]) .start_timer(); - self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc) + self.gc_iteration_internal(target_timeline_id, horizon, pitr) .await } } @@ -1226,24 +1252,21 @@ impl Tenant { /// /// Used at graceful shutdown. /// - pub async fn checkpoint(&self) -> anyhow::Result<()> { + pub async fn freeze_and_flush(&self) -> anyhow::Result<()> { // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the - // checkpoints. We don't want to block everything else while the - // checkpoint runs. - let timelines_to_checkpoint = { + // flushing. We don't want to block everything else while the + // flushing is performed. + let timelines_to_flush = { let timelines = self.timelines.lock().unwrap(); timelines .iter() - .map(|(id, timeline)| (*id, Arc::clone(timeline))) + .map(|(_id, timeline)| Arc::clone(timeline)) .collect::>() }; - for (id, timeline) in &timelines_to_checkpoint { - timeline - .checkpoint(CheckpointConfig::Flush) - .instrument(info_span!("checkpoint", timeline = %id, tenant = %self.tenant_id)) - .await?; + for timeline in &timelines_to_flush { + timeline.freeze_and_flush().await?; } Ok(()) @@ -1278,26 +1301,62 @@ impl Tenant { timeline }; - info!("waiting for layer_removal_cs.lock()"); - // No timeout here, GC & Compaction should be responsive to the `TimelineState::Stopping` change. - let layer_removal_guard = timeline.layer_removal_cs.lock().await; - info!("got layer_removal_cs.lock(), deleting layer files"); + // Now that the Timeline is in Stopping state, request all the related tasks to + // shut down. + // + // NB: If you call delete_timeline multiple times concurrently, they will + // all go through the motions here. Make sure the code here is idempotent, + // and don't error out if some of the shutdown tasks have already been + // completed! - // NB: storage_sync upload tasks that reference these layers have been cancelled - // by the caller. + // Stop the walreceiver first. + debug!("waiting for wal receiver to shutdown"); + task_mgr::shutdown_tasks( + Some(TaskKind::WalReceiverManager), + Some(self.tenant_id), + Some(timeline_id), + ) + .await; + debug!("wal receiver shutdown confirmed"); - let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id); - // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up - // with some layers missing. - std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { - format!( - "Failed to remove local timeline directory '{}'", - local_timeline_directory.display() - ) - })?; - info!("finished deleting layer files, releasing layer_removal_cs.lock()"); + info!("waiting for timeline tasks to shutdown"); + task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await; - drop(layer_removal_guard); + { + // Grab the layer_removal_cs lock, and actually perform the deletion. + // + // This lock prevents multiple concurrent delete_timeline calls from + // stepping on each other's toes, while deleting the files. It also + // prevents GC or compaction from running at the same time. + // + // Note that there are still other race conditions between + // GC, compaction and timeline deletion. GC task doesn't + // register itself properly with the timeline it's + // operating on. See + // https://github.com/neondatabase/neon/issues/2671 + // + // No timeout here, GC & Compaction should be responsive to the + // `TimelineState::Stopping` change. + info!("waiting for layer_removal_cs.lock()"); + let layer_removal_guard = timeline.layer_removal_cs.lock().await; + info!("got layer_removal_cs.lock(), deleting layer files"); + + // NB: storage_sync upload tasks that reference these layers have been cancelled + // by the caller. + + let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id); + // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up + // with some layers missing. + std::fs::remove_dir_all(&local_timeline_directory).with_context(|| { + format!( + "Failed to remove local timeline directory '{}'", + local_timeline_directory.display() + ) + })?; + + info!("finished deleting layer files, releasing layer_removal_cs.lock()"); + drop(layer_removal_guard); + } // Remove the timeline from the map. let mut timelines = self.timelines.lock().unwrap(); @@ -1375,7 +1434,7 @@ impl Tenant { // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. - crate::tenant_tasks::start_background_loops(self.tenant_id); + tasks::start_background_loops(self.tenant_id); for timeline in not_broken_timelines { timeline.set_state(TimelineState::Active); @@ -1599,7 +1658,7 @@ impl Tenant { new_metadata: TimelineMetadata, ancestor: Option>, remote_client: Option, - ) -> anyhow::Result { + ) -> anyhow::Result> { if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() { anyhow::ensure!( ancestor.is_some(), @@ -1635,7 +1694,7 @@ impl Tenant { conf, tenant_conf: Arc::new(RwLock::new(tenant_conf)), timelines: Mutex::new(HashMap::new()), - gc_cs: Mutex::new(()), + gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, remote_storage, state, @@ -1694,7 +1753,7 @@ impl Tenant { let _enter = info_span!("saving tenantconf").entered(); info!("persisting tenantconf to {}", target_config_path.display()); - // TODO this will prepend comments endlessly + // TODO this will prepend comments endlessly ? let mut conf_content = r#"# This file contains a specific per-tenant's config. # It is read in case of pageserver restart. @@ -1707,7 +1766,10 @@ impl Tenant { let mut target_config_file = VirtualFile::open_with_options( target_config_path, - OpenOptions::new().write(true).create_new(first_save), + OpenOptions::new() + .truncate(true) // This needed for overwriting with small config files + .write(true) + .create_new(first_save), )?; target_config_file @@ -1779,12 +1841,13 @@ impl Tenant { target_timeline_id: Option, horizon: u64, pitr: Duration, - checkpoint_before_gc: bool, ) -> anyhow::Result { let mut totals: GcResult = Default::default(); let now = Instant::now(); - let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?; + let gc_timelines = self + .refresh_gc_info_internal(target_timeline_id, horizon, pitr) + .await?; utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines"); @@ -1806,18 +1869,6 @@ impl Tenant { // made. break; } - - // If requested, force flush all in-memory layers to disk first, - // so that they too can be garbage collected. That's - // used in tests, so we want as deterministic results as possible. - if checkpoint_before_gc { - timeline.checkpoint(CheckpointConfig::Forced).await?; - info!( - "timeline {} checkpoint_before_gc done", - timeline.timeline_id - ); - } - let result = timeline.gc().await?; totals += result; } @@ -1831,7 +1882,7 @@ impl Tenant { /// [`Tenant::get_gc_horizon`]. /// /// This is usually executed as part of periodic gc, but can now be triggered more often. - pub fn refresh_gc_info(&self) -> anyhow::Result>> { + pub async fn refresh_gc_info(&self) -> anyhow::Result>> { // since this method can now be called at different rates than the configured gc loop, it // might be that these configuration values get applied faster than what it was previously, // since these were only read from the gc task. @@ -1842,54 +1893,60 @@ impl Tenant { let target_timeline_id = None; self.refresh_gc_info_internal(target_timeline_id, horizon, pitr) + .await } - fn refresh_gc_info_internal( + async fn refresh_gc_info_internal( &self, target_timeline_id: Option, horizon: u64, pitr: Duration, ) -> anyhow::Result>> { // grab mutex to prevent new timelines from being created here. - let gc_cs = self.gc_cs.lock().unwrap(); - - let timelines = self.timelines.lock().unwrap(); + let gc_cs = self.gc_cs.lock().await; // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. - let mut all_branchpoints: BTreeSet<(TimelineId, Lsn)> = BTreeSet::new(); - let timeline_ids = { - if let Some(target_timeline_id) = target_timeline_id.as_ref() { - if timelines.get(target_timeline_id).is_none() { - bail!("gc target timeline does not exist") - } - }; + let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = { + let timelines = self.timelines.lock().unwrap(); + let mut all_branchpoints = BTreeSet::new(); + let timeline_ids = { + if let Some(target_timeline_id) = target_timeline_id.as_ref() { + if timelines.get(target_timeline_id).is_none() { + bail!("gc target timeline does not exist") + } + }; - timelines - .iter() - .map(|(timeline_id, timeline_entry)| { - if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { - // If target_timeline is specified, we only need to know branchpoints of its children - if let Some(timeline_id) = target_timeline_id { - if ancestor_timeline_id == &timeline_id { + timelines + .iter() + .map(|(timeline_id, timeline_entry)| { + if let Some(ancestor_timeline_id) = + &timeline_entry.get_ancestor_timeline_id() + { + // If target_timeline is specified, we only need to know branchpoints of its children + if let Some(timeline_id) = target_timeline_id { + if ancestor_timeline_id == &timeline_id { + all_branchpoints.insert(( + *ancestor_timeline_id, + timeline_entry.get_ancestor_lsn(), + )); + } + } + // Collect branchpoints for all timelines + else { all_branchpoints.insert(( *ancestor_timeline_id, timeline_entry.get_ancestor_lsn(), )); } } - // Collect branchpoints for all timelines - else { - all_branchpoints - .insert((*ancestor_timeline_id, timeline_entry.get_ancestor_lsn())); - } - } - *timeline_id - }) - .collect::>() + *timeline_id + }) + .collect::>() + }; + (all_branchpoints, timeline_ids) }; - drop(timelines); // Ok, we now know all the branch points. // Update the GC information for each timeline. @@ -1915,7 +1972,7 @@ impl Tenant { )) .map(|&x| x.1) .collect(); - timeline.update_gc_info(branchpoints, cutoff, pitr)?; + timeline.update_gc_info(branchpoints, cutoff, pitr).await?; gc_timelines.push(timeline); } @@ -1925,7 +1982,7 @@ impl Tenant { } /// Branch an existing timeline - fn branch_timeline( + async fn branch_timeline( &self, src: TimelineId, dst: TimelineId, @@ -1934,10 +1991,11 @@ impl Tenant { // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn // about timelines, so otherwise a race condition is possible, where we create new timeline and GC // concurrently removes data that is needed by the new timeline. - let _gc_cs = self.gc_cs.lock().unwrap(); - let timelines = self.timelines.lock().unwrap(); - let timeline_uninit_mark = self.create_timeline_uninit_mark(dst, &timelines)?; - drop(timelines); + let _gc_cs = self.gc_cs.lock().await; + let timeline_uninit_mark = { + let timelines = self.timelines.lock().unwrap(); + self.create_timeline_uninit_mark(dst, &timelines)? + }; // In order for the branch creation task to not wait for GC/compaction, // we need to make sure that the starting LSN of the child branch is not out of scope midway by @@ -2106,8 +2164,13 @@ impl Tenant { }); unfinished_timeline - .checkpoint(CheckpointConfig::Flush).await - .with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?; + .freeze_and_flush() + .await + .with_context(|| { + format!( + "Failed to flush after pgdatadir import for timeline {tenant_id}/{timeline_id}" + ) + })?; let timeline = { let mut timelines = self.timelines.lock().unwrap(); @@ -2136,7 +2199,7 @@ impl Tenant { let tenant_id = self.tenant_id; let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() { - let remote_client = create_remote_timeline_client( + let remote_client = RemoteTimelineClient::new( remote_storage.clone(), self.conf, tenant_id, @@ -2166,7 +2229,7 @@ impl Tenant { Ok(UninitializedTimeline { owning_tenant: self, timeline_id: new_timeline_id, - raw_timeline: Some((Arc::new(new_timeline), uninit_mark)), + raw_timeline: Some((new_timeline, uninit_mark)), }) } Err(e) => { @@ -2184,7 +2247,7 @@ impl Tenant { new_metadata: TimelineMetadata, ancestor: Option>, remote_client: Option, - ) -> anyhow::Result { + ) -> anyhow::Result> { let timeline_data = self .create_timeline_data( new_timeline_id, @@ -2267,12 +2330,12 @@ impl Tenant { // See more for on the issue #2748 condenced out of the initial PR review. let mut shared_cache = self.cached_logical_sizes.lock().await; - size::gather_inputs(self, logical_sizes_at_once, &mut *shared_cache).await + size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache).await } } fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> { - fs::remove_dir_all(&timeline_dir) + fs::remove_dir_all(timeline_dir) .or_else(|e| { if e.kind() == std::io::ErrorKind::NotFound { // we can leave the uninit mark without a timeline dir, @@ -2288,7 +2351,7 @@ fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> a timeline_dir.display() ) })?; - fs::remove_file(&uninit_mark).with_context(|| { + fs::remove_file(uninit_mark).with_context(|| { format!( "Failed to remove timeline uninit mark file {}", uninit_mark.display() @@ -2388,7 +2451,7 @@ fn try_create_target_tenant_dir( anyhow::bail!("failpoint tenant-creation-before-tmp-rename"); }); - fs::rename(&temporary_tenant_dir, target_tenant_directory).with_context(|| { + fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| { format!( "failed to move tenant {} temporary directory {} into the permanent one {}", tenant_id, @@ -2442,9 +2505,9 @@ fn run_initdb( ); let initdb_output = Command::new(&initdb_bin_path) - .args(&["-D", &initdb_target_dir.to_string_lossy()]) - .args(&["-U", &conf.superuser]) - .args(&["-E", "utf8"]) + .args(["-D", &initdb_target_dir.to_string_lossy()]) + .args(["-U", &conf.superuser]) + .args(["-E", "utf8"]) .arg("--no-instructions") // This is only used for a temporary installation that is deleted shortly after, // so no need to fsync it @@ -2487,12 +2550,8 @@ pub fn dump_layerfile_from_path(path: &Path, verbose: bool) -> anyhow::Result<() file.read_exact_at(&mut header_buf, 0)?; match u16::from_be_bytes(header_buf) { - crate::IMAGE_FILE_MAGIC => { - image_layer::ImageLayer::new_for_path(path, file)?.dump(verbose)? - } - crate::DELTA_FILE_MAGIC => { - delta_layer::DeltaLayer::new_for_path(path, file)?.dump(verbose)? - } + crate::IMAGE_FILE_MAGIC => ImageLayer::new_for_path(path, file)?.dump(verbose)?, + crate::DELTA_FILE_MAGIC => DeltaLayer::new_for_path(path, file)?.dump(verbose)?, magic => bail!("unrecognized magic identifier: {:?}", magic), } @@ -2529,7 +2588,7 @@ pub mod harness { }; use super::*; - use crate::tenant_config::{TenantConf, TenantConfOpt}; + use crate::tenant::config::{TenantConf, TenantConfOpt}; use hex_literal::hex; use utils::id::{TenantId, TimelineId}; @@ -2606,9 +2665,11 @@ pub mod harness { // Disable automatic GC and compaction to make the unit tests more deterministic. // The tests perform them manually if needed. - let mut tenant_conf = TenantConf::dummy_conf(); - tenant_conf.gc_period = Duration::ZERO; - tenant_conf.compaction_period = Duration::ZERO; + let tenant_conf = TenantConf { + gc_period: Duration::ZERO, + compaction_period: Duration::ZERO, + ..TenantConf::default() + }; let tenant_id = TenantId::generate(); fs::create_dir_all(conf.tenant_path(&tenant_id))?; @@ -2672,7 +2733,7 @@ pub mod harness { &self, key: Key, lsn: Lsn, - base_img: Option, + base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, _pg_version: u32, ) -> Result { @@ -2727,9 +2788,18 @@ mod tests { writer.finish_write(Lsn(0x20)); drop(writer); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?, + TEST_IMG("foo at 0x20") + ); Ok(()) } @@ -2794,7 +2864,9 @@ mod tests { //assert_current_logical_size(&tline, Lsn(0x40)); // Branch the history, modify relation differently on the new timeline - tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?; + tenant + .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30))) + .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); @@ -2804,15 +2876,15 @@ mod tests { // Check page contents on both branches assert_eq!( - from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?, + from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?, "foo at 0x40" ); assert_eq!( - from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?, + from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?, "bar at 0x40" ); assert_eq!( - from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?, + from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).no_ondemand_download()?)?, "foobar at 0x20" ); @@ -2842,7 +2914,7 @@ mod tests { writer.finish_write(lsn); lsn += 0x10; } - tline.checkpoint(CheckpointConfig::Forced).await?; + tline.freeze_and_flush().await?; { let writer = tline.writer(); writer.put( @@ -2859,7 +2931,7 @@ mod tests { )?; writer.finish_write(lsn); } - tline.checkpoint(CheckpointConfig::Forced).await + tline.freeze_and_flush().await } #[tokio::test] @@ -2874,15 +2946,18 @@ mod tests { make_some_layers(tline.as_ref(), Lsn(0x20)).await?; // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - // FIXME: this doesn't actually remove any layer currently, given how the checkpointing + // FIXME: this doesn't actually remove any layer currently, given how the flushing // and compaction works. But it does set the 'cutoff' point so that the cross check // below should fail. tenant - .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false) + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO) .await?; // try to branch at lsn 25, should fail because we already garbage collected the data - match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + match tenant + .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) + .await + { Ok(_) => panic!("branching should have failed"), Err(err) => { assert!(err.to_string().contains("invalid branch start lsn")); @@ -2907,7 +2982,10 @@ mod tests { .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)? .initialize()?; // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 - match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) { + match tenant + .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) + .await + { Ok(_) => panic!("branching should have failed"), Err(err) => { assert!(&err.to_string().contains("invalid branch start lsn")); @@ -2934,7 +3012,7 @@ mod tests { let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; - repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?; + repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?; let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); match tline.get(*TEST_KEY, Lsn(0x25)) { @@ -2955,15 +3033,20 @@ mod tests { .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; - tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + tenant + .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40))) + .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 tenant - .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false) + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO) .await?; - assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); + assert!(newtline + .get(*TEST_KEY, Lsn(0x25)) + .no_ondemand_download() + .is_ok()); Ok(()) } @@ -2977,7 +3060,9 @@ mod tests { .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; - tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + tenant + .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40))) + .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); @@ -2986,12 +3071,12 @@ mod tests { // run gc on parent tenant - .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false) + .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO) .await?; // Check that the data is still accessible on the branch. assert_eq!( - newtline.get(*TEST_KEY, Lsn(0x50))?, + newtline.get(*TEST_KEY, Lsn(0x50)).no_ondemand_download()?, TEST_IMG(&format!("foo at {}", Lsn(0x40))) ); @@ -3008,7 +3093,6 @@ mod tests { .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)? .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x8000)).await?; - tline.checkpoint(CheckpointConfig::Forced).await?; } let tenant = harness.load().await; @@ -3031,16 +3115,16 @@ mod tests { .initialize()?; make_some_layers(tline.as_ref(), Lsn(0x20)).await?; - tline.checkpoint(CheckpointConfig::Forced).await?; - tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?; + tenant + .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40))) + .await?; let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); make_some_layers(newtline.as_ref(), Lsn(0x60)).await?; - tline.checkpoint(CheckpointConfig::Forced).await?; } // check that both of them are initially unloaded @@ -3112,7 +3196,7 @@ mod tests { writer.finish_write(Lsn(0x10)); drop(writer); - tline.checkpoint(CheckpointConfig::Forced).await?; + tline.freeze_and_flush().await?; tline.compact().await?; let writer = tline.writer(); @@ -3120,7 +3204,7 @@ mod tests { writer.finish_write(Lsn(0x20)); drop(writer); - tline.checkpoint(CheckpointConfig::Forced).await?; + tline.freeze_and_flush().await?; tline.compact().await?; let writer = tline.writer(); @@ -3128,7 +3212,7 @@ mod tests { writer.finish_write(Lsn(0x30)); drop(writer); - tline.checkpoint(CheckpointConfig::Forced).await?; + tline.freeze_and_flush().await?; tline.compact().await?; let writer = tline.writer(); @@ -3136,21 +3220,36 @@ mod tests { writer.finish_write(Lsn(0x40)); drop(writer); - tline.checkpoint(CheckpointConfig::Forced).await?; + tline.freeze_and_flush().await?; tline.compact().await?; - assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30")); - assert_eq!(tline.get(*TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40")); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?, + TEST_IMG("foo at 0x10") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?, + TEST_IMG("foo at 0x20") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x30)).no_ondemand_download()?, + TEST_IMG("foo at 0x30") + ); + assert_eq!( + tline.get(*TEST_KEY, Lsn(0x40)).no_ondemand_download()?, + TEST_IMG("foo at 0x40") + ); Ok(()) } // - // Insert 1000 key-value pairs with increasing keys, checkpoint, - // repeat 50 times. + // Insert 1000 key-value pairs with increasing keys, flush, compact, GC. + // Repeat 50 times. // #[tokio::test] async fn test_bulk_insert() -> anyhow::Result<()> { @@ -3185,8 +3284,10 @@ mod tests { let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; - tline.checkpoint(CheckpointConfig::Forced).await?; + tline + .update_gc_info(Vec::new(), cutoff, Duration::ZERO) + .await?; + tline.freeze_and_flush().await?; tline.compact().await?; tline.gc().await?; } @@ -3249,16 +3350,17 @@ mod tests { for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = blknum as u32; assert_eq!( - tline.get(test_key, lsn)?, + tline.get(test_key, lsn).no_ondemand_download()?, TEST_IMG(&format!("{} at {}", blknum, last_lsn)) ); } - // Perform a cycle of checkpoint, compaction, and GC - println!("checkpointing {}", lsn); + // Perform a cycle of flush, compact, and GC let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; - tline.checkpoint(CheckpointConfig::Forced).await?; + tline + .update_gc_info(Vec::new(), cutoff, Duration::ZERO) + .await?; + tline.freeze_and_flush().await?; tline.compact().await?; tline.gc().await?; } @@ -3306,7 +3408,9 @@ mod tests { let mut tline_id = TIMELINE_ID; for _ in 0..50 { let new_tline_id = TimelineId::generate(); - tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; + tenant + .branch_timeline(tline_id, new_tline_id, Some(lsn)) + .await?; tline = tenant .get_timeline(new_tline_id, true) .expect("Should have the branched timeline"); @@ -3332,16 +3436,17 @@ mod tests { for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = blknum as u32; assert_eq!( - tline.get(test_key, lsn)?, + tline.get(test_key, lsn).no_ondemand_download()?, TEST_IMG(&format!("{} at {}", blknum, last_lsn)) ); } - // Perform a cycle of checkpoint, compaction, and GC - println!("checkpointing {}", lsn); + // Perform a cycle of flush, compact, and GC let cutoff = tline.get_last_record_lsn(); - tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?; - tline.checkpoint(CheckpointConfig::Forced).await?; + tline + .update_gc_info(Vec::new(), cutoff, Duration::ZERO) + .await?; + tline.freeze_and_flush().await?; tline.compact().await?; tline.gc().await?; } @@ -3371,7 +3476,9 @@ mod tests { #[allow(clippy::needless_range_loop)] for idx in 0..NUM_TLINES { let new_tline_id = TimelineId::generate(); - tenant.branch_timeline(tline_id, new_tline_id, Some(lsn))?; + tenant + .branch_timeline(tline_id, new_tline_id, Some(lsn)) + .await?; tline = tenant .get_timeline(new_tline_id, true) .expect("Should have the branched timeline"); @@ -3404,7 +3511,7 @@ mod tests { println!("checking [{idx}][{blknum}] at {lsn}"); test_key.field6 = blknum as u32; assert_eq!( - tline.get(test_key, *lsn)?, + tline.get(test_key, *lsn).no_ondemand_download()?, TEST_IMG(&format!("{idx} {blknum} at {lsn}")) ); } diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index bbcdabe1cd..e3cc800447 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -5,7 +5,6 @@ use crate::page_cache; use crate::page_cache::{ReadBufResult, PAGE_SZ}; use bytes::Bytes; -use once_cell::sync::Lazy; use std::ops::{Deref, DerefMut}; use std::os::unix::fs::FileExt; use std::sync::atomic::AtomicU64; @@ -61,7 +60,7 @@ where /// /// ```no_run /// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader}; -/// # let reader: FileBlockReader = todo!(); +/// # let reader: FileBlockReader = unimplemented!("stub"); /// let cursor = reader.block_cursor(); /// let buf = cursor.read_blk(1); /// // do stuff with 'buf' @@ -117,7 +116,7 @@ where } } -static NEXT_ID: Lazy = Lazy::new(|| AtomicU64::new(1)); +static NEXT_ID: AtomicU64 = AtomicU64::new(1); /// An adapter for reading a (virtual) file using the page cache. /// diff --git a/pageserver/src/tenant_config.rs b/pageserver/src/tenant/config.rs similarity index 89% rename from pageserver/src/tenant_config.rs rename to pageserver/src/tenant/config.rs index 10b8a589c3..8569c70217 100644 --- a/pageserver/src/tenant_config.rs +++ b/pageserver/src/tenant/config.rs @@ -185,14 +185,16 @@ impl TenantConfOpt { if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag { self.max_lsn_wal_lag = Some(max_lsn_wal_lag); } + if let Some(trace_read_requests) = other.trace_read_requests { + self.trace_read_requests = Some(trace_read_requests); + } } } -impl TenantConf { - pub fn default() -> TenantConf { +impl Default for TenantConf { + fn default() -> Self { use defaults::*; - - TenantConf { + Self { checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT) .expect("cannot parse default checkpoint timeout"), @@ -217,29 +219,4 @@ impl TenantConf { trace_read_requests: false, } } - - pub fn dummy_conf() -> Self { - TenantConf { - checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_timeout: Duration::from_secs(600), - compaction_target_size: 4 * 1024 * 1024, - compaction_period: Duration::from_secs(10), - compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD, - gc_horizon: defaults::DEFAULT_GC_HORIZON, - gc_period: Duration::from_secs(10), - image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD, - pitr_interval: Duration::from_secs(60 * 60), - walreceiver_connect_timeout: humantime::parse_duration( - defaults::DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, - ) - .unwrap(), - lagging_wal_timeout: humantime::parse_duration( - defaults::DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT, - ) - .unwrap(), - max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) - .unwrap(), - trace_read_requests: false, - } - } } diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 33255dbd82..88dff32b76 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -139,7 +139,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> { off += keys_len as u64; let values_off = off as usize; - let values_len = num_children as usize * VALUE_SZ as usize; + let values_len = num_children as usize * VALUE_SZ; //off += values_len as u64; let prefix = &buf[prefix_off..prefix_off + prefix_len as usize]; @@ -177,7 +177,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> { while low < high { let mid = low + size / 2; - let key_off = mid as usize * self.suffix_len as usize; + let key_off = mid * self.suffix_len as usize; let suffix = &self.keys[key_off..key_off + self.suffix_len as usize]; // Does this match? keybuf[self.prefix_len as usize..].copy_from_slice(suffix); @@ -328,7 +328,7 @@ where while idx < node.num_children as usize { let suffix = &node.keys[key_off..key_off + suffix_len]; keybuf[prefix_len..].copy_from_slice(suffix); - let value = node.value(idx as usize); + let value = node.value(idx); #[allow(clippy::collapsible_if)] if node.level == 0 { // leaf @@ -368,7 +368,7 @@ where key_off -= suffix_len; let suffix = &node.keys[key_off..key_off + suffix_len]; keybuf[prefix_len..].copy_from_slice(suffix); - let value = node.value(idx as usize); + let value = node.value(idx); #[allow(clippy::collapsible_if)] if node.level == 0 { // leaf @@ -629,7 +629,7 @@ impl BuildNode { self.keys.extend(&key[self.prefix.len()..]); self.values.extend(value.0); - assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize); + assert!(self.keys.len() == self.num_children as usize * self.suffix_len); assert!(self.values.len() == self.num_children as usize * VALUE_SZ); self.size += self.suffix_len + VALUE_SZ; @@ -674,7 +674,7 @@ impl BuildNode { self.size -= prefix_len * self.num_children as usize; self.size += prefix_len; - assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize); + assert!(self.keys.len() == self.num_children as usize * self.suffix_len); assert!(self.values.len() == self.num_children as usize * VALUE_SZ); true @@ -684,7 +684,7 @@ impl BuildNode { /// Serialize the node to on-disk format. /// fn pack(&self) -> Bytes { - assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize); + assert!(self.keys.len() == self.num_children as usize * self.suffix_len); assert!(self.values.len() == self.num_children as usize * VALUE_SZ); assert!(self.num_children > 0); @@ -940,7 +940,7 @@ mod tests { let t = -(f64::ln(u)); let key_int = (t * 1000000.0) as u128; - all_data.insert(key_int as u128, idx as u64); + all_data.insert(key_int, idx as u64); } // Build a tree from it diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 0774fa42a6..c433e65ad2 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -91,7 +91,7 @@ impl EphemeralFile { break; } - off += n as usize; + off += n; } Ok(()) } diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 48dcf1906a..5b33ac2969 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -12,7 +12,7 @@ use crate::metrics::NUM_ONDISK_LAYERS; use crate::repository::Key; -use crate::tenant::inmemory_layer::InMemoryLayer; +use crate::tenant::storage_layer::InMemoryLayer; use crate::tenant::storage_layer::Layer; use anyhow::Result; use std::collections::VecDeque; @@ -25,8 +25,7 @@ use super::bst_layer_map::RetroactiveLayerMap; /// /// LayerMap tracks what layers exist on a timeline. /// -#[derive(Default)] -pub struct LayerMap { +pub struct LayerMap { // // 'open_layer' holds the current InMemoryLayer that is accepting new // records. If it is None, 'next_open_layer_at' will be set instead, indicating @@ -47,20 +46,35 @@ pub struct LayerMap { pub frozen_layers: VecDeque>, /// Index of the historic layers optimized for search - index: RetroactiveLayerMap>, + index: RetroactiveLayerMap>, /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient. /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree. - l0_delta_layers: Vec>, + l0_delta_layers: Vec>, +} + +impl Default for LayerMap { + fn default() -> Self { + Self { + open_layer: None, + next_open_layer_at: None, + frozen_layers: VecDeque::default(), + l0_delta_layers: Vec::default(), + index: RetroactiveLayerMap::default(), + } + } } /// Return value of LayerMap::search -pub struct SearchResult { - pub layer: Arc, +pub struct SearchResult { + pub layer: Arc, pub lsn_floor: Lsn, } -impl LayerMap { +impl LayerMap +where + L: ?Sized + Layer, +{ /// /// Find the latest layer that covers the given 'key', with lsn < /// 'end_lsn'. @@ -72,39 +86,39 @@ impl LayerMap { /// contain the version, even if it's missing from the returned /// layer. /// - pub fn search(&self, key: Key, end_lsn: Lsn) -> Result> { + pub fn search(&self, key: Key, end_lsn: Lsn) -> Option> { match self.index.query(key.to_i128(), end_lsn.0 - 1) { - (None, None) => Ok(None), + (None, None) => None, (None, Some(image)) => { let lsn_floor = image.get_lsn_range().start; - Ok(Some(SearchResult { + Some(SearchResult { layer: image, lsn_floor, - })) + }) } (Some(delta), None) => { let lsn_floor = delta.get_lsn_range().start; - Ok(Some(SearchResult { + Some(SearchResult { layer: delta, lsn_floor, - })) + }) } (Some(delta), Some(image)) => { let img_lsn = image.get_lsn_range().start; let image_is_newer = image.get_lsn_range().end > delta.get_lsn_range().end; let image_exact_match = Lsn(img_lsn.0 + 1) == end_lsn; if image_is_newer || image_exact_match { - Ok(Some(SearchResult { + Some(SearchResult { layer: image, lsn_floor: img_lsn, - })) + }) } else { let lsn_floor = std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1); - Ok(Some(SearchResult { + Some(SearchResult { layer: delta, lsn_floor, - })) + }) } } } @@ -113,7 +127,7 @@ impl LayerMap { /// /// Insert an on-disk layer /// - pub fn insert_historic(&mut self, layer: Arc) { + pub fn insert_historic(&mut self, layer: Arc) { let kr = layer.get_key_range(); let lr = layer.get_lsn_range(); self.index.insert( @@ -140,7 +154,7 @@ impl LayerMap { /// /// This should be called when the corresponding file on disk has been deleted. /// - pub fn remove_historic(&mut self, layer: Arc) { + pub fn remove_historic(&mut self, layer: Arc) { let kr = layer.get_key_range(); let lr = layer.get_lsn_range(); self.index.remove( @@ -182,7 +196,7 @@ impl LayerMap { let start = key.start.to_i128(); let end = key.end.to_i128(); - let layer_covers = |layer: Option>| match layer { + let layer_covers = |layer: Option>| match layer { Some(layer) => layer.get_lsn_range().start >= lsn.start, None => false, }; @@ -202,7 +216,7 @@ impl LayerMap { return Ok(true); } - pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { + pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { self.index.iter() } @@ -218,7 +232,7 @@ impl LayerMap { &self, key_range: &Range, lsn: Lsn, - ) -> Result, Option>)>> { + ) -> Result, Option>)>> { let version = match self.index.get_version(lsn.0 - 1) { Some(v) => v, None => return Ok(vec![]), @@ -228,7 +242,7 @@ impl LayerMap { let end = key_range.end.to_i128(); // Initialize loop variables - let mut coverage: Vec<(Range, Option>)> = vec![]; + let mut coverage: Vec<(Range, Option>)> = vec![]; let mut current_key = start.clone(); let mut current_val = version.query(start).1; @@ -308,7 +322,7 @@ impl LayerMap { } /// Return all L0 delta layers - pub fn get_level0_deltas(&self) -> Result>> { + pub fn get_level0_deltas(&self) -> Result>> { Ok(self.l0_delta_layers.clone()) } diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index f3a0a5171a..297cccbe30 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -255,8 +255,7 @@ pub fn save_metadata( // fsync the parent directory to ensure the directory entry is durable if first_save { let timeline_dir = File::open( - &path - .parent() + path.parent() .expect("Metadata should always have a parent dir"), )?; timeline_dir.sync_all()?; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs new file mode 100644 index 0000000000..44849de735 --- /dev/null +++ b/pageserver/src/tenant/mgr.rs @@ -0,0 +1,494 @@ +//! This module acts as a switchboard to access different repositories managed by this +//! page server. + +use std::collections::{hash_map, HashMap}; +use std::ffi::OsStr; +use std::path::Path; +use std::sync::Arc; +use tokio::fs; + +use anyhow::Context; +use once_cell::sync::Lazy; +use tokio::sync::RwLock; +use tracing::*; + +use remote_storage::GenericRemoteStorage; +use utils::crashsafe; + +use crate::config::PageServerConf; +use crate::task_mgr::{self, TaskKind}; +use crate::tenant::config::TenantConfOpt; +use crate::tenant::{Tenant, TenantState}; +use crate::IGNORED_TENANT_FILE_NAME; + +use utils::fs_ext::PathExt; +use utils::id::{TenantId, TimelineId}; + +static TENANTS: Lazy>>> = + Lazy::new(|| RwLock::new(HashMap::new())); + +/// Initialize repositories with locally available timelines. +/// Timelines that are only partially available locally (remote storage has more data than this pageserver) +/// are scheduled for download and added to the tenant once download is completed. +#[instrument(skip(conf, remote_storage))] +pub async fn init_tenant_mgr( + conf: &'static PageServerConf, + remote_storage: Option, +) -> anyhow::Result<()> { + // Scan local filesystem for attached tenants + let mut number_of_tenants = 0; + let tenants_dir = conf.tenants_path(); + + let mut dir_entries = fs::read_dir(&tenants_dir) + .await + .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?; + + loop { + match dir_entries.next_entry().await { + Ok(None) => break, + Ok(Some(dir_entry)) => { + let tenant_dir_path = dir_entry.path(); + if crate::is_temporary(&tenant_dir_path) { + info!( + "Found temporary tenant directory, removing: {}", + tenant_dir_path.display() + ); + if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await { + error!( + "Failed to remove temporary directory '{}': {:?}", + tenant_dir_path.display(), + e + ); + } + } else { + // This case happens if we crash during attach before creating the attach marker file + let is_empty = tenant_dir_path.is_empty_dir().with_context(|| { + format!("Failed to check whether {tenant_dir_path:?} is an empty dir") + })?; + if is_empty { + info!("removing empty tenant directory {tenant_dir_path:?}"); + if let Err(e) = fs::remove_dir(&tenant_dir_path).await { + error!( + "Failed to remove empty tenant directory '{}': {e:#}", + tenant_dir_path.display() + ) + } + continue; + } + + let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME); + if tenant_ignore_mark_file.exists() { + info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant"); + continue; + } + + match schedule_local_tenant_processing( + conf, + &tenant_dir_path, + remote_storage.clone(), + ) { + Ok(tenant) => { + TENANTS.write().await.insert(tenant.tenant_id(), tenant); + number_of_tenants += 1; + } + Err(e) => { + error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}"); + } + } + } + } + Err(e) => { + // On error, print it, but continue with the other tenants. If we error out + // here, the pageserver startup fails altogether, causing outage for *all* + // tenants. That seems worse. + error!( + "Failed to list tenants dir entry in directory {tenants_dir:?}, reason: {e:?}" + ); + } + } + } + + info!("Processed {number_of_tenants} local tenants at startup"); + Ok(()) +} + +pub fn schedule_local_tenant_processing( + conf: &'static PageServerConf, + tenant_path: &Path, + remote_storage: Option, +) -> anyhow::Result> { + anyhow::ensure!( + tenant_path.is_dir(), + "Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory" + ); + anyhow::ensure!( + !crate::is_temporary(tenant_path), + "Cannot load tenant from temporary path {tenant_path:?}" + ); + anyhow::ensure!( + !tenant_path.is_empty_dir().with_context(|| { + format!("Failed to check whether {tenant_path:?} is an empty dir") + })?, + "Cannot load tenant from empty directory {tenant_path:?}" + ); + + let tenant_id = tenant_path + .file_name() + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| { + format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}") + })?; + + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id); + anyhow::ensure!( + !conf.tenant_ignore_mark_file_path(tenant_id).exists(), + "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}" + ); + + let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() { + info!("tenant {tenant_id} has attaching mark file, resuming its attach operation"); + if let Some(remote_storage) = remote_storage { + Tenant::spawn_attach(conf, tenant_id, remote_storage) + } else { + warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured"); + Tenant::create_broken_tenant(conf, tenant_id) + } + } else { + info!("tenant {tenant_id} is assumed to be loadable, starting load operation"); + // Start loading the tenant into memory. It will initially be in Loading state. + Tenant::spawn_load(conf, tenant_id, remote_storage) + }; + Ok(tenant) +} + +/// +/// Shut down all tenants. This runs as part of pageserver shutdown. +/// +pub async fn shutdown_all_tenants() { + let tenants_to_shut_down = { + let mut m = TENANTS.write().await; + let mut tenants_to_shut_down = Vec::with_capacity(m.len()); + for (_, tenant) in m.drain() { + if tenant.is_active() { + // updates tenant state, forbidding new GC and compaction iterations from starting + tenant.set_stopping(); + tenants_to_shut_down.push(tenant) + } + } + drop(m); + tenants_to_shut_down + }; + + // Shut down all existing walreceiver connections and stop accepting the new ones. + task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await; + + // Ok, no background tasks running anymore. Flush any remaining data in + // memory to disk. + // + // We assume that any incoming connections that might request pages from + // the tenant have already been terminated by the caller, so there + // should be no more activity in any of the repositories. + // + // On error, log it but continue with the shutdown for other tenants. + for tenant in tenants_to_shut_down { + let tenant_id = tenant.tenant_id(); + debug!("shutdown tenant {tenant_id}"); + + if let Err(err) = tenant.freeze_and_flush().await { + error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}"); + } + } +} + +pub async fn create_tenant( + conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, + tenant_id: TenantId, + remote_storage: Option, +) -> anyhow::Result>> { + match TENANTS.write().await.entry(tenant_id) { + hash_map::Entry::Occupied(_) => { + debug!("tenant {tenant_id} already exists"); + Ok(None) + } + hash_map::Entry::Vacant(v) => { + // Hold the write_tenants() lock, since all of this is local IO. + // If this section ever becomes contentious, introduce a new `TenantState::Creating`. + let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?; + let created_tenant = + schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?; + let crated_tenant_id = created_tenant.tenant_id(); + anyhow::ensure!( + tenant_id == crated_tenant_id, + "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})", + ); + v.insert(Arc::clone(&created_tenant)); + Ok(Some(created_tenant)) + } + } +} + +pub async fn update_tenant_config( + conf: &'static PageServerConf, + tenant_conf: TenantConfOpt, + tenant_id: TenantId, +) -> anyhow::Result<()> { + info!("configuring tenant {tenant_id}"); + get_tenant(tenant_id, true) + .await? + .update_tenant_config(tenant_conf); + Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?; + Ok(()) +} + +/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. +/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. +pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result> { + let m = TENANTS.read().await; + let tenant = m + .get(&tenant_id) + .with_context(|| format!("Tenant {tenant_id} not found in the local state"))?; + if active_only && !tenant.is_active() { + anyhow::bail!( + "Tenant {tenant_id} is not active. Current state: {:?}", + tenant.current_state() + ) + } else { + Ok(Arc::clone(tenant)) + } +} + +pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> { + match get_tenant(tenant_id, true).await { + Ok(tenant) => { + tenant.delete_timeline(timeline_id).await?; + } + Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"), + } + + Ok(()) +} + +pub async fn detach_tenant( + conf: &'static PageServerConf, + tenant_id: TenantId, +) -> anyhow::Result<()> { + remove_tenant_from_memory(tenant_id, async { + let local_tenant_directory = conf.tenant_path(&tenant_id); + fs::remove_dir_all(&local_tenant_directory) + .await + .with_context(|| { + format!("Failed to remove local tenant directory {local_tenant_directory:?}") + })?; + Ok(()) + }) + .await +} + +pub async fn load_tenant( + conf: &'static PageServerConf, + tenant_id: TenantId, + remote_storage: Option, +) -> anyhow::Result<()> { + run_if_no_tenant_in_memory(tenant_id, |vacant_entry| { + let tenant_path = conf.tenant_path(&tenant_id); + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id); + if tenant_ignore_mark.exists() { + std::fs::remove_file(&tenant_ignore_mark) + .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?; + } + + let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage) + .with_context(|| { + format!("Failed to schedule tenant processing in path {tenant_path:?}") + })?; + + vacant_entry.insert(new_tenant); + Ok(()) + }).await +} + +pub async fn ignore_tenant( + conf: &'static PageServerConf, + tenant_id: TenantId, +) -> anyhow::Result<()> { + remove_tenant_from_memory(tenant_id, async { + let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id); + fs::File::create(&ignore_mark_file) + .await + .context("Failed to create ignore mark file") + .and_then(|_| { + crashsafe::fsync_file_and_parent(&ignore_mark_file) + .context("Failed to fsync ignore mark file") + }) + .with_context(|| format!("Failed to crate ignore mark for tenant {tenant_id}"))?; + Ok(()) + }) + .await +} + +/// +/// Get list of tenants, for the mgmt API +/// +pub async fn list_tenants() -> Vec<(TenantId, TenantState)> { + TENANTS + .read() + .await + .iter() + .map(|(id, tenant)| (*id, tenant.current_state())) + .collect() +} + +/// Execute Attach mgmt API command. +/// +/// Downloading all the tenant data is performed in the background, this merely +/// spawns the background task and returns quickly. +pub async fn attach_tenant( + conf: &'static PageServerConf, + tenant_id: TenantId, + remote_storage: GenericRemoteStorage, +) -> anyhow::Result<()> { + run_if_no_tenant_in_memory(tenant_id, |vacant_entry| { + let tenant_path = conf.tenant_path(&tenant_id); + anyhow::ensure!( + !tenant_path.exists(), + "Cannot attach tenant {tenant_id}, local tenant directory already exists" + ); + + let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage); + vacant_entry.insert(tenant); + + Ok(()) + }) + .await +} + +async fn run_if_no_tenant_in_memory(tenant_id: TenantId, run: F) -> anyhow::Result +where + F: FnOnce(hash_map::VacantEntry>) -> anyhow::Result, +{ + match TENANTS.write().await.entry(tenant_id) { + hash_map::Entry::Occupied(e) => { + anyhow::bail!( + "tenant {tenant_id} already exists, state: {:?}", + e.get().current_state() + ) + } + hash_map::Entry::Vacant(v) => run(v), + } +} + +/// Stops and removes the tenant from memory, if it's not [`TenantState::Stopping`] already, bails otherwise. +/// Allows to remove other tenant resources manually, via `tenant_cleanup`. +/// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal +/// operation would be needed to remove it. +async fn remove_tenant_from_memory( + tenant_id: TenantId, + tenant_cleanup: F, +) -> anyhow::Result +where + F: std::future::Future>, +{ + // It's important to keep the tenant in memory after the final cleanup, to avoid cleanup races. + // The exclusive lock here ensures we don't miss the tenant state updates before trying another removal. + // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to + // avoid holding the lock for the entire process. + { + let tenants_accessor = TENANTS.write().await; + match tenants_accessor.get(&tenant_id) { + Some(tenant) => match tenant.current_state() { + TenantState::Attaching + | TenantState::Loading + | TenantState::Broken + | TenantState::Active => tenant.set_stopping(), + TenantState::Stopping => { + anyhow::bail!("Tenant {tenant_id} is stopping already") + } + }, + None => anyhow::bail!("Tenant not found for id {tenant_id}"), + } + } + + // shutdown all tenant and timeline tasks: gc, compaction, page service) + // No new tasks will be started for this tenant because it's in `Stopping` state. + // Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely. + task_mgr::shutdown_tasks(None, Some(tenant_id), None).await; + + match tenant_cleanup + .await + .with_context(|| format!("Failed to run cleanup for tenant {tenant_id}")) + { + Ok(hook_value) => { + let mut tenants_accessor = TENANTS.write().await; + if tenants_accessor.remove(&tenant_id).is_none() { + warn!("Tenant {tenant_id} got removed from memory before operation finished"); + } + Ok(hook_value) + } + Err(e) => { + let tenants_accessor = TENANTS.read().await; + match tenants_accessor.get(&tenant_id) { + Some(tenant) => tenant.set_broken(), + None => warn!("Tenant {tenant_id} got removed from memory"), + } + Err(e) + } + } +} + +#[cfg(feature = "testing")] +use { + crate::repository::GcResult, pageserver_api::models::TimelineGcRequest, + utils::http::error::ApiError, +}; + +#[cfg(feature = "testing")] +pub async fn immediate_gc( + tenant_id: TenantId, + timeline_id: TimelineId, + gc_req: TimelineGcRequest, +) -> Result>, ApiError> { + let guard = TENANTS.read().await; + + let tenant = guard + .get(&tenant_id) + .map(Arc::clone) + .with_context(|| format!("Tenant {tenant_id} not found")) + .map_err(ApiError::NotFound)?; + + let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); + // Use tenant's pitr setting + let pitr = tenant.get_pitr_interval(); + + // Run in task_mgr to avoid race with detach operation + let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); + task_mgr::spawn( + &tokio::runtime::Handle::current(), + TaskKind::GarbageCollector, + Some(tenant_id), + Some(timeline_id), + &format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"), + false, + async move { + fail::fail_point!("immediate_gc_task_pre"); + let result = tenant + .gc_iteration(Some(timeline_id), gc_horizon, pitr) + .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id)) + .await; + // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it + // better once the types support it. + match task_done.send(result) { + Ok(_) => (), + Err(result) => error!("failed to send gc result: {result:?}"), + } + Ok(()) + } + ); + + // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task + drop(guard); + + Ok(wait_task_done) +} diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/tenant/remote_timeline_client.rs similarity index 74% rename from pageserver/src/storage_sync2.rs rename to pageserver/src/tenant/remote_timeline_client.rs index 0b17c3fc42..45988ff47a 100644 --- a/pageserver/src/storage_sync2.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -32,7 +32,8 @@ //! the corresponding remote operation with the timeline's [`RemoteTimelineClient`]: //! //! - [`RemoteTimelineClient::schedule_layer_file_upload`] when we've created a new layer file. -//! - [`RemoteTimelineClient::schedule_index_upload`] when we've updated the timeline metadata file. +//! - [`RemoteTimelineClient::schedule_index_upload_for_metadata_update`] when we've updated the timeline metadata file. +//! - [`RemoteTimelineClient::schedule_index_upload_for_file_changes`] to upload an updated index file, after we've scheduled file uploads //! - [`RemoteTimelineClient::schedule_layer_file_deletion`] when we've deleted one or more layer files. //! //! Internally, these functions create [`UploadOp`]s and put them in a queue. @@ -57,7 +58,7 @@ //! To have a consistent remote structure, it's important that uploads and //! deletions are performed in the right order. For example, the index file //! contains a list of layer files, so it must not be uploaded until all the -//! layer files that are in its list have been succesfully uploaded. +//! layer files that are in its list have been successfully uploaded. //! //! The contract between client and its user is that the user is responsible of //! scheduling operations in an order that keeps the remote consistent as @@ -139,7 +140,7 @@ //! Note that if we crash during file deletion between the index update //! that removes the file from the list of files, and deleting the remote file, //! the file is leaked in the remote storage. Similarly, if a new file is created -//! and uploaded, but the pageserver dies permantently before updating the +//! and uploaded, but the pageserver dies permanently before updating the //! remote index file, the new file is leaked in remote storage. We accept and //! tolerate that for now. //! Note further that we cannot easily fix this by scheduling deletes for every @@ -147,31 +148,43 @@ //! following two cases: //! - (1) We had the file locally, deleted it locally, scheduled a remote delete, //! but crashed before it finished remotely. -//! - (2) We never had the file locally because we were still in tenant attach -//! when we crashed. (Similar case for on-demand download in the future.) +//! - (2) We never had the file locally because we haven't on-demand downloaded +//! it yet. //! -//! # Downloads (= Tenant Attach) +//! # Downloads //! //! In addition to the upload queue, [`RemoteTimelineClient`] has functions for -//! downloading files from the remote storage. Downloads are performed immediately, -//! independently of the uploads. +//! downloading files from the remote storage. Downloads are performed immediately +//! against the `RemoteStorage`, independently of the upload queue. //! //! When we attach a tenant, we perform the following steps: //! - create `Tenant` object in `TenantState::Attaching` state -//! - List timelines that are present in remote storage, and download their remote [`IndexPart`]s -//! - For each timeline, create `Timeline` struct and a `RemoteTimelineClient`, and initialize the client's upload queue with its `IndexPart` -//! - eagerly download all the remote layers using the client's download APIs -//! - transition tenant from `TenantState::Attaching` to `TenantState::Active` state. +//! - List timelines that are present in remote storage, and for each: +//! - download their remote [`IndexPart`]s +//! - create `Timeline` struct and a `RemoteTimelineClient` +//! - initialize the client's upload queue with its `IndexPart` +//! - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart` +//! but not present locally +//! - schedule uploads for layers that are only present locally. +//! - if the remote `IndexPart`'s metadata was newer than the metadata in +//! the local filesystem, write the remote metadata to the local filesystem +//! - After the above is done for each timeline, open the tenant for business by +//! transitioning it from `TenantState::Attaching` to `TenantState::Active` state. +//! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops. //! -//! Most of the above happens in [`Timeline::reconcile_with_remote`]. +//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers. //! We keep track of the fact that a client is in `Attaching` state in a marker -//! file on the local disk. -//! However, the distinction is moot for storage sync since we call -//! `reconcile_with_remote` for tenants both with and without the marker file. -//! -//! In the future, downloading will be done on-demand and `reconcile_with_remote` -//! will only be responsible for re-scheduling upload ops after a crash of an -//! `Active` tenant. +//! file on the local disk. This is critical because, when we restart the pageserver, +//! we do not want to do the `List timelines` step for each tenant that has already +//! been successfully attached (for performance & cost reasons). +//! Instead, for a tenant without the attach marker file, we assume that the +//! local state is in sync or ahead of the remote state. This includes the list +//! of all of the tenant's timelines, which is particularly critical to be up-to-date: +//! if there's a timeline on the remote that the pageserver doesn't know about, +//! the GC will not consider its branch point, leading to data loss. +//! So, for a tenant with the attach marker file, we know that we do not yet have +//! persisted all the remote timeline's metadata files locally. To exclude the +//! risk above, we re-run the procedure for such tenants //! //! # Operating Without Remote Storage //! @@ -194,39 +207,51 @@ mod upload; // re-export these pub use download::{is_temp_download_file, list_remote_timelines}; -use std::collections::{HashMap, VecDeque}; -use std::fmt::Debug; -use std::ops::DerefMut; -use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; use anyhow::ensure; use remote_storage::{DownloadError, GenericRemoteStorage}; +use std::ops::DerefMut; use tokio::runtime::Runtime; -use tracing::{error, info, warn}; +use tracing::{debug, info, warn}; use tracing::{info_span, Instrument}; - use utils::lsn::Lsn; -use self::index::IndexPart; - -use crate::metrics::MeasureRemoteOp; use crate::metrics::RemoteOpFileKind; use crate::metrics::RemoteOpKind; -use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS; +use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics}; +use crate::tenant::remote_timeline_client::index::LayerFileMetadata; use crate::{ config::PageServerConf, - storage_sync::index::{LayerFileMetadata, RelativePath}, task_mgr, task_mgr::TaskKind, task_mgr::BACKGROUND_RUNTIME, tenant::metadata::TimelineMetadata, + tenant::upload_queue::{ + UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask, + }, {exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}, }; use utils::id::{TenantId, TimelineId}; +use self::index::IndexPart; + +use super::storage_layer::LayerFileName; + +// Occasional network issues and such can cause remote operations to fail, and +// that's expected. If a download fails, we log it at info-level, and retry. +// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN +// level instead, as repeated failures can mean a more serious problem. If it +// fails more than FAILED_DOWNLOAD_RETRIES times, we give up +const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3; +const FAILED_DOWNLOAD_RETRIES: u32 = 10; + +// Similarly log failed uploads and deletions at WARN level, after this many +// retries. Uploads and deletions are retried forever, though. +const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; + /// A client for accessing a timeline's data in remote storage. /// /// This takes care of managing the number of connections, and balancing them @@ -256,207 +281,42 @@ pub struct RemoteTimelineClient { upload_queue: Mutex, + metrics: Arc, + storage_impl: GenericRemoteStorage, } -// clippy warns that Uninitialized is much smaller than Initialized, which wastes -// memory for Uninitialized variants. Doesn't matter in practice, there are not -// that many upload queues in a running pageserver, and most of them are initialized -// anyway. -#[allow(clippy::large_enum_variant)] -enum UploadQueue { - Uninitialized, - Initialized(UploadQueueInitialized), - Stopped(UploadQueueStopped), -} - -impl UploadQueue { - fn as_str(&self) -> &'static str { - match self { - UploadQueue::Uninitialized => "Uninitialized", - UploadQueue::Initialized(_) => "Initialized", - UploadQueue::Stopped(_) => "Stopped", - } - } -} - -/// This keeps track of queued and in-progress tasks. -struct UploadQueueInitialized { - /// Counter to assign task IDs - task_counter: u64, - - /// All layer files stored in the remote storage, taking into account all - /// in-progress and queued operations - latest_files: HashMap, - - /// Metadata stored in the remote storage, taking into account all - /// in-progress and queued operations. - /// DANGER: do not return to outside world, e.g., safekeepers. - latest_metadata: TimelineMetadata, - - /// `disk_consistent_lsn` from the last metadata file that was successfully - /// uploaded. `Lsn(0)` if nothing was uploaded yet. - /// Unlike `latest_files` or `latest_metadata`, this value is never ahead. - /// Safekeeper can rely on it to make decisions for WAL storage. - last_uploaded_consistent_lsn: Lsn, - - // Breakdown of different kinds of tasks currently in-progress - num_inprogress_layer_uploads: usize, - num_inprogress_metadata_uploads: usize, - num_inprogress_deletions: usize, - - /// Tasks that are currently in-progress. In-progress means that a tokio Task - /// has been launched for it. An in-progress task can be busy uploading, but it can - /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can - /// be waiting for retry in `exponential_backoff`. - inprogress_tasks: HashMap>, - - /// Queued operations that have not been launched yet. They might depend on previous - /// tasks to finish. For example, metadata upload cannot be performed before all - /// preceding layer file uploads have completed. - queued_operations: VecDeque, -} - -struct UploadQueueStopped { - last_uploaded_consistent_lsn: Lsn, -} - -impl UploadQueue { - fn initialize_empty_remote( - &mut self, - metadata: &TimelineMetadata, - ) -> anyhow::Result<&mut UploadQueueInitialized> { - match self { - UploadQueue::Uninitialized => (), - UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => { - anyhow::bail!("already initialized, state {}", self.as_str()) - } - } - - info!("initializing upload queue for empty remote"); - - let state = UploadQueueInitialized { - // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead. - latest_files: Default::default(), - latest_metadata: metadata.clone(), - // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent - // safekeepers from garbage-collecting anything. - last_uploaded_consistent_lsn: Lsn(0), - // what follows are boring default initializations - task_counter: Default::default(), - num_inprogress_layer_uploads: 0, - num_inprogress_metadata_uploads: 0, - num_inprogress_deletions: 0, - inprogress_tasks: Default::default(), - queued_operations: Default::default(), - }; - - *self = UploadQueue::Initialized(state); - Ok(self.initialized_mut().expect("we just set it")) - } - - fn initialize_with_current_remote_index_part( - &mut self, - index_part: &IndexPart, - ) -> anyhow::Result<&mut UploadQueueInitialized> { - match self { - UploadQueue::Uninitialized => (), - UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => { - anyhow::bail!("already initialized, state {}", self.as_str()) - } - } - - let mut files = HashMap::new(); - for path in &index_part.timeline_layers { - let layer_metadata = index_part - .layer_metadata - .get(path) - .map(LayerFileMetadata::from) - .unwrap_or(LayerFileMetadata::MISSING); - files.insert(path.clone(), layer_metadata); - } - - let index_part_metadata = index_part.parse_metadata()?; - info!( - "initializing upload queue with remote index_part.disk_consistent_lsn: {}", - index_part_metadata.disk_consistent_lsn() - ); - - let state = UploadQueueInitialized { - latest_files: files, - latest_metadata: index_part_metadata.clone(), - last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(), - // what follows are boring default initializations - task_counter: 0, - num_inprogress_layer_uploads: 0, - num_inprogress_metadata_uploads: 0, - num_inprogress_deletions: 0, - inprogress_tasks: Default::default(), - queued_operations: Default::default(), - }; - - *self = UploadQueue::Initialized(state); - Ok(self.initialized_mut().expect("we just set it")) - } - - fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> { - match self { - UploadQueue::Uninitialized | UploadQueue::Stopped(_) => { - anyhow::bail!("queue is in state {}", self.as_str()) - } - UploadQueue::Initialized(x) => Ok(x), - } - } -} - -/// An in-progress upload or delete task. -#[derive(Debug)] -struct UploadTask { - /// Unique ID of this task. Used as the key in `inprogress_tasks` above. - task_id: u64, - retries: AtomicU32, - - op: UploadOp, -} - -#[derive(Debug)] -enum UploadOp { - /// Upload a layer file - UploadLayer(PathBuf, LayerFileMetadata), - - /// Upload the metadata file - UploadMetadata(IndexPart, Lsn), - - /// Delete a file. - Delete(RemoteOpFileKind, PathBuf), - - /// Barrier. When the barrier operation is reached, - Barrier(tokio::sync::watch::Sender<()>), -} - -impl std::fmt::Display for UploadOp { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - match self { - UploadOp::UploadLayer(path, metadata) => write!( - f, - "UploadLayer({}, size={:?})", - path.display(), - metadata.file_size() - ), - UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn), - UploadOp::Delete(_, path) => write!(f, "Delete({})", path.display()), - UploadOp::Barrier(_) => write!(f, "Barrier"), - } - } -} - impl RemoteTimelineClient { + /// + /// Create a remote storage client for given timeline + /// + /// Note: the caller must initialize the upload queue before any uploads can be scheduled, + /// by calling init_upload_queue. + /// + pub fn new( + remote_storage: GenericRemoteStorage, + conf: &'static PageServerConf, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> anyhow::Result { + Ok(RemoteTimelineClient { + conf, + runtime: &BACKGROUND_RUNTIME, + tenant_id, + timeline_id, + storage_impl: remote_storage, + upload_queue: Mutex::new(UploadQueue::Uninitialized), + metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)), + }) + } + /// Initialize the upload queue for a remote storage that already received /// an index file upload, i.e., it's not empty. /// The given `index_part` must be the one on the remote. pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> { let mut upload_queue = self.upload_queue.lock().unwrap(); upload_queue.initialize_with_current_remote_index_part(index_part)?; + self.update_remote_physical_size_gauge(Some(index_part)); Ok(()) } @@ -468,6 +328,7 @@ impl RemoteTimelineClient { ) -> anyhow::Result<()> { let mut upload_queue = self.upload_queue.lock().unwrap(); upload_queue.initialize_empty_remote(local_metadata)?; + self.update_remote_physical_size_gauge(None); Ok(()) } @@ -479,6 +340,24 @@ impl RemoteTimelineClient { } } + fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) { + let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part { + current_remote_index_part + .layer_metadata + .values() + // If we don't have the file size for the layer, don't account for it in the metric. + .map(|ilmd| ilmd.file_size.unwrap_or(0)) + .sum() + } else { + 0 + }; + self.metrics.remote_physical_size_gauge().set(size); + } + + pub fn get_remote_physical_size(&self) -> u64 { + self.metrics.remote_physical_size_gauge().get() + } + // // Download operations. // @@ -499,6 +378,7 @@ impl RemoteTimelineClient { self.timeline_id, RemoteOpFileKind::Index, RemoteOpKind::Download, + Arc::clone(&self.metrics), ) .await } @@ -510,7 +390,7 @@ impl RemoteTimelineClient { /// On success, returns the size of the downloaded file. pub async fn download_layer_file( &self, - path: &RelativePath, + layer_file_name: &LayerFileName, layer_metadata: &LayerFileMetadata, ) -> anyhow::Result { let downloaded_size = download::download_layer_file( @@ -518,7 +398,7 @@ impl RemoteTimelineClient { &self.storage_impl, self.tenant_id, self.timeline_id, - path, + layer_file_name, layer_metadata, ) .measure_remote_op( @@ -526,6 +406,7 @@ impl RemoteTimelineClient { self.timeline_id, RemoteOpFileKind::Layer, RemoteOpKind::Download, + Arc::clone(&self.metrics), ) .await?; @@ -536,13 +417,23 @@ impl RemoteTimelineClient { let new_metadata = LayerFileMetadata::new(downloaded_size); let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - if let Some(upgraded) = upload_queue.latest_files.get_mut(path) { - upgraded.merge(&new_metadata); + if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) { + if upgraded.merge(&new_metadata) { + upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; + } + // If we don't do an index file upload inbetween here and restart, + // the value will go back down after pageserver restart, since we will + // have lost this data point. + // But, we upload index part fairly frequently, and restart pageserver rarely. + // So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner. + self.metrics + .remote_physical_size_gauge() + .add(downloaded_size); } else { // The file should exist, since we just downloaded it. warn!( "downloaded file {:?} not found in local copy of the index file", - path + layer_file_name ); } } @@ -554,14 +445,20 @@ impl RemoteTimelineClient { // /// - /// Launch an index-file upload operation in the background. + /// Launch an index-file upload operation in the background, with + /// updated metadata. /// /// The upload will be added to the queue immediately, but it /// won't be performed until all previosuly scheduled layer file /// upload operations have completed successfully. This is to /// ensure that when the index file claims that layers X, Y and Z - /// exist in remote storage, they really do. - pub fn schedule_index_upload( + /// exist in remote storage, they really do. To wait for the upload + /// to complete, use `wait_completion`. + /// + /// If there were any changes to the list of files, i.e. if any + /// layer file uploads were scheduled, since the last index file + /// upload, those will be included too. + pub fn schedule_index_upload_for_metadata_update( self: &Arc, metadata: &TimelineMetadata, ) -> anyhow::Result<()> { @@ -572,26 +469,60 @@ impl RemoteTimelineClient { // ahead of what's _actually_ on the remote during index upload. upload_queue.latest_metadata = metadata.clone(); + let metadata_bytes = upload_queue.latest_metadata.to_bytes()?; + self.schedule_index_upload(upload_queue, metadata_bytes); + + Ok(()) + } + + /// + /// Launch an index-file upload operation in the background, if necessary. + /// + /// Use this function to schedule the update of the index file after + /// scheduling file uploads or deletions. If no file uploads or deletions + /// have been scheduled since the last index file upload, this does + /// nothing. + /// + /// Like schedule_index_upload_for_metadata_update(), this merely adds + /// the upload to the upload queue and returns quickly. + pub fn schedule_index_upload_for_file_changes(self: &Arc) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { + let metadata_bytes = upload_queue.latest_metadata.to_bytes()?; + self.schedule_index_upload(upload_queue, metadata_bytes); + } + + Ok(()) + } + + /// Launch an index-file upload operation in the background (internal function) + fn schedule_index_upload( + self: &Arc, + upload_queue: &mut UploadQueueInitialized, + metadata_bytes: Vec, + ) { + info!( + "scheduling metadata upload with {} files ({} changed)", + upload_queue.latest_files.len(), + upload_queue.latest_files_changes_since_metadata_upload_scheduled, + ); + let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); let index_part = IndexPart::new( upload_queue.latest_files.clone(), disk_consistent_lsn, - upload_queue.latest_metadata.to_bytes()?, + metadata_bytes, ); let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn); self.update_upload_queue_unfinished_metric(1, &op); upload_queue.queued_operations.push_back(op); - - info!( - "scheduled metadata upload with {} files", - upload_queue.latest_files.len() - ); + upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0; // Launch the task immediately, if possible self.launch_queued_tasks(upload_queue); - - Ok(()) } /// @@ -599,7 +530,7 @@ impl RemoteTimelineClient { /// pub fn schedule_layer_file_upload( self: &Arc, - path: &Path, + layer_file_name: &LayerFileName, layer_metadata: &LayerFileMetadata, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); @@ -612,20 +543,19 @@ impl RemoteTimelineClient { "file size not initialized in metadata" ); - let relative_path = RelativePath::from_local_path( - &self.conf.timeline_path(&self.timeline_id, &self.tenant_id), - path, - )?; - upload_queue .latest_files - .insert(relative_path, layer_metadata.clone()); + .insert(layer_file_name.clone(), layer_metadata.clone()); + upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; - let op = UploadOp::UploadLayer(PathBuf::from(path), layer_metadata.clone()); + let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone()); self.update_upload_queue_unfinished_metric(1, &op); upload_queue.queued_operations.push_back(op); - info!("scheduled layer file upload {}", path.display()); + info!( + "scheduled layer file upload {}", + layer_file_name.file_name() + ); // Launch the task immediately, if possible self.launch_queued_tasks(upload_queue); @@ -635,25 +565,21 @@ impl RemoteTimelineClient { /// /// Launch a delete operation in the background. /// - /// The deletion won't actually be performed, until all preceding - /// upload operations have completed succesfully. - pub fn schedule_layer_file_deletion(self: &Arc, paths: &[PathBuf]) -> anyhow::Result<()> { + /// Note: This schedules an index file upload before the deletions. The + /// deletion won't actually be performed, until any previously scheduled + /// upload operations, and the index file upload, have completed + /// succesfully. + /// + pub fn schedule_layer_file_deletion( + self: &Arc, + names: &[LayerFileName], + ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - // Convert the paths into RelativePaths, and gather other information we need. - let mut relative_paths = Vec::with_capacity(paths.len()); - for path in paths { - relative_paths.push(RelativePath::from_local_path( - &self.conf.timeline_path(&self.timeline_id, &self.tenant_id), - path, - )?); - } - // Deleting layers doesn't affect the values stored in TimelineMetadata, // so we don't need update it. Just serialize it. let metadata_bytes = upload_queue.latest_metadata.to_bytes()?; - let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); // Update the remote index file, removing the to-be-deleted files from the index, // before deleting the actual files. @@ -663,25 +589,21 @@ impl RemoteTimelineClient { // from latest_files, but not yet scheduled for deletion. Use a closure // to syntactically forbid ? or bail! calls here. let no_bail_here = || { - for relative_path in relative_paths { - upload_queue.latest_files.remove(&relative_path); + for name in names { + upload_queue.latest_files.remove(name); + upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; } - let index_part = IndexPart::new( - upload_queue.latest_files.clone(), - disk_consistent_lsn, - metadata_bytes, - ); - let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn); - self.update_upload_queue_unfinished_metric(1, &op); - upload_queue.queued_operations.push_back(op); + if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { + self.schedule_index_upload(upload_queue, metadata_bytes); + } // schedule the actual deletions - for path in paths { - let op = UploadOp::Delete(RemoteOpFileKind::Layer, PathBuf::from(path)); + for name in names { + let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone()); self.update_upload_queue_unfinished_metric(1, &op); upload_queue.queued_operations.push_back(op); - info!("scheduled layer file deletion {}", path.display()); + info!("scheduled layer file deletion {}", name.file_name()); } // Launch the tasks immediately, if possible @@ -753,7 +675,7 @@ impl RemoteTimelineClient { // We can launch this task. Remove it from the queue first. let next_op = upload_queue.queued_operations.pop_front().unwrap(); - info!("starting op: {}", next_op); + debug!("starting op: {}", next_op); // Update the counters match next_op { @@ -837,18 +759,28 @@ impl RemoteTimelineClient { } let upload_result: anyhow::Result<()> = match &task.op { - UploadOp::UploadLayer(ref path, ref layer_metadata) => { - upload::upload_timeline_layer(&self.storage_impl, path, layer_metadata) - .measure_remote_op( - self.tenant_id, - self.timeline_id, - RemoteOpFileKind::Layer, - RemoteOpKind::Upload, - ) - .await + UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => { + let path = &self + .conf + .timeline_path(&self.timeline_id, &self.tenant_id) + .join(layer_file_name.file_name()); + upload::upload_timeline_layer( + self.conf, + &self.storage_impl, + path, + layer_metadata, + ) + .measure_remote_op( + self.tenant_id, + self.timeline_id, + RemoteOpFileKind::Layer, + RemoteOpKind::Upload, + Arc::clone(&self.metrics), + ) + .await } UploadOp::UploadMetadata(ref index_part, _lsn) => { - upload::upload_index_part( + let res = upload::upload_index_part( self.conf, &self.storage_impl, self.tenant_id, @@ -860,16 +792,26 @@ impl RemoteTimelineClient { self.timeline_id, RemoteOpFileKind::Index, RemoteOpKind::Upload, + Arc::clone(&self.metrics), ) - .await + .await; + if res.is_ok() { + self.update_remote_physical_size_gauge(Some(index_part)); + } + res } - UploadOp::Delete(metric_file_kind, ref path) => { - delete::delete_layer(&self.storage_impl, path) + UploadOp::Delete(metric_file_kind, ref layer_file_name) => { + let path = &self + .conf + .timeline_path(&self.timeline_id, &self.tenant_id) + .join(layer_file_name.file_name()); + delete::delete_layer(self.conf, &self.storage_impl, path) .measure_remote_op( self.tenant_id, self.timeline_id, *metric_file_kind, RemoteOpKind::Delete, + Arc::clone(&self.metrics), ) .await } @@ -888,10 +830,22 @@ impl RemoteTimelineClient { Err(e) => { let retries = task.retries.fetch_add(1, Ordering::SeqCst); - error!( - "failed to perform remote task {}, will retry (attempt {}): {:?}", - task.op, retries, e - ); + // Uploads can fail due to rate limits (IAM, S3), spurious network problems, + // or other external reasons. Such issues are relatively regular, so log them + // at info level at first, and only WARN if the operation fails repeatedly. + // + // (See similar logic for downloads in `download::download_retry`) + if retries < FAILED_UPLOAD_WARN_THRESHOLD { + info!( + "failed to perform remote task {}, will retry (attempt {}): {:#}", + task.op, retries, e + ); + } else { + warn!( + "failed to perform remote task {}, will retry (attempt {}): {:?}", + task.op, retries, e + ); + } // sleep until it's time to retry, or we're cancelled tokio::select! { @@ -913,7 +867,7 @@ impl RemoteTimelineClient { task.op, retries ); } else { - info!("remote task {} completed successfully", task.op); + debug!("remote task {} completed successfully", task.op); } // The task has completed succesfully. Remove it from the in-progress list. @@ -960,14 +914,8 @@ impl RemoteTimelineClient { return; } }; - REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS - .get_metric_with_label_values(&[ - &self.tenant_id.to_string(), - &self.timeline_id.to_string(), - file_kind.as_str(), - op_kind.as_str(), - ]) - .unwrap() + self.metrics + .unfinished_tasks(&file_kind, &op_kind) .add(delta) } @@ -1032,34 +980,12 @@ impl RemoteTimelineClient { } } -/// -/// Create a remote storage client for given timeline -/// -/// Note: the caller must initialize the upload queue before any uploads can be scheduled, -/// by calling init_upload_queue. -/// -pub fn create_remote_timeline_client( - remote_storage: GenericRemoteStorage, - conf: &'static PageServerConf, - tenant_id: TenantId, - timeline_id: TimelineId, -) -> anyhow::Result { - Ok(RemoteTimelineClient { - conf, - runtime: &BACKGROUND_RUNTIME, - tenant_id, - timeline_id, - storage_impl: remote_storage, - upload_queue: Mutex::new(UploadQueue::Uninitialized), - }) -} - #[cfg(test)] mod tests { use super::*; use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; - use std::collections::HashSet; + use std::{collections::HashSet, path::Path}; use utils::lsn::Lsn; pub(super) fn dummy_contents(name: &str) -> Vec { @@ -1083,15 +1009,11 @@ mod tests { TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap() } - fn assert_file_list(a: &HashSet, b: &[&str]) { - let xx = PathBuf::from(""); - let mut avec: Vec = a - .iter() - .map(|x| x.to_local_path(&xx).to_string_lossy().into()) - .collect(); + fn assert_file_list(a: &HashSet, b: &[&str]) { + let mut avec: Vec = a.iter().map(|x| x.file_name()).collect(); avec.sort(); - let mut bvec = b.to_owned(); + let mut bvec = b.to_vec(); bvec.sort_unstable(); assert_eq!(avec, bvec); @@ -1159,8 +1081,7 @@ mod tests { println!("workdir: {}", harness.conf.workdir.display()); - let storage_impl = - GenericRemoteStorage::from_config(harness.conf.workdir.clone(), &storage_config)?; + let storage_impl = GenericRemoteStorage::from_config(&storage_config)?; let client = Arc::new(RemoteTimelineClient { conf: harness.conf, runtime, @@ -1168,6 +1089,10 @@ mod tests { timeline_id: TIMELINE_ID, storage_impl, upload_queue: Mutex::new(UploadQueue::Uninitialized), + metrics: Arc::new(RemoteTimelineClientMetrics::new( + &harness.tenant_id, + &TIMELINE_ID, + )), }); let remote_timeline_dir = @@ -1184,11 +1109,11 @@ mod tests { std::fs::write(timeline_path.join("bar"), &content_bar)?; client.schedule_layer_file_upload( - &timeline_path.join("foo"), + &LayerFileName::Test("foo".to_owned()), &LayerFileMetadata::new(content_foo.len() as u64), )?; client.schedule_layer_file_upload( - &timeline_path.join("bar"), + &LayerFileName::Test("bar".to_owned()), &LayerFileMetadata::new(content_bar.len() as u64), )?; @@ -1199,15 +1124,19 @@ mod tests { assert!(upload_queue.queued_operations.is_empty()); assert!(upload_queue.inprogress_tasks.len() == 2); assert!(upload_queue.num_inprogress_layer_uploads == 2); + + // also check that `latest_file_changes` was updated + assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2); } // Schedule upload of index. Check that it is queued let metadata = dummy_metadata(Lsn(0x20)); - client.schedule_index_upload(&metadata)?; + client.schedule_index_upload_for_metadata_update(&metadata)?; { let mut guard = client.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut().unwrap(); assert!(upload_queue.queued_operations.len() == 1); + assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0); } // Wait for the uploads to finish @@ -1230,10 +1159,10 @@ mod tests { let content_baz = dummy_contents("baz"); std::fs::write(timeline_path.join("baz"), &content_baz)?; client.schedule_layer_file_upload( - &timeline_path.join("baz"), + &LayerFileName::Test("baz".to_owned()), &LayerFileMetadata::new(content_baz.len() as u64), )?; - client.schedule_layer_file_deletion(&[timeline_path.join("foo")])?; + client.schedule_layer_file_deletion(&[LayerFileName::Test("foo".to_owned())])?; { let mut guard = client.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut().unwrap(); @@ -1243,6 +1172,7 @@ mod tests { assert!(upload_queue.inprogress_tasks.len() == 1); assert!(upload_queue.num_inprogress_layer_uploads == 1); assert!(upload_queue.num_inprogress_deletions == 0); + assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0); } assert_remote_files(&["foo", "bar", "index_part.json"], &remote_timeline_dir); diff --git a/pageserver/src/tenant/remote_timeline_client/delete.rs b/pageserver/src/tenant/remote_timeline_client/delete.rs new file mode 100644 index 0000000000..9f6732fbff --- /dev/null +++ b/pageserver/src/tenant/remote_timeline_client/delete.rs @@ -0,0 +1,28 @@ +//! Helper functions to delete files from remote storage with a RemoteStorage +use anyhow::Context; +use std::path::Path; +use tracing::debug; + +use remote_storage::GenericRemoteStorage; + +use crate::config::PageServerConf; + +pub(super) async fn delete_layer<'a>( + conf: &'static PageServerConf, + storage: &'a GenericRemoteStorage, + local_layer_path: &'a Path, +) -> anyhow::Result<()> { + fail::fail_point!("before-delete-layer", |_| { + anyhow::bail!("failpoint before-delete-layer") + }); + debug!("Deleting layer from remote storage: {local_layer_path:?}",); + + let path_to_delete = conf.remote_path(local_layer_path)?; + + // XXX: If the deletion fails because the object already didn't exist, + // it would be good to just issue a warning but consider it success. + // https://github.com/neondatabase/neon/issues/2934 + storage.delete(&path_to_delete).await.with_context(|| { + format!("Failed to delete remote layer from storage at {path_to_delete:?}") + }) +} diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs new file mode 100644 index 0000000000..422728d1f3 --- /dev/null +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -0,0 +1,331 @@ +//! Helper functions to download files from remote storage with a RemoteStorage +//! +//! The functions in this module retry failed operations automatically, according +//! to the FAILED_DOWNLOAD_RETRIES constant. + +use std::collections::HashSet; +use std::future::Future; +use std::path::Path; + +use anyhow::{anyhow, Context}; +use futures::stream::{FuturesUnordered, StreamExt}; +use tokio::fs; +use tokio::io::AsyncWriteExt; +use tracing::{debug, error, info, info_span, warn, Instrument}; + +use crate::config::PageServerConf; +use crate::tenant::storage_layer::LayerFileName; +use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; +use remote_storage::{DownloadError, GenericRemoteStorage}; +use utils::crashsafe::path_with_suffix_extension; +use utils::id::{TenantId, TimelineId}; + +use super::index::{IndexPart, IndexPartUnclean, LayerFileMetadata}; +use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD}; + +async fn fsync_path(path: impl AsRef) -> Result<(), std::io::Error> { + fs::File::open(path).await?.sync_all().await +} + +/// +/// If 'metadata' is given, we will validate that the downloaded file's size matches that +/// in the metadata. (In the future, we might do more cross-checks, like CRC validation) +/// +/// Returns the size of the downloaded file. +pub async fn download_layer_file<'a>( + conf: &'static PageServerConf, + storage: &'a GenericRemoteStorage, + tenant_id: TenantId, + timeline_id: TimelineId, + layer_file_name: &'a LayerFileName, + layer_metadata: &'a LayerFileMetadata, +) -> Result { + let timeline_path = conf.timeline_path(&timeline_id, &tenant_id); + + let local_path = timeline_path.join(layer_file_name.file_name()); + + let remote_path = conf + .remote_path(&local_path) + .map_err(DownloadError::Other)?; + + // Perform a rename inspired by durable_rename from file_utils.c. + // The sequence: + // write(tmp) + // fsync(tmp) + // rename(tmp, new) + // fsync(new) + // fsync(parent) + // For more context about durable_rename check this email from postgres mailing list: + // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com + // If pageserver crashes the temp file will be deleted on startup and re-downloaded. + let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); + + let (mut destination_file, bytes_amount) = download_retry( + || async { + // TODO: this doesn't use the cached fd for some reason? + let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| { + format!( + "Failed to create a destination file for layer '{}'", + temp_file_path.display() + ) + }) + .map_err(DownloadError::Other)?; + let mut download = storage.download(&remote_path).await.with_context(|| { + format!( + "Failed to open a download stream for layer with remote storage path '{remote_path:?}'" + ) + }) + .map_err(DownloadError::Other)?; + let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| { + format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}") + }) + .map_err(DownloadError::Other)?; + Ok((destination_file, bytes_amount)) + }, + &format!("download {remote_path:?}"), + ).await?; + + // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: + // A file will not be closed immediately when it goes out of scope if there are any IO operations + // that have not yet completed. To ensure that a file is closed immediately when it is dropped, + // you should call flush before dropping it. + // + // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because + // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations. + // But for additional safety lets check/wait for any pending operations. + destination_file + .flush() + .await + .with_context(|| { + format!( + "failed to flush source file at {}", + temp_file_path.display() + ) + }) + .map_err(DownloadError::Other)?; + + match layer_metadata.file_size() { + Some(expected) if expected != bytes_amount => { + return Err(DownloadError::Other(anyhow!( + "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'", + temp_file_path.display() + ))); + } + Some(_) | None => { + // matches, or upgrading from an earlier IndexPart version + } + } + + // not using sync_data because it can lose file size update + destination_file + .sync_all() + .await + .with_context(|| { + format!( + "failed to fsync source file at {}", + temp_file_path.display() + ) + }) + .map_err(DownloadError::Other)?; + drop(destination_file); + + fail::fail_point!("remote-storage-download-pre-rename", |_| { + Err(DownloadError::Other(anyhow!( + "remote-storage-download-pre-rename failpoint triggered" + ))) + }); + + fs::rename(&temp_file_path, &local_path) + .await + .with_context(|| { + format!( + "Could not rename download layer file to {}", + local_path.display(), + ) + }) + .map_err(DownloadError::Other)?; + + fsync_path(&local_path) + .await + .with_context(|| format!("Could not fsync layer file {}", local_path.display(),)) + .map_err(DownloadError::Other)?; + + tracing::info!("download complete: {}", local_path.display()); + + Ok(bytes_amount) +} + +const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; + +pub fn is_temp_download_file(path: &Path) -> bool { + let extension = path.extension().map(|pname| { + pname + .to_str() + .expect("paths passed to this function must be valid Rust strings") + }); + match extension { + Some(TEMP_DOWNLOAD_EXTENSION) => true, + Some(_) => false, + None => false, + } +} + +/// List timelines of given tenant in remote storage +pub async fn list_remote_timelines<'a>( + storage: &'a GenericRemoteStorage, + conf: &'static PageServerConf, + tenant_id: TenantId, +) -> anyhow::Result> { + let tenant_path = conf.timelines_path(&tenant_id); + let tenant_storage_path = conf.remote_path(&tenant_path)?; + + fail::fail_point!("storage-sync-list-remote-timelines", |_| { + anyhow::bail!("storage-sync-list-remote-timelines"); + }); + + let timelines = download_retry( + || storage.list_prefixes(Some(&tenant_storage_path)), + &format!("list prefixes for {tenant_path:?}"), + ) + .await?; + + if timelines.is_empty() { + anyhow::bail!("no timelines found on the remote storage") + } + + let mut timeline_ids = HashSet::new(); + let mut part_downloads = FuturesUnordered::new(); + + for timeline_remote_storage_key in timelines { + let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { + anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}") + })?; + + let timeline_id: TimelineId = object_name.parse().with_context(|| { + format!("failed to parse object name into timeline id '{object_name}'") + })?; + + // list_prefixes returns all files with the prefix. If we haven't seen this timeline ID + // yet, launch a download task for it. + if !timeline_ids.contains(&timeline_id) { + timeline_ids.insert(timeline_id); + let storage_clone = storage.clone(); + part_downloads.push(async move { + ( + timeline_id, + download_index_part(conf, &storage_clone, tenant_id, timeline_id) + .instrument(info_span!("download_index_part", timeline=%timeline_id)) + .await, + ) + }); + } + } + + // Wait for all the download tasks to complete. + let mut timeline_parts = Vec::new(); + while let Some((timeline_id, part_upload_result)) = part_downloads.next().await { + let index_part = part_upload_result + .with_context(|| format!("Failed to fetch index part for timeline {timeline_id}"))?; + + debug!("Successfully fetched index part for timeline {timeline_id}"); + timeline_parts.push((timeline_id, index_part)); + } + Ok(timeline_parts) +} + +pub async fn download_index_part( + conf: &'static PageServerConf, + storage: &GenericRemoteStorage, + tenant_id: TenantId, + timeline_id: TimelineId, +) -> Result { + let index_part_path = conf + .metadata_path(timeline_id, tenant_id) + .with_file_name(IndexPart::FILE_NAME); + let part_storage_path = conf + .remote_path(&index_part_path) + .map_err(DownloadError::BadInput)?; + + let index_part_bytes = download_retry( + || async { + let mut index_part_download = storage.download(&part_storage_path).await?; + + let mut index_part_bytes = Vec::new(); + tokio::io::copy( + &mut index_part_download.download_stream, + &mut index_part_bytes, + ) + .await + .with_context(|| { + format!("Failed to download an index part into file {index_part_path:?}") + }) + .map_err(DownloadError::Other)?; + Ok(index_part_bytes) + }, + &format!("download {part_storage_path:?}"), + ) + .await?; + + let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes) + .with_context(|| { + format!("Failed to deserialize index part file into file {index_part_path:?}") + }) + .map_err(DownloadError::Other)?; + + let index_part = index_part.remove_unclean_layer_file_names(); + + Ok(index_part) +} + +/// +/// Helper function to handle retries for a download operation. +/// +/// Remote operations can fail due to rate limits (IAM, S3), spurious network +/// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times, +/// with backoff. +/// +/// (See similar logic for uploads in `perform_upload_task`) +async fn download_retry(mut op: O, description: &str) -> Result +where + O: FnMut() -> F, + F: Future>, +{ + let mut attempts = 0; + loop { + let result = op().await; + match result { + Ok(_) => { + if attempts > 0 { + info!("{description} succeeded after {attempts} retries"); + } + return result; + } + + // These are "permanent" errors that should not be retried. + Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => { + return result; + } + // Assume that any other failure might be transient, and the operation might + // succeed if we just keep trying. + Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => { + info!("{description} failed, will retry (attempt {attempts}): {err:#}"); + } + Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => { + warn!("{description} failed, will retry (attempt {attempts}): {err:#}"); + } + Err(DownloadError::Other(ref err)) => { + // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up. + error!("{description} still failed after {attempts} retries, giving up: {err:?}"); + return result; + } + } + // sleep and retry + exponential_backoff( + attempts, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ) + .await; + attempts += 1; + } +} diff --git a/pageserver/src/storage_sync2/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs similarity index 50% rename from pageserver/src/storage_sync2/index.rs rename to pageserver/src/tenant/remote_timeline_client/index.rs index b1f43dcb93..c199b7e10b 100644 --- a/pageserver/src/storage_sync2/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -2,46 +2,16 @@ //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about //! remote timeline layers and its metadata. -use std::{ - collections::{HashMap, HashSet}, - path::{Path, PathBuf}, -}; +use std::collections::{HashMap, HashSet}; -use anyhow::{Context, Ok}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr}; +use tracing::warn; -use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::{metadata::TimelineMetadata, storage_layer::LayerFileName}; use utils::lsn::Lsn; -/// A part of the filesystem path, that needs a root to become a path again. -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] -#[serde(transparent)] -pub struct RelativePath(String); - -impl RelativePath { - /// Attempts to strip off the base from path, producing a relative path or an error. - pub fn from_local_path(timeline_path: &Path, path: &Path) -> anyhow::Result { - let relative = path.strip_prefix(timeline_path).with_context(|| { - format!( - "path '{}' is not relative to base '{}'", - path.display(), - timeline_path.display() - ) - })?; - Ok(Self::from_filename(relative)) - } - - pub fn from_filename(path: &Path) -> RelativePath { - RelativePath(path.to_string_lossy().to_string()) - } - - pub fn to_local_path(&self, timeline_path: &Path) -> PathBuf { - timeline_path.join(&self.0) - } -} - /// Metadata gathered for each of the layer files. /// /// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which @@ -78,9 +48,17 @@ impl LayerFileMetadata { /// Metadata has holes due to version upgrades. This method is called to upgrade self with the /// other value. /// - /// This is called on the possibly outdated version. - pub fn merge(&mut self, other: &Self) { - self.file_size = other.file_size.or(self.file_size); + /// This is called on the possibly outdated version. Returns true if any changes + /// were made. + pub fn merge(&mut self, other: &Self) -> bool { + let mut changed = false; + + if self.file_size != other.file_size { + self.file_size = other.file_size.or(self.file_size); + changed = true; + } + + changed } } @@ -92,26 +70,25 @@ impl LayerFileMetadata { /// remember to add a test case for the changed version. #[serde_as] #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] -pub struct IndexPart { +pub struct IndexPartImpl +where + L: std::hash::Hash + PartialEq + Eq, +{ /// Debugging aid describing the version of this type. #[serde(default)] version: usize, - /// Each of the layers present on remote storage. + /// Layer names, which are stored on the remote storage. /// /// Additional metadata can might exist in `layer_metadata`. - pub timeline_layers: HashSet, + pub timeline_layers: HashSet, - /// FIXME: unused field. This should be removed, but that changes the on-disk format, - /// so we need to make sure we're backwards- (and maybe forwards-) compatible - missing_layers: HashSet, - - /// Per layer file metadata, which can be present for a present or missing layer file. + /// Per layer file name metadata, which can be present for a present or missing layer file. /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata /// that latest version stores. - #[serde(default)] - pub layer_metadata: HashMap, + #[serde(default = "HashMap::default")] + pub layer_metadata: HashMap, // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata. // It's duplicated here for convenience. @@ -120,6 +97,101 @@ pub struct IndexPart { metadata_bytes: Vec, } +// TODO seems like another part of the remote storage file format +// compatibility issue, see https://github.com/neondatabase/neon/issues/3072 +pub type IndexPart = IndexPartImpl; + +pub type IndexPartUnclean = IndexPartImpl; + +#[derive(Debug, PartialEq, Eq, Hash, Clone)] +pub enum UncleanLayerFileName { + Clean(LayerFileName), + BackupFile(String), +} + +impl<'de> serde::Deserialize<'de> for UncleanLayerFileName { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_string(UncleanLayerFileNameVisitor) + } +} + +struct UncleanLayerFileNameVisitor; + +impl<'de> serde::de::Visitor<'de> for UncleanLayerFileNameVisitor { + type Value = UncleanLayerFileName; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + formatter, + "a string that is a valid LayerFileName or '.old' backup file name" + ) + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + let maybe_clean: Result = v.parse(); + match maybe_clean { + Ok(clean) => Ok(UncleanLayerFileName::Clean(clean)), + Err(e) => { + if v.ends_with(".old") || v == "metadata_backup" { + Ok(UncleanLayerFileName::BackupFile(v.to_owned())) + } else { + Err(E::custom(e)) + } + } + } + } +} + +impl UncleanLayerFileName { + fn into_clean(self) -> Option { + match self { + UncleanLayerFileName::Clean(clean) => Some(clean), + UncleanLayerFileName::BackupFile(_) => None, + } + } +} + +impl IndexPartUnclean { + pub fn remove_unclean_layer_file_names(self) -> IndexPart { + let IndexPartUnclean { + version, + timeline_layers, + layer_metadata, + disk_consistent_lsn, + metadata_bytes, + } = self; + + IndexPart { + version, + timeline_layers: timeline_layers + .into_iter() + .filter_map(|unclean_file_name| match unclean_file_name { + UncleanLayerFileName::Clean(clean_name) => Some(clean_name), + UncleanLayerFileName::BackupFile(backup_file_name) => { + // For details see https://github.com/neondatabase/neon/issues/3024 + warn!( + "got backup file on the remote storage, ignoring it {backup_file_name}" + ); + None + } + }) + .collect(), + layer_metadata: layer_metadata + .into_iter() + .filter_map(|(l, m)| l.into_clean().map(|l| (l, m))) + .collect(), + disk_consistent_lsn, + metadata_bytes, + } + } +} + impl IndexPart { /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be /// used to understand later versions. @@ -129,23 +201,22 @@ impl IndexPart { pub const FILE_NAME: &'static str = "index_part.json"; pub fn new( - layers_and_metadata: HashMap, + layers_and_metadata: HashMap, disk_consistent_lsn: Lsn, metadata_bytes: Vec, ) -> Self { - let mut timeline_layers = HashSet::new(); - let mut layer_metadata = HashMap::new(); + let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len()); + let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len()); - separate_paths_and_metadata( - &layers_and_metadata, - &mut timeline_layers, - &mut layer_metadata, - ); + for (remote_name, metadata) in &layers_and_metadata { + timeline_layers.insert(remote_name.to_owned()); + let metadata = IndexLayerMetadata::from(metadata); + layer_metadata.insert(remote_name.to_owned(), metadata); + } Self { version: Self::LATEST_VERSION, timeline_layers, - missing_layers: HashSet::new(), layer_metadata, disk_consistent_lsn, metadata_bytes, @@ -160,7 +231,7 @@ impl IndexPart { /// Serialized form of [`LayerFileMetadata`]. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] pub struct IndexLayerMetadata { - file_size: Option, + pub(super) file_size: Option, } impl From<&'_ LayerFileMetadata> for IndexLayerMetadata { @@ -171,18 +242,6 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata { } } -fn separate_paths_and_metadata( - input: &HashMap, - output: &mut HashSet, - layer_metadata: &mut HashMap, -) { - for (path, metadata) in input { - let metadata = IndexLayerMetadata::from(metadata); - layer_metadata.insert(path.clone(), metadata); - output.insert(path.clone()); - } -} - #[cfg(test)] mod tests { use super::*; @@ -191,21 +250,20 @@ mod tests { fn v0_indexpart_is_parsed() { let example = r#"{ "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], - "missing_layers":["not_a_real_layer_but_adding_coverage"], "disk_consistent_lsn":"0/16960E8", "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] }"#; let expected = IndexPart { version: 0, - timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(), - missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(), + timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]), layer_metadata: HashMap::default(), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), }; - let part = serde_json::from_str::(example).unwrap(); + let part: IndexPartUnclean = serde_json::from_str(example).unwrap(); + let part = part.remove_unclean_layer_file_names(); assert_eq!(part, expected); } @@ -214,10 +272,9 @@ mod tests { let example = r#"{ "version":1, "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], - "missing_layers":["not_a_real_layer_but_adding_coverage"], "layer_metadata":{ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, - "not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 } + "LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 } }, "disk_consistent_lsn":"0/16960E8", "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] @@ -226,13 +283,12 @@ mod tests { let expected = IndexPart { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? version: 1, - timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(), - missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(), + timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]), layer_metadata: HashMap::from([ - (RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { file_size: Some(25600000), }), - (RelativePath("not_a_real_layer_but_adding_coverage".to_owned()), IndexLayerMetadata { + (LayerFileName::new_test("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: Some(9007199254741001), @@ -242,7 +298,46 @@ mod tests { metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), }; - let part = serde_json::from_str::(example).unwrap(); + let part = serde_json::from_str::(example) + .unwrap() + .remove_unclean_layer_file_names(); + assert_eq!(part, expected); + } + + #[test] + fn v1_indexpart_is_parsed_with_optional_missing_layers() { + let example = r#"{ + "version":1, + "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], + "missing_layers":["This shouldn't fail deserialization"], + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] + }"#; + + let expected = IndexPart { + // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? + version: 1, + timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]), + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + file_size: Some(25600000), + }), + (LayerFileName::new_test("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata { + // serde_json should always parse this but this might be a double with jq for + // example. + file_size: Some(9007199254741001), + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), + }; + + let part = serde_json::from_str::(example).unwrap(); + let part = part.remove_unclean_layer_file_names(); assert_eq!(part, expected); } } diff --git a/pageserver/src/storage_sync2/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs similarity index 72% rename from pageserver/src/storage_sync2/upload.rs rename to pageserver/src/tenant/remote_timeline_client/upload.rs index b03a0f6ce7..5082fa1634 100644 --- a/pageserver/src/storage_sync2/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -5,12 +5,12 @@ use fail::fail_point; use std::path::Path; use tokio::fs; -use super::index::IndexPart; -use crate::config::PageServerConf; -use crate::storage_sync::LayerFileMetadata; +use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart}; use remote_storage::GenericRemoteStorage; use utils::id::{TenantId, TimelineId}; +use super::index::LayerFileMetadata; + /// Serializes and uploads the given index part data to the remote storage. pub(super) async fn upload_index_part<'a>( conf: &'static PageServerConf, @@ -30,12 +30,9 @@ pub(super) async fn upload_index_part<'a>( let index_part_path = conf .metadata_path(timeline_id, tenant_id) .with_file_name(IndexPart::FILE_NAME); + let storage_path = conf.remote_path(&index_part_path)?; storage - .upload_storage_object( - Box::new(index_part_bytes), - index_part_size, - &index_part_path, - ) + .upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path) .await .with_context(|| format!("Failed to upload index part for '{tenant_id} / {timeline_id}'")) } @@ -44,36 +41,26 @@ pub(super) async fn upload_index_part<'a>( /// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload. /// /// On an error, bumps the retries count and reschedules the entire task. -pub(super) async fn upload_timeline_layer( - storage: &GenericRemoteStorage, - source_path: &Path, - known_metadata: &LayerFileMetadata, +pub(super) async fn upload_timeline_layer<'a>( + conf: &'static PageServerConf, + storage: &'a GenericRemoteStorage, + source_path: &'a Path, + known_metadata: &'a LayerFileMetadata, ) -> anyhow::Result<()> { fail_point!("before-upload-layer", |_| { bail!("failpoint before-upload-layer") }); - let storage_path = storage.remote_object_id(source_path).with_context(|| { - format!( - "Failed to get the layer storage path for local path '{}'", - source_path.display() - ) - })?; + let storage_path = conf.remote_path(source_path)?; - let source_file = fs::File::open(&source_path).await.with_context(|| { - format!( - "Failed to open a source file for layer '{}'", - source_path.display() - ) - })?; + let source_file = fs::File::open(&source_path) + .await + .with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?; let fs_size = source_file .metadata() .await .with_context(|| { - format!( - "Failed to get the source file metadata for layer '{}'", - source_path.display() - ) + format!("Failed to get the source file metadata for layer {source_path:?}") })? .len(); diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 24d9b2a10e..aa11985cbe 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -3,8 +3,11 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; use anyhow::Context; +use tokio::sync::oneshot::error::RecvError; use tokio::sync::Semaphore; +use crate::pgdatadir_mapping::CalculateLogicalSizeError; + use super::Tenant; use utils::id::TimelineId; use utils::lsn::Lsn; @@ -67,6 +70,7 @@ pub(super) async fn gather_inputs( let timelines = tenant .refresh_gc_info() + .await .context("Failed to refresh gc_info before gathering inputs")?; if timelines.is_empty() { @@ -93,8 +97,6 @@ pub(super) async fn gather_inputs( // used to determine the `retention_period` for the size model let mut max_cutoff_distance = None; - // this will probably conflict with on-demand downloaded layers, or at least force them all - // to be downloaded for timeline in timelines { let last_record_lsn = timeline.get_last_record_lsn(); @@ -212,11 +214,30 @@ pub(super) async fn gather_inputs( let mut have_any_error = false; while let Some(res) = joinset.join_next().await { - // each of these come with Result, JoinError> + // each of these come with Result, JoinError> // because of spawn + spawn_blocking - let res = res.and_then(|inner| inner); match res { - Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => { + Err(join_error) if join_error.is_cancelled() => { + unreachable!("we are not cancelling any of the futures, nor should be"); + } + Err(join_error) => { + // cannot really do anything, as this panic is likely a bug + error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}"); + have_any_error = true; + } + Ok(Err(recv_result_error)) => { + // cannot really do anything, as this panic is likely a bug + error!("failed to receive logical size query result: {recv_result_error:#}"); + have_any_error = true; + } + Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => { + warn!( + timeline_id=%timeline.timeline_id, + "failed to calculate logical size at {lsn}: {error:#}" + ); + have_any_error = true; + } + Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => { debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated"); logical_size_cache.insert((timeline.timeline_id, lsn), size); @@ -228,21 +249,6 @@ pub(super) async fn gather_inputs( command: Command::Update(size), }); } - Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => { - warn!( - timeline_id=%timeline.timeline_id, - "failed to calculate logical size at {lsn}: {error:#}" - ); - have_any_error = true; - } - Err(join_error) if join_error.is_cancelled() => { - unreachable!("we are not cancelling any of the futures, nor should be"); - } - Err(join_error) => { - // cannot really do anything, as this panic is likely a bug - error!("logical size query panicked: {join_error:#}"); - have_any_error = true; - } } } @@ -351,7 +357,7 @@ enum LsnKind { struct TimelineAtLsnSizeResult( Arc, utils::lsn::Lsn, - anyhow::Result, + Result, ); #[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))] @@ -359,17 +365,15 @@ async fn calculate_logical_size( limit: Arc, timeline: Arc, lsn: utils::lsn::Lsn, -) -> Result { - let permit = tokio::sync::Semaphore::acquire_owned(limit) +) -> Result { + let _permit = tokio::sync::Semaphore::acquire_owned(limit) .await .expect("global semaphore should not had been closed"); - tokio::task::spawn_blocking(move || { - let _permit = permit; - let size_res = timeline.calculate_logical_size(lsn); - TimelineAtLsnSizeResult(timeline, lsn, size_res) - }) - .await + let size_res = timeline + .spawn_ondemand_logical_size_calculation(lsn) + .await?; + Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res)) } #[test] diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 104e8e2ca5..0b957bd810 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -1,6 +1,10 @@ -//! //! Common traits and structs for layers -//! + +mod delta_layer; +mod filename; +mod image_layer; +mod inmemory_layer; +mod remote_layer; use crate::repository::{Key, Value}; use crate::walrecord::NeonWalRecord; @@ -8,12 +12,19 @@ use anyhow::Result; use bytes::Bytes; use std::ops::Range; use std::path::PathBuf; +use std::sync::Arc; use utils::{ id::{TenantId, TimelineId}, lsn::Lsn, }; +pub use delta_layer::{DeltaLayer, DeltaLayerWriter}; +pub use filename::{DeltaFileName, ImageFileName, LayerFileName, PathOrConf}; +pub use image_layer::{ImageLayer, ImageLayerWriter}; +pub use inmemory_layer::InMemoryLayer; +pub use remote_layer::RemoteLayer; + pub fn range_overlaps(a: &Range, b: &Range) -> bool where T: PartialOrd, @@ -69,26 +80,9 @@ pub enum ValueReconstructResult { Missing, } -/// A Layer contains all data in a "rectangle" consisting of a range of keys and -/// range of LSNs. -/// -/// There are two kinds of layers, in-memory and on-disk layers. In-memory -/// layers are used to ingest incoming WAL, and provide fast access to the -/// recent page versions. On-disk layers are stored as files on disk, and are -/// immutable. This trait presents the common functionality of in-memory and -/// on-disk layers. -/// -/// Furthermore, there are two kinds of on-disk layers: delta and image layers. -/// A delta layer contains all modifications within a range of LSNs and keys. -/// An image layer is a snapshot of all the data in a key-range, at a single -/// LSN -/// +/// Supertrait of the [`Layer`] trait that captures the bare minimum interface +/// required by [`LayerMap`]. pub trait Layer: Send + Sync { - fn get_tenant_id(&self) -> TenantId; - - /// Identify the timeline this layer belongs to - fn get_timeline_id(&self) -> TimelineId; - /// Range of keys that this layer covers fn get_key_range(&self) -> Range; @@ -100,13 +94,11 @@ pub trait Layer: Send + Sync { /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1 fn get_lsn_range(&self) -> Range; - /// Filename used to store this layer on disk. (Even in-memory layers - /// implement this, to print a handy unique identifier for the layer for - /// log messages, even though they're never not on disk.) - fn filename(&self) -> PathBuf; - - /// If a layer has a corresponding file on a local filesystem, return its absolute path. - fn local_path(&self) -> Option; + /// Does this layer only contain some data for the key-range (incremental), + /// or does it contain a version of every page? This is important to know + /// for garbage collecting old layers: an incremental layer depends on + /// the previous non-incremental layer. + fn is_incremental(&self) -> bool; /// /// Return data needed to reconstruct given page at LSN. @@ -127,35 +119,88 @@ pub trait Layer: Send + Sync { reconstruct_data: &mut ValueReconstructState, ) -> Result; - /// Does this layer only contain some data for the key-range (incremental), - /// or does it contain a version of every page? This is important to know - /// for garbage collecting old layers: an incremental layer depends on - /// the previous non-incremental layer. - fn is_incremental(&self) -> bool; + /// A short ID string that uniquely identifies the given layer within a [`LayerMap`]. + fn short_id(&self) -> String; - /// Returns true for layers that are represented in memory. - fn is_in_memory(&self) -> bool; + /// Dump summary of the contents of the layer to stdout + fn dump(&self, verbose: bool) -> Result<()>; +} + +/// Returned by [`Layer::iter`] +pub type LayerIter<'i> = Box> + 'i>; + +/// Returned by [`Layer::key_iter`] +pub type LayerKeyIter<'i> = Box + 'i>; + +/// A Layer contains all data in a "rectangle" consisting of a range of keys and +/// range of LSNs. +/// +/// There are two kinds of layers, in-memory and on-disk layers. In-memory +/// layers are used to ingest incoming WAL, and provide fast access to the +/// recent page versions. On-disk layers are stored as files on disk, and are +/// immutable. This trait presents the common functionality of in-memory and +/// on-disk layers. +/// +/// Furthermore, there are two kinds of on-disk layers: delta and image layers. +/// A delta layer contains all modifications within a range of LSNs and keys. +/// An image layer is a snapshot of all the data in a key-range, at a single +/// LSN +/// +pub trait PersistentLayer: Layer { + fn get_tenant_id(&self) -> TenantId; + + /// Identify the timeline this layer belongs to + fn get_timeline_id(&self) -> TimelineId; + + /// File name used for this layer, both in the pageserver's local filesystem + /// state as well as in the remote storage. + fn filename(&self) -> LayerFileName; + + // Path to the layer file in the local filesystem. + // `None` for `RemoteLayer`. + fn local_path(&self) -> Option; /// Iterate through all keys and values stored in the layer - fn iter(&self) -> Box> + '_>; + fn iter(&self) -> Result>; /// Iterate through all keys stored in the layer. Returns key, lsn and value size /// It is used only for compaction and so is currently implemented only for DeltaLayer - fn key_iter(&self) -> Box + '_> { + fn key_iter(&self) -> Result> { panic!("Not implemented") } /// Permanently remove this layer from disk. fn delete(&self) -> Result<()>; - /// Dump summary of the contents of the layer to stdout - fn dump(&self, verbose: bool) -> Result<()>; + fn downcast_remote_layer(self: Arc) -> Option> { + None + } + + fn is_remote_layer(&self) -> bool { + false + } + + /// Returns None if the layer file size is not known. + /// + /// Should not change over the lifetime of the layer object because + /// current_physical_size is computed as the som of this value. + fn file_size(&self) -> Option; +} + +pub fn downcast_remote_layer( + layer: &Arc, +) -> Option> { + if layer.is_remote_layer() { + Arc::clone(layer).downcast_remote_layer() + } else { + None + } } impl std::fmt::Debug for dyn Layer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Layer") - .field("filename", &self.filename()) + .field("short_id", &self.short_id()) .finish() } } diff --git a/pageserver/src/tenant/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs similarity index 95% rename from pageserver/src/tenant/delta_layer.rs rename to pageserver/src/tenant/storage_layer/delta_layer.rs index dcd6956640..302ba2dc78 100644 --- a/pageserver/src/tenant/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -29,15 +29,16 @@ use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; -use crate::tenant::filename::{DeltaFileName, PathOrConf}; -use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; +use crate::tenant::storage_layer::{ + PersistentLayer, ValueReconstructResult, ValueReconstructState, +}; use crate::virtual_file::VirtualFile; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; -use std::fs; +use std::fs::{self, File}; use std::io::{BufWriter, Write}; use std::io::{Seek, SeekFrom}; use std::ops::Range; @@ -52,6 +53,8 @@ use utils::{ lsn::Lsn, }; +use super::{DeltaFileName, Layer, LayerFileName, LayerIter, LayerKeyIter, PathOrConf}; + /// /// Header stored in the beginning of the file /// @@ -178,6 +181,8 @@ pub struct DeltaLayer { pub key_range: Range, pub lsn_range: Range, + pub file_size: u64, + inner: RwLock, } @@ -194,14 +199,6 @@ pub struct DeltaLayerInner { } impl Layer for DeltaLayer { - fn get_tenant_id(&self) -> TenantId { - self.tenant_id - } - - fn get_timeline_id(&self) -> TimelineId { - self.timeline_id - } - fn get_key_range(&self) -> Range { self.key_range.clone() } @@ -209,13 +206,86 @@ impl Layer for DeltaLayer { fn get_lsn_range(&self) -> Range { self.lsn_range.clone() } - - fn filename(&self) -> PathBuf { - PathBuf::from(self.layer_name().to_string()) + fn is_incremental(&self) -> bool { + true } - fn local_path(&self) -> Option { - Some(self.path()) + fn short_id(&self) -> String { + self.filename().file_name() + } + /// debugging function to print out the contents of the layer + fn dump(&self, verbose: bool) -> Result<()> { + println!( + "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----", + self.tenant_id, + self.timeline_id, + self.key_range.start, + self.key_range.end, + self.lsn_range.start, + self.lsn_range.end + ); + + if !verbose { + return Ok(()); + } + + let inner = self.load()?; + + println!( + "index_start_blk: {}, root {}", + inner.index_start_blk, inner.index_root_blk + ); + + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + file, + ); + + tree_reader.dump()?; + + let mut cursor = file.block_cursor(); + + // A subroutine to dump a single blob + let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result { + let buf = cursor.read_blob(blob_ref.pos())?; + let val = Value::des(&buf)?; + let desc = match val { + Value::Image(img) => { + format!(" img {} bytes", img.len()) + } + Value::WalRecord(rec) => { + let wal_desc = walrecord::describe_wal_record(&rec)?; + format!( + " rec {} bytes will_init: {} {}", + buf.len(), + rec.will_init(), + wal_desc + ) + } + }; + Ok(desc) + }; + + tree_reader.visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |delta_key, val| { + let blob_ref = BlobRef(val); + let key = DeltaKey::extract_key_from_buf(delta_key); + let lsn = DeltaKey::extract_lsn_from_buf(delta_key); + + let desc = match dump_blob(blob_ref) { + Ok(desc) => desc, + Err(err) => format!("ERROR: {}", err), + }; + println!(" key {} at {}: {}", key, lsn, desc); + true + }, + )?; + + Ok(()) } fn get_value_reconstruct_data( @@ -302,29 +372,38 @@ impl Layer for DeltaLayer { Ok(ValueReconstructResult::Complete) } } +} - fn iter<'a>(&'a self) -> Box> + 'a> { - let inner = match self.load() { - Ok(inner) => inner, - Err(e) => panic!("Failed to load a delta layer: {e:?}"), - }; - - match DeltaValueIter::new(inner) { - Ok(iter) => Box::new(iter), - Err(err) => Box::new(std::iter::once(Err(err))), - } +impl PersistentLayer for DeltaLayer { + fn get_tenant_id(&self) -> TenantId { + self.tenant_id } - fn key_iter<'a>(&'a self) -> Box + 'a> { - let inner = match self.load() { - Ok(inner) => inner, - Err(e) => panic!("Failed to load a delta layer: {e:?}"), - }; + fn get_timeline_id(&self) -> TimelineId { + self.timeline_id + } - match DeltaKeyIter::new(inner) { + fn filename(&self) -> LayerFileName { + self.layer_name().into() + } + + fn local_path(&self) -> Option { + Some(self.path()) + } + + fn iter(&self) -> Result> { + let inner = self.load().context("load delta layer")?; + Ok(match DeltaValueIter::new(inner) { Ok(iter) => Box::new(iter), - Err(e) => panic!("Layer index is corrupted: {e:?}"), - } + Err(err) => Box::new(std::iter::once(Err(err))), + }) + } + + fn key_iter(&self) -> Result> { + let inner = self.load()?; + Ok(Box::new( + DeltaKeyIter::new(inner).context("Layer index is corrupted")?, + )) } fn delete(&self) -> Result<()> { @@ -333,87 +412,8 @@ impl Layer for DeltaLayer { Ok(()) } - fn is_incremental(&self) -> bool { - true - } - - fn is_in_memory(&self) -> bool { - false - } - - /// debugging function to print out the contents of the layer - fn dump(&self, verbose: bool) -> Result<()> { - println!( - "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----", - self.tenant_id, - self.timeline_id, - self.key_range.start, - self.key_range.end, - self.lsn_range.start, - self.lsn_range.end - ); - - if !verbose { - return Ok(()); - } - - let inner = self.load()?; - - println!( - "index_start_blk: {}, root {}", - inner.index_start_blk, inner.index_root_blk - ); - - let file = inner.file.as_ref().unwrap(); - let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( - inner.index_start_blk, - inner.index_root_blk, - file, - ); - - tree_reader.dump()?; - - let mut cursor = file.block_cursor(); - - // A subroutine to dump a single blob - let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result { - let buf = cursor.read_blob(blob_ref.pos())?; - let val = Value::des(&buf)?; - let desc = match val { - Value::Image(img) => { - format!(" img {} bytes", img.len()) - } - Value::WalRecord(rec) => { - let wal_desc = walrecord::describe_wal_record(&rec)?; - format!( - " rec {} bytes will_init: {} {}", - buf.len(), - rec.will_init(), - wal_desc - ) - } - }; - Ok(desc) - }; - - tree_reader.visit( - &[0u8; DELTA_KEY_SIZE], - VisitDirection::Forwards, - |delta_key, val| { - let blob_ref = BlobRef(val); - let key = DeltaKey::extract_key_from_buf(delta_key); - let lsn = DeltaKey::extract_lsn_from_buf(delta_key); - - let desc = match dump_blob(blob_ref) { - Ok(desc) => desc, - Err(err) => format!("ERROR: {}", err), - }; - println!(" key {} at {}: {}", key, lsn, desc); - true - }, - )?; - - Ok(()) + fn file_size(&self) -> Option { + Some(self.file_size) } } @@ -511,8 +511,8 @@ impl DeltaLayer { } } PathOrConf::Path(path) => { - let actual_filename = Path::new(path.file_name().unwrap()); - let expected_filename = self.filename(); + let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned(); + let expected_filename = self.filename().file_name(); if actual_filename != expected_filename { println!( @@ -539,6 +539,7 @@ impl DeltaLayer { timeline_id: TimelineId, tenant_id: TenantId, filename: &DeltaFileName, + file_size: u64, ) -> DeltaLayer { DeltaLayer { path_or_conf: PathOrConf::Conf(conf), @@ -546,6 +547,7 @@ impl DeltaLayer { tenant_id, key_range: filename.key_range.clone(), lsn_range: filename.lsn_range.clone(), + file_size, inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, @@ -558,21 +560,23 @@ impl DeltaLayer { /// Create a DeltaLayer struct representing an existing file on disk. /// /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. - pub fn new_for_path(path: &Path, file: F) -> Result - where - F: FileExt, - { + pub fn new_for_path(path: &Path, file: File) -> Result { let mut summary_buf = Vec::new(); summary_buf.resize(PAGE_SZ, 0); file.read_exact_at(&mut summary_buf, 0)?; let summary = Summary::des_prefix(&summary_buf)?; + let metadata = file + .metadata() + .context("get file metadata to determine size")?; + Ok(DeltaLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), timeline_id: summary.timeline_id, tenant_id: summary.tenant_id, key_range: summary.key_range, lsn_range: summary.lsn_range, + file_size: metadata.len(), inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, @@ -729,6 +733,10 @@ impl DeltaLayerWriterInner { file.seek(SeekFrom::Start(0))?; Summary::ser_into(&summary, &mut file)?; + let metadata = file + .metadata() + .context("get file metadata to determine size")?; + // Note: Because we opened the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't // set inner.file here. The first read will have to re-open it. @@ -738,6 +746,7 @@ impl DeltaLayerWriterInner { timeline_id: self.timeline_id, key_range: self.key_start..key_end, lsn_range: self.lsn_range.clone(), + file_size: metadata.len(), inner: RwLock::new(DeltaLayerInner { loaded: false, file: None, diff --git a/pageserver/src/tenant/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs similarity index 63% rename from pageserver/src/tenant/filename.rs rename to pageserver/src/tenant/storage_layer/filename.rs index 0ebf2d479b..6ecf9227c7 100644 --- a/pageserver/src/tenant/filename.rs +++ b/pageserver/src/tenant/storage_layer/filename.rs @@ -7,11 +7,12 @@ use std::cmp::Ordering; use std::fmt; use std::ops::Range; use std::path::PathBuf; +use std::str::FromStr; use utils::lsn::Lsn; // Note: Timeline::load_layer_map() relies on this sort order -#[derive(Debug, PartialEq, Eq, Clone)] +#[derive(Debug, PartialEq, Eq, Clone, Hash)] pub struct DeltaFileName { pub key_range: Range, pub lsn_range: Range, @@ -101,7 +102,7 @@ impl fmt::Display for DeltaFileName { } } -#[derive(Debug, PartialEq, Eq, Clone)] +#[derive(Debug, PartialEq, Eq, Clone, Hash)] pub struct ImageFileName { pub key_range: Range, pub lsn: Lsn, @@ -172,6 +173,103 @@ impl fmt::Display for ImageFileName { ) } } +#[derive(Debug, PartialEq, Eq, Hash, Clone)] +pub enum LayerFileName { + Image(ImageFileName), + Delta(DeltaFileName), + #[cfg(test)] + Test(String), +} + +impl LayerFileName { + pub fn file_name(&self) -> String { + match self { + LayerFileName::Image(fname) => format!("{fname}"), + LayerFileName::Delta(fname) => format!("{fname}"), + #[cfg(test)] + LayerFileName::Test(fname) => fname.to_string(), + } + } + #[cfg(test)] + pub(crate) fn new_test(name: &str) -> LayerFileName { + LayerFileName::Test(name.to_owned()) + } +} + +impl From for LayerFileName { + fn from(fname: ImageFileName) -> Self { + LayerFileName::Image(fname) + } +} +impl From for LayerFileName { + fn from(fname: DeltaFileName) -> Self { + LayerFileName::Delta(fname) + } +} + +// include a `/` in the name as an additional layer of robustness +// because `/` chars are not allowed in UNIX paths +#[cfg(test)] +const LAYER_FILE_NAME_TEST_PREFIX: &str = "LAYER_FILE_NAME::test/"; + +impl FromStr for LayerFileName { + type Err = String; + + fn from_str(value: &str) -> Result { + #[cfg(test)] + if let Some(value) = value.strip_prefix(LAYER_FILE_NAME_TEST_PREFIX) { + return Ok(LayerFileName::Test(value.to_owned())); + } + let delta = DeltaFileName::parse_str(value); + let image = ImageFileName::parse_str(value); + let ok = match (delta, image) { + (None, None) => { + return Err(format!( + "neither delta nor image layer file name: {value:?}" + )) + } + (Some(delta), None) => LayerFileName::Delta(delta), + (None, Some(image)) => LayerFileName::Image(image), + (Some(_), Some(_)) => unreachable!(), + }; + Ok(ok) + } +} + +impl serde::Serialize for LayerFileName { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match self { + LayerFileName::Image(fname) => serializer.serialize_str(&format!("{}", fname)), + LayerFileName::Delta(fname) => serializer.serialize_str(&format!("{}", fname)), + #[cfg(test)] + LayerFileName::Test(t) => { + serializer.serialize_str(&format!("{LAYER_FILE_NAME_TEST_PREFIX}{t}")) + } + } + } +} + +struct LayerFileNameVisitor; + +impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor { + type Value = LayerFileName; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + write!( + formatter, + "a string that is a valid image or delta layer file name" + ) + } + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + v.parse().map_err(|e| E::custom(e)) + } +} /// Helper enum to hold a PageServerConf, or a path /// diff --git a/pageserver/src/tenant/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs similarity index 93% rename from pageserver/src/tenant/image_layer.rs rename to pageserver/src/tenant/storage_layer/image_layer.rs index 8409d34bc9..9a26fce73b 100644 --- a/pageserver/src/tenant/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -21,12 +21,13 @@ //! actual page images are stored in the "values" part. use crate::config::PageServerConf; use crate::page_cache::PAGE_SZ; -use crate::repository::{Key, Value, KEY_SIZE}; +use crate::repository::{Key, KEY_SIZE}; use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter}; use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; -use crate::tenant::filename::{ImageFileName, PathOrConf}; -use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; +use crate::tenant::storage_layer::{ + PersistentLayer, ValueReconstructResult, ValueReconstructState, +}; use crate::virtual_file::VirtualFile; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{bail, ensure, Context, Result}; @@ -34,10 +35,11 @@ use bytes::Bytes; use hex; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; -use std::fs; +use std::fs::{self, File}; use std::io::Write; use std::io::{Seek, SeekFrom}; use std::ops::Range; +use std::os::unix::prelude::FileExt; use std::path::{Path, PathBuf}; use std::sync::{RwLock, RwLockReadGuard}; use tracing::*; @@ -48,6 +50,9 @@ use utils::{ lsn::Lsn, }; +use super::filename::{ImageFileName, LayerFileName, PathOrConf}; +use super::{Layer, LayerIter}; + /// /// Header stored in the beginning of the file /// @@ -100,6 +105,7 @@ pub struct ImageLayer { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub key_range: Range, + pub file_size: u64, // This entry contains an image of all pages as of this LSN pub lsn: Lsn, @@ -120,22 +126,6 @@ pub struct ImageLayerInner { } impl Layer for ImageLayer { - fn filename(&self) -> PathBuf { - PathBuf::from(self.layer_name().to_string()) - } - - fn local_path(&self) -> Option { - Some(self.path()) - } - - fn get_tenant_id(&self) -> TenantId { - self.tenant_id - } - - fn get_timeline_id(&self) -> TimelineId { - self.timeline_id - } - fn get_key_range(&self) -> Range { self.key_range.clone() } @@ -144,58 +134,12 @@ impl Layer for ImageLayer { // End-bound is exclusive self.lsn..(self.lsn + 1) } - - /// Look up given page in the file - fn get_value_reconstruct_data( - &self, - key: Key, - lsn_range: Range, - reconstruct_state: &mut ValueReconstructState, - ) -> anyhow::Result { - assert!(self.key_range.contains(&key)); - assert!(lsn_range.start >= self.lsn); - assert!(lsn_range.end >= self.lsn); - - let inner = self.load()?; - - let file = inner.file.as_ref().unwrap(); - let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file); - - let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; - key.write_to_byte_slice(&mut keybuf); - if let Some(offset) = tree_reader.get(&keybuf)? { - let blob = file.block_cursor().read_blob(offset).with_context(|| { - format!( - "failed to read value from data file {} at offset {}", - self.filename().display(), - offset - ) - })?; - let value = Bytes::from(blob); - - reconstruct_state.img = Some((self.lsn, value)); - Ok(ValueReconstructResult::Complete) - } else { - Ok(ValueReconstructResult::Missing) - } - } - - fn iter(&self) -> Box>> { - todo!(); - } - - fn delete(&self) -> Result<()> { - // delete underlying file - fs::remove_file(self.path())?; - Ok(()) - } - fn is_incremental(&self) -> bool { false } - fn is_in_memory(&self) -> bool { - false + fn short_id(&self) -> String { + self.filename().file_name() } /// debugging function to print out the contents of the layer @@ -223,6 +167,72 @@ impl Layer for ImageLayer { Ok(()) } + + /// Look up given page in the file + fn get_value_reconstruct_data( + &self, + key: Key, + lsn_range: Range, + reconstruct_state: &mut ValueReconstructState, + ) -> anyhow::Result { + assert!(self.key_range.contains(&key)); + assert!(lsn_range.start >= self.lsn); + assert!(lsn_range.end >= self.lsn); + + let inner = self.load()?; + + let file = inner.file.as_ref().unwrap(); + let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file); + + let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; + key.write_to_byte_slice(&mut keybuf); + if let Some(offset) = tree_reader.get(&keybuf)? { + let blob = file.block_cursor().read_blob(offset).with_context(|| { + format!( + "failed to read value from data file {} at offset {}", + self.path().display(), + offset + ) + })?; + let value = Bytes::from(blob); + + reconstruct_state.img = Some((self.lsn, value)); + Ok(ValueReconstructResult::Complete) + } else { + Ok(ValueReconstructResult::Missing) + } + } +} + +impl PersistentLayer for ImageLayer { + fn filename(&self) -> LayerFileName { + self.layer_name().into() + } + + fn local_path(&self) -> Option { + Some(self.path()) + } + + fn get_tenant_id(&self) -> TenantId { + self.tenant_id + } + + fn get_timeline_id(&self) -> TimelineId { + self.timeline_id + } + fn iter(&self) -> Result> { + unimplemented!(); + } + + fn delete(&self) -> Result<()> { + // delete underlying file + fs::remove_file(self.path())?; + Ok(()) + } + + fn file_size(&self) -> Option { + Some(self.file_size) + } } impl ImageLayer { @@ -314,8 +324,8 @@ impl ImageLayer { } } PathOrConf::Path(path) => { - let actual_filename = Path::new(path.file_name().unwrap()); - let expected_filename = self.filename(); + let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned(); + let expected_filename = self.filename().file_name(); if actual_filename != expected_filename { println!( @@ -339,6 +349,7 @@ impl ImageLayer { timeline_id: TimelineId, tenant_id: TenantId, filename: &ImageFileName, + file_size: u64, ) -> ImageLayer { ImageLayer { path_or_conf: PathOrConf::Conf(conf), @@ -346,6 +357,7 @@ impl ImageLayer { tenant_id, key_range: filename.key_range.clone(), lsn: filename.lsn, + file_size, inner: RwLock::new(ImageLayerInner { loaded: false, file: None, @@ -358,21 +370,21 @@ impl ImageLayer { /// Create an ImageLayer struct representing an existing file on disk. /// /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary. - pub fn new_for_path(path: &Path, file: F) -> Result - where - F: std::os::unix::prelude::FileExt, - { + pub fn new_for_path(path: &Path, file: File) -> Result { let mut summary_buf = Vec::new(); summary_buf.resize(PAGE_SZ, 0); file.read_exact_at(&mut summary_buf, 0)?; let summary = Summary::des_prefix(&summary_buf)?; - + let metadata = file + .metadata() + .context("get file metadata to determine size")?; Ok(ImageLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), timeline_id: summary.timeline_id, tenant_id: summary.tenant_id, key_range: summary.key_range, lsn: summary.lsn, + file_size: metadata.len(), inner: RwLock::new(ImageLayerInner { file: None, loaded: false, @@ -518,6 +530,10 @@ impl ImageLayerWriterInner { file.seek(SeekFrom::Start(0))?; Summary::ser_into(&summary, &mut file)?; + let metadata = file + .metadata() + .context("get metadata to determine file size")?; + // Note: Because we open the file in write-only mode, we cannot // reuse the same VirtualFile for reading later. That's why we don't // set inner.file here. The first read will have to re-open it. @@ -527,6 +543,7 @@ impl ImageLayerWriterInner { tenant_id: self.tenant_id, key_range: self.key_range.clone(), lsn: self.lsn, + file_size: metadata.len(), inner: RwLock::new(ImageLayerInner { loaded: false, file: None, @@ -551,7 +568,7 @@ impl ImageLayerWriterInner { lsn: self.lsn, }, ); - std::fs::rename(self.path, &final_path)?; + std::fs::rename(self.path, final_path)?; trace!("created image layer {}", layer.path().display()); diff --git a/pageserver/src/tenant/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs similarity index 90% rename from pageserver/src/tenant/inmemory_layer.rs rename to pageserver/src/tenant/storage_layer/inmemory_layer.rs index 9aa33a72ca..93356a9d8c 100644 --- a/pageserver/src/tenant/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -8,11 +8,10 @@ use crate::config::PageServerConf; use crate::repository::{Key, Value}; use crate::tenant::blob_io::{BlobCursor, BlobWriter}; use crate::tenant::block_io::BlockReader; -use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter}; use crate::tenant::ephemeral_file::EphemeralFile; -use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; +use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState}; use crate::walrecord; -use anyhow::{bail, ensure, Result}; +use anyhow::{ensure, Result}; use std::cell::RefCell; use std::collections::HashMap; use tracing::*; @@ -26,9 +25,10 @@ use utils::{ // while being able to use std::fmt::Write's methods use std::fmt::Write as _; use std::ops::Range; -use std::path::PathBuf; use std::sync::RwLock; +use super::{DeltaLayer, DeltaLayerWriter, Layer}; + thread_local! { /// A buffer for serializing object during [`InMemoryLayer::put_value`]. /// This buffer is reused for each serialization to avoid additional malloc calls. @@ -75,33 +75,13 @@ impl InMemoryLayerInner { } } -impl Layer for InMemoryLayer { - // An in-memory layer can be spilled to disk into ephemeral file, - // This function is used only for debugging, so we don't need to be very precise. - // Construct a filename as if it was a delta layer. - fn filename(&self) -> PathBuf { - let inner = self.inner.read().unwrap(); - - let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX)); - - PathBuf::from(format!( - "inmem-{:016X}-{:016X}", - self.start_lsn.0, end_lsn.0 - )) - } - - fn local_path(&self) -> Option { - None - } - - fn get_tenant_id(&self) -> TenantId { - self.tenant_id - } - - fn get_timeline_id(&self) -> TimelineId { +impl InMemoryLayer { + pub fn get_timeline_id(&self) -> TimelineId { self.timeline_id } +} +impl Layer for InMemoryLayer { fn get_key_range(&self) -> Range { Key::MIN..Key::MAX } @@ -117,72 +97,16 @@ impl Layer for InMemoryLayer { self.start_lsn..end_lsn } - /// Look up given value in the layer. - fn get_value_reconstruct_data( - &self, - key: Key, - lsn_range: Range, - reconstruct_state: &mut ValueReconstructState, - ) -> anyhow::Result { - ensure!(lsn_range.start >= self.start_lsn); - let mut need_image = true; - - let inner = self.inner.read().unwrap(); - - let mut reader = inner.file.block_cursor(); - - // Scan the page versions backwards, starting from `lsn`. - if let Some(vec_map) = inner.index.get(&key) { - let slice = vec_map.slice_range(lsn_range); - for (entry_lsn, pos) in slice.iter().rev() { - let buf = reader.read_blob(*pos)?; - let value = Value::des(&buf)?; - match value { - Value::Image(img) => { - reconstruct_state.img = Some((*entry_lsn, img)); - return Ok(ValueReconstructResult::Complete); - } - Value::WalRecord(rec) => { - let will_init = rec.will_init(); - reconstruct_state.records.push((*entry_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; - } - } - } - } - } - - // release lock on 'inner' - - // If an older page image is needed to reconstruct the page, let the - // caller know. - if need_image { - Ok(ValueReconstructResult::Continue) - } else { - Ok(ValueReconstructResult::Complete) - } - } - - fn iter(&self) -> Box>> { - todo!(); - } - - /// Nothing to do here. When you drop the last reference to the layer, it will - /// be deallocated. - fn delete(&self) -> Result<()> { - bail!("can't delete an InMemoryLayer") - } - fn is_incremental(&self) -> bool { // in-memory layer is always considered incremental. true } - fn is_in_memory(&self) -> bool { - true + fn short_id(&self) -> String { + let inner = self.inner.read().unwrap(); + + let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX)); + format!("inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0) } /// debugging function to print out the contents of the layer @@ -235,6 +159,55 @@ impl Layer for InMemoryLayer { Ok(()) } + + /// Look up given value in the layer. + fn get_value_reconstruct_data( + &self, + key: Key, + lsn_range: Range, + reconstruct_state: &mut ValueReconstructState, + ) -> anyhow::Result { + ensure!(lsn_range.start >= self.start_lsn); + let mut need_image = true; + + let inner = self.inner.read().unwrap(); + + let mut reader = inner.file.block_cursor(); + + // Scan the page versions backwards, starting from `lsn`. + if let Some(vec_map) = inner.index.get(&key) { + let slice = vec_map.slice_range(lsn_range); + for (entry_lsn, pos) in slice.iter().rev() { + let buf = reader.read_blob(*pos)?; + let value = Value::des(&buf)?; + match value { + Value::Image(img) => { + reconstruct_state.img = Some((*entry_lsn, img)); + return Ok(ValueReconstructResult::Complete); + } + Value::WalRecord(rec) => { + let will_init = rec.will_init(); + reconstruct_state.records.push((*entry_lsn, rec)); + if will_init { + // This WAL record initializes the page, so no need to go further back + need_image = false; + break; + } + } + } + } + } + + // release lock on 'inner' + + // If an older page image is needed to reconstruct the page, let the + // caller know. + if need_image { + Ok(ValueReconstructResult::Continue) + } else { + Ok(ValueReconstructResult::Complete) + } + } } impl InMemoryLayer { diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs new file mode 100644 index 0000000000..33474bb4a2 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -0,0 +1,210 @@ +//! A RemoteLayer is an in-memory placeholder for a layer file that exists +//! in remote storage. +//! +use crate::config::PageServerConf; +use crate::repository::Key; +use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; +use anyhow::{bail, Result}; +use std::ops::Range; +use std::path::PathBuf; +use std::sync::Arc; + +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, +}; + +use super::filename::{DeltaFileName, ImageFileName, LayerFileName}; +use super::image_layer::ImageLayer; +use super::{DeltaLayer, LayerIter, LayerKeyIter, PersistentLayer}; + +#[derive(Debug)] +pub struct RemoteLayer { + tenantid: TenantId, + timelineid: TimelineId, + key_range: Range, + lsn_range: Range, + + pub file_name: LayerFileName, + + pub layer_metadata: LayerFileMetadata, + + is_delta: bool, + + is_incremental: bool, + + pub(crate) ongoing_download: Arc, +} + +impl Layer for RemoteLayer { + fn get_key_range(&self) -> Range { + self.key_range.clone() + } + + fn get_lsn_range(&self) -> Range { + self.lsn_range.clone() + } + + fn get_value_reconstruct_data( + &self, + _key: Key, + _lsn_range: Range, + _reconstruct_state: &mut ValueReconstructState, + ) -> Result { + bail!( + "layer {} needs to be downloaded", + self.filename().file_name() + ); + } + + fn is_incremental(&self) -> bool { + self.is_incremental + } + + /// debugging function to print out the contents of the layer + fn dump(&self, _verbose: bool) -> Result<()> { + println!( + "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----", + self.tenantid, + self.timelineid, + self.key_range.start, + self.key_range.end, + self.lsn_range.start, + self.lsn_range.end + ); + + Ok(()) + } + + fn short_id(&self) -> String { + self.filename().file_name() + } +} + +impl PersistentLayer for RemoteLayer { + fn get_tenant_id(&self) -> TenantId { + self.tenantid + } + + fn get_timeline_id(&self) -> TimelineId { + self.timelineid + } + + fn filename(&self) -> LayerFileName { + if self.is_delta { + DeltaFileName { + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), + } + .into() + } else { + ImageFileName { + key_range: self.key_range.clone(), + lsn: self.lsn_range.start, + } + .into() + } + } + + fn local_path(&self) -> Option { + None + } + + fn iter(&self) -> Result> { + bail!("cannot iterate a remote layer"); + } + + fn key_iter(&self) -> Result> { + bail!("cannot iterate a remote layer"); + } + + fn delete(&self) -> Result<()> { + Ok(()) + } + + fn downcast_remote_layer<'a>(self: Arc) -> Option> { + Some(self) + } + + fn is_remote_layer(&self) -> bool { + true + } + + fn file_size(&self) -> Option { + self.layer_metadata.file_size() + } +} + +impl RemoteLayer { + pub fn new_img( + tenantid: TenantId, + timelineid: TimelineId, + fname: &ImageFileName, + layer_metadata: &LayerFileMetadata, + ) -> RemoteLayer { + RemoteLayer { + tenantid, + timelineid, + key_range: fname.key_range.clone(), + lsn_range: fname.lsn..(fname.lsn + 1), + is_delta: false, + is_incremental: false, + file_name: fname.to_owned().into(), + layer_metadata: layer_metadata.clone(), + ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + } + } + + pub fn new_delta( + tenantid: TenantId, + timelineid: TimelineId, + fname: &DeltaFileName, + layer_metadata: &LayerFileMetadata, + ) -> RemoteLayer { + RemoteLayer { + tenantid, + timelineid, + key_range: fname.key_range.clone(), + lsn_range: fname.lsn_range.clone(), + is_delta: true, + is_incremental: true, + file_name: fname.to_owned().into(), + layer_metadata: layer_metadata.clone(), + ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + } + } + + /// Create a Layer struct representing this layer, after it has been downloaded. + pub fn create_downloaded_layer( + &self, + conf: &'static PageServerConf, + file_size: u64, + ) -> Arc { + if self.is_delta { + let fname = DeltaFileName { + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), + }; + Arc::new(DeltaLayer::new( + conf, + self.timelineid, + self.tenantid, + &fname, + file_size, + )) + } else { + let fname = ImageFileName { + key_range: self.key_range.clone(), + lsn: self.lsn_range.start, + }; + Arc::new(ImageLayer::new( + conf, + self.timelineid, + self.tenantid, + &fname, + file_size, + )) + } + } +} diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant/tasks.rs similarity index 98% rename from pageserver/src/tenant_tasks.rs rename to pageserver/src/tenant/tasks.rs index d17f0eed43..8397d26e5d 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -8,8 +8,8 @@ use std::time::Duration; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::mgr; use crate::tenant::{Tenant, TenantState}; -use crate::tenant_mgr; use tracing::*; use utils::id::TenantId; @@ -127,7 +127,7 @@ async fn gc_loop(tenant_id: TenantId) { } else { // Run gc if gc_horizon > 0 { - if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await + if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval()).await { sleep_duration = wait_duration; error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration); @@ -155,7 +155,7 @@ async fn wait_for_active_tenant( wait: Duration, ) -> ControlFlow<(), Arc> { let tenant = loop { - match tenant_mgr::get_tenant(tenant_id, false) { + match mgr::get_tenant(tenant_id, false).await { Ok(tenant) => break tenant, Err(e) => { error!("Failed to get a tenant {tenant_id}: {e:#}"); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2a17ac34eb..651c8116f5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3,43 +3,46 @@ use anyhow::{anyhow, bail, ensure, Context}; use bytes::Bytes; use fail::fail_point; +use futures::stream::FuturesUnordered; +use futures::StreamExt; use itertools::Itertools; use once_cell::sync::OnceCell; -use pageserver_api::models::TimelineState; -use tokio::sync::watch; -use tokio::task::spawn_blocking; +use pageserver_api::models::{ + DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskState, TimelineState, +}; +use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError}; +use tokio_util::sync::CancellationToken; use tracing::*; use std::cmp::{max, min, Ordering}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::fs; use std::ops::{Deref, Range}; use std::path::{Path, PathBuf}; -use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock}; +use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; -use crate::storage_sync::index::{IndexPart, RelativePath}; -use crate::storage_sync::RemoteTimelineClient; +use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata}; +use crate::tenant::storage_layer::{ + DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer, LayerFileName, + RemoteLayer, +}; use crate::tenant::{ - delta_layer::{DeltaLayer, DeltaLayerWriter}, ephemeral_file::is_ephemeral_file, - filename::{DeltaFileName, ImageFileName}, - image_layer::{ImageLayer, ImageLayerWriter}, - inmemory_layer::InMemoryLayer, layer_map::{LayerMap, SearchResult}, metadata::{save_metadata, TimelineMetadata}, par_fsync, - storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}, + storage_layer::{PersistentLayer, ValueReconstructResult, ValueReconstructState}, }; use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::metrics::TimelineMetrics; -use crate::pgdatadir_mapping::BlockNumber; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; -use crate::tenant_config::TenantConfOpt; +use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError}; +use crate::tenant::config::TenantConfOpt; use pageserver_api::reltag::RelTag; use postgres_connection::PgConnectionConfig; @@ -51,16 +54,19 @@ use utils::{ simple_rcu::{Rcu, RcuReadGuard}, }; +use crate::page_cache; use crate::repository::GcResult; use crate::repository::{Key, Value}; use crate::task_mgr::TaskKind; -use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task}; +use crate::walreceiver::{is_broker_client_initialized, spawn_connection_manager_task}; use crate::walredo::WalRedoManager; -use crate::CheckpointConfig; use crate::METADATA_FILE_NAME; use crate::ZERO_PAGE; use crate::{is_temporary, task_mgr}; -use crate::{page_cache, storage_sync::index::LayerFileMetadata}; + +use super::remote_timeline_client::index::IndexPart; +use super::remote_timeline_client::RemoteTimelineClient; +use super::storage_layer::{DeltaLayer, ImageLayer, Layer}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] enum FlushLoopState { @@ -73,12 +79,14 @@ pub struct Timeline { conf: &'static PageServerConf, tenant_conf: Arc>, + myself: Weak, + pub tenant_id: TenantId, pub timeline_id: TimelineId, pub pg_version: u32, - pub layers: RwLock, + pub layers: RwLock>, last_freeze_at: AtomicLsn, // Atomic would be more appropriate here. @@ -88,10 +96,7 @@ pub struct Timeline { walredo_mgr: Arc, /// Remote storage client. - /// - /// If Some, use it to upload all newly created layers to the remote storage, - /// and keep remote metadata file in sync. In the future, also use it to download - /// layer files on-demand. + /// See [`storage_sync`] module comment for details. pub remote_client: Option>, // What page versions do we hold in the repository? If we get a @@ -155,7 +160,7 @@ pub struct Timeline { // List of child timelines and their branch points. This is needed to avoid // garbage collecting data that is still needed by the child timelines. - pub gc_info: RwLock, + pub gc_info: std::sync::RwLock, // It may change across major versions so for simplicity // keep it after running initdb for a timeline. @@ -173,7 +178,6 @@ pub struct Timeline { /// Current logical size of the "datadir", at the last LSN. current_logical_size: LogicalSize, - initial_size_computation_started: AtomicBool, /// Information about the last processed message by the WAL receiver, /// or None if WAL receiver has not received anything for this timeline @@ -183,6 +187,8 @@ pub struct Timeline { /// Relation size cache pub rel_size_cache: RwLock>, + download_all_remote_layers_task_info: RwLock>, + state: watch::Sender, } @@ -199,6 +205,8 @@ struct LogicalSize { /// /// NOTE: initial size is not a constant and will change between restarts. initial_logical_size: OnceCell, + /// Semaphore to track ongoing calculation of `initial_logical_size`. + initial_size_computation: Arc, /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines. initial_part_end: Option, /// All other size changes after startup, combined together. @@ -249,6 +257,8 @@ impl LogicalSize { fn empty_initial() -> Self { Self { initial_logical_size: OnceCell::with_value(0), + // initial_logical_size already computed, so, don't admit any calculations + initial_size_computation: Arc::new(Semaphore::new(0)), initial_part_end: None, size_added_after_initial: AtomicI64::new(0), } @@ -257,6 +267,7 @@ impl LogicalSize { fn deferred_initial(compute_to: Lsn) -> Self { Self { initial_logical_size: OnceCell::new(), + initial_size_computation: Arc::new(Semaphore::new(1)), initial_part_end: Some(compute_to), size_added_after_initial: AtomicI64::new(0), } @@ -299,12 +310,68 @@ impl LogicalSize { } } +/// Returned by [`Timeline::layer_size_sum`] +pub enum LayerSizeSum { + /// The result is accurate. + Accurate(u64), + // We don't know the layer file size of one or more layers. + // They contribute to the sum with a value of 0. + // Hence, the sum is a lower bound for the actualy layer file size sum. + ApproximateLowerBound(u64), +} + +impl LayerSizeSum { + pub fn approximate_is_ok(self) -> u64 { + match self { + LayerSizeSum::Accurate(v) => v, + LayerSizeSum::ApproximateLowerBound(v) => v, + } + } +} + pub struct WalReceiverInfo { pub wal_source_connconf: PgConnectionConfig, pub last_received_msg_lsn: Lsn, pub last_received_msg_ts: u128, } +/// Like `?`, but for [`PageReconstructResult`]. +/// Use it to bubble up the `NeedsDownload` and `Error` to the caller. +/// +/// Once `std::ops::Try` is stabilized, we should use it instead of this macro. +#[macro_export] +macro_rules! try_no_ondemand_download { + ($result:expr) => {{ + let result = $result; + match result { + PageReconstructResult::Success(value) => value, + PageReconstructResult::NeedsDownload(timeline, layer) => { + return PageReconstructResult::NeedsDownload(timeline, layer); + } + PageReconstructResult::Error(e) => return PageReconstructResult::Error(e), + } + }}; +} + +/// Replacement for `?` in functions that return [`PageReconstructResult`]. +/// +/// Given an `expr: Result`, use `try_page_reconstruct_result!(expr)` +/// instead of `(expr)?`. +/// If `expr` is `Ok(v)`, the macro evaluates to `v`. +/// If `expr` is `Err(e)`, the macro returns `PageReconstructResult::Error(e.into())`. +/// +/// Once `std::ops::Try` is stabilized, we should use it instead of this macro. +#[macro_export] +macro_rules! try_page_reconstruct_result { + ($result:expr) => {{ + let result = $result; + match result { + Ok(v) => v, + Err(e) => return PageReconstructResult::from(e), + } + }}; +} + /// /// Information about how much history needs to be retained, needed by /// Garbage Collection. @@ -334,6 +401,77 @@ pub struct GcInfo { pub pitr_cutoff: Lsn, } +pub enum PageReconstructResult { + Success(T), + /// The given RemoteLayer needs to be downloaded and replaced in the timeline's layer map + /// for the operation to succeed. Use [`Timeline::download_remote_layer`] to do it, then + /// retry the operation that returned this error. + NeedsDownload(Weak, Weak), + Error(PageReconstructError), +} + +/// An error happened in a get() operation. +#[derive(thiserror::Error)] +pub enum PageReconstructError { + #[error(transparent)] + Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error + + #[error(transparent)] + WalRedo(#[from] crate::walredo::WalRedoError), +} + +impl std::fmt::Debug for PageReconstructError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + Self::Other(err) => err.fmt(f), + Self::WalRedo(err) => err.fmt(f), + } + } +} + +/// This impl makes it so you can substitute return type +/// `Result` with `PageReconstructError` in functions +/// and existing `?` will generally continue to work. +/// The reason why thanks to +/// anyhow::Error that `(some error type)ensures that exis +impl From for PageReconstructResult +where + E: Into, +{ + fn from(e: E) -> Self { + Self::Error(e.into()) + } +} + +impl PageReconstructResult { + /// Treat the need for on-demand download as an error. + /// + /// **Avoid this function in new code** if you can help it, + /// as on-demand download will become the norm in the future, + /// especially once we implement layer file eviction. + /// + /// If you are in an async function, use [`with_ondemand_download`] + /// to do the download right here. + /// + /// If you are in a sync function, change its return type from + /// `Result` to `PageReconstructResult` and bubble up + /// the non-success cases of `PageReconstructResult` to the caller. + /// This gives them a chance to do the download and retry. + /// Consider using [`try_no_ondemand_download`] for convenience. + /// + /// For more background, read the comment on [`with_ondemand_download`]. + pub fn no_ondemand_download(self) -> anyhow::Result { + match self { + PageReconstructResult::Success(value) => Ok(value), + // TODO print more info about the timeline + PageReconstructResult::NeedsDownload(_, _) => anyhow::bail!("Layer needs downloading"), + PageReconstructResult::Error(e) => { + Err(anyhow::Error::new(e).context("Failed to reconstruct the page")) + } + } + } +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -361,8 +499,10 @@ impl Timeline { /// the Repository implementation may incorrectly return a value from an ancestor /// branch, for example, or waste a lot of cycles chasing the non-existing key. /// - pub fn get(&self, key: Key, lsn: Lsn) -> anyhow::Result { - anyhow::ensure!(lsn.is_valid(), "Invalid LSN"); + pub fn get(&self, key: Key, lsn: Lsn) -> PageReconstructResult { + if !lsn.is_valid() { + return PageReconstructResult::from(anyhow!("Invalid LSN")); + } // Check the page cache. We will get back the most recent page with lsn <= `lsn`. // The cached image can be returned directly if there is no WAL between the cached image @@ -372,7 +512,7 @@ impl Timeline { Some((cached_lsn, cached_img)) => { match cached_lsn.cmp(&lsn) { Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check - Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image + Ordering::Equal => return PageReconstructResult::Success(cached_img), // exact LSN match, return the image Ordering::Greater => { unreachable!("the returned lsn should never be after the requested lsn") } @@ -387,13 +527,18 @@ impl Timeline { img: cached_page_img, }; - self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; + try_no_ondemand_download!(self.get_reconstruct_data(key, lsn, &mut reconstruct_state)); self.metrics .reconstruct_time_histo .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state)) } + // Like get(), but if a remote layer file is needed, it is downloaded as part of this call. + pub async fn get_download(&self, key: Key, lsn: Lsn) -> anyhow::Result { + with_ondemand_download(|| self.get(key, lsn)).await + } + /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. pub fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last @@ -420,30 +565,27 @@ impl Timeline { } } - /// Get the physical size of the timeline at the latest LSN - pub fn get_physical_size(&self) -> u64 { - self.metrics.current_physical_size_gauge.get() + /// The sum of the file size of all historic layers in the layer map. + /// This method makes no distinction between local and remote layers. + /// Hence, the result **does not represent local filesystem usage**. + pub fn layer_size_sum(&self) -> LayerSizeSum { + let layer_map = self.layers.read().unwrap(); + let mut size = 0; + let mut no_size_cnt = 0; + for l in layer_map.iter_historic_layers() { + let (l_size, l_no_size) = l.file_size().map(|s| (s, 0)).unwrap_or((0, 1)); + size += l_size; + no_size_cnt += l_no_size; + } + if no_size_cnt == 0 { + LayerSizeSum::Accurate(size) + } else { + LayerSizeSum::ApproximateLowerBound(size) + } } - /// Get the physical size of the timeline at the latest LSN non incrementally - pub fn get_physical_size_non_incremental(&self) -> anyhow::Result { - let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); - // total size of layer files in the current timeline directory - let mut total_physical_size = 0; - - for direntry in fs::read_dir(timeline_path)? { - let direntry = direntry?; - let fname = direntry.file_name(); - let fname = fname.to_string_lossy(); - - if ImageFileName::parse_str(&fname).is_some() - || DeltaFileName::parse_str(&fname).is_some() - { - total_physical_size += direntry.metadata()?.len(); - } - } - - Ok(total_physical_size) + pub fn get_resident_physical_size(&self) -> u64 { + self.metrics.resident_physical_size_gauge.get() } /// @@ -491,22 +633,10 @@ impl Timeline { } /// Flush to disk all data that was written with the put_* functions - /// - /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't - /// know anything about them here in the repository. #[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))] - pub async fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> { - match cconf { - CheckpointConfig::Flush => { - self.freeze_inmem_layer(false); - self.flush_frozen_layers_and_wait().await - } - CheckpointConfig::Forced => { - self.freeze_inmem_layer(false); - self.flush_frozen_layers_and_wait().await?; - self.compact().await - } - } + pub async fn freeze_and_flush(&self) -> anyhow::Result<()> { + self.freeze_inmem_layer(false); + self.flush_frozen_layers_and_wait().await } pub async fn compact(&self) -> anyhow::Result<()> { @@ -563,14 +693,18 @@ impl Timeline { // Define partitioning schema if needed - match self.repartition( - self.get_last_record_lsn(), - self.get_compaction_target_size(), - ) { + match self + .repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + ) + .await + { Ok((partitioning, lsn)) => { // 2. Create new image layers for partitions that have been modified // "enough". - let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?; + let layer_paths_to_upload = + self.create_image_layers(&partitioning, lsn, false).await?; if let Some(remote_client) = &self.remote_client { for (path, layer_metadata) in layer_paths_to_upload { remote_client.schedule_layer_file_upload(&path, &layer_metadata)?; @@ -581,6 +715,18 @@ impl Timeline { let timer = self.metrics.compact_time_histo.start_timer(); self.compact_level0(target_file_size).await?; timer.stop_and_record(); + + // If `create_image_layers' or `compact_level0` scheduled any + // uploads or deletions, but didn't update the index file yet, + // do it now. + // + // This isn't necessary for correctness, the remote state is + // consistent without the uploads and deletions, and we would + // update the index file on next flush iteration too. But it + // could take a while until that happens. + if let Some(remote_client) = &self.remote_client { + remote_client.schedule_index_upload_for_file_changes()?; + } } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -606,18 +752,22 @@ impl Timeline { /// /// The size could be lagging behind the actual number, in case /// the initial size calculation has not been run (gets triggered on the first size access). - pub fn get_current_logical_size(self: &Arc) -> anyhow::Result { + /// + /// return size and boolean flag that shows if the size is exact + pub fn get_current_logical_size(self: &Arc) -> anyhow::Result<(u64, bool)> { let current_size = self.current_logical_size.current_size()?; debug!("Current size: {current_size:?}"); + let mut is_exact = true; let size = current_size.size(); if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) = (current_size, self.current_logical_size.initial_part_end) { + is_exact = false; self.try_spawn_size_init_task(init_lsn); } - Ok(size) + Ok((size, is_exact)) } /// Check if more than 'checkpoint_distance' of WAL has been accumulated in @@ -741,76 +891,81 @@ impl Timeline { walredo_mgr: Arc, remote_client: Option, pg_version: u32, - ) -> Self { + ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); let (state, _) = watch::channel(TimelineState::Suspended); let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0); let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); - let mut result = Timeline { - conf, - tenant_conf, - timeline_id, - tenant_id, - pg_version, - layers: RwLock::new(LayerMap::default()), + Arc::new_cyclic(|myself| { + let mut result = Timeline { + conf, + tenant_conf, + myself: myself.clone(), + timeline_id, + tenant_id, + pg_version, + layers: RwLock::new(LayerMap::default()), - walredo_mgr, + walredo_mgr, - remote_client: remote_client.map(Arc::new), + remote_client: remote_client.map(Arc::new), - // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. - last_record_lsn: SeqWait::new(RecordLsn { - last: disk_consistent_lsn, - prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)), - }), - disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0), + // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. + last_record_lsn: SeqWait::new(RecordLsn { + last: disk_consistent_lsn, + prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)), + }), + disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0), - last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0), - last_freeze_ts: RwLock::new(Instant::now()), + last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0), + last_freeze_ts: RwLock::new(Instant::now()), - ancestor_timeline: ancestor, - ancestor_lsn: metadata.ancestor_lsn(), + ancestor_timeline: ancestor, + ancestor_lsn: metadata.ancestor_lsn(), - metrics: TimelineMetrics::new(&tenant_id, &timeline_id), + metrics: TimelineMetrics::new(&tenant_id, &timeline_id), - flush_loop_state: Mutex::new(FlushLoopState::NotStarted), + flush_loop_state: Mutex::new(FlushLoopState::NotStarted), - layer_flush_start_tx, - layer_flush_done_tx, + layer_flush_start_tx, + layer_flush_done_tx, - write_lock: Mutex::new(()), - layer_removal_cs: Default::default(), + write_lock: Mutex::new(()), + layer_removal_cs: Default::default(), - gc_info: RwLock::new(GcInfo { - retain_lsns: Vec::new(), - horizon_cutoff: Lsn(0), - pitr_cutoff: Lsn(0), - }), + gc_info: std::sync::RwLock::new(GcInfo { + retain_lsns: Vec::new(), + horizon_cutoff: Lsn(0), + pitr_cutoff: Lsn(0), + }), - latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), - initdb_lsn: metadata.initdb_lsn(), + latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), + initdb_lsn: metadata.initdb_lsn(), - current_logical_size: if disk_consistent_lsn.is_valid() { - // we're creating timeline data with some layer files existing locally, - // need to recalculate timeline's logical size based on data in the layers. - LogicalSize::deferred_initial(disk_consistent_lsn) - } else { - // we're creating timeline data without any layers existing locally, - // initial logical size is 0. - LogicalSize::empty_initial() - }, - initial_size_computation_started: AtomicBool::new(false), - partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), - repartition_threshold: 0, + current_logical_size: if disk_consistent_lsn.is_valid() { + // we're creating timeline data with some layer files existing locally, + // need to recalculate timeline's logical size based on data in the layers. + LogicalSize::deferred_initial(disk_consistent_lsn) + } else { + // we're creating timeline data without any layers existing locally, + // initial logical size is 0. + LogicalSize::empty_initial() + }, + partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), + repartition_threshold: 0, - last_received_wal: Mutex::new(None), - rel_size_cache: RwLock::new(HashMap::new()), - state, - }; - result.repartition_threshold = result.get_checkpoint_distance() / 10; - result + last_received_wal: Mutex::new(None), + rel_size_cache: RwLock::new(HashMap::new()), + + download_all_remote_layers_task_info: RwLock::new(None), + + state, + }; + result.repartition_threshold = result.get_checkpoint_distance() / 10; + result + }) } pub(super) fn maybe_spawn_flush_loop(self: &Arc) { @@ -856,12 +1011,12 @@ impl Timeline { } pub(super) fn launch_wal_receiver(self: &Arc) { - if !is_etcd_client_initialized() { + if !is_broker_client_initialized() { if cfg!(test) { - info!("not launching WAL receiver because etcd client hasn't been initialized"); + info!("not launching WAL receiver because broker client hasn't been initialized"); return; } else { - panic!("etcd client not initialized"); + panic!("broker client not initialized"); } } @@ -882,7 +1037,6 @@ impl Timeline { drop(tenant_conf_guard); let self_clone = Arc::clone(self); spawn_connection_manager_task( - self.conf.broker_etcd_prefix.clone(), self_clone, walreceiver_connect_timeout, lagging_wal_timeout, @@ -925,11 +1079,18 @@ impl Timeline { continue; } - let layer = - ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); + let file_size = direntry_path.metadata()?.len(); - trace!("found layer {}", layer.filename().display()); - total_physical_size += layer.path().metadata()?.len(); + let layer = ImageLayer::new( + self.conf, + self.timeline_id, + self.tenant_id, + &imgfilename, + file_size, + ); + + trace!("found layer {}", layer.path().display()); + total_physical_size += file_size; layers.insert_historic(Arc::new(layer)); num_layers += 1; } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) { @@ -949,16 +1110,23 @@ impl Timeline { continue; } - let layer = - DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); + let file_size = direntry_path.metadata()?.len(); - trace!("found layer {}", layer.filename().display()); - total_physical_size += layer.path().metadata()?.len(); + let layer = DeltaLayer::new( + self.conf, + self.timeline_id, + self.tenant_id, + &deltafilename, + file_size, + ); + + trace!("found layer {}", layer.path().display()); + total_physical_size += file_size; layers.insert_historic(Arc::new(layer)); num_layers += 1; } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") { // ignore these - } else if crate::storage_sync::is_temp_download_file(&direntry_path) { + } else if remote_timeline_client::is_temp_download_file(&direntry_path) { info!( "skipping temp download file, reconcile_with_remote will resume / clean up: {}", fname @@ -988,7 +1156,7 @@ impl Timeline { num_layers, disk_consistent_lsn, total_physical_size ); self.metrics - .current_physical_size_gauge + .resident_physical_size_gauge .set(total_physical_size); timer.stop_and_record(); @@ -996,162 +1164,156 @@ impl Timeline { Ok(()) } - async fn download_missing( + async fn create_remote_layers( &self, index_part: &IndexPart, - remote_client: &RemoteTimelineClient, - mut local_filenames: HashSet, + local_layers: HashMap>, up_to_date_disk_consistent_lsn: Lsn, - ) -> anyhow::Result> { - let mut remote_filenames: HashSet = HashSet::new(); - for fname in index_part.timeline_layers.iter() { - remote_filenames.insert(fname.to_local_path(&PathBuf::from(""))); - } - - // Are there any local files that exist, with a size that doesn't match - // with the size stored in the remote index file? - // If so, rename_to_backup those files so that we re-download them later. - local_filenames.retain(|path| { - let layer_metadata = index_part - .layer_metadata - .get(&RelativePath::from_filename(path)) - .map(LayerFileMetadata::from) - .unwrap_or(LayerFileMetadata::MISSING); - - if let Some(remote_size) = layer_metadata.file_size() { - let local_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id).join(&path); - match local_path.metadata() { - Ok(metadata) => { - let local_size = metadata.len(); - - if local_size != remote_size { - warn!("removing local file \"{}\" because it has unexpected length {}; length in remote index is {}", - path.display(), - local_size, - remote_size); - if let Err(err) = rename_to_backup(&local_path) { - error!("could not rename file \"{}\": {:?}", - local_path.display(), err); - } - self.metrics.current_physical_size_gauge.sub(local_size); - false - } else { - true - } - } - Err(err) => { - error!("could not get size of local file \"{}\": {:?}", path.display(), err); - true - } - } - } else { - true - } - }); - + ) -> anyhow::Result>> { // Are we missing some files that are present in remote storage? - // Download them now. - // TODO Downloading many files this way is not efficient. - // Better to use FuturesUnordered. Maybe keep as is because: - // a) inplace download is a throw-away code, on-demand patch doesnt need that - // b) typical case now is that there is nothing to sync, this downloads a lot - // 1) if there was another pageserver that came and generated new files - // 2) during attach of a timeline with big history which we currently do not do - for path in remote_filenames.difference(&local_filenames) { - let fname = path.to_str().unwrap(); - info!("remote layer file {fname} does not exist locally"); + // Create RemoteLayer instances for them. + let mut local_only_layers = local_layers; + for remote_layer_name in &index_part.timeline_layers { + let local_layer = local_only_layers.remove(remote_layer_name); - let layer_metadata = index_part + let remote_layer_metadata = index_part .layer_metadata - .get(&RelativePath::from_filename(path)) + .get(remote_layer_name) .map(LayerFileMetadata::from) .unwrap_or(LayerFileMetadata::MISSING); - if let Some(imgfilename) = ImageFileName::parse_str(fname) { - if imgfilename.lsn > up_to_date_disk_consistent_lsn { - warn!( + // Is the local layer's size different from the size stored in the + // remote index file? + // If so, rename_to_backup those files & replace their local layer with + // a RemoteLayer in the layer map so that we re-download them on-demand. + if let Some(local_layer) = local_layer { + let local_layer_path = local_layer + .local_path() + .expect("caller must ensure that local_layers only contains local layers"); + ensure!( + local_layer_path.exists(), + "every layer from local_layers must exist on disk: {}", + local_layer_path.display() + ); + + if let Some(remote_size) = remote_layer_metadata.file_size() { + let metadata = local_layer_path.metadata().with_context(|| { + format!( + "get file size of local layer {}", + local_layer_path.display() + ) + })?; + let local_size = metadata.len(); + if local_size != remote_size { + warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}"); + if let Err(err) = rename_to_backup(&local_layer_path) { + assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display()); + anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}"); + } else { + self.metrics.resident_physical_size_gauge.sub(local_size); + { + let mut layers = self.layers.write().unwrap(); + layers.remove_historic(local_layer); + layers.rebuild_index(); + } + // fall-through to adding the remote layer + } + } else { + debug!( + "layer is present locally and file size matches remote, using it: {}", + local_layer_path.display() + ); + continue; + } + } else { + debug!( + "layer is present locally and remote does not have file size, using it: {}", + local_layer_path.display() + ); + continue; + } + } + + info!( + "remote layer does not exist locally, creating remote layer: {}", + remote_layer_name.file_name() + ); + + match remote_layer_name { + LayerFileName::Image(imgfilename) => { + if imgfilename.lsn > up_to_date_disk_consistent_lsn { + warn!( "found future image layer {} on timeline {} remote_consistent_lsn is {}", imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn ); - continue; - } + continue; + } - trace!("downloading image file: {}", file = path.display()); - let sz = remote_client - .download_layer_file(&RelativePath::from_filename(path), &layer_metadata) - .await - .context("download image layer")?; - trace!("done"); - - let image_layer = - ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename); - - { - let mut layers = self.layers.write().unwrap(); - layers.insert_historic(Arc::new(image_layer)); - layers.rebuild_index(); - } - self.metrics.current_physical_size_gauge.add(sz); - } else if let Some(deltafilename) = DeltaFileName::parse_str(fname) { - // Create a DeltaLayer struct for each delta file. - // The end-LSN is exclusive, while disk_consistent_lsn is - // inclusive. For example, if disk_consistent_lsn is 100, it is - // OK for a delta layer to have end LSN 101, but if the end LSN - // is 102, then it might not have been fully flushed to disk - // before crash. - if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 { - warn!( - "found future delta layer {} on timeline {} remote_consistent_lsn is {}", - deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn + let remote_layer = RemoteLayer::new_img( + self.tenant_id, + self.timeline_id, + imgfilename, + &remote_layer_metadata, ); - continue; + let remote_layer = Arc::new(remote_layer); + + { + let mut layers = self.layers.write().unwrap(); + layers.insert_historic(remote_layer); + layers.rebuild_index(); + } } - - trace!("downloading image file: {}", file = path.display()); - let sz = remote_client - .download_layer_file(&RelativePath::from_filename(path), &layer_metadata) - .await - .context("download delta layer")?; - trace!("done"); - - let delta_layer = - DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename); - - { - let mut layers = self.layers.write().unwrap(); - layers.insert_historic(Arc::new(delta_layer)); - layers.rebuild_index(); + LayerFileName::Delta(deltafilename) => { + // Create a RemoteLayer for the delta file. + // The end-LSN is exclusive, while disk_consistent_lsn is + // inclusive. For example, if disk_consistent_lsn is 100, it is + // OK for a delta layer to have end LSN 101, but if the end LSN + // is 102, then it might not have been fully flushed to disk + // before crash. + if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 { + warn!( + "found future delta layer {} on timeline {} remote_consistent_lsn is {}", + deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn + ); + continue; + } + let remote_layer = RemoteLayer::new_delta( + self.tenant_id, + self.timeline_id, + deltafilename, + &remote_layer_metadata, + ); + let remote_layer = Arc::new(remote_layer); + { + let mut layers = self.layers.write().unwrap(); + layers.insert_historic(remote_layer); + layers.rebuild_index(); + } } - self.metrics.current_physical_size_gauge.add(sz); - } else { - bail!("unexpected layer filename in remote storage: {}", fname); + #[cfg(test)] + LayerFileName::Test(_) => unreachable!(), } } - // now these are local only filenames - let local_only_filenames = local_filenames - .difference(&remote_filenames) - .cloned() - .collect(); - Ok(local_only_filenames) + Ok(local_only_layers) } + /// This function will synchronize local state with what we have in remote storage. /// - /// This function will synchronize local data with what we have in remote storage. - /// 1. It will download missing layer files. - /// 2. It will update local metadata if remote one has greater `disk_consistent_lsn`. - /// 3. It will upload files that are missing on the remote - /// 4. It will update index file on the remote accordingly - /// TODO may be a bit cleaner to do things based on populated remote client, - /// and then do things based on its upload_queue.latest_files + /// Steps taken: + /// 1. Initialize upload queue based on `index_part`. + /// 2. Create `RemoteLayer` instances for layers that exist only on the remote. + /// The list of layers on the remote comes from `index_part`. + /// The list of local layers is given by the layer map's `iter_historic_layers()`. + /// So, the layer map must have been loaded already. + /// 3. Schedule upload of local-only layer files (which will then also update the remote + /// IndexPart to include the new layer files). /// - /// This is used during tenant attach. The layer map must have been loaded - /// with local filesystem contents already. - /// - /// The caller should provide IndexPart if it exists on the remote storage. If it's None, - /// we assume that it is missing on the remote storage, which means that we initialized - /// a timeline and then restarted before successful upload was performed + /// Refer to the `storage_sync` module comment for more context. /// + /// # TODO + /// May be a bit cleaner to do things based on populated remote client, + /// and then do things based on its upload_queue.latest_files. #[instrument(skip(self, index_part, up_to_date_metadata))] pub async fn reconcile_with_remote( &self, @@ -1166,50 +1328,46 @@ impl Timeline { let disk_consistent_lsn = up_to_date_metadata.disk_consistent_lsn(); - // Build a map of local layers for quick lookups - let mut local_filenames: HashSet = HashSet::new(); - for layer in self.layers.read().unwrap().iter_historic_layers() { - local_filenames.insert(layer.filename()); - } + let local_layers = self + .layers + .read() + .unwrap() + .iter_historic_layers() + .map(|l| (l.filename(), l)) + .collect::>(); - let local_only_filenames = match index_part { + let local_only_layers = match index_part { Some(index_part) => { info!( "initializing upload queue from remote index with {} layer files", index_part.timeline_layers.len() ); remote_client.init_upload_queue(index_part)?; - let local_only_filenames = self - .download_missing( - index_part, - remote_client, - local_filenames, - disk_consistent_lsn, - ) - .await?; - local_only_filenames + self.create_remote_layers(index_part, local_layers, disk_consistent_lsn) + .await? } None => { info!("initializing upload queue as empty"); remote_client.init_upload_queue_for_empty_remote(up_to_date_metadata)?; - local_filenames + local_layers } }; // Are there local files that don't exist remotely? Schedule uploads for them - let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); - for fname in &local_only_filenames { - let absolute = timeline_path.join(fname); - let sz = absolute + for (layer_name, layer) in &local_only_layers { + // XXX solve this in the type system + let layer_path = layer + .local_path() + .expect("local_only_layers only contains local layers"); + let layer_size = layer_path .metadata() - .with_context(|| format!("failed to get file {} metadata", fname.display()))? + .with_context(|| format!("failed to get file {layer_path:?} metadata"))? .len(); - info!("scheduling {} for upload", fname.display()); - remote_client.schedule_layer_file_upload(&absolute, &LayerFileMetadata::new(sz))?; - } - if !local_only_filenames.is_empty() { - remote_client.schedule_index_upload(up_to_date_metadata)?; + info!("scheduling {layer_path:?} for upload"); + remote_client + .schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?; } + remote_client.schedule_index_upload_for_file_changes()?; info!("Done"); @@ -1217,70 +1375,186 @@ impl Timeline { } fn try_spawn_size_init_task(self: &Arc, init_lsn: Lsn) { - // Atomically check if the timeline size calculation had already started. - // If the flag was not already set, this sets it. - if !self - .initial_size_computation_started - .swap(true, AtomicOrdering::SeqCst) + let permit = match Arc::clone(&self.current_logical_size.initial_size_computation) + .try_acquire_owned() { - // We need to start the computation task. - let self_clone = Arc::clone(self); - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - task_mgr::TaskKind::InitialLogicalSizeCalculation, - Some(self.tenant_id), - Some(self.timeline_id), - "initial size calculation", - false, - async move { - let mut timeline_state_updates = self_clone.subscribe_for_state_updates(); - let self_calculation = Arc::clone(&self_clone); - tokio::select! { - calculation_result = spawn_blocking(move || self_calculation.calculate_logical_size(init_lsn)) => { - let calculated_size = calculation_result - .context("Failed to spawn calculation result task")? - .context("Failed to calculate logical size")?; - match self_clone.current_logical_size.initial_logical_size.set(calculated_size) { - Ok(()) => info!("Successfully calculated initial logical size"), - Err(existing_size) => error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing"), - } - Ok(()) - }, - new_event = async { - loop { - match timeline_state_updates.changed().await { - Ok(()) => { - let new_state = *timeline_state_updates.borrow(); - match new_state { - // we're running this job for active timelines only - TimelineState::Active => continue, - TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return Some(new_state), - } - } - Err(_sender_dropped_error) => return None, - } - } - } => { - match new_event { - Some(new_state) => info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates"), - None => info!("Timeline dropped state updates sender, stopping init size calculation"), - } - Ok(()) - }, + Ok(permit) => permit, + Err(TryAcquireError::NoPermits) => { + // computation already ongoing or finished with success + return; + } + Err(TryAcquireError::Closed) => unreachable!("we never call close"), + }; + debug_assert!(self + .current_logical_size + .initial_logical_size + .get() + .is_none()); + // We need to start the computation task. + let self_clone = Arc::clone(self); + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::InitialLogicalSizeCalculation, + Some(self.tenant_id), + Some(self.timeline_id), + "initial size calculation", + false, + // NB: don't log errors here, task_mgr will do that. + async move { + let calculated_size = match self_clone.logical_size_calculation_task(init_lsn).await + { + Ok(s) => s, + Err(CalculateLogicalSizeError::Cancelled) => { + // Don't make noise, this is a common task. + // In the unlikely case that there ihs another call to this function, we'll retry + // because initial_logical_size is still None. + info!("initial size calculation cancelled, likely timeline delete / tenant detach"); + return Ok(()); } - }.instrument(info_span!("initial_logical_size_calculation", tenant = %self.tenant_id, timeline = %self.timeline_id)), - ); + x @ Err(_) => x.context("Failed to calculate logical size")?, + }; + match self_clone + .current_logical_size + .initial_logical_size + .set(calculated_size) + { + Ok(()) => (), + Err(existing_size) => { + // This shouldn't happen because the semaphore is initialized with 1. + // But if it happens, just complain & report success so there are no further retries. + error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing") + } + } + // now that `initial_logical_size.is_some()`, reduce permit count to 0 + // so that we prevent future callers from spawning this task + permit.forget(); + Ok(()) + }, + ); + } + + pub fn spawn_ondemand_logical_size_calculation( + self: &Arc, + lsn: Lsn, + ) -> oneshot::Receiver> { + let (sender, receiver) = oneshot::channel(); + let self_clone = Arc::clone(self); + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::InitialLogicalSizeCalculation, + Some(self.tenant_id), + Some(self.timeline_id), + "ondemand logical size calculation", + false, + async move { + let res = self_clone.logical_size_calculation_task(lsn).await; + let _ = sender.send(res).ok(); + Ok(()) // Receiver is responsible for handling errors + }, + ); + receiver + } + + #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))] + async fn logical_size_calculation_task( + self: &Arc, + init_lsn: Lsn, + ) -> Result { + let mut timeline_state_updates = self.subscribe_for_state_updates(); + let self_calculation = Arc::clone(self); + let cancel = CancellationToken::new(); + + let calculation = async { + let cancel = cancel.child_token(); + tokio::task::spawn_blocking(move || { + // Run in a separate thread since this can do a lot of + // synchronous file IO without .await inbetween + // if there are no RemoteLayers that would require downloading. + let h = tokio::runtime::Handle::current(); + h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel)) + }) + .await + .context("Failed to spawn calculation result task")? + }; + let timeline_state_cancellation = async { + loop { + match timeline_state_updates.changed().await { + Ok(()) => { + let new_state = *timeline_state_updates.borrow(); + match new_state { + // we're running this job for active timelines only + TimelineState::Active => continue, + TimelineState::Broken + | TimelineState::Stopping + | TimelineState::Suspended => { + break format!("aborted because timeline became inactive (new state: {new_state:?})") + } + } + } + Err(_sender_dropped_error) => { + // can't happen, the sender is not dropped as long as the Timeline exists + break "aborted because state watch was dropped".to_string(); + } + } + } + }; + + let taskmgr_shutdown_cancellation = async { + task_mgr::shutdown_watcher().await; + "aborted because task_mgr shutdown requested".to_string() + }; + + tokio::pin!(calculation); + loop { + tokio::select! { + res = &mut calculation => { return res } + reason = timeline_state_cancellation => { + debug!(reason = reason, "cancelling calculation"); + cancel.cancel(); + return calculation.await; + } + reason = taskmgr_shutdown_cancellation => { + debug!(reason = reason, "cancelling calculation"); + cancel.cancel(); + return calculation.await; + } + } } } /// Calculate the logical size of the database at the latest LSN. /// /// NOTE: counted incrementally, includes ancestors, this can be a slow operation. - pub fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result { + async fn calculate_logical_size( + &self, + up_to_lsn: Lsn, + cancel: CancellationToken, + ) -> Result { info!( "Calculating logical size for timeline {} at {}", self.timeline_id, up_to_lsn ); + // These failpoints are used by python tests to ensure that we don't delete + // the timeline while the logical size computation is ongoing. + // The first failpoint is used to make this function pause. + // Then the python test initiates timeline delete operation in a thread. + // It waits for a few seconds, then arms the second failpoint and disables + // the first failpoint. The second failpoint prints an error if the timeline + // delete code has deleted the on-disk state while we're still running here. + // It shouldn't do that. If it does it anyway, the error will be caught + // by the test suite, highlighting the problem. + fail::fail_point!("timeline-calculate-logical-size-pause"); + fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| { + if !self + .conf + .metadata_path(self.timeline_id, self.tenant_id) + .exists() + { + error!("timeline-calculate-logical-size-pre metadata file does not exist") + } + // need to return something + Ok(0) + }); let timer = if up_to_lsn == self.initdb_lsn { if let Some(size) = self.current_logical_size.initialized_size() { if size != 0 { @@ -1296,7 +1570,9 @@ impl Timeline { } else { self.metrics.logical_size_histo.start_timer() }; - let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn)?; + let logical_size = self + .get_current_logical_size_non_incremental(up_to_lsn, cancel) + .await?; debug!("calculated logical size: {logical_size}"); timer.stop_and_record(); Ok(logical_size) @@ -1320,7 +1596,45 @@ impl Timeline { Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"), } } +} +type TraversalId = String; + +trait TraversalLayerExt { + fn traversal_id(&self) -> TraversalId; +} + +impl TraversalLayerExt for Arc { + fn traversal_id(&self) -> TraversalId { + match self.local_path() { + Some(local_path) => { + debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", self.get_timeline_id())), + "need timeline ID to uniquely identify the layer when traversal crosses ancestor boundary", + ); + format!("{}", local_path.display()) + } + None => { + format!( + "remote {}/{}", + self.get_timeline_id(), + self.filename().file_name() + ) + } + } + } +} + +impl TraversalLayerExt for Arc { + fn traversal_id(&self) -> TraversalId { + format!( + "timeline {} in-memory {}", + self.get_timeline_id(), + self.short_id() + ) + } +} + +impl Timeline { /// /// Get a handle to a Layer for reading. /// @@ -1334,14 +1648,15 @@ impl Timeline { key: Key, request_lsn: Lsn, reconstruct_state: &mut ValueReconstructState, - ) -> anyhow::Result<()> { + ) -> PageReconstructResult<()> { // Start from the current timeline. let mut timeline_owned; let mut timeline = self; // For debugging purposes, collect the path of layers that we traversed // through. It's included in the error message if we fail to find the key. - let mut traversal_path: Vec<(ValueReconstructResult, Lsn, Arc)> = Vec::new(); + let mut traversal_path = + Vec::<(ValueReconstructResult, Lsn, Box)>::new(); let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img { *cached_lsn @@ -1361,12 +1676,12 @@ impl Timeline { // The function should have updated 'state' //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn); match result { - ValueReconstructResult::Complete => return Ok(()), + ValueReconstructResult::Complete => return PageReconstructResult::Success(()), ValueReconstructResult::Continue => { // If we reached an earlier cached page image, we're done. if cont_lsn == cached_lsn + 1 { self.metrics.materialized_page_cache_hit_counter.inc_by(1); - return Ok(()); + return PageReconstructResult::Success(()); } if prev_lsn <= cont_lsn { // Didn't make any progress in last iteration. Error out to avoid @@ -1399,7 +1714,10 @@ impl Timeline { timeline.ancestor_lsn, cont_lsn ); - let ancestor = timeline.get_ancestor_timeline()?; + let ancestor = match timeline.get_ancestor_timeline() { + Ok(timeline) => timeline, + Err(e) => return PageReconstructResult::from(e), + }; timeline_owned = ancestor; timeline = &*timeline_owned; prev_lsn = Lsn(u64::MAX); @@ -1417,13 +1735,16 @@ impl Timeline { // Get all the data needed to reconstruct the page version from this layer. // But if we have an older cached page image, no need to go past that. let lsn_floor = max(cached_lsn + 1, start_lsn); - result = open_layer.get_value_reconstruct_data( + result = match open_layer.get_value_reconstruct_data( key, lsn_floor..cont_lsn, reconstruct_state, - )?; + ) { + Ok(result) => result, + Err(e) => return PageReconstructResult::from(e), + }; cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, open_layer.clone())); + traversal_path.push((result, cont_lsn, Box::new(open_layer.clone()))); continue; } } @@ -1432,28 +1753,43 @@ impl Timeline { if cont_lsn > start_lsn { //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); let lsn_floor = max(cached_lsn + 1, start_lsn); - result = frozen_layer.get_value_reconstruct_data( + result = match frozen_layer.get_value_reconstruct_data( key, lsn_floor..cont_lsn, reconstruct_state, - )?; + ) { + Ok(result) => result, + Err(e) => return PageReconstructResult::from(e), + }; cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, frozen_layer.clone())); + traversal_path.push((result, cont_lsn, Box::new(frozen_layer.clone()))); continue 'outer; } } - if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn)? { + if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); + // If it's a remote layer, the caller can do the download and retry. + if let Some(remote_layer) = super::storage_layer::downcast_remote_layer(&layer) { + info!("need remote layer {}", layer.traversal_id()); + return PageReconstructResult::NeedsDownload( + Weak::clone(&timeline.myself), + Arc::downgrade(&remote_layer), + ); + } + let lsn_floor = max(cached_lsn + 1, lsn_floor); - result = layer.get_value_reconstruct_data( + result = match layer.get_value_reconstruct_data( key, lsn_floor..cont_lsn, reconstruct_state, - )?; + ) { + Ok(result) => result, + Err(e) => return PageReconstructResult::from(e), + }; cont_lsn = lsn_floor; - traversal_path.push((result, cont_lsn, layer)); + traversal_path.push((result, cont_lsn, Box::new(layer.clone()))); } else if timeline.ancestor_timeline.is_some() { // Nothing on this timeline. Traverse to parent result = ValueReconstructResult::Continue; @@ -1668,7 +2004,7 @@ impl Timeline { } /// Flush one frozen in-memory layer to disk, as a new delta layer. - #[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.filename().display()))] + #[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))] async fn flush_frozen_layer(&self, frozen_layer: Arc) -> anyhow::Result<()> { // As a special case, when we have just imported an image into the repository, // instead of writing out a L0 delta layer, we directly write out image layer @@ -1677,9 +2013,11 @@ impl Timeline { let lsn_range = frozen_layer.get_lsn_range(); let layer_paths_to_upload = if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { - let (partitioning, _lsn) = - self.repartition(self.initdb_lsn, self.get_compaction_target_size())?; - self.create_image_layers(&partitioning, self.initdb_lsn, true)? + let (partitioning, _lsn) = self + .repartition(self.initdb_lsn, self.get_compaction_target_size()) + .await?; + self.create_image_layers(&partitioning, self.initdb_lsn, true) + .await? } else { // normal case, write out a L0 delta layer file. let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?; @@ -1727,7 +2065,7 @@ impl Timeline { fn update_metadata_file( &self, disk_consistent_lsn: Lsn, - layer_paths_to_upload: HashMap, + layer_paths_to_upload: HashMap, ) -> anyhow::Result<()> { // We can only save a valid 'prev_record_lsn' value on disk if we // flushed *all* in-memory changes to disk. We only track @@ -1776,13 +2114,9 @@ impl Timeline { if let Some(remote_client) = &self.remote_client { for (path, layer_metadata) in layer_paths_to_upload { - remote_client - .schedule_layer_file_upload(&path, &layer_metadata) - .context("schedule_layer_file_upload")?; + remote_client.schedule_layer_file_upload(&path, &layer_metadata)?; } - remote_client - .schedule_index_upload(&metadata) - .context("schedule_layer_file_upload")?; + remote_client.schedule_index_upload_for_metadata_update(&metadata)?; } Ok(()) @@ -1792,10 +2126,11 @@ impl Timeline { fn create_delta_layer( &self, frozen_layer: &InMemoryLayer, - ) -> anyhow::Result<(PathBuf, LayerFileMetadata)> { + ) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> { // Write it out let new_delta = frozen_layer.write_to_disk()?; let new_delta_path = new_delta.path(); + let new_delta_filename = new_delta.filename(); // Sync it to disk. // @@ -1820,23 +2155,36 @@ impl Timeline { // update the timeline's physical size let sz = new_delta_path.metadata()?.len(); - self.metrics.current_physical_size_gauge.add(sz); + self.metrics.resident_physical_size_gauge.add(sz); // update metrics self.metrics.num_persistent_files_created.inc_by(1); self.metrics.persistent_bytes_written.inc_by(sz); - Ok((new_delta_path, LayerFileMetadata::new(sz))) + Ok((new_delta_filename, LayerFileMetadata::new(sz))) } - fn repartition(&self, lsn: Lsn, partition_size: u64) -> anyhow::Result<(KeyPartitioning, Lsn)> { - let mut partitioning_guard = self.partitioning.lock().unwrap(); - if partitioning_guard.1 == Lsn(0) - || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold + async fn repartition( + &self, + lsn: Lsn, + partition_size: u64, + ) -> anyhow::Result<(KeyPartitioning, Lsn)> { { - let keyspace = self.collect_keyspace(lsn)?; - let partitioning = keyspace.partition(partition_size); + let partitioning_guard = self.partitioning.lock().unwrap(); + if partitioning_guard.1 != Lsn(0) + && lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold + { + // no repartitioning needed + return Ok((partitioning_guard.0.clone(), partitioning_guard.1)); + } + } + let keyspace = self.collect_keyspace(lsn).await?; + let partitioning = keyspace.partition(partition_size); + + let mut partitioning_guard = self.partitioning.lock().unwrap(); + if lsn > partitioning_guard.1 { *partitioning_guard = (partitioning, lsn); - return Ok((partitioning_guard.0.clone(), lsn)); + } else { + warn!("Concurrent repartitioning of keyspace. This unexpected, but probably harmless"); } Ok((partitioning_guard.0.clone(), partitioning_guard.1)) } @@ -1882,12 +2230,12 @@ impl Timeline { Ok(false) } - fn create_image_layers( + async fn create_image_layers( &self, partitioning: &KeyPartitioning, lsn: Lsn, force: bool, - ) -> anyhow::Result> { + ) -> anyhow::Result> { let timer = self.metrics.create_images_time_histo.start_timer(); let mut image_layers: Vec = Vec::new(); for partition in partitioning.parts.iter() { @@ -1909,7 +2257,7 @@ impl Timeline { for range in &partition.ranges { let mut key = range.start; while key < range.end { - let img = match self.get(key, lsn) { + let img = match self.get_download(key, lsn).await { Ok(img) => img, Err(err) => { // If we fail to reconstruct a VM or FSM page, we can zero the @@ -1965,13 +2313,16 @@ impl Timeline { let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len()); let mut layers = self.layers.write().unwrap(); + let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id); for l in image_layers { - let path = l.path(); - let metadata = path.metadata()?; + let path = l.filename(); + let metadata = timeline_path.join(path.file_name()).metadata()?; layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len())); - self.metrics.current_physical_size_gauge.add(metadata.len()); + self.metrics + .resident_physical_size_gauge + .add(metadata.len()); layers.insert_historic(Arc::new(l)); } layers.rebuild_index(); @@ -1984,7 +2335,7 @@ impl Timeline { #[derive(Default)] struct CompactLevel0Phase1Result { new_layers: Vec, - deltas_to_compact: Vec>, + deltas_to_compact: Vec>, } impl Timeline { @@ -2042,7 +2393,7 @@ impl Timeline { level0_deltas.len() ); for l in deltas_to_compact.iter() { - info!("compact includes {}", l.filename().display()); + info!("compact includes {}", l.filename().file_name()); } // We don't need the original list of layers anymore. Drop it so that // we don't accidentally use it later in the function. @@ -2050,38 +2401,40 @@ impl Timeline { // This iterator walks through all key-value pairs from all the layers // we're compacting, in key, LSN order. - let all_values_iter = deltas_to_compact - .iter() - .map(|l| l.iter()) - .kmerge_by(|a, b| { - if let Ok((a_key, a_lsn, _)) = a { - if let Ok((b_key, b_lsn, _)) = b { - match a_key.cmp(b_key) { - Ordering::Less => true, - Ordering::Equal => a_lsn <= b_lsn, - Ordering::Greater => false, + let all_values_iter = + itertools::process_results(deltas_to_compact.iter().map(|l| l.iter()), |iter_iter| { + iter_iter.kmerge_by(|a, b| { + if let Ok((a_key, a_lsn, _)) = a { + if let Ok((b_key, b_lsn, _)) = b { + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + } else { + false } } else { - false + true } - } else { - true - } - }); + }) + })?; // This iterator walks through all keys and is needed to calculate size used by each key - let mut all_keys_iter = deltas_to_compact - .iter() - .map(|l| l.key_iter()) - .kmerge_by(|a, b| { - let (a_key, a_lsn, _) = a; - let (b_key, b_lsn, _) = b; - match a_key.cmp(b_key) { - Ordering::Less => true, - Ordering::Equal => a_lsn <= b_lsn, - Ordering::Greater => false, - } - }); + let mut all_keys_iter = itertools::process_results( + deltas_to_compact.iter().map(|l| l.key_iter()), + |iter_iter| { + iter_iter.kmerge_by(|a, b| { + let (a_key, a_lsn, _) = a; + let (b_key, b_lsn, _) = b; + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => a_lsn <= b_lsn, + Ordering::Greater => false, + } + }) + }, + )?; // Merge the contents of all the input delta layers into a new set // of delta layers, based on the current partitioning. @@ -2251,11 +2604,16 @@ impl Timeline { deltas_to_compact, } = self.compact_level0_phase1(target_file_size).await?; + if new_layers.is_empty() && deltas_to_compact.is_empty() { + // nothing to do + return Ok(()); + } + // Before deleting any layers, we need to wait for their upload ops to finish. // See storage_sync module level comment on consistency. // Do it here because we don't want to hold self.layers.write() while waiting. if let Some(remote_client) = &self.remote_client { - info!("waiting for upload ops to complete"); + debug!("waiting for upload ops to complete"); remote_client .wait_completion() .await @@ -2271,29 +2629,31 @@ impl Timeline { if let Some(remote_client) = &self.remote_client { remote_client.schedule_layer_file_upload( - &new_delta_path, + &l.filename(), &LayerFileMetadata::new(metadata.len()), )?; } // update the timeline's physical size - self.metrics.current_physical_size_gauge.add(metadata.len()); + self.metrics + .resident_physical_size_gauge + .add(metadata.len()); new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len())); - layers.insert_historic(Arc::new(l)); - layers.rebuild_index(); + let x: Arc = Arc::new(l); + layers.insert_historic(x); } // Now that we have reshuffled the data to set of new delta layers, we can // delete the old ones - let mut layer_paths_to_delete = Vec::with_capacity(deltas_to_compact.len()); + let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len()); for l in deltas_to_compact { if let Some(path) = l.local_path() { self.metrics - .current_physical_size_gauge + .resident_physical_size_gauge .sub(path.metadata()?.len()); - layer_paths_to_delete.push(path); } + layer_names_to_delete.push(l.filename()); l.delete()?; layers.remove_historic(l); } @@ -2302,7 +2662,7 @@ impl Timeline { // Also schedule the deletions in remote storage if let Some(remote_client) = &self.remote_client { - remote_client.schedule_layer_file_deletion(&layer_paths_to_delete)?; + remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?; } Ok(()) @@ -2334,55 +2694,71 @@ impl Timeline { /// /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine /// whether a record is needed for PITR. - pub(super) fn update_gc_info( + /// + /// NOTE: This function holds a short-lived lock to protect the 'gc_info' + /// field, so that the three values passed as argument are stored + /// atomically. But the caller is responsible for ensuring that no new + /// branches are created that would need to be included in 'retain_lsns', + /// for example. The caller should hold `Tenant::gc_cs` lock to ensure + /// that. + /// + pub(super) async fn update_gc_info( &self, retain_lsns: Vec, cutoff_horizon: Lsn, pitr: Duration, ) -> anyhow::Result<()> { - let mut gc_info = self.gc_info.write().unwrap(); - - gc_info.horizon_cutoff = cutoff_horizon; - gc_info.retain_lsns = retain_lsns; - - // Calculate pitr cutoff point. - // If we cannot determine a cutoff LSN, be conservative and don't GC anything. - let mut pitr_cutoff_lsn: Lsn; - - if pitr != Duration::ZERO { - // conservative, safe default is to remove nothing, when we have no - // commit timestamp data available - pitr_cutoff_lsn = *self.get_latest_gc_cutoff_lsn(); - - // First, calculate pitr_cutoff_timestamp and then convert it to LSN. - // If we don't have enough data to convert to LSN, - // play safe and don't remove any layers. + // First, calculate pitr_cutoff_timestamp and then convert it to LSN. + // + // Some unit tests depend on garbage-collection working even when + // CLOG data is missing, so that find_lsn_for_timestamp() doesn't + // work, so avoid calling it altogether if time-based retention is not + // configured. It would be pointless anyway. + let pitr_cutoff = if pitr != Duration::ZERO { let now = SystemTime::now(); if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) { let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp); - match self.find_lsn_for_timestamp(pitr_timestamp)? { - LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn, + match with_ondemand_download(|| self.find_lsn_for_timestamp(pitr_timestamp)).await? + { + LsnForTimestamp::Present(lsn) => lsn, LsnForTimestamp::Future(lsn) => { + // The timestamp is in the future. That sounds impossible, + // but what it really means is that there hasn't been + // any commits since the cutoff timestamp. debug!("future({})", lsn); - pitr_cutoff_lsn = gc_info.horizon_cutoff; + cutoff_horizon } LsnForTimestamp::Past(lsn) => { debug!("past({})", lsn); + // conservative, safe default is to remove nothing, when we + // have no commit timestamp data available + *self.get_latest_gc_cutoff_lsn() } LsnForTimestamp::NoData(lsn) => { debug!("nodata({})", lsn); + // conservative, safe default is to remove nothing, when we + // have no commit timestamp data available + *self.get_latest_gc_cutoff_lsn() } } - debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn) + } else { + // If we don't have enough data to convert to LSN, + // play safe and don't remove any layers. + *self.get_latest_gc_cutoff_lsn() } } else { - // No time-based retention. (Some unit tests depend on garbage-collection - // working even when CLOG data is missing, so that find_lsn_for_timestamp() - // above doesn't work.) - pitr_cutoff_lsn = gc_info.horizon_cutoff; - } - gc_info.pitr_cutoff = pitr_cutoff_lsn; + // No time-based retention was configured. Set time-based cutoff to + // same as LSN based. + cutoff_horizon + }; + + // Grab the lock and update the values + *self.gc_info.write().unwrap() = GcInfo { + retain_lsns, + horizon_cutoff: cutoff_horizon, + pitr_cutoff, + }; Ok(()) } @@ -2456,9 +2832,6 @@ impl Timeline { ); write_guard.store_and_unlock(new_gc_cutoff).wait(); } - // Persist the new GC cutoff value in the metadata file, before - // we actually remove anything. - self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; info!("GC starting"); @@ -2468,7 +2841,7 @@ impl Timeline { // See storage_sync module level comment on consistency. // Do it here because we don't want to hold self.layers.write() while waiting. if let Some(remote_client) = &self.remote_client { - info!("waiting for upload ops to complete"); + debug!("waiting for upload ops to complete"); remote_client .wait_completion() .await @@ -2487,23 +2860,13 @@ impl Timeline { // let mut layers = self.layers.write().unwrap(); 'outer: for l in layers.iter_historic_layers() { - // This layer is in the process of being flushed to disk. - // It will be swapped out of the layer map, replaced with - // on-disk layers containing the same data. - // We can't GC it, as it's not on disk. We can't remove it - // from the layer map yet, as it would make its data - // inaccessible. - if l.is_in_memory() { - continue; - } - result.layers_total += 1; // 1. Is it newer than GC horizon cutoff point? if l.get_lsn_range().end > horizon_cutoff { debug!( "keeping {} because it's newer than horizon_cutoff {}", - l.filename().display(), + l.filename().file_name(), horizon_cutoff ); result.layers_needed_by_cutoff += 1; @@ -2514,7 +2877,7 @@ impl Timeline { if l.get_lsn_range().end > pitr_cutoff { debug!( "keeping {} because it's newer than pitr_cutoff {}", - l.filename().display(), + l.filename().file_name(), pitr_cutoff ); result.layers_needed_by_pitr += 1; @@ -2531,7 +2894,7 @@ impl Timeline { if &l.get_lsn_range().start <= retain_lsn { debug!( "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", - l.filename().display(), + l.filename().file_name(), retain_lsn, l.is_incremental(), ); @@ -2564,7 +2927,7 @@ impl Timeline { { debug!( "keeping {} because it is the latest layer", - l.filename().display() + l.filename().file_name() ); result.layers_not_updated += 1; continue 'outer; @@ -2573,26 +2936,40 @@ impl Timeline { // We didn't find any reason to keep this file, so remove it. debug!( "garbage collecting {} is_dropped: xx is_incremental: {}", - l.filename().display(), + l.filename().file_name(), l.is_incremental(), ); layers_to_remove.push(Arc::clone(&l)); } - // Actually delete the layers from disk and remove them from the map. - // (couldn't do this in the loop above, because you cannot modify a collection - // while iterating it. BTreeMap::retain() would be another option) - let mut layer_paths_to_delete = Vec::with_capacity(layers_to_remove.len()); - for doomed_layer in layers_to_remove { - if let Some(path) = doomed_layer.local_path() { - self.metrics - .current_physical_size_gauge - .sub(path.metadata()?.len()); - layer_paths_to_delete.push(path); + if !layers_to_remove.is_empty() { + // Persist the new GC cutoff value in the metadata file, before + // we actually remove anything. + self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; + + // Actually delete the layers from disk and remove them from the map. + // (couldn't do this in the loop above, because you cannot modify a collection + // while iterating it. BTreeMap::retain() would be another option) + let mut layer_names_to_delete = Vec::with_capacity(layers_to_remove.len()); + for doomed_layer in layers_to_remove { + if let Some(path) = doomed_layer.local_path() { + self.metrics + .resident_physical_size_gauge + .sub(path.metadata()?.len()); + } + layer_names_to_delete.push(doomed_layer.filename()); + doomed_layer.delete()?; // FIXME: schedule succeeded deletions before returning? + layers.remove_historic(doomed_layer); + result.layers_removed += 1; + } + + if result.layers_removed != 0 { + fail_point!("after-timeline-gc-removed-layers"); + } + + if let Some(remote_client) = &self.remote_client { + remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?; } - doomed_layer.delete()?; - layers.remove_historic(doomed_layer); - result.layers_removed += 1; } layers.rebuild_index(); @@ -2601,14 +2978,6 @@ impl Timeline { result.layers_removed, new_gc_cutoff ); - if result.layers_removed != 0 { - fail_point!("after-timeline-gc-removed-layers"); - } - - if let Some(remote_client) = &self.remote_client { - remote_client.schedule_layer_file_deletion(&layer_paths_to_delete)?; - } - result.elapsed = now.elapsed()?; Ok(result) } @@ -2621,7 +2990,7 @@ impl Timeline { key: Key, request_lsn: Lsn, mut data: ValueReconstructState, - ) -> anyhow::Result { + ) -> PageReconstructResult { // Perform WAL redo if needed data.records.reverse(); @@ -2633,9 +3002,11 @@ impl Timeline { key, img_lsn ); - Ok(img.clone()) + PageReconstructResult::Success(img.clone()) } else { - bail!("base image for {} at {} not found", key, request_lsn); + PageReconstructResult::from(anyhow!( + "base image for {key} at {request_lsn} not found" + )) } } else { // We need to do WAL redo. @@ -2643,36 +3014,38 @@ impl Timeline { // If we don't have a base image, then the oldest WAL record better initialize // the page if data.img.is_none() && !data.records.first().unwrap().1.will_init() { - bail!( + PageReconstructResult::from(anyhow!( "Base image for {} at {} not found, but got {} WAL records", key, request_lsn, data.records.len() - ); + )) } else { - let base_img = if let Some((_lsn, img)) = data.img { + if data.img.is_some() { trace!( "found {} WAL records and a base image for {} at {}, performing WAL redo", data.records.len(), key, request_lsn ); - Some(img) } else { trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); - None }; let last_rec_lsn = data.records.last().unwrap().0; - let img = self + let img = match self .walredo_mgr - .request_redo(key, request_lsn, base_img, data.records, self.pg_version) - .context("Failed to reconstruct a page image:")?; + .request_redo(key, request_lsn, data.img, data.records, self.pg_version) + .context("Failed to reconstruct a page image:") + { + Ok(img) => img, + Err(e) => return PageReconstructResult::from(e), + }; if img.len() == page_cache::PAGE_SZ { let cache = page_cache::get(); - cache + if let Err(e) = cache .memorize_materialized_page( self.tenant_id, self.timeline_id, @@ -2680,10 +3053,324 @@ impl Timeline { last_rec_lsn, &img, ) - .context("Materialized page memoization failed")?; + .context("Materialized page memoization failed") + { + return PageReconstructResult::from(e); + } } - Ok(img) + PageReconstructResult::Success(img) + } + } + } + + /// Download a layer file from remote storage and insert it into the layer map. + /// + /// It's safe to call this function for the same layer concurrently. In that case: + /// - If the layer has already been downloaded, `OK(...)` is returned. + /// - If the layer is currently being downloaded, we wait until that download succeeded / failed. + /// - If it succeeded, we return `Ok(...)`. + /// - If it failed, we or another concurrent caller will initiate a new download attempt. + /// + /// Download errors are classified and retried if appropriate by the underlying RemoteTimelineClient function. + /// It has an internal limit for the maximum number of retries and prints appropriate log messages. + /// If we exceed the limit, it returns an error, and this function passes it through. + /// The caller _could_ retry further by themselves by calling this function again, but _should not_ do it. + /// The reason is that they cannot distinguish permanent errors from temporary ones, whereas + /// the underlying RemoteTimelineClient can. + /// + /// There is no internal timeout or slowness detection. + /// If the caller has a deadline or needs a timeout, they can simply stop polling: + /// we're **cancellation-safe** because the download happens in a separate task_mgr task. + /// So, the current download attempt will run to completion even if we stop polling. + #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))] + pub async fn download_remote_layer( + self: Arc, + remote_layer: Arc, + ) -> anyhow::Result<()> { + let permit = match Arc::clone(&remote_layer.ongoing_download) + .acquire_owned() + .await + { + Ok(permit) => permit, + Err(_closed) => { + info!("download of layer has already finished"); + return Ok(()); + } + }; + + let (sender, receiver) = tokio::sync::oneshot::channel(); + // Spawn a task so that download does not outlive timeline when we detach tenant / delete timeline. + task_mgr::spawn( + &tokio::runtime::Handle::current(), + TaskKind::RemoteDownloadTask, + Some(self.tenant_id), + Some(self.timeline_id), + &format!("download layer {}", remote_layer.short_id()), + false, + async move { + let remote_client = self.remote_client.as_ref().unwrap(); + + // Does retries + exponential back-off internally. + // When this fails, don't layer further retry attempts here. + let result = remote_client + .download_layer_file(&remote_layer.file_name, &remote_layer.layer_metadata) + .await; + + if let Ok(size) = &result { + // XXX the temp file is still around in Err() case + // and consumes space until we clean up upon pageserver restart. + self.metrics.resident_physical_size_gauge.add(*size); + + // Download complete. Replace the RemoteLayer with the corresponding + // Delta- or ImageLayer in the layer map. + let new_layer = remote_layer.create_downloaded_layer(self.conf, *size); + let mut layers = self.layers.write().unwrap(); + { + let l: Arc = remote_layer.clone(); + layers.remove_historic(l); + } + layers.insert_historic(new_layer); + layers.rebuild_index(); + drop(layers); + + // Now that we've inserted the download into the layer map, + // close the semaphore. This will make other waiters for + // this download return Ok(()). + assert!(!remote_layer.ongoing_download.is_closed()); + remote_layer.ongoing_download.close(); + } else { + // Keep semaphore open. We'll drop the permit at the end of the function. + } + + // Don't treat it as an error if the task that triggered the download + // is no longer interested in the result. + sender.send(result.map(|_sz| ())).ok(); + + // In case we failed and there are other waiters, this will make one + // of them retry the download in a new task. + // XXX: This resets the exponential backoff because it's a new call to + // download_layer file. + drop(permit); + + Ok(()) + }, + ); + + receiver.await.context("download task cancelled")? + } + + pub async fn spawn_download_all_remote_layers( + self: Arc, + ) -> Result { + let mut status_guard = self.download_all_remote_layers_task_info.write().unwrap(); + if let Some(st) = &*status_guard { + match &st.state { + DownloadRemoteLayersTaskState::Running => { + return Err(st.clone()); + } + DownloadRemoteLayersTaskState::ShutDown + | DownloadRemoteLayersTaskState::Completed => { + *status_guard = None; + } + } + } + + let self_clone = Arc::clone(&self); + let task_id = task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + task_mgr::TaskKind::DownloadAllRemoteLayers, + Some(self.tenant_id), + Some(self.timeline_id), + "download all remote layers task", + false, + async move { + self_clone.download_all_remote_layers().await; + let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap(); + match &mut *status_guard { + None => { + warn!("tasks status is supposed to be Some(), since we are running"); + } + Some(st) => { + let exp_task_id = format!("{}", task_mgr::current_task_id().unwrap()); + if st.task_id != exp_task_id { + warn!("task id changed while we were still running, expecting {} but have {}", exp_task_id, st.task_id); + } else { + st.state = DownloadRemoteLayersTaskState::Completed; + } + } + }; + Ok(()) + } + .instrument(info_span!(parent: None, "download_all_remote_layers", tenant = %self.tenant_id, timeline = %self.timeline_id)) + ); + + let initial_info = DownloadRemoteLayersTaskInfo { + task_id: format!("{task_id}"), + state: DownloadRemoteLayersTaskState::Running, + total_layer_count: 0, + successful_download_count: 0, + failed_download_count: 0, + }; + *status_guard = Some(initial_info.clone()); + + Ok(initial_info) + } + + async fn download_all_remote_layers(self: &Arc) { + let mut downloads: FuturesUnordered<_> = { + let layers = self.layers.read().unwrap(); + layers + .iter_historic_layers() + .filter_map(|l| l.downcast_remote_layer()) + .map({ + |l| { + let self_clone = Arc::clone(self); + self_clone.download_remote_layer(l) + } + }) + .collect() + }; + + macro_rules! lock_status { + ($st:ident) => { + let mut st = self.download_all_remote_layers_task_info.write().unwrap(); + let st = st + .as_mut() + .expect("this function is only called after the task has been spawned"); + assert_eq!( + st.task_id, + format!( + "{}", + task_mgr::current_task_id().expect("we run inside a task_mgr task") + ) + ); + let $st = st; + }; + } + + { + lock_status!(st); + st.total_layer_count = downloads.len().try_into().unwrap(); + } + loop { + tokio::select! { + dl = downloads.next() => { + lock_status!(st); + match dl { + None => break, + Some(Ok(())) => { + st.successful_download_count += 1; + }, + Some(Err(e)) => { + error!(error = %e, "layer download failed"); + st.failed_download_count += 1; + } + } + } + _ = task_mgr::shutdown_watcher() => { + // Kind of pointless to watch for shutdowns here, + // as download_remote_layer spawns other task_mgr tasks internally. + lock_status!(st); + st.state = DownloadRemoteLayersTaskState::ShutDown; + } + } + } + { + lock_status!(st); + st.state = DownloadRemoteLayersTaskState::Completed; + } + } + + pub fn get_download_all_remote_layers_task_info(&self) -> Option { + self.download_all_remote_layers_task_info + .read() + .unwrap() + .clone() + } +} + +/// Helper function to deal with [`PageReconstructResult`]. +/// +/// Takes a sync closure that returns a [`PageReconstructResult`]. +/// If it is [`PageReconstructResult::NeedsDownload`], +/// do the download and retry the closure. +/// +/// ### Background +/// +/// This is a crutch to make on-demand downloads efficient in +/// our async-sync-async sandwich codebase. Some context: +/// +/// - The code that does the downloads uses async Rust. +/// - The code that initiates download is many levels of sync Rust. +/// - The sync code must wait for the download to finish to +/// make further progress. +/// - The sync code is invoked directly from async functions upstack. +/// +/// Example (there are also much worse ones where the sandwich is taller) +/// +/// async handle_get_page_at_lsn_request page_service.rs +/// sync get_rel_page_at_lsn timeline.rs +/// sync timeline.get timeline.rs +/// sync get_reconstruct_data timeline.rs +/// async download_remote_layer timeline.rs +/// +/// It is not possible to Timeline::download_remote_layer().await within +/// get_reconstruct_data, so instead, we return [`PageReconstructResult::NeedsDownload`] +/// which contains references to the [`Timeline`] and [`RemoteLayer`]. +/// We bubble that error upstack to the async code, which can then call +/// `Timeline::download_remote_layer().await`. +/// That is _efficient_ because tokio can use the same OS thread to do +/// other work while we're waiting for the download. +/// +/// It is a deliberate decision to use a new result type to communicate +/// the need for download instead of adding another variant to [`PageReconstructError`]. +/// The reason is that with the latter approach, any place that does +/// `?` on a `Result` will implicitly ignore the +/// need for download. We want that to be explicit, so that +/// - the code base becomes greppable for places that don't do a download +/// - future code changes will need to explicilty address for on-demand download +/// +/// Alternatives to consider in the future: +/// +/// - Inside `get_reconstruct_data`, we can std::thread::spawn a thread +/// and use it to block_on the download_remote_layer future. +/// That is obviously inefficient as it creates one thread per download. +/// - Convert everything to async. The problem here is that the sync +/// functions are used by many other sync functions. So, the scope +/// creep of such a conversion is tremendous. +/// - Compromise between the two: implement async functions for each sync +/// function. Switch over the hot code paths (GetPage()) to use the +/// async path, so that the hot path doesn't spawn threads. Other code +/// paths would remain sync initially, and get converted to async over time. +/// +pub async fn with_ondemand_download(mut f: F) -> Result +where + F: Send + FnMut() -> PageReconstructResult, + T: Send, +{ + loop { + let closure_result = f(); + match closure_result { + PageReconstructResult::NeedsDownload(weak_timeline, weak_remote_layer) => { + // if the timeline is gone, it has likely been deleted / tenant detached + let tl = weak_timeline.upgrade().context("timeline is gone")?; + // if the remote layer got removed, retry the function, it might succeed now + let remote_layer = match weak_remote_layer.upgrade() { + None => { + info!("remote layer is gone, retrying closure"); + continue; + } + Some(l) => l, + }; + // Does retries internally + tl.download_remote_layer(remote_layer).await?; + // Download successful, retry the closure + continue; + } + PageReconstructResult::Success(closure_value) => return Ok(closure_value), + PageReconstructResult::Error(e) => { + return Err(anyhow::Error::new(e).context("Failed to reconstruct the page")) } } } @@ -2693,8 +3380,8 @@ impl Timeline { /// to an error, as anyhow context information. fn layer_traversal_error( msg: String, - path: Vec<(ValueReconstructResult, Lsn, Arc)>, -) -> anyhow::Result<()> { + path: Vec<(ValueReconstructResult, Lsn, Box)>, +) -> PageReconstructResult<()> { // We want the original 'msg' to be the outermost context. The outermost context // is the most high-level information, which also gets propagated to the client. let mut msg_iter = path @@ -2704,7 +3391,7 @@ fn layer_traversal_error( "layer traversal: result {:?}, cont_lsn {}, layer: {}", r, c, - l.filename().display() + l.traversal_id(), ) }) .chain(std::iter::once(msg)); @@ -2712,7 +3399,8 @@ fn layer_traversal_error( let err = anyhow!(msg_iter.next().unwrap()); // Append all subsequent traversals, and the error message 'msg', as contexts. - Err(msg_iter.fold(err, |err, msg| err.context(msg))) + let msg = msg_iter.fold(err, |err, msg| err.context(msg)); + PageReconstructResult::from(msg) } /// Various functions to mutate the timeline. @@ -2772,9 +3460,9 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> { let mut new_path = path.to_owned(); for i in 0u32.. { - new_path.set_file_name(format!("{}.{}.old", filename, i)); + new_path.set_file_name(format!("{filename}.{i}.old")); if !new_path.exists() { - std::fs::rename(&path, &new_path)?; + std::fs::rename(path, &new_path)?; return Ok(()); } } diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs new file mode 100644 index 0000000000..790b2f59aa --- /dev/null +++ b/pageserver/src/tenant/upload_queue.rs @@ -0,0 +1,213 @@ +use crate::metrics::RemoteOpFileKind; + +use super::storage_layer::LayerFileName; +use crate::tenant::metadata::TimelineMetadata; +use crate::tenant::remote_timeline_client::index::IndexPart; +use crate::tenant::remote_timeline_client::index::LayerFileMetadata; +use std::collections::{HashMap, VecDeque}; +use std::fmt::Debug; + +use std::sync::Arc; +use tracing::info; + +use std::sync::atomic::AtomicU32; +use utils::lsn::Lsn; + +// clippy warns that Uninitialized is much smaller than Initialized, which wastes +// memory for Uninitialized variants. Doesn't matter in practice, there are not +// that many upload queues in a running pageserver, and most of them are initialized +// anyway. +#[allow(clippy::large_enum_variant)] +pub(crate) enum UploadQueue { + Uninitialized, + Initialized(UploadQueueInitialized), + Stopped(UploadQueueStopped), +} + +impl UploadQueue { + fn as_str(&self) -> &'static str { + match self { + UploadQueue::Uninitialized => "Uninitialized", + UploadQueue::Initialized(_) => "Initialized", + UploadQueue::Stopped(_) => "Stopped", + } + } +} + +/// This keeps track of queued and in-progress tasks. +pub(crate) struct UploadQueueInitialized { + /// Counter to assign task IDs + pub(crate) task_counter: u64, + + /// All layer files stored in the remote storage, taking into account all + /// in-progress and queued operations + pub(crate) latest_files: HashMap, + + /// How many file uploads or deletions been scheduled, since the + /// last (scheduling of) metadata index upload? + pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64, + + /// Metadata stored in the remote storage, taking into account all + /// in-progress and queued operations. + /// DANGER: do not return to outside world, e.g., safekeepers. + pub(crate) latest_metadata: TimelineMetadata, + + /// `disk_consistent_lsn` from the last metadata file that was successfully + /// uploaded. `Lsn(0)` if nothing was uploaded yet. + /// Unlike `latest_files` or `latest_metadata`, this value is never ahead. + /// Safekeeper can rely on it to make decisions for WAL storage. + pub(crate) last_uploaded_consistent_lsn: Lsn, + + // Breakdown of different kinds of tasks currently in-progress + pub(crate) num_inprogress_layer_uploads: usize, + pub(crate) num_inprogress_metadata_uploads: usize, + pub(crate) num_inprogress_deletions: usize, + + /// Tasks that are currently in-progress. In-progress means that a tokio Task + /// has been launched for it. An in-progress task can be busy uploading, but it can + /// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can + /// be waiting for retry in `exponential_backoff`. + pub(crate) inprogress_tasks: HashMap>, + + /// Queued operations that have not been launched yet. They might depend on previous + /// tasks to finish. For example, metadata upload cannot be performed before all + /// preceding layer file uploads have completed. + pub(crate) queued_operations: VecDeque, +} + +pub(crate) struct UploadQueueStopped { + pub(crate) last_uploaded_consistent_lsn: Lsn, +} + +impl UploadQueue { + pub(crate) fn initialize_empty_remote( + &mut self, + metadata: &TimelineMetadata, + ) -> anyhow::Result<&mut UploadQueueInitialized> { + match self { + UploadQueue::Uninitialized => (), + UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => { + anyhow::bail!("already initialized, state {}", self.as_str()) + } + } + + info!("initializing upload queue for empty remote"); + + let state = UploadQueueInitialized { + // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead. + latest_files: HashMap::new(), + latest_files_changes_since_metadata_upload_scheduled: 0, + latest_metadata: metadata.clone(), + // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent + // safekeepers from garbage-collecting anything. + last_uploaded_consistent_lsn: Lsn(0), + // what follows are boring default initializations + task_counter: 0, + num_inprogress_layer_uploads: 0, + num_inprogress_metadata_uploads: 0, + num_inprogress_deletions: 0, + inprogress_tasks: HashMap::new(), + queued_operations: VecDeque::new(), + }; + + *self = UploadQueue::Initialized(state); + Ok(self.initialized_mut().expect("we just set it")) + } + + pub(crate) fn initialize_with_current_remote_index_part( + &mut self, + index_part: &IndexPart, + ) -> anyhow::Result<&mut UploadQueueInitialized> { + match self { + UploadQueue::Uninitialized => (), + UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => { + anyhow::bail!("already initialized, state {}", self.as_str()) + } + } + + let mut files = HashMap::with_capacity(index_part.timeline_layers.len()); + for layer_name in &index_part.timeline_layers { + let layer_metadata = index_part + .layer_metadata + .get(layer_name) + .map(LayerFileMetadata::from) + .unwrap_or(LayerFileMetadata::MISSING); + files.insert(layer_name.to_owned(), layer_metadata); + } + + let index_part_metadata = index_part.parse_metadata()?; + info!( + "initializing upload queue with remote index_part.disk_consistent_lsn: {}", + index_part_metadata.disk_consistent_lsn() + ); + + let state = UploadQueueInitialized { + latest_files: files, + latest_files_changes_since_metadata_upload_scheduled: 0, + latest_metadata: index_part_metadata.clone(), + last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(), + // what follows are boring default initializations + task_counter: 0, + num_inprogress_layer_uploads: 0, + num_inprogress_metadata_uploads: 0, + num_inprogress_deletions: 0, + inprogress_tasks: HashMap::new(), + queued_operations: VecDeque::new(), + }; + + *self = UploadQueue::Initialized(state); + Ok(self.initialized_mut().expect("we just set it")) + } + + pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> { + match self { + UploadQueue::Uninitialized | UploadQueue::Stopped(_) => { + anyhow::bail!("queue is in state {}", self.as_str()) + } + UploadQueue::Initialized(x) => Ok(x), + } + } +} + +/// An in-progress upload or delete task. +#[derive(Debug)] +pub(crate) struct UploadTask { + /// Unique ID of this task. Used as the key in `inprogress_tasks` above. + pub(crate) task_id: u64, + pub(crate) retries: AtomicU32, + + pub(crate) op: UploadOp, +} + +#[derive(Debug)] +pub(crate) enum UploadOp { + /// Upload a layer file + UploadLayer(LayerFileName, LayerFileMetadata), + + /// Upload the metadata file + UploadMetadata(IndexPart, Lsn), + + /// Delete a file. + Delete(RemoteOpFileKind, LayerFileName), + + /// Barrier. When the barrier operation is reached, + Barrier(tokio::sync::watch::Sender<()>), +} + +impl std::fmt::Display for UploadOp { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + UploadOp::UploadLayer(path, metadata) => { + write!( + f, + "UploadLayer({}, size={:?})", + path.file_name(), + metadata.file_size() + ) + } + UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn), + UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()), + UploadOp::Barrier(_) => write!(f, "Barrier"), + } + } +} diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs deleted file mode 100644 index 70de713a26..0000000000 --- a/pageserver/src/tenant_mgr.rs +++ /dev/null @@ -1,427 +0,0 @@ -//! This module acts as a switchboard to access different repositories managed by this -//! page server. - -use std::collections::hash_map; -use std::ffi::OsStr; -use std::fs; -use std::path::Path; -use std::sync::Arc; - -use anyhow::Context; -use tracing::*; - -use remote_storage::GenericRemoteStorage; - -use crate::config::PageServerConf; -use crate::task_mgr::{self, TaskKind}; -use crate::tenant::{Tenant, TenantState}; -use crate::tenant_config::TenantConfOpt; - -use utils::fs_ext::PathExt; -use utils::id::{TenantId, TimelineId}; - -mod tenants_state { - use once_cell::sync::Lazy; - use std::{ - collections::HashMap, - sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}, - }; - use utils::id::TenantId; - - use crate::tenant::Tenant; - - static TENANTS: Lazy>>> = - Lazy::new(|| RwLock::new(HashMap::new())); - - pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap>> { - TENANTS - .read() - .expect("Failed to read() tenants lock, it got poisoned") - } - - pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap>> { - TENANTS - .write() - .expect("Failed to write() tenants lock, it got poisoned") - } -} - -/// Initialize repositories with locally available timelines. -/// Timelines that are only partially available locally (remote storage has more data than this pageserver) -/// are scheduled for download and added to the tenant once download is completed. -pub fn init_tenant_mgr( - conf: &'static PageServerConf, - remote_storage: Option, -) -> anyhow::Result<()> { - let _entered = info_span!("init_tenant_mgr").entered(); - - // Scan local filesystem for attached tenants - let mut number_of_tenants = 0; - let tenants_dir = conf.tenants_path(); - for dir_entry in std::fs::read_dir(&tenants_dir) - .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))? - { - match &dir_entry { - Ok(dir_entry) => { - let tenant_dir_path = dir_entry.path(); - if crate::is_temporary(&tenant_dir_path) { - info!( - "Found temporary tenant directory, removing: {}", - tenant_dir_path.display() - ); - if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) { - error!( - "Failed to remove temporary directory '{}': {:?}", - tenant_dir_path.display(), - e - ); - } - } else { - match load_local_tenant(conf, &tenant_dir_path, remote_storage.clone()) { - Ok(Some(tenant)) => { - tenants_state::write_tenants().insert(tenant.tenant_id(), tenant); - number_of_tenants += 1; - } - Ok(None) => { - // This case happens if we crash during attach before creating the attach marker file - if let Err(e) = std::fs::remove_dir(&tenant_dir_path) { - error!( - "Failed to remove empty tenant directory '{}': {e:#}", - tenant_dir_path.display() - ) - } - } - Err(e) => { - error!( - "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}", - tenants_dir.display(), - dir_entry, - e - ); - } - } - } - } - Err(e) => { - // On error, print it, but continue with the other tenants. If we error out - // here, the pageserver startup fails altogether, causing outage for *all* - // tenants. That seems worse. - error!( - "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}", - dir_entry, - tenants_dir.display(), - e, - ); - } - } - } - - info!("Processed {number_of_tenants} local tenants at startup"); - Ok(()) -} - -fn load_local_tenant( - conf: &'static PageServerConf, - tenant_path: &Path, - remote_storage: Option, -) -> anyhow::Result>> { - if !tenant_path.is_dir() { - anyhow::bail!("tenant_path is not a directory: {tenant_path:?}") - } - - let is_empty = tenant_path - .is_empty_dir() - .context("check whether tenant_path is an empty dir")?; - if is_empty { - info!("skipping empty tenant directory {tenant_path:?}"); - return Ok(None); - } - - let tenant_id = tenant_path - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .context("Could not parse tenant id out of the tenant dir name")?; - - let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() { - info!("tenant {tenant_id} has attaching mark file, resuming its attach operation"); - if let Some(remote_storage) = remote_storage { - Tenant::spawn_attach(conf, tenant_id, &remote_storage) - } else { - warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured"); - Tenant::create_broken_tenant(conf, tenant_id) - } - } else { - info!("tenant {tenant_id} is assumed to be loadable, starting load operation"); - // Start loading the tenant into memory. It will initially be in Loading state. - Tenant::spawn_load(conf, tenant_id, remote_storage) - }; - Ok(Some(tenant)) -} - -/// -/// Shut down all tenants. This runs as part of pageserver shutdown. -/// -pub async fn shutdown_all_tenants() { - let tenants_to_shut_down = { - let mut m = tenants_state::write_tenants(); - let mut tenants_to_shut_down = Vec::with_capacity(m.len()); - for (_, tenant) in m.drain() { - if tenant.is_active() { - // updates tenant state, forbidding new GC and compaction iterations from starting - tenant.set_stopping(); - tenants_to_shut_down.push(tenant) - } - } - drop(m); - tenants_to_shut_down - }; - - // Shut down all existing walreceiver connections and stop accepting the new ones. - task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await; - - // Ok, no background tasks running anymore. Flush any remaining data in - // memory to disk. - // - // We assume that any incoming connections that might request pages from - // the tenant have already been terminated by the caller, so there - // should be no more activity in any of the repositories. - // - // On error, log it but continue with the shutdown for other tenants. - for tenant in tenants_to_shut_down { - let tenant_id = tenant.tenant_id(); - debug!("shutdown tenant {tenant_id}"); - - if let Err(err) = tenant.checkpoint().await { - error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}"); - } - } -} - -pub fn create_tenant( - conf: &'static PageServerConf, - tenant_conf: TenantConfOpt, - tenant_id: TenantId, - remote_storage: Option, -) -> anyhow::Result>> { - match tenants_state::write_tenants().entry(tenant_id) { - hash_map::Entry::Occupied(_) => { - debug!("tenant {tenant_id} already exists"); - Ok(None) - } - hash_map::Entry::Vacant(v) => { - // Hold the write_tenants() lock, since all of this is local IO. - // If this section ever becomes contentious, introduce a new `TenantState::Creating`. - let tenant_directory = - super::tenant::create_tenant_files(conf, tenant_conf, tenant_id)?; - let created_tenant = load_local_tenant(conf, &tenant_directory, remote_storage)?; - match created_tenant { - None => { - // We get None in case the directory is empty. - // This shouldn't happen here, because we just created the directory. - // So, skip any cleanup work for now, we don't know how we reached this state. - anyhow::bail!("we just created the tenant directory, it can't be empty"); - } - Some(tenant) => { - anyhow::ensure!( - tenant_id == tenant.tenant_id(), - "loaded created tenant has unexpected tenant id (expect {} != actual {})", - tenant_id, - tenant.tenant_id() - ); - v.insert(Arc::clone(&tenant)); - Ok(Some(tenant)) - } - } - } - } -} - -pub fn update_tenant_config( - conf: &'static PageServerConf, - tenant_conf: TenantConfOpt, - tenant_id: TenantId, -) -> anyhow::Result<()> { - info!("configuring tenant {tenant_id}"); - get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf); - Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?; - Ok(()) -} - -/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. -/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. -pub fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result> { - let m = tenants_state::read_tenants(); - let tenant = m - .get(&tenant_id) - .with_context(|| format!("Tenant {tenant_id} not found in the local state"))?; - if active_only && !tenant.is_active() { - anyhow::bail!( - "Tenant {tenant_id} is not active. Current state: {:?}", - tenant.current_state() - ) - } else { - Ok(Arc::clone(tenant)) - } -} - -pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> { - // Start with the shutdown of timeline tasks (this shuts down the walreceiver) - // It is important that we do not take locks here, and do not check whether the timeline exists - // because if we hold tenants_state::write_tenants() while awaiting for the tasks to join - // we cannot create new timelines and tenants, and that can take quite some time, - // it can even become stuck due to a bug making whole pageserver unavailable for some operations - // so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation - // and then try to actually remove timeline from inmemory state and this is the point when concurrent requests - // will synchronize and either fail with the not found error or succeed - - debug!("waiting for wal receiver to shutdown"); - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(tenant_id), - Some(timeline_id), - ) - .await; - debug!("wal receiver shutdown confirmed"); - - info!("waiting for timeline tasks to shutdown"); - task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await; - info!("timeline task shutdown completed"); - match get_tenant(tenant_id, true) { - Ok(tenant) => { - tenant.delete_timeline(timeline_id).await?; - } - Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"), - } - - Ok(()) -} - -pub async fn detach_tenant( - conf: &'static PageServerConf, - tenant_id: TenantId, -) -> anyhow::Result<()> { - let tenant = match { - let mut tenants_accessor = tenants_state::write_tenants(); - tenants_accessor.remove(&tenant_id) - } { - Some(tenant) => tenant, - None => anyhow::bail!("Tenant not found for id {tenant_id}"), - }; - - tenant.set_stopping(); - // shutdown all tenant and timeline tasks: gc, compaction, page service) - task_mgr::shutdown_tasks(None, Some(tenant_id), None).await; - - // If removal fails there will be no way to successfully retry detach, - // because the tenant no longer exists in the in-memory map. And it needs to be removed from it - // before we remove files, because it contains references to tenant - // which references ephemeral files which are deleted on drop. So if we keep these references, - // we will attempt to remove files which no longer exist. This can be fixed by having shutdown - // mechanism for tenant that will clean temporary data to avoid any references to ephemeral files - let local_tenant_directory = conf.tenant_path(&tenant_id); - fs::remove_dir_all(&local_tenant_directory).with_context(|| { - format!( - "Failed to remove local tenant directory '{}'", - local_tenant_directory.display() - ) - })?; - - Ok(()) -} - -/// -/// Get list of tenants, for the mgmt API -/// -pub fn list_tenants() -> Vec<(TenantId, TenantState)> { - tenants_state::read_tenants() - .iter() - .map(|(id, tenant)| (*id, tenant.current_state())) - .collect() -} - -/// Execute Attach mgmt API command. -/// -/// Downloading all the tenant data is performed in the background, this merely -/// spawns the background task and returns quickly. -pub async fn attach_tenant( - conf: &'static PageServerConf, - tenant_id: TenantId, - remote_storage: &GenericRemoteStorage, -) -> anyhow::Result<()> { - match tenants_state::write_tenants().entry(tenant_id) { - hash_map::Entry::Occupied(e) => { - // Cannot attach a tenant that already exists. The error message depends on - // the state it's in. - match e.get().current_state() { - TenantState::Attaching => { - anyhow::bail!("tenant {tenant_id} attach is already in progress") - } - current_state => { - anyhow::bail!("tenant already exists, current state: {current_state:?}") - } - } - } - hash_map::Entry::Vacant(v) => { - let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage); - v.insert(tenant); - Ok(()) - } - } -} - -#[cfg(feature = "testing")] -use { - crate::repository::GcResult, pageserver_api::models::TimelineGcRequest, - utils::http::error::ApiError, -}; - -#[cfg(feature = "testing")] -pub fn immediate_gc( - tenant_id: TenantId, - timeline_id: TimelineId, - gc_req: TimelineGcRequest, -) -> Result>, ApiError> { - let guard = tenants_state::read_tenants(); - - let tenant = guard - .get(&tenant_id) - .map(Arc::clone) - .with_context(|| format!("Tenant {tenant_id} not found")) - .map_err(ApiError::NotFound)?; - - let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); - // Use tenant's pitr setting - let pitr = tenant.get_pitr_interval(); - - // Run in task_mgr to avoid race with detach operation - let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); - task_mgr::spawn( - &tokio::runtime::Handle::current(), - TaskKind::GarbageCollector, - Some(tenant_id), - Some(timeline_id), - &format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"), - false, - async move { - fail::fail_point!("immediate_gc_task_pre"); - let result = tenant - .gc_iteration(Some(timeline_id), gc_horizon, pitr, true) - .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id)) - .await; - // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it - // better once the types support it. - match task_done.send(result) { - Ok(_) => (), - Err(result) => error!("failed to send gc result: {result:?}"), - } - Ok(()) - } - ); - - // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task - drop(guard); - - Ok(wait_task_done) -} diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 46e4acd50c..fb216123c1 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -12,7 +12,7 @@ //! use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME}; use once_cell::sync::OnceCell; -use std::fs::{File, OpenOptions}; +use std::fs::{self, File, OpenOptions}; use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write}; use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; @@ -240,6 +240,10 @@ impl VirtualFile { self.with_file("fsync", |file| file.sync_all())? } + pub fn metadata(&self) -> Result { + self.with_file("metadata", |file| file.metadata())? + } + /// Helper function that looks up the underlying File for this VirtualFile, /// opening it and evicting some other File if necessary. It calls 'func' /// with the physical File. diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index e8a2e99f06..031b80a6e0 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -31,9 +31,12 @@ use bytes::{Buf, Bytes, BytesMut}; use tracing::*; use crate::pgdatadir_mapping::*; +use crate::tenant::PageReconstructResult; use crate::tenant::Timeline; +use crate::try_page_reconstruct_result as try_prr; use crate::walrecord::*; use crate::ZERO_PAGE; +use crate::{try_no_ondemand_download, try_page_reconstruct_result}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; @@ -52,14 +55,16 @@ pub struct WalIngest<'a> { } impl<'a> WalIngest<'a> { - pub fn new(timeline: &Timeline, startpoint: Lsn) -> Result { + pub fn new(timeline: &Timeline, startpoint: Lsn) -> PageReconstructResult { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. - let checkpoint_bytes = timeline.get_checkpoint(startpoint)?; - let checkpoint = CheckPoint::decode(&checkpoint_bytes)?; + let checkpoint_bytes = try_no_ondemand_download!(timeline.get_checkpoint(startpoint)); + let checkpoint = try_page_reconstruct_result!( + CheckPoint::decode(&checkpoint_bytes).context("Failed to decode checkpoint bytes") + ); trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); - Ok(WalIngest { + PageReconstructResult::Success(WalIngest { timeline, checkpoint, checkpoint_modified: false, @@ -80,10 +85,12 @@ impl<'a> WalIngest<'a> { lsn: Lsn, modification: &mut DatadirModification, decoded: &mut DecodedWALRecord, - ) -> Result<()> { + ) -> PageReconstructResult<()> { modification.lsn = lsn; - decode_wal_record(recdata, decoded, self.timeline.pg_version) - .context("failed decoding wal record")?; + try_prr!( + decode_wal_record(recdata, decoded, self.timeline.pg_version) + .context("failed decoding wal record") + ); let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -98,7 +105,7 @@ impl<'a> WalIngest<'a> { if decoded.xl_rmid == pg_constants::RM_HEAP_ID || decoded.xl_rmid == pg_constants::RM_HEAP2_ID { - self.ingest_heapam_record(&mut buf, modification, decoded)?; + try_prr!(self.ingest_heapam_record(&mut buf, modification, decoded)); } // Handle other special record types if decoded.xl_rmid == pg_constants::RM_SMGR_ID @@ -106,13 +113,13 @@ impl<'a> WalIngest<'a> { == pg_constants::XLOG_SMGR_CREATE { let create = XlSmgrCreate::decode(&mut buf); - self.ingest_xlog_smgr_create(modification, &create)?; + try_prr!(self.ingest_xlog_smgr_create(modification, &create)); } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE { let truncate = XlSmgrTruncate::decode(&mut buf); - self.ingest_xlog_smgr_truncate(modification, &truncate)?; + try_prr!(self.ingest_xlog_smgr_truncate(modification, &truncate)); } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { debug!( "handle RM_DBASE_ID for Postgres version {:?}", @@ -125,14 +132,14 @@ impl<'a> WalIngest<'a> { let createdb = XlCreateDatabase::decode(&mut buf); debug!("XLOG_DBASE_CREATE v14"); - self.ingest_xlog_dbase_create(modification, &createdb)?; + try_prr!(self.ingest_xlog_dbase_create(modification, &createdb)); } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == postgres_ffi::v14::bindings::XLOG_DBASE_DROP { let dropdb = XlDropDatabase::decode(&mut buf); for tablespace_id in dropdb.tablespace_ids { trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id)); } } } else if self.timeline.pg_version == 15 { @@ -148,14 +155,14 @@ impl<'a> WalIngest<'a> { // So we can reuse XlCreateDatabase here. debug!("XLOG_DBASE_CREATE_FILE_COPY"); let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(modification, &createdb)?; + try_prr!(self.ingest_xlog_dbase_create(modification, &createdb)); } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == postgres_ffi::v15::bindings::XLOG_DBASE_DROP { let dropdb = XlDropDatabase::decode(&mut buf); for tablespace_id in dropdb.tablespace_ids { trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); - modification.drop_dbdir(tablespace_id, dropdb.db_id)?; + try_prr!(modification.drop_dbdir(tablespace_id, dropdb.db_id)); } } } @@ -167,38 +174,38 @@ impl<'a> WalIngest<'a> { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - self.put_slru_page_image( + try_prr!(self.put_slru_page_image( modification, SlruKind::Clog, segno, rpageno, ZERO_PAGE.clone(), - )?; + )); } else { assert!(info == pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(&mut buf); - self.ingest_clog_truncate_record(modification, &xlrec)?; + try_prr!(self.ingest_clog_truncate_record(modification, &xlrec)); } } else if decoded.xl_rmid == pg_constants::RM_XACT_ID { let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); - self.ingest_xact_record( + try_prr!(self.ingest_xact_record( modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, - )?; + )); } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED || info == pg_constants::XLOG_XACT_ABORT_PREPARED { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); - self.ingest_xact_record( + try_prr!(self.ingest_xact_record( modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, - )?; + )); // Remove twophase file. see RemoveTwoPhaseFile() in postgres code trace!( "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}", @@ -206,9 +213,10 @@ impl<'a> WalIngest<'a> { parsed_xact.xid, lsn, ); - modification.drop_twophase_file(parsed_xact.xid)?; + try_prr!(modification.drop_twophase_file(parsed_xact.xid)); } else if info == pg_constants::XLOG_XACT_PREPARE { - modification.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))?; + try_prr!(modification + .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))); } } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; @@ -217,34 +225,34 @@ impl<'a> WalIngest<'a> { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - self.put_slru_page_image( + try_prr!(self.put_slru_page_image( modification, SlruKind::MultiXactOffsets, segno, rpageno, ZERO_PAGE.clone(), - )?; + )); } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - self.put_slru_page_image( + try_prr!(self.put_slru_page_image( modification, SlruKind::MultiXactMembers, segno, rpageno, ZERO_PAGE.clone(), - )?; + )); } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { let xlrec = XlMultiXactCreate::decode(&mut buf); - self.ingest_multixact_create_record(modification, &xlrec)?; + try_prr!(self.ingest_multixact_create_record(modification, &xlrec)); } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { let xlrec = XlMultiXactTruncate::decode(&mut buf); - self.ingest_multixact_truncate_record(modification, &xlrec)?; + try_prr!(self.ingest_multixact_truncate_record(modification, &xlrec)); } } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID { let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(modification, &xlrec, decoded)?; + try_prr!(self.ingest_relmap_page(modification, &xlrec, decoded)); } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_NEXTOID { @@ -258,7 +266,9 @@ impl<'a> WalIngest<'a> { { let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT]; buf.copy_to_slice(&mut checkpoint_bytes); - let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?; + let xlog_checkpoint = try_prr!( + CheckPoint::decode(&checkpoint_bytes).context("deserialize CheckPoint") + ); trace!( "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}", xlog_checkpoint.oldestXid, @@ -279,22 +289,23 @@ impl<'a> WalIngest<'a> { // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. for blk in decoded.blocks.iter() { - self.ingest_decoded_block(modification, lsn, decoded, blk)?; + try_no_ondemand_download!(self.ingest_decoded_block(modification, lsn, decoded, blk)); } // If checkpoint data was updated, store the new version in the repository if self.checkpoint_modified { - let new_checkpoint_bytes = self.checkpoint.encode()?; + let new_checkpoint_bytes = + try_prr!(self.checkpoint.encode().context("encode checkpoint")); - modification.put_checkpoint(new_checkpoint_bytes)?; + try_prr!(modification.put_checkpoint(new_checkpoint_bytes)); self.checkpoint_modified = false; } // Now that this record has been fully handled, including updating the // checkpoint data, let the repository know that it is up-to-date to this LSN - modification.commit()?; + try_prr!(modification.commit()); - Ok(()) + PageReconstructResult::Success(()) } fn ingest_decoded_block( @@ -303,12 +314,12 @@ impl<'a> WalIngest<'a> { lsn: Lsn, decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, - ) -> Result<()> { + ) -> PageReconstructResult<()> { let rel = RelTag { spcnode: blk.rnode_spcnode, dbnode: blk.rnode_dbnode, relnode: blk.rnode_relnode, - forknum: blk.forknum as u8, + forknum: blk.forknum, }; // @@ -323,7 +334,7 @@ impl<'a> WalIngest<'a> { && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) // compression of WAL is not yet supported: fall back to storing the original WAL record - && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)? + && !try_prr!(postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)) { // Extract page image from FPI record let img_len = blk.bimg_len as usize; @@ -345,15 +356,20 @@ impl<'a> WalIngest<'a> { page_set_lsn(&mut image, lsn) } assert_eq!(image.len(), BLCKSZ as usize); - self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?; + try_no_ondemand_download!(self.put_rel_page_image( + modification, + rel, + blk.blkno, + image.freeze() + )); } else { let rec = NeonWalRecord::Postgres { will_init: blk.will_init || blk.apply_image, rec: decoded.record.clone(), }; - self.put_rel_wal_record(modification, rel, blk.blkno, rec)?; + try_prr!(self.put_rel_wal_record(modification, rel, blk.blkno, rec)); } - Ok(()) + PageReconstructResult::Success(()) } fn ingest_heapam_record( @@ -505,7 +521,7 @@ impl<'a> WalIngest<'a> { &mut self, modification: &mut DatadirModification, rec: &XlCreateDatabase, - ) -> Result<()> { + ) -> anyhow::Result<()> { let db_id = rec.db_id; let tablespace_id = rec.tablespace_id; let src_db_id = rec.src_db_id; @@ -520,14 +536,16 @@ impl<'a> WalIngest<'a> { let rels = modification .tline - .list_rels(src_tablespace_id, src_db_id, req_lsn)?; + .list_rels(src_tablespace_id, src_db_id, req_lsn) + .no_ondemand_download()?; debug!("ingest_xlog_dbase_create: {} rels", rels.len()); // Copy relfilemap let filemap = modification .tline - .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)?; + .get_relmap_file(src_tablespace_id, src_db_id, req_lsn) + .no_ondemand_download()?; modification.put_relmap_file(tablespace_id, db_id, filemap)?; let mut num_rels_copied = 0; @@ -536,7 +554,10 @@ impl<'a> WalIngest<'a> { assert_eq!(src_rel.spcnode, src_tablespace_id); assert_eq!(src_rel.dbnode, src_db_id); - let nblocks = modification.tline.get_rel_size(src_rel, req_lsn, true)?; + let nblocks = modification + .tline + .get_rel_size(src_rel, req_lsn, true) + .no_ondemand_download()?; let dst_rel = RelTag { spcnode: tablespace_id, dbnode: db_id, @@ -553,7 +574,8 @@ impl<'a> WalIngest<'a> { let content = modification .tline - .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)?; + .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true) + .no_ondemand_download()?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; } @@ -657,7 +679,7 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification, parsed: &XlXactParsedRecord, is_commit: bool, - ) -> Result<()> { + ) -> anyhow::Result<()> { // Record update of CLOG pages let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE; let mut segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -713,7 +735,11 @@ impl<'a> WalIngest<'a> { relnode: xnode.relnode, }; let last_lsn = self.timeline.get_last_record_lsn(); - if modification.tline.get_rel_exists(rel, last_lsn, true)? { + if modification + .tline + .get_rel_exists(rel, last_lsn, true) + .no_ondemand_download()? + { self.put_rel_drop(modification, rel)?; } } @@ -725,7 +751,7 @@ impl<'a> WalIngest<'a> { &mut self, modification: &mut DatadirModification, xlrec: &XlClogTruncate, - ) -> Result<()> { + ) -> anyhow::Result<()> { info!( "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}", xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db @@ -767,7 +793,8 @@ impl<'a> WalIngest<'a> { let req_lsn = modification.tline.get_last_record_lsn(); for segno in modification .tline - .list_slru_segments(SlruKind::Clog, req_lsn)? + .list_slru_segments(SlruKind::Clog, req_lsn) + .no_ondemand_download()? { let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; if slru_may_delete_clogsegment(segpage, xlrec.pageno) { @@ -923,10 +950,10 @@ impl<'a> WalIngest<'a> { rel: RelTag, blknum: BlockNumber, img: Bytes, - ) -> Result<()> { - self.handle_rel_extend(modification, rel, blknum)?; - modification.put_rel_page_image(rel, blknum, img)?; - Ok(()) + ) -> PageReconstructResult<()> { + try_no_ondemand_download!(self.handle_rel_extend(modification, rel, blknum)); + try_prr!(modification.put_rel_page_image(rel, blknum, img)); + PageReconstructResult::Success(()) } fn put_rel_wal_record( @@ -936,7 +963,8 @@ impl<'a> WalIngest<'a> { blknum: BlockNumber, rec: NeonWalRecord, ) -> Result<()> { - self.handle_rel_extend(modification, rel, blknum)?; + self.handle_rel_extend(modification, rel, blknum) + .no_ondemand_download()?; modification.put_rel_wal_record(rel, blknum, rec)?; Ok(()) } @@ -946,7 +974,7 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification, rel: RelTag, nblocks: BlockNumber, - ) -> Result<()> { + ) -> anyhow::Result<()> { modification.put_rel_truncation(rel, nblocks)?; Ok(()) } @@ -956,11 +984,17 @@ impl<'a> WalIngest<'a> { Ok(()) } - fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> Result { - let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true)? { + fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result { + let nblocks = if !self + .timeline + .get_rel_exists(rel, lsn, true) + .no_ondemand_download()? + { 0 } else { - self.timeline.get_rel_size(rel, lsn, true)? + self.timeline + .get_rel_size(rel, lsn, true) + .no_ondemand_download()? }; Ok(nblocks) } @@ -970,30 +1004,31 @@ impl<'a> WalIngest<'a> { modification: &mut DatadirModification, rel: RelTag, blknum: BlockNumber, - ) -> Result<()> { + ) -> PageReconstructResult<()> { let new_nblocks = blknum + 1; // Check if the relation exists. We implicitly create relations on first // record. // TODO: would be nice if to be more explicit about it let last_lsn = modification.lsn; - let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true)? { - // create it with 0 size initially, the logic below will extend it - modification.put_rel_creation(rel, 0)?; - 0 - } else { - self.timeline.get_rel_size(rel, last_lsn, true)? - }; + let old_nblocks = + if !try_no_ondemand_download!(self.timeline.get_rel_exists(rel, last_lsn, true)) { + // create it with 0 size initially, the logic below will extend it + try_prr!(modification.put_rel_creation(rel, 0)); + 0 + } else { + try_no_ondemand_download!(self.timeline.get_rel_size(rel, last_lsn, true)) + }; if new_nblocks > old_nblocks { //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks); - modification.put_rel_extend(rel, new_nblocks)?; + try_prr!(modification.put_rel_extend(rel, new_nblocks)); // fill the gap with zeros for gap_blknum in old_nblocks..blknum { - modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?; + try_prr!(modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())); } } - Ok(()) + PageReconstructResult::Success(()) } fn put_slru_page_image( @@ -1015,7 +1050,7 @@ impl<'a> WalIngest<'a> { kind: SlruKind, segno: u32, blknum: BlockNumber, - ) -> Result<()> { + ) -> anyhow::Result<()> { // we don't use a cache for this like we do for relations. SLRUS are explcitly // extended with ZEROPAGE records, not with commit records, so it happens // a lot less frequently. @@ -1027,13 +1062,16 @@ impl<'a> WalIngest<'a> { let last_lsn = self.timeline.get_last_record_lsn(); let old_nblocks = if !self .timeline - .get_slru_segment_exists(kind, segno, last_lsn)? + .get_slru_segment_exists(kind, segno, last_lsn) + .no_ondemand_download()? { // create it with 0 size initially, the logic below will extend it modification.put_slru_segment_creation(kind, segno, 0)?; 0 } else { - self.timeline.get_slru_segment_size(kind, segno, last_lsn)? + self.timeline + .get_slru_segment_size(kind, segno, last_lsn) + .no_ondemand_download()? }; if new_nblocks > old_nblocks { @@ -1086,7 +1124,7 @@ mod tests { m.put_checkpoint(ZERO_CHECKPOINT.clone())?; m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file m.commit()?; - let walingest = WalIngest::new(tline, Lsn(0x10))?; + let walingest = WalIngest::new(tline, Lsn(0x10)).no_ondemand_download()?; Ok(walingest) } @@ -1095,62 +1133,107 @@ mod tests { async fn test_relsize() -> Result<()> { let tenant = TenantHarness::create("test_relsize")?.load().await; let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&*tline)?; + let mut walingest = init_walingest_test(&tline)?; let mut m = tline.begin_modification(Lsn(0x20)); walingest.put_rel_creation(&mut m, TESTREL_A)?; - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2")) + .no_ondemand_download()?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x30)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3")) + .no_ondemand_download()?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x40)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4")) + .no_ondemand_download()?; m.commit()?; let mut m = tline.begin_modification(Lsn(0x50)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5")) + .no_ondemand_download()?; m.commit()?; - assert_current_logical_size(&*tline, Lsn(0x50)); + assert_current_logical_size(&tline, Lsn(0x50)); // The relation was created at LSN 2, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false); - assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err()); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download()?, + false + ); + assert!(tline + .get_rel_size(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download() + .is_err()); - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + 1 + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x50), false) + .no_ondemand_download()?, + 3 + ); // Check page contents at each LSN assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 2") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1 at 4") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 2 at 5") ); @@ -1158,23 +1241,39 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x60)); walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?; m.commit()?; - assert_current_logical_size(&*tline, Lsn(0x60)); + assert_current_logical_size(&tline, Lsn(0x60)); // Check reported size and contents after truncation - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 2); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(0x60), false) + .no_ondemand_download()?, + 2 + ); + assert_eq!( + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false) + .no_ondemand_download()?, TEST_IMG("foo blk 0 at 3") ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1 at 4") ); // should still see the truncated block with older LSN - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, 3); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(0x50), false) + .no_ondemand_download()?, + 3 + ); + assert_eq!( + tline + .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG("foo blk 2 at 5") ); @@ -1182,35 +1281,62 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x68)); walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false)?, 0); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x68), false) + .no_ondemand_download()?, + 0 + ); // Extend from 0 to 2 blocks, leaving a gap let mut m = tline.begin_modification(Lsn(0x70)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1")) + .no_ondemand_download()?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false)?, 2); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(0x70), false) + .no_ondemand_download()?, + 2 + ); + assert_eq!( + tline + .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false) + .no_ondemand_download()?, ZERO_PAGE ); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1") ); // Extend a lot more, leaving a big gap that spans across segments let mut m = tline.begin_modification(Lsn(0x80)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500")) + .no_ondemand_download()?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, 1501); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x80), false) + .no_ondemand_download()?, + 1501 + ); for blk in 2..1500 { assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false) + .no_ondemand_download()?, ZERO_PAGE ); } assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false) + .no_ondemand_download()?, TEST_IMG("foo blk 1500") ); @@ -1223,15 +1349,27 @@ mod tests { async fn test_drop_extend() -> Result<()> { let tenant = TenantHarness::create("test_drop_extend")?.load().await; let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&*tline)?; + let mut walingest = init_walingest_test(&tline)?; let mut m = tline.begin_modification(Lsn(0x20)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2")) + .no_ondemand_download()?; m.commit()?; // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, 1); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + 1 + ); // Drop rel let mut m = tline.begin_modification(Lsn(0x30)); @@ -1239,19 +1377,36 @@ mod tests { m.commit()?; // Check that rel is not visible anymore - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30), false)?, false); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x30), false) + .no_ondemand_download()?, + false + ); // FIXME: should fail //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30), false)?.is_none()); // Re-create it let mut m = tline.begin_modification(Lsn(0x40)); - walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4")) + .no_ondemand_download()?; m.commit()?; // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false)?, 1); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x40), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x40), false) + .no_ondemand_download()?, + 1 + ); Ok(()) } @@ -1263,30 +1418,52 @@ mod tests { async fn test_truncate_extend() -> Result<()> { let tenant = TenantHarness::create("test_truncate_extend")?.load().await; let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&*tline)?; + let mut walingest = init_walingest_test(&tline)?; // Create a 20 MB relation (the size is arbitrary) let relsize = 20 * 1024 * 1024 / 8192; let mut m = tline.begin_modification(Lsn(0x20)); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); - walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data)) + .no_ondemand_download()?; } m.commit()?; // The relation was created at LSN 20, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10), false)?, false); - assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10), false).is_err()); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download()?, + false + ); + assert!(tline + .get_rel_size(TESTREL_A, Lsn(0x10), false) + .no_ondemand_download() + .is_err()); - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false)?, relsize); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x20), false) + .no_ondemand_download()?, + relsize + ); // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false) + .no_ondemand_download()?, TEST_IMG(&data) ); } @@ -1298,24 +1475,38 @@ mod tests { m.commit()?; // Check reported size and contents after truncation - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false)?, 1); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x60), false) + .no_ondemand_download()?, + 1 + ); for blkno in 0..1 { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false) + .no_ondemand_download()?, TEST_IMG(&data) ); } // should still see all blocks with older LSN - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false)?, relsize); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x50), false) + .no_ondemand_download()?, + relsize + ); for blkno in 0..relsize { let lsn = Lsn(0x20); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false) + .no_ondemand_download()?, TEST_IMG(&data) ); } @@ -1326,18 +1517,32 @@ mod tests { let mut m = tline.begin_modification(lsn); for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, lsn); - walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data)) + .no_ondemand_download()?; } m.commit()?; - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80), false)?, true); - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false)?, relsize); + assert_eq!( + tline + .get_rel_exists(TESTREL_A, Lsn(0x80), false) + .no_ondemand_download()?, + true + ); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(0x80), false) + .no_ondemand_download()?, + relsize + ); // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x80); let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( - tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)?, + tline + .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false) + .no_ondemand_download()?, TEST_IMG(&data) ); } @@ -1351,21 +1556,25 @@ mod tests { async fn test_large_rel() -> Result<()> { let tenant = TenantHarness::create("test_large_rel")?.load().await; let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?; - let mut walingest = init_walingest_test(&*tline)?; + let mut walingest = init_walingest_test(&tline)?; let mut lsn = 0x10; for blknum in 0..RELSEG_SIZE + 1 { lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); - walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?; + walingest + .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img) + .no_ondemand_download()?; m.commit()?; } - assert_current_logical_size(&*tline, Lsn(lsn)); + assert_current_logical_size(&tline, Lsn(lsn)); assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, RELSEG_SIZE + 1 ); @@ -1374,8 +1583,13 @@ mod tests { let mut m = tline.begin_modification(Lsn(lsn)); walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)?; m.commit()?; - assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, RELSEG_SIZE); - assert_current_logical_size(&*tline, Lsn(lsn)); + assert_eq!( + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, + RELSEG_SIZE + ); + assert_current_logical_size(&tline, Lsn(lsn)); // Truncate another block lsn += 0x10; @@ -1383,10 +1597,12 @@ mod tests { walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)?; m.commit()?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, RELSEG_SIZE - 1 ); - assert_current_logical_size(&*tline, Lsn(lsn)); + assert_current_logical_size(&tline, Lsn(lsn)); // Truncate to 1500, and then truncate all the way down to 0, one block at a time // This tests the behavior at segment boundaries @@ -1397,13 +1613,15 @@ mod tests { walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?; m.commit()?; assert_eq!( - tline.get_rel_size(TESTREL_A, Lsn(lsn), false)?, + tline + .get_rel_size(TESTREL_A, Lsn(lsn), false) + .no_ondemand_download()?, size as BlockNumber ); size -= 1; } - assert_current_logical_size(&*tline, Lsn(lsn)); + assert_current_logical_size(&tline, Lsn(lsn)); Ok(()) } diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 1fad91c836..aaf46579a7 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -6,7 +6,7 @@ //! hence WAL receiver needs to react on such events. //! //! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming. -//! For that, it watches specific keys in etcd broker and pulls the relevant data periodically. +//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically. //! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other. //! Without this data, no WAL streaming is possible currently. //! @@ -26,57 +26,52 @@ mod walreceiver_connection; use crate::config::PageServerConf; use crate::task_mgr::WALRECEIVER_RUNTIME; -use anyhow::{ensure, Context}; -use etcd_broker::Client; -use itertools::Itertools; +use anyhow::Context; use once_cell::sync::OnceCell; use std::future::Future; +use storage_broker::BrokerClientChannel; use tokio::sync::watch; use tracing::*; -use url::Url; pub use connection_manager::spawn_connection_manager_task; -static ETCD_CLIENT: OnceCell = OnceCell::new(); +static BROKER_CLIENT: OnceCell = OnceCell::new(); /// -/// Initialize the etcd client. This must be called once at page server startup. +/// Initialize the broker client. This must be called once at page server startup. /// -pub async fn init_etcd_client(conf: &'static PageServerConf) -> anyhow::Result<()> { - let etcd_endpoints = conf.broker_endpoints.clone(); - ensure!( - !etcd_endpoints.is_empty(), - "Cannot start wal receiver: etcd endpoints are empty" - ); +pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> { + let broker_endpoint = conf.broker_endpoint.clone(); - let etcd_client = Client::connect(etcd_endpoints.clone(), None) - .await - .context("Failed to connect to etcd")?; + // Note: we do not attempt connecting here (but validate endpoints sanity). + let broker_client = + storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context( + format!( + "Failed to create broker client to {}", + &conf.broker_endpoint + ), + )?; - // FIXME: Should we still allow the pageserver to start, if etcd - // doesn't work? It could still serve GetPage requests, with the - // data it has locally and from what it can download from remote - // storage - if ETCD_CLIENT.set(etcd_client).is_err() { - panic!("etcd already initialized"); + if BROKER_CLIENT.set(broker_client).is_err() { + panic!("broker already initialized"); } info!( - "Initialized etcd client with endpoints: {}", - etcd_endpoints.iter().map(Url::to_string).join(", ") + "Initialized broker client with endpoints: {}", + broker_endpoint ); Ok(()) } /// -/// Get a handle to the etcd client +/// Get a handle to the broker client /// -pub fn get_etcd_client() -> &'static etcd_broker::Client { - ETCD_CLIENT.get().expect("etcd client not initialized") +pub fn get_broker_client() -> &'static BrokerClientChannel { + BROKER_CLIENT.get().expect("broker client not initialized") } -pub fn is_etcd_client_initialized() -> bool { - ETCD_CLIENT.get().is_some() +pub fn is_broker_client_initialized() -> bool { + BROKER_CLIENT.get().is_some() } /// A handle of an asynchronous task. @@ -134,15 +129,21 @@ impl TaskHandle { match self.events_receiver.changed().await { Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()), Err(_task_channel_part_dropped) => { - TaskEvent::End(match self.join_handle.take() { + TaskEvent::End(match self.join_handle.as_mut() { Some(jh) => { if !jh.is_finished() { warn!("sender is dropped while join handle is still alive"); } - jh.await + let res = jh + .await .map_err(|e| anyhow::anyhow!("Failed to join task: {e}")) - .and_then(|x| x) + .and_then(|x| x); + + // For cancellation-safety, drop join_handle only after successful .await. + self.join_handle = None; + + res } None => { // Another option is to have an enum, join handle or result and give away the reference to it diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs index c598f20b10..8b60e59305 100644 --- a/pageserver/src/walreceiver/connection_manager.rs +++ b/pageserver/src/walreceiver/connection_manager.rs @@ -1,21 +1,15 @@ //! WAL receiver logic that ensures the pageserver gets connectected to safekeeper, //! that contains the latest WAL to stream and this connection does not go stale. //! -//! To achieve that, a etcd broker is used: safekepers propagate their timelines' state in it, +//! To achieve that, a storage broker is used: safekepers propagate their timelines' state in it, //! the manager subscribes for changes and accumulates those to query the one with the biggest Lsn for connection. //! Current connection state is tracked too, to ensure it's not getting stale. //! -//! After every connection or etcd update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader, +//! After every connection or storage broker update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader, //! then a [re]connection happens, if necessary. -//! Only WAL streaming task expects to be finished, other loops (etcd, connection management) never exit unless cancelled explicitly via the dedicated channel. +//! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel. -use std::{ - collections::{hash_map, HashMap}, - num::NonZeroU64, - ops::ControlFlow, - sync::Arc, - time::Duration, -}; +use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration}; use crate::task_mgr::TaskKind; use crate::task_mgr::WALRECEIVER_RUNTIME; @@ -23,16 +17,18 @@ use crate::tenant::Timeline; use crate::{task_mgr, walreceiver::TaskStateUpdate}; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; -use etcd_broker::{ - subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription, - BrokerUpdate, Client, -}; use pageserver_api::models::TimelineState; +use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey; +use storage_broker::proto::SafekeeperTimelineInfo; +use storage_broker::proto::SubscribeSafekeeperInfoRequest; +use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; +use storage_broker::BrokerClientChannel; +use storage_broker::Streaming; use tokio::{select, sync::watch}; use tracing::*; use crate::{ - exponential_backoff, walreceiver::get_etcd_client, DEFAULT_BASE_BACKOFF_SECONDS, + exponential_backoff, walreceiver::get_broker_client, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, }; use postgres_connection::{parse_host_port, PgConnectionConfig}; @@ -45,14 +41,13 @@ use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle}; /// Spawns the loop to take care of the timeline's WAL streaming connection. pub fn spawn_connection_manager_task( - broker_loop_prefix: String, timeline: Arc, wal_connect_timeout: Duration, lagging_wal_timeout: Duration, max_lsn_wal_lag: NonZeroU64, auth_token: Option>, ) { - let mut etcd_client = get_etcd_client().clone(); + let mut broker_client = get_broker_client().clone(); let tenant_id = timeline.tenant_id; let timeline_id = timeline.timeline_id; @@ -65,7 +60,7 @@ pub fn spawn_connection_manager_task( &format!("walreceiver for timeline {tenant_id}/{timeline_id}"), false, async move { - info!("WAL receiver broker started, connecting to etcd"); + info!("WAL receiver manager started, connecting to broker"); let mut walreceiver_state = WalreceiverState::new( timeline, wal_connect_timeout, @@ -81,8 +76,7 @@ pub fn spawn_connection_manager_task( return Ok(()); }, loop_step_result = connection_manager_loop_step( - &broker_loop_prefix, - &mut etcd_client, + &mut broker_client, &mut walreceiver_state, ) => match loop_step_result { ControlFlow::Continue(()) => continue, @@ -103,10 +97,9 @@ pub fn spawn_connection_manager_task( /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. /// Based on the updates, desides whether to start, keep or stop a WAL receiver task. -/// If etcd subscription is cancelled, exits. +/// If storage broker subscription is cancelled, exits. async fn connection_manager_loop_step( - broker_prefix: &str, - etcd_client: &mut Client, + broker_client: &mut BrokerClientChannel, walreceiver_state: &mut WalreceiverState, ) -> ControlFlow<(), ()> { let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates(); @@ -124,13 +117,11 @@ async fn connection_manager_loop_step( timeline_id: walreceiver_state.timeline.timeline_id, }; - // XXX: We never explicitly cancel etcd task, instead establishing one and never letting it go, - // running the entire loop step as much as possible to an end. - // The task removal happens implicitly on drop, both aborting the etcd subscription task and dropping the receiver channel end, - // forcing the etcd subscription to exit either way. - let mut broker_subscription = - subscribe_for_timeline_updates(etcd_client, broker_prefix, id).await; - info!("Subscribed for etcd timeline changes, waiting for new etcd data"); + // Subscribe to the broker updates. Stream shares underlying TCP connection + // with other streams on this client (other connection managers). When + // object goes out of scope, stream finishes in drop() automatically. + let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await; + info!("Subscribed for broker timeline updates"); loop { let time_until_next_retry = walreceiver_state.time_until_next_retry(); @@ -145,12 +136,6 @@ async fn connection_manager_loop_step( // - this might change the current desired connection // - timeline state changes to something that does not allow walreceiver to run concurrently select! { - broker_connection_result = &mut broker_subscription.watcher_handle => { - info!("Broker connection was closed from the other side, ending current broker loop step"); - cleanup_broker_connection(broker_connection_result, walreceiver_state); - return ControlFlow::Continue(()); - }, - Some(wal_connection_update) = async { match walreceiver_state.wal_connection.as_mut() { Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await), @@ -160,21 +145,17 @@ async fn connection_manager_loop_step( let wal_connection = walreceiver_state.wal_connection.as_mut() .expect("Should have a connection, as checked by the corresponding select! guard"); match wal_connection_update { - TaskEvent::Update(c) => { - match c { - TaskStateUpdate::Init | TaskStateUpdate::Started => {}, - TaskStateUpdate::Progress(status) => { - if status.has_processed_wal { - // We have advanced last_record_lsn by processing the WAL received - // from this safekeeper. This is good enough to clean unsuccessful - // retries history and allow reconnecting to this safekeeper without - // sleeping for a long time. - walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); - } - wal_connection.status = status.to_owned(); - } + TaskEvent::Update(TaskStateUpdate::Init | TaskStateUpdate::Started) => {}, + TaskEvent::Update(TaskStateUpdate::Progress(new_status)) => { + if new_status.has_processed_wal { + // We have advanced last_record_lsn by processing the WAL received + // from this safekeeper. This is good enough to clean unsuccessful + // retries history and allow reconnecting to this safekeeper without + // sleeping for a long time. + walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id); } - }, + wal_connection.status = new_status; + } TaskEvent::End(walreceiver_task_result) => { match walreceiver_task_result { Ok(()) => debug!("WAL receiving task finished"), @@ -185,22 +166,16 @@ async fn connection_manager_loop_step( } }, - // Got a new update from etcd - broker_update = broker_subscription.value_updates.recv() => { + // Got a new update from the broker + broker_update = broker_subscription.message() => { match broker_update { - Some(broker_update) => walreceiver_state.register_timeline_update(broker_update), - None => { - info!("Broker sender end was dropped, ending current broker loop step"); - // Ensure to cancel and wait for the broker subscription task end, to log its result. - // Broker sender end is in the broker subscription task and its drop means abnormal task completion. - // First, ensure that the task is stopped (abort can be done without errors on already stopped tasks and repeated multiple times). - broker_subscription.watcher_handle.abort(); - // Then, wait for the task to finish and print its result. If the task was finished before abort (which we assume in this abnormal case), - // a proper error message will be printed, otherwise an abortion message is printed which is ok, since we're signalled to finish anyway. - cleanup_broker_connection( - (&mut broker_subscription.watcher_handle).await, - walreceiver_state, - ); + Ok(Some(broker_update)) => walreceiver_state.register_timeline_update(broker_update), + Err(e) => { + error!("broker subscription failed: {e}"); + return ControlFlow::Continue(()); + } + Ok(None) => { + error!("broker subscription stream ended"); // can't happen return ControlFlow::Continue(()); } } @@ -231,18 +206,18 @@ async fn connection_manager_loop_step( } }, - _ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {} - } - - // Fetch more etcd timeline updates, but limit ourselves since they may arrive quickly. - let mut max_events_to_poll = 100_u32; - while max_events_to_poll > 0 { - if let Ok(broker_update) = broker_subscription.value_updates.try_recv() { - walreceiver_state.register_timeline_update(broker_update); - max_events_to_poll -= 1; - } else { - break; - } + Some(()) = async { + match time_until_next_retry { + Some(sleep_time) => { + tokio::time::sleep(sleep_time).await; + Some(()) + }, + None => { + debug!("No candidates to retry, waiting indefinitely for the broker events"); + None + } + } + } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"), } if let Some(new_candidate) = walreceiver_state.next_connection_candidate() { @@ -285,33 +260,11 @@ async fn wait_for_active_timeline( } } -fn cleanup_broker_connection( - broker_connection_result: Result, tokio::task::JoinError>, - walreceiver_state: &mut WalreceiverState, -) { - match broker_connection_result { - Ok(Ok(())) => info!("Broker conneciton task finished, ending current broker loop step"), - Ok(Err(broker_error)) => warn!("Broker conneciton ended with error: {broker_error}"), - Err(abort_error) => { - if abort_error.is_panic() { - error!("Broker connection panicked: {abort_error}") - } else { - debug!("Broker connection aborted: {abort_error}") - } - } - } - - walreceiver_state.wal_stream_candidates.clear(); -} - /// Endlessly try to subscribe for broker updates for a given timeline. -/// If there are no safekeepers to maintain the lease, the timeline subscription will be unavailable in the broker and the operation will fail constantly. -/// This is ok, pageservers should anyway try subscribing (with some backoff) since it's the only way they can get the timeline WAL anyway. async fn subscribe_for_timeline_updates( - etcd_client: &mut Client, - broker_prefix: &str, + broker_client: &mut BrokerClientChannel, id: TenantTimelineId, -) -> BrokerSubscription { +) -> Streaming { let mut attempt = 0; loop { exponential_backoff( @@ -322,18 +275,21 @@ async fn subscribe_for_timeline_updates( .await; attempt += 1; - match etcd_broker::subscribe_for_json_values( - etcd_client, - SubscriptionKey::sk_timeline_info(broker_prefix.to_owned(), id), - ) - .instrument(info_span!("etcd_subscription")) - .await - { - Ok(new_subscription) => { - return new_subscription; + // subscribe to the specific timeline + let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId { + tenant_id: id.tenant_id.as_ref().to_owned(), + timeline_id: id.timeline_id.as_ref().to_owned(), + }); + let request = SubscribeSafekeeperInfoRequest { + subscription_key: Some(key), + }; + + match broker_client.subscribe_safekeeper_info(request).await { + Ok(resp) => { + return resp.into_inner(); } Err(e) => { - warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in etcd: {e:#}"); + warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}"); continue; } } @@ -360,8 +316,8 @@ struct WalreceiverState { wal_connection: Option, /// Info about retries and unsuccessful attempts to connect to safekeepers. wal_connection_retries: HashMap, - /// Data about all timelines, available for connection, fetched from etcd, grouped by their corresponding safekeeper node id. - wal_stream_candidates: HashMap, + /// Data about all timelines, available for connection, fetched from storage broker, grouped by their corresponding safekeeper node id. + wal_stream_candidates: HashMap, auth_token: Option>, } @@ -395,13 +351,11 @@ struct RetryInfo { retry_duration_seconds: f64, } -/// Data about the timeline to connect to, received from etcd. +/// Data about the timeline to connect to, received from the broker. #[derive(Debug)] -struct EtcdSkTimeline { - timeline: SkTimelineInfo, - /// Etcd generation, the bigger it is, the more up to date the timeline data is. - etcd_version: i64, - /// Time at which the data was fetched from etcd last time, to track the stale data. +struct BrokerSkTimeline { + timeline: SafekeeperTimelineInfo, + /// Time at which the data was fetched from the broker last time, to track the stale data. latest_update: NaiveDateTime, } @@ -453,7 +407,7 @@ impl WalreceiverState { .await .context("walreceiver connection handling failure") } - .instrument(info_span!("walreceiver_connection", id = %id)) + .instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id)) }); let now = Utc::now().naive_utc(); @@ -533,36 +487,28 @@ impl WalreceiverState { .values() .filter_map(|retry| retry.next_retry_at) .filter(|next_retry_at| next_retry_at > &now) - .min(); + .min()?; - next_retry_at.and_then(|next_retry_at| (next_retry_at - now).to_std().ok()) + (next_retry_at - now).to_std().ok() } - /// Adds another etcd timeline into the state, if its more recent than the one already added there for the same key. - fn register_timeline_update(&mut self, timeline_update: BrokerUpdate) { - match self - .wal_stream_candidates - .entry(timeline_update.key.node_id) - { - hash_map::Entry::Occupied(mut o) => { - let existing_value = o.get_mut(); - if existing_value.etcd_version < timeline_update.etcd_version { - existing_value.etcd_version = timeline_update.etcd_version; - existing_value.timeline = timeline_update.value; - existing_value.latest_update = Utc::now().naive_utc(); - } - } - hash_map::Entry::Vacant(v) => { - v.insert(EtcdSkTimeline { - timeline: timeline_update.value, - etcd_version: timeline_update.etcd_version, - latest_update: Utc::now().naive_utc(), - }); - } + /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key. + fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) { + let new_safekeeper_id = NodeId(timeline_update.safekeeper_id); + let old_entry = self.wal_stream_candidates.insert( + new_safekeeper_id, + BrokerSkTimeline { + timeline: timeline_update, + latest_update: Utc::now().naive_utc(), + }, + ); + + if old_entry.is_none() { + info!("New SK node was added: {new_safekeeper_id}"); } } - /// Cleans up stale etcd records and checks the rest for the new connection candidate. + /// Cleans up stale broker records and checks the rest for the new connection candidate. /// Returns a new candidate, if the current state is absent or somewhat lagging, `None` otherwise. /// The current rules for approving new candidates: /// * pick a candidate different from the connected safekeeper with biggest `commit_lsn` and lowest failed connection attemps @@ -585,7 +531,7 @@ impl WalreceiverState { Some(existing_wal_connection) => { let connected_sk_node = existing_wal_connection.sk_id; - let (new_sk_id, new_safekeeper_etcd_data, new_wal_source_connconf) = + let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) = self.select_connection_candidate(Some(connected_sk_node))?; let now = Utc::now().naive_utc(); @@ -614,7 +560,7 @@ impl WalreceiverState { } if let Some(current_commit_lsn) = existing_wal_connection.status.commit_lsn { - let new_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0)); + let new_commit_lsn = Lsn(new_safekeeper_broker_data.commit_lsn); // Check if the new candidate has much more WAL than the current one. match new_commit_lsn.0.checked_sub(current_commit_lsn.0) { Some(new_sk_lsn_advantage) => { @@ -644,7 +590,7 @@ impl WalreceiverState { .status .commit_lsn .unwrap_or(current_lsn); - let candidate_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0)); + let candidate_commit_lsn = Lsn(new_safekeeper_broker_data.commit_lsn); // Keep discovered_new_wal only if connected safekeeper has not caught up yet. let mut discovered_new_wal = existing_wal_connection @@ -727,7 +673,7 @@ impl WalreceiverState { None } - /// Selects the best possible candidate, based on the data collected from etcd updates about the safekeepers. + /// Selects the best possible candidate, based on the data collected from the broker updates about the safekeepers. /// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another. /// /// The candidate that is chosen: @@ -736,7 +682,7 @@ impl WalreceiverState { fn select_connection_candidate( &self, node_to_omit: Option, - ) -> Option<(NodeId, &SkTimelineInfo, PgConnectionConfig)> { + ) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> { self.applicable_connection_candidates() .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit) .max_by_key(|(_, info, _)| info.commit_lsn) @@ -746,12 +692,12 @@ impl WalreceiverState { /// Some safekeepers are filtered by the retry cooldown. fn applicable_connection_candidates( &self, - ) -> impl Iterator { + ) -> impl Iterator { let now = Utc::now().naive_utc(); self.wal_stream_candidates .iter() - .filter(|(_, info)| info.timeline.commit_lsn.is_some()) + .filter(|(_, info)| Lsn(info.timeline.commit_lsn) != Lsn::INVALID) .filter(move |(sk_id, _)| { let next_retry_at = self .wal_connection_retries @@ -761,12 +707,14 @@ impl WalreceiverState { }); next_retry_at.is_none() || next_retry_at.unwrap() <= now - }) - .filter_map(|(sk_id, etcd_info)| { - let info = &etcd_info.timeline; + }).filter_map(|(sk_id, broker_info)| { + let info = &broker_info.timeline; + if info.safekeeper_connstr.is_empty() { + return None; // no connection string, ignore sk + } match wal_stream_connection_config( self.id, - info.safekeeper_connstr.as_deref()?, + info.safekeeper_connstr.as_ref(), match &self.auth_token { None => None, Some(x) => Some(x), @@ -781,15 +729,16 @@ impl WalreceiverState { }) } - /// Remove candidates which haven't sent etcd updates for a while. + /// Remove candidates which haven't sent broker updates for a while. fn cleanup_old_candidates(&mut self) { let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len()); + let lagging_wal_timeout = self.lagging_wal_timeout; - self.wal_stream_candidates.retain(|node_id, etcd_info| { - if let Ok(time_since_latest_etcd_update) = - (Utc::now().naive_utc() - etcd_info.latest_update).to_std() + self.wal_stream_candidates.retain(|node_id, broker_info| { + if let Ok(time_since_latest_broker_update) = + (Utc::now().naive_utc() - broker_info.latest_update).to_std() { - let should_retain = time_since_latest_etcd_update < self.lagging_wal_timeout; + let should_retain = time_since_latest_broker_update < lagging_wal_timeout; if !should_retain { node_ids_to_remove.push(*node_id); } @@ -799,8 +748,11 @@ impl WalreceiverState { } }); - for node_id in node_ids_to_remove { - self.wal_connection_retries.remove(&node_id); + if !node_ids_to_remove.is_empty() { + for node_id in node_ids_to_remove { + info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"); + self.wal_connection_retries.remove(&node_id); + } } } @@ -853,7 +805,7 @@ fn wal_stream_connection_config( auth_token: Option<&str>, ) -> anyhow::Result { let (host, port) = - parse_host_port(&listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?; + parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?; let port = port.unwrap_or(5432); Ok(PgConnectionConfig::new_host_port(host, port) .extend_options([ @@ -870,6 +822,28 @@ mod tests { use crate::tenant::harness::{TenantHarness, TIMELINE_ID}; use url::Host; + fn dummy_broker_sk_timeline( + commit_lsn: u64, + safekeeper_connstr: &str, + latest_update: NaiveDateTime, + ) -> BrokerSkTimeline { + BrokerSkTimeline { + timeline: SafekeeperTimelineInfo { + safekeeper_id: 0, + tenant_timeline_id: None, + last_log_term: 0, + flush_lsn: 0, + commit_lsn, + backup_lsn: 0, + remote_consistent_lsn: 0, + peer_horizon_lsn: 0, + local_start_lsn: 0, + safekeeper_connstr: safekeeper_connstr.to_owned(), + }, + latest_update, + } + } + #[tokio::test] async fn no_connection_no_candidate() -> anyhow::Result<()> { let harness = TenantHarness::create("no_connection_no_candidate")?; @@ -881,74 +855,16 @@ mod tests { state.wal_connection = None; state.wal_stream_candidates = HashMap::from([ - ( - NodeId(0), - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(1)), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - safekeeper_connstr: None, - }, - etcd_version: 0, - latest_update: now, - }, - ), - ( - NodeId(1), - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: None, - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - - safekeeper_connstr: Some("no_commit_lsn".to_string()), - }, - etcd_version: 0, - latest_update: now, - }, - ), - ( - NodeId(2), - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: None, - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - safekeeper_connstr: Some("no_commit_lsn".to_string()), - }, - etcd_version: 0, - latest_update: now, - }, - ), + (NodeId(0), dummy_broker_sk_timeline(1, "", now)), + (NodeId(1), dummy_broker_sk_timeline(0, "no_commit_lsn", now)), + (NodeId(2), dummy_broker_sk_timeline(0, "no_commit_lsn", now)), ( NodeId(3), - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - safekeeper_connstr: None, - }, - etcd_version: 0, - latest_update: delay_over_threshold, - }, + dummy_broker_sk_timeline( + 1 + state.max_lsn_wal_lag.get(), + "delay_over_threshold", + delay_over_threshold, + ), ), ]); @@ -983,10 +899,10 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, - status: connection_status.clone(), + status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskStateUpdate::Progress(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status)) .ok(); Ok(()) }), @@ -995,57 +911,23 @@ mod tests { state.wal_stream_candidates = HashMap::from([ ( connected_sk_id, - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() * 2)), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()), - }, - etcd_version: 0, - latest_update: now, - }, + dummy_broker_sk_timeline( + current_lsn + state.max_lsn_wal_lag.get() * 2, + DUMMY_SAFEKEEPER_HOST, + now, + ), ), ( NodeId(1), - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(current_lsn)), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - - safekeeper_connstr: Some("not_advanced_lsn".to_string()), - }, - etcd_version: 0, - latest_update: now, - }, + dummy_broker_sk_timeline(current_lsn, "not_advanced_lsn", now), ), ( NodeId(2), - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() / 2)), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - - safekeeper_connstr: Some("not_enough_advanced_lsn".to_string()), - }, - etcd_version: 0, - latest_update: now, - }, + dummy_broker_sk_timeline( + current_lsn + state.max_lsn_wal_lag.get() / 2, + "not_enough_advanced_lsn", + now, + ), ), ]); @@ -1067,21 +949,7 @@ mod tests { state.wal_connection = None; state.wal_stream_candidates = HashMap::from([( NodeId(0), - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()), - }, - etcd_version: 0, - latest_update: now, - }, + dummy_broker_sk_timeline(1 + state.max_lsn_wal_lag.get(), DUMMY_SAFEKEEPER_HOST, now), )]); let only_candidate = state @@ -1102,57 +970,15 @@ mod tests { state.wal_stream_candidates = HashMap::from([ ( NodeId(0), - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(selected_lsn - 100)), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - - safekeeper_connstr: Some("smaller_commit_lsn".to_string()), - }, - etcd_version: 0, - latest_update: now, - }, + dummy_broker_sk_timeline(selected_lsn - 100, "smaller_commit_lsn", now), ), ( NodeId(1), - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(selected_lsn)), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()), - }, - etcd_version: 0, - latest_update: now, - }, + dummy_broker_sk_timeline(selected_lsn, DUMMY_SAFEKEEPER_HOST, now), ), ( NodeId(2), - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(Lsn(selected_lsn + 100)), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - - safekeeper_connstr: None, - }, - etcd_version: 0, - latest_update: now, - }, + dummy_broker_sk_timeline(selected_lsn + 100, "", now), ), ]); let biggest_wal_candidate = state.next_connection_candidate().expect( @@ -1186,39 +1012,11 @@ mod tests { state.wal_stream_candidates = HashMap::from([ ( NodeId(0), - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(bigger_lsn), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()), - }, - etcd_version: 0, - latest_update: now, - }, + dummy_broker_sk_timeline(bigger_lsn.0, DUMMY_SAFEKEEPER_HOST, now), ), ( NodeId(1), - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(current_lsn), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()), - }, - etcd_version: 0, - latest_update: now, - }, + dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now), ), ]); state.wal_connection_retries = HashMap::from([( @@ -1263,10 +1061,10 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, - status: connection_status.clone(), + status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskStateUpdate::Progress(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status)) .ok(); Ok(()) }), @@ -1275,39 +1073,11 @@ mod tests { state.wal_stream_candidates = HashMap::from([ ( connected_sk_id, - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(current_lsn), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()), - }, - etcd_version: 0, - latest_update: now, - }, + dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now), ), ( NodeId(1), - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(new_lsn), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - - safekeeper_connstr: Some("advanced_by_lsn_safekeeper".to_string()), - }, - etcd_version: 0, - latest_update: now, - }, + dummy_broker_sk_timeline(new_lsn.0, "advanced_by_lsn_safekeeper", now), ), ]); @@ -1356,10 +1126,10 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: NodeId(1), - status: connection_status.clone(), + status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender - .send(TaskStateUpdate::Progress(connection_status.clone())) + .send(TaskStateUpdate::Progress(connection_status)) .ok(); Ok(()) }), @@ -1367,21 +1137,7 @@ mod tests { }); state.wal_stream_candidates = HashMap::from([( NodeId(0), - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(current_lsn), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()), - }, - etcd_version: 0, - latest_update: now, - }, + dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now), )]); let over_threshcurrent_candidate = state.next_connection_candidate().expect( @@ -1441,21 +1197,7 @@ mod tests { }); state.wal_stream_candidates = HashMap::from([( NodeId(0), - EtcdSkTimeline { - timeline: SkTimelineInfo { - last_log_term: None, - flush_lsn: None, - commit_lsn: Some(new_lsn), - backup_lsn: None, - remote_consistent_lsn: None, - peer_horizon_lsn: None, - local_start_lsn: None, - - safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()), - }, - etcd_version: 0, - latest_update: now, - }, + dummy_broker_sk_timeline(new_lsn.0, DUMMY_SAFEKEEPER_HOST, now), )]); let over_threshcurrent_candidate = state.next_connection_candidate().expect( diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index cf2a99f1b5..3753807327 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -20,7 +20,9 @@ use tokio::{pin, select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; use tracing::{debug, error, info, trace, warn}; -use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate}; +use crate::{ + metrics::LIVE_CONNECTIONS_COUNT, tenant::with_ondemand_download, walreceiver::TaskStateUpdate, +}; use crate::{ task_mgr, task_mgr::TaskKind, @@ -35,7 +37,7 @@ use pq_proto::ReplicationFeedback; use utils::lsn::Lsn; /// Status of the connection. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Copy)] pub struct WalConnectionStatus { /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running. pub is_connected: bool, @@ -83,7 +85,7 @@ pub async fn handle_walreceiver_connection( streaming_lsn: None, commit_lsn: None, }; - if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}"); return Ok(()); } @@ -135,7 +137,7 @@ pub async fn handle_walreceiver_connection( connection_status.latest_connection_update = Utc::now().naive_utc(); connection_status.latest_wal_update = Utc::now().naive_utc(); connection_status.commit_lsn = Some(end_of_wal); - if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}"); return Ok(()); } @@ -173,7 +175,8 @@ pub async fn handle_walreceiver_connection( let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); - let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?; + let mut walingest = + with_ondemand_download(|| WalIngest::new(timeline.as_ref(), startpoint)).await?; while let Some(replication_message) = { select! { @@ -184,7 +187,20 @@ pub async fn handle_walreceiver_connection( replication_message = physical_stream.next() => replication_message, } } { - let replication_message = replication_message?; + let replication_message = match replication_message { + Ok(message) => message, + Err(replication_error) => { + if replication_error.is_closed() { + info!("Replication stream got closed"); + return Ok(()); + } else { + return Err( + anyhow::Error::new(replication_error).context("replication stream error") + ); + } + } + }; + let now = Utc::now().naive_utc(); let last_rec_lsn_before_msg = last_rec_lsn; @@ -207,7 +223,7 @@ pub async fn handle_walreceiver_connection( } &_ => {} }; - if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!("Wal connection event listener dropped, aborting the connection: {e}"); return Ok(()); } @@ -235,9 +251,16 @@ pub async fn handle_walreceiver_connection( // at risk of hitting a deadlock. ensure!(lsn.is_aligned()); - walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded) - .context("could not ingest record at {lsn}")?; + with_ondemand_download(|| { + walingest.ingest_record( + recdata.clone(), + lsn, + &mut modification, + &mut decoded, + ) + }) + .await + .with_context(|| format!("could not ingest record at {lsn}"))?; fail_point!("walreceiver-after-ingest"); @@ -273,8 +296,7 @@ pub async fn handle_walreceiver_connection( if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg { // We have successfully processed at least one WAL record. connection_status.has_processed_wal = true; - if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) - { + if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) { warn!("Wal connection event listener dropped, aborting the connection: {e}"); return Ok(()); } @@ -313,10 +335,11 @@ pub async fn handle_walreceiver_connection( // Send the replication feedback message. // Regular standby_status_update fields are put into this message. + let (timeline_logical_size, _) = timeline + .get_current_logical_size() + .context("Status update creation failed to get current logical size")?; let status_update = ReplicationFeedback { - current_timeline_size: timeline - .get_current_logical_size() - .context("Status update creation failed to get current logical size")?, + current_timeline_size: timeline_logical_size, ps_writelsn: write_lsn, ps_flushlsn: flush_lsn, ps_applylsn: apply_lsn, diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 38fb9a4247..7581140934 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -1,6 +1,7 @@ //! //! Functions for parsing WAL records. //! + use anyhow::Result; use bytes::{Buf, Bytes}; use postgres_ffi::pg_constants; diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index dfcd49f5c2..7cf489562b 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -84,7 +84,7 @@ pub trait WalRedoManager: Send + Sync { &self, key: Key, lsn: Lsn, - base_img: Option, + base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, pg_version: u32, ) -> Result; @@ -147,7 +147,7 @@ impl WalRedoManager for PostgresRedoManager { &self, key: Key, lsn: Lsn, - base_img: Option, + base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, pg_version: u32, ) -> Result { @@ -156,7 +156,8 @@ impl WalRedoManager for PostgresRedoManager { return Err(WalRedoError::InvalidRequest); } - let mut img: Option = base_img; + let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID); + let mut img = base_img.map(|p| p.1); let mut batch_neon = can_apply_in_neon(&records[0].1); let mut batch_start = 0; for i in 1..records.len() { @@ -170,6 +171,7 @@ impl WalRedoManager for PostgresRedoManager { key, lsn, img, + base_img_lsn, &records[batch_start..i], self.conf.wal_redo_timeout, pg_version, @@ -189,6 +191,7 @@ impl WalRedoManager for PostgresRedoManager { key, lsn, img, + base_img_lsn, &records[batch_start..], self.conf.wal_redo_timeout, pg_version, @@ -223,11 +226,13 @@ impl PostgresRedoManager { /// /// Process one request for WAL redo using wal-redo postgres /// + #[allow(clippy::too_many_arguments)] fn apply_batch_postgres( &self, key: Key, lsn: Lsn, base_img: Option, + base_img_lsn: Lsn, records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, pg_version: u32, @@ -282,9 +287,12 @@ impl PostgresRedoManager { // next request will launch a new one. if result.is_err() { error!( - "error applying {} WAL records ({} bytes) to reconstruct page image at LSN {}", + "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}", records.len(), + records.first().map(|p| p.0).unwrap_or(Lsn(0)), + records.last().map(|p| p.0).unwrap_or(Lsn(0)), nbytes, + base_img_lsn, lsn ); let process = process_guard.take().unwrap(); @@ -401,7 +409,7 @@ impl PostgresRedoManager { key ); for &xid in xids { - let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE; + let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -451,7 +459,7 @@ impl PostgresRedoManager { key ); for &xid in xids { - let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE; + let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -639,7 +647,7 @@ impl PostgresRedoProcess { info!("running initdb in {}", datadir.display()); let initdb = Command::new(pg_bin_dir_path.join("initdb")) - .args(&["-D", &datadir.to_string_lossy()]) + .args(["-D", &datadir.to_string_lossy()]) .arg("-N") .env_clear() .env("LD_LIBRARY_PATH", &pg_lib_dir_path) @@ -922,8 +930,7 @@ impl NoLeakChild { match child.wait() { Ok(exit_status) => { - // log at error level since .kill() is something we only do on errors ATM - error!(exit_status = %exit_status, "wait successful"); + info!(exit_status = %exit_status, "wait successful"); } Err(e) => { error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)"); diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 7f4e30a12e..ec377dbb1e 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -4,11 +4,12 @@ MODULE_big = neon OBJS = \ $(WIN32RES) \ + file_cache.o \ libpagestore.o \ libpqwalproposer.o \ + neon.o \ pagestore_smgr.o \ relsize_cache.o \ - neon.o \ walproposer.o \ walproposer_utils.o diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c new file mode 100644 index 0000000000..96c2461e2d --- /dev/null +++ b/pgxn/neon/file_cache.c @@ -0,0 +1,597 @@ +/* + * + * file_cache.c + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * pgxn/neon/file_cache.c + * + *------------------------------------------------------------------------- + */ + +#include +#include +#include + +#include "postgres.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "pagestore_client.h" +#include "access/parallel.h" +#include "postmaster/bgworker.h" +#include "storage/relfilenode.h" +#include "storage/buf_internals.h" +#include "storage/latch.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "utils/dynahash.h" +#include "utils/guc.h" +#include "storage/fd.h" +#include "storage/pg_shmem.h" +#include "storage/buf_internals.h" + +/* + * Local file cache is used to temporary store relations pages in local file system. + * All blocks of all relations are stored inside one file and addressed using shared hash map. + * Currently LRU eviction policy based on L2 list is used as replacement algorithm. + * As far as manipulation of L2-list requires global critical section, we are not using partitioned hash. + * Also we are using exclusive lock even for read operation because LRU requires relinking element in L2 list. + * If this lock become a bottleneck, we can consider other eviction strategies, for example clock algorithm. + * + * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about + * its consistency. + */ + +/* Local file storage allocation chunk. + * Should be power of two and not less than 32. Using larger than page chunks can + * 1. Reduce hash-map memory footprint: 8TB database contains billion pages + * and size of hash entry is 40 bytes, so we need 40Gb just for hash map. + * 1Mb chunks can reduce hash map size to 320Mb. + * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed + */ +#define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */ +#define MB ((uint64)1024*1024) + +#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) + +typedef struct FileCacheEntry +{ + BufferTag key; + uint32 offset; + uint32 access_count; + uint32 bitmap[BLOCKS_PER_CHUNK/32]; + dlist_node lru_node; /* LRU list node */ +} FileCacheEntry; + +typedef struct FileCacheControl +{ + uint32 size; /* size of cache file in chunks */ + dlist_head lru; /* double linked list for LRU replacement algorithm */ +} FileCacheControl; + +static HTAB* lfc_hash; +static int lfc_desc; +static LWLockId lfc_lock; +static int lfc_max_size; +static int lfc_size_limit; +static char* lfc_path; +static FileCacheControl* lfc_ctl; +static shmem_startup_hook_type prev_shmem_startup_hook; +#if PG_VERSION_NUM>=150000 +static shmem_request_hook_type prev_shmem_request_hook; +#endif + +static void +lfc_shmem_startup(void) +{ + bool found; + static HASHCTL info; + + if (prev_shmem_startup_hook) + { + prev_shmem_startup_hook(); + } + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + + lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found); + if (!found) + { + uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size); + lfc_lock = (LWLockId)GetNamedLWLockTranche("lfc_lock"); + info.keysize = sizeof(BufferTag); + info.entrysize = sizeof(FileCacheEntry); + lfc_hash = ShmemInitHash("lfc_hash", + /* lfc_size+1 because we add new element to hash table before eviction of victim */ + lfc_size+1, lfc_size+1, + &info, + HASH_ELEM | HASH_BLOBS); + lfc_ctl->size = 0; + dlist_init(&lfc_ctl->lru); + + /* Remove file cache on restart */ + (void)unlink(lfc_path); + } + LWLockRelease(AddinShmemInitLock); +} + +static void +lfc_shmem_request(void) +{ +#if PG_VERSION_NUM>=150000 + if (prev_shmem_request_hook) + prev_shmem_request_hook(); +#endif + + RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry))); + RequestNamedLWLockTranche("lfc_lock", 1); +} + +bool +lfc_check_limit_hook(int *newval, void **extra, GucSource source) +{ + if (*newval > lfc_max_size) + { + elog(ERROR, "neon.file_cache_size_limit can not be larger than neon.max_file_cache_size"); + return false; + } + return true; +} + +void +lfc_change_limit_hook(int newval, void *extra) +{ + uint32 new_size = SIZE_MB_TO_CHUNKS(newval); + /* + * Stats collector detach shared memory, so we should not try to access shared memory here. + * Parallel workers first assign default value (0), so not perform truncation in parallel workers. + */ + if (!lfc_ctl || !UsedShmemSegAddr || IsParallelWorker()) + return; + + /* Open cache file if not done yet */ + if (lfc_desc == 0) + { + lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT); + if (lfc_desc < 0) { + elog(LOG, "Failed to open file cache %s: %m", lfc_path); + lfc_size_limit = 0; /* disable file cache */ + return; + } + } + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + while (new_size < lfc_ctl->size && !dlist_is_empty(&lfc_ctl->lru)) + { + /* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */ + FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); + Assert(victim->access_count == 0); +#ifdef FALLOC_FL_PUNCH_HOLE + if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0) + elog(LOG, "Failed to punch hole in file: %m"); +#endif + hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL); + lfc_ctl->size -= 1; + } + elog(LOG, "set local file cache limit to %d", new_size); + LWLockRelease(lfc_lock); +} + +void +lfc_init(void) +{ + /* + * In order to create our shared memory area, we have to be loaded via + * shared_preload_libraries. + */ + if (!process_shared_preload_libraries_in_progress) + elog(ERROR, "Neon module should be loaded via shared_preload_libraries"); + + DefineCustomIntVariable("neon.max_file_cache_size", + "Maximal size of Neon local file cache", + NULL, + &lfc_max_size, + 0, /* disabled by default */ + 0, + INT_MAX, + PGC_POSTMASTER, + GUC_UNIT_MB, + NULL, + NULL, + NULL); + + DefineCustomIntVariable("neon.file_cache_size_limit", + "Current limit for size of Neon local file cache", + NULL, + &lfc_size_limit, + 0, /* disabled by default */ + 0, + INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MB, + NULL, + lfc_change_limit_hook, + NULL); + + DefineCustomStringVariable("neon.file_cache_path", + "Path to local file cache (can be raw device)", + NULL, + &lfc_path, + "file.cache", + PGC_POSTMASTER, + 0, + NULL, + NULL, + NULL); + + if (lfc_max_size == 0) + return; + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = lfc_shmem_startup; +#if PG_VERSION_NUM>=150000 + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = lfc_shmem_request; +#else + lfc_shmem_request(); +#endif +} + +/* + * Check if page is present in the cache. + * Returns true if page is found in local cache. + */ +bool +lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno) +{ + BufferTag tag; + FileCacheEntry* entry; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); + bool found; + uint32 hash; + + if (lfc_size_limit == 0) /* fast exit if file cache is disabled */ + return false; + + tag.rnode = rnode; + tag.forkNum = forkNum; + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1); + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_SHARED); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + found = entry != NULL && (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) != 0; + LWLockRelease(lfc_lock); + return found; +} + +/* + * Try to read page from local cache. + * Returns true if page is found in local cache. + * In case of error lfc_size_limit is set to zero to disable any further opera-tins with cache. + */ +bool +lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + char *buffer) +{ + BufferTag tag; + FileCacheEntry* entry; + ssize_t rc; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); + bool result = true; + uint32 hash; + + if (lfc_size_limit == 0) /* fast exit if file cache is disabled */ + return false; + + tag.rnode = rnode; + tag.forkNum = forkNum; + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1); + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0) + { + /* Page is not cached */ + LWLockRelease(lfc_lock); + return false; + } + /* Unlink entry from LRU list to pin it for the duration of IO operation */ + if (entry->access_count++ == 0) + dlist_delete(&entry->lru_node); + LWLockRelease(lfc_lock); + + /* Open cache file if not done yet */ + if (lfc_desc == 0) + { + lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT); + if (lfc_desc < 0) { + elog(LOG, "Failed to open file cache %s: %m", lfc_path); + lfc_size_limit = 0; /* disable file cache */ + result = false; + } + } + + if (lfc_desc > 0) + { + rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ); + if (rc != BLCKSZ) + { + elog(INFO, "Failed to read file cache: %m"); + lfc_size_limit = 0; /* disable file cache */ + result = false; + } + } + + /* Place entry to the head of LRU list */ + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + Assert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); + LWLockRelease(lfc_lock); + + return result; +} + +/* + * Put page in local file cache. + * If cache is full then evict some other page. + */ +void +lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + char *buffer) +{ + BufferTag tag; + FileCacheEntry* entry; + ssize_t rc; + bool found; + int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1); + uint32 hash; + + if (lfc_size_limit == 0) /* fast exit if file cache is disabled */ + return; + + tag.rnode = rnode; + tag.forkNum = forkNum; + tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1); + hash = get_hash_value(lfc_hash, &tag); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found); + + if (found) + { + /* Unlink entry from LRU list to pin it for the duration of IO operation */ + if (entry->access_count++ == 0) + dlist_delete(&entry->lru_node); + } + else + { + /* + * We have two choices if all cache pages are pinned (i.e. used in IO operations): + * 1. Wait until some of this operation is completed and pages is unpinned + * 2. Allocate one more chunk, so that specified cache size is more recommendation than hard limit. + * As far as probability of such event (that all pages are pinned) is considered to be very very small: + * there are should be very large number of concurrent IO operations and them are limited by max_connections, + * we prefer not to complicate code and use second approach. + */ + if (lfc_ctl->size >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru)) + { + /* Cache overflow: evict least recently used chunk */ + FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru)); + Assert(victim->access_count == 0); + entry->offset = victim->offset; /* grab victim's chunk */ + hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL); + elog(LOG, "Swap file cache page"); + } + else + entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */ + entry->access_count = 1; + memset(entry->bitmap, 0, sizeof entry->bitmap); + } + LWLockRelease(lfc_lock); + + /* Open cache file if not done yet */ + if (lfc_desc == 0) + { + lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT); + if (lfc_desc < 0) { + elog(LOG, "Failed to open file cache %s: %m", lfc_path); + lfc_size_limit = 0; /* disable file cache */ + } + } + if (lfc_desc > 0) + { + rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ); + if (rc != BLCKSZ) + { + elog(INFO, "Failed to write file cache: %m"); + lfc_size_limit = 0; /* disable file cache */ + } + } + /* Place entry to the head of LRU list */ + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + Assert(entry->access_count > 0); + if (--entry->access_count == 0) + dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); + if (lfc_size_limit != 0) + entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31)); + LWLockRelease(lfc_lock); +} + + +/* + * Record structure holding the to be exposed cache data. + */ +typedef struct +{ + uint32 pageoffs; + Oid relfilenode; + Oid reltablespace; + Oid reldatabase; + ForkNumber forknum; + BlockNumber blocknum; + uint16 accesscount; +} LocalCachePagesRec; + +/* + * Function context for data persisting over repeated calls. + */ +typedef struct +{ + TupleDesc tupdesc; + LocalCachePagesRec *record; +} LocalCachePagesContext; + +/* + * Function returning data from the local file cache + * relation node/tablespace/database/blocknum and access_counter + */ +PG_FUNCTION_INFO_V1(local_cache_pages); + +#define NUM_LOCALCACHE_PAGES_ELEM 7 + +Datum +local_cache_pages(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Datum result; + MemoryContext oldcontext; + LocalCachePagesContext *fctx; /* User function context. */ + TupleDesc tupledesc; + TupleDesc expected_tupledesc; + HeapTuple tuple; + + if (SRF_IS_FIRSTCALL()) + { + HASH_SEQ_STATUS status; + FileCacheEntry* entry; + uint32 n_pages = 0; + uint32 i; + + funcctx = SRF_FIRSTCALL_INIT(); + + /* Switch context when allocating stuff to be used in later calls */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* Create a user function context for cross-call persistence */ + fctx = (LocalCachePagesContext *) palloc(sizeof(LocalCachePagesContext)); + + /* + * To smoothly support upgrades from version 1.0 of this extension + * transparently handle the (non-)existence of the pinning_backends + * column. We unfortunately have to get the result type for that... - + * we can't use the result type determined by the function definition + * without potentially crashing when somebody uses the old (or even + * wrong) function definition though. + */ + if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM) + elog(ERROR, "incorrect number of output arguments"); + + /* Construct a tuple descriptor for the result rows. */ + tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts); + TupleDescInitEntry(tupledesc, (AttrNumber) 1, "pageoffs", + INT8OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber", + INT2OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber", + INT8OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 7, "accesscount", + INT4OID, -1, 0); + + fctx->tupdesc = BlessTupleDesc(tupledesc); + + LWLockAcquire(lfc_lock, LW_SHARED); + + hash_seq_init(&status, lfc_hash); + while ((entry = hash_seq_search(&status)) != NULL) + { + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + n_pages += (entry->bitmap[i >> 5] & (1 << (i & 31))) != 0; + } + fctx->record = (LocalCachePagesRec *) + MemoryContextAllocHuge(CurrentMemoryContext, + sizeof(LocalCachePagesRec) * n_pages); + + /* Set max calls and remember the user function context. */ + funcctx->max_calls = n_pages; + funcctx->user_fctx = fctx; + + /* Return to original context when allocating transient memory */ + MemoryContextSwitchTo(oldcontext); + + /* + * Scan through all the buffers, saving the relevant fields in the + * fctx->record structure. + * + * We don't hold the partition locks, so we don't get a consistent + * snapshot across all buffers, but we do grab the buffer header + * locks, so the information of each buffer is self-consistent. + */ + n_pages = 0; + hash_seq_init(&status, lfc_hash); + while ((entry = hash_seq_search(&status)) != NULL) + { + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + { + if (entry->bitmap[i >> 5] & (1 << (i & 31))) + { + fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i; + fctx->record[n_pages].relfilenode = entry->key.rnode.relNode; + fctx->record[n_pages].reltablespace = entry->key.rnode.spcNode; + fctx->record[n_pages].reldatabase = entry->key.rnode.dbNode; + fctx->record[n_pages].forknum = entry->key.forkNum; + fctx->record[n_pages].blocknum = entry->key.blockNum + i; + fctx->record[n_pages].accesscount = entry->access_count; + n_pages += 1; + } + } + } + Assert(n_pages == funcctx->max_calls); + LWLockRelease(lfc_lock); + } + + funcctx = SRF_PERCALL_SETUP(); + + /* Get the saved state */ + fctx = funcctx->user_fctx; + + if (funcctx->call_cntr < funcctx->max_calls) + { + uint32 i = funcctx->call_cntr; + Datum values[NUM_LOCALCACHE_PAGES_ELEM]; + bool nulls[NUM_LOCALCACHE_PAGES_ELEM] = { + false, false, false, false, false, false, false + }; + + values[0] = Int64GetDatum((int64) fctx->record[i].pageoffs); + values[1] = ObjectIdGetDatum(fctx->record[i].relfilenode); + values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace); + values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase); + values[4] = ObjectIdGetDatum(fctx->record[i].forknum); + values[5] = Int64GetDatum((int64) fctx->record[i].blocknum); + values[6] = Int32GetDatum(fctx->record[i].accesscount); + + /* Build and return the tuple. */ + tuple = heap_form_tuple(fctx->tupdesc, values, nulls); + result = HeapTupleGetDatum(tuple); + + SRF_RETURN_NEXT(funcctx, result); + } + else + SRF_RETURN_DONE(funcctx); +} diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index ae8275168d..c6199dddc0 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -420,7 +420,7 @@ pg_init_libpagestore(void) NULL, NULL, NULL); DefineCustomStringVariable("neon.safekeeper_token_env", - "the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $ZENITH_AUTH_TOKEN", + "the environment variable containing JWT token for authentication with Safekeepers, the convention is to either unset or set to $NEON_AUTH_TOKEN", NULL, &safekeeper_token_env, NULL, @@ -464,12 +464,12 @@ pg_init_libpagestore(void) NULL, NULL, NULL); DefineCustomIntVariable("neon.readahead_buffer_size", "number of prefetches to buffer", - "This buffer is used to store prefetched data; so " - "it is important that this buffer is at least as " - "large as the configured value of all tablespaces' " - "effective_io_concurrency and maintenance_io_concurrency, " - "your sessions' values of these, and the value for " - "seqscan_prefetch_buffers.", + "This buffer is used to hold and manage prefetched " + "data; so it is important that this buffer is at " + "least as large as the configured value of all " + "tablespaces' effective_io_concurrency and " + "maintenance_io_concurrency, and your sessions' " + "values for these settings.", &readahead_buffer_size, 128, 16, 1024, PGC_USERSET, @@ -516,4 +516,5 @@ pg_init_libpagestore(void) smgr_init_hook = smgr_init_neon; dbsize_hook = neon_dbsize; } + lfc_init(); } diff --git a/pgxn/neon/neon--1.0.sql b/pgxn/neon/neon--1.0.sql index 58b98a5923..6cf111ea6a 100644 --- a/pgxn/neon/neon--1.0.sql +++ b/pgxn/neon/neon--1.0.sql @@ -22,3 +22,13 @@ AS 'MODULE_PATHNAME', 'backpressure_throttling_time' LANGUAGE C STRICT PARALLEL UNSAFE; +CREATE FUNCTION local_cache_pages() +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'local_cache_pages' +LANGUAGE C PARALLEL SAFE; + +-- Create a view for convenient access. +CREATE VIEW local_cache AS + SELECT P.* FROM local_cache_pages() AS P + (pageoffs int8, relfilenode oid, reltablespace oid, reldatabase oid, + relforknumber int2, relblocknumber int8, accesscount int4); diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 170a0cb72d..831756b849 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -203,4 +203,11 @@ extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumbe extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); +/* functions for local file cache */ +extern void lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer); +extern bool lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer); +extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno); +extern void lfc_init(void); + + #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index a8dde3927a..0b34cb3ca9 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -242,6 +242,14 @@ PrefetchState *MyPState; ) \ ) +#define ReceiveBufferNeedsCompaction() (\ + (MyPState->n_responses_buffered / 8) < ( \ + MyPState->ring_receive - \ + MyPState->ring_last - \ + MyPState->n_responses_buffered \ + ) \ +) + int n_prefetch_hits = 0; int n_prefetch_misses = 0; int n_prefetch_missed_caches = 0; @@ -249,17 +257,99 @@ int n_prefetch_dupes = 0; XLogRecPtr prefetch_lsn = 0; +static bool compact_prefetch_buffers(void); static void consume_prefetch_responses(void); static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn); static bool prefetch_read(PrefetchRequest *slot); static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn); static bool prefetch_wait_for(uint64 ring_index); -static void prefetch_cleanup(void); +static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno); +static bool +compact_prefetch_buffers(void) +{ + uint64 empty_ring_index = MyPState->ring_last; + uint64 search_ring_index = MyPState->ring_receive; + int n_moved = 0; + + if (MyPState->ring_receive == MyPState->ring_last) + return false; + + while (search_ring_index > MyPState->ring_last) + { + search_ring_index--; + if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED) + { + empty_ring_index = search_ring_index; + break; + } + } + + /* + * Here we have established: + * slots < search_ring_index may be unused (not scanned) + * slots >= search_ring_index and <= empty_ring_index are unused + * slots > empty_ring_index are in use, or outside our buffer's range. + * + * Therefore, there is a gap of at least one unused items between + * search_ring_index and empty_ring_index, which grows as we hit + * more unused items while moving backwards through the array. + */ + + while (search_ring_index > MyPState->ring_last) + { + PrefetchRequest *source_slot; + PrefetchRequest *target_slot; + bool found; + + search_ring_index--; + + source_slot = GetPrfSlot(search_ring_index); + + if (source_slot->status == PRFS_UNUSED) + continue; + + target_slot = GetPrfSlot(empty_ring_index); + + Assert(source_slot->status == PRFS_RECEIVED); + Assert(target_slot->status == PRFS_UNUSED); + + target_slot->buftag = source_slot->buftag; + target_slot->status = source_slot->status; + target_slot->response = source_slot->response; + target_slot->effective_request_lsn = source_slot->effective_request_lsn; + target_slot->my_ring_index = empty_ring_index; + + prfh_delete(MyPState->prf_hash, source_slot); + prfh_insert(MyPState->prf_hash, target_slot, &found); + + Assert(!found); + + /* Adjust the location of our known-empty slot */ + empty_ring_index--; + + source_slot->status = PRFS_UNUSED; + source_slot->buftag = (BufferTag) {0}; + source_slot->response = NULL; + source_slot->my_ring_index = 0; + source_slot->effective_request_lsn = 0; + + n_moved++; + } + + if (MyPState->ring_last != empty_ring_index) + { + prefetch_cleanup_trailing_unused(); + return true; + } + + return false; +} + void readahead_buffer_resize(int newsize, void *extra) { @@ -323,7 +413,7 @@ readahead_buffer_resize(int newsize, void *extra) prfh_insert(newPState->prf_hash, newslot, &found); Assert(!found); - + switch (newslot->status) { case PRFS_UNUSED: @@ -370,7 +460,7 @@ consume_prefetch_responses(void) } static void -prefetch_cleanup(void) +prefetch_cleanup_trailing_unused(void) { uint64 ring_index; PrefetchRequest *slot; @@ -531,7 +621,10 @@ prefetch_set_unused(uint64 ring_index) /* run cleanup if we're holding back ring_last */ if (MyPState->ring_last == ring_index) - prefetch_cleanup(); + prefetch_cleanup_trailing_unused(); + /* ... and try to store the buffered responses more compactly if > 12.5% of the buffer is gaps */ + else if (ReceiveBufferNeedsCompaction()) + compact_prefetch_buffers(); } static void @@ -702,20 +795,31 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls Assert(slot->status != PRFS_UNUSED); - /* We have the slot for ring_last, so that must still be in progress */ - switch (slot->status) + /* + * If there is good reason to run compaction on the prefetch buffers, + * try to do that. + */ + if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) { - case PRFS_REQUESTED: - Assert(MyPState->ring_receive == cleanup_index); - prefetch_wait_for(cleanup_index); - prefetch_set_unused(cleanup_index); - break; - case PRFS_RECEIVED: - case PRFS_TAG_REMAINS: - prefetch_set_unused(cleanup_index); - break; - default: - pg_unreachable(); + Assert(slot->status == PRFS_UNUSED); + } + else + { + /* We have the slot for ring_last, so that must still be in progress */ + switch (slot->status) + { + case PRFS_REQUESTED: + Assert(MyPState->ring_receive == cleanup_index); + prefetch_wait_for(cleanup_index); + prefetch_set_unused(cleanup_index); + break; + case PRFS_RECEIVED: + case PRFS_TAG_REMAINS: + prefetch_set_unused(cleanup_index); + break; + default: + pg_unreachable(); + } } } @@ -1102,7 +1206,7 @@ PageIsEmptyHeapPage(char *buffer) } static void -neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) +neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) { XLogRecPtr lsn = PageGetLSN(buffer); @@ -1116,7 +1220,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch * correctness, the non-logged updates are not critical. But we want to * have a reasonably up-to-date VM and FSM in the page server. */ - if (forknum == FSM_FORKNUM && !RecoveryInProgress()) + if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress()) { /* FSM is never WAL-logged and we don't care. */ XLogRecPtr recptr; @@ -1125,30 +1229,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch XLogFlush(recptr); lsn = recptr; ereport(SmgrTrace, - (errmsg("FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X", - blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forknum, LSN_FORMAT_ARGS(lsn)))); - } - else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress()) - { - /* - * Always WAL-log vm. We should never miss clearing visibility map - * bits. - * - * TODO Is it too bad for performance? Hopefully we do not evict - * actively used vm too often. - */ - XLogRecPtr recptr; - - recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false); - XLogFlush(recptr); - lsn = recptr; - - ereport(SmgrTrace, - (errmsg("Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X/%X", + (errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X", blocknum, reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, @@ -1543,6 +1624,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer, bool skipFsync) { XLogRecPtr lsn; + BlockNumber n_blocks = 0; switch (reln->smgr_relpersistence) { @@ -1582,7 +1664,16 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, errhint("This limit is defined by neon.max_cluster_size GUC"))); } - neon_wallog_page(reln, forkNum, blkno, buffer); + /* + * Usually Postgres doesn't extend relation on more than one page + * (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData + * call smgrextend for destination relation n using size of source relation + */ + n_blocks = neon_nblocks(reln, forkNum); + while (n_blocks < blkno) + neon_wallog_page(reln, forkNum, n_blocks++, buffer, true); + + neon_wallog_page(reln, forkNum, blkno, buffer, false); set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1); lsn = PageGetLSN(buffer); @@ -1593,6 +1684,8 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, forkNum, blkno, (uint32) (lsn >> 32), (uint32) lsn); + lfc_write(reln->smgr_rnode.node, forkNum, blkno, buffer); + #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) mdextend(reln, forkNum, blkno, buffer, skipFsync); @@ -1666,6 +1759,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } + if (lfc_cache_contains(reln->smgr_rnode.node, forknum, blocknum)) + return false; + tag = (BufferTag) { .rnode = reln->smgr_rnode.node, .forkNum = forknum, @@ -1808,6 +1904,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, { case T_NeonGetPageResponse: memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ); + lfc_write(rnode, forkNum, blkno, buffer); break; case T_NeonErrorResponse: @@ -1829,7 +1926,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, /* buffer was used, clean up for later reuse */ prefetch_set_unused(ring_index); - prefetch_cleanup(); + prefetch_cleanup_trailing_unused(); } /* @@ -1859,6 +1956,12 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } + /* Try to read from local file cache */ + if (lfc_read(reln->smgr_rnode.node, forkNum, blkno, buffer)) + { + return; + } + request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno); neon_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); @@ -2010,7 +2113,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - neon_wallog_page(reln, forknum, blocknum, buffer); + neon_wallog_page(reln, forknum, blocknum, buffer, false); lsn = PageGetLSN(buffer); elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", @@ -2020,6 +2123,8 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, forknum, blocknum, (uint32) (lsn >> 32), (uint32) lsn); + lfc_write(reln->smgr_rnode.node, forknum, blocknum, buffer); + #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) mdwrite(reln, forknum, blocknum, buffer, skipFsync); diff --git a/poetry.lock b/poetry.lock index 716423d51e..1b04230cef 100644 --- a/poetry.lock +++ b/poetry.lock @@ -525,7 +525,7 @@ typing-extensions = ">=4.1.0" [[package]] name = "certifi" -version = "2022.9.24" +version = "2022.12.7" description = "Python package for providing Mozilla's CA Bundle." category = "main" optional = false @@ -941,11 +941,11 @@ xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] [[package]] name = "mypy" -version = "0.971" +version = "0.991" description = "Optional static typing for Python" category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.dependencies] mypy-extensions = ">=0.4.3" @@ -954,6 +954,7 @@ typing-extensions = ">=3.10" [package.extras] dmypy = ["psutil (>=4.0)"] +install-types = ["pip"] python2 = ["typed-ast (>=1.4.0,<2)"] reports = ["lxml"] @@ -1227,6 +1228,17 @@ pytest = ">=6.1.0" [package.extras] testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"] +[[package]] +name = "pytest-httpserver" +version = "1.0.6" +description = "pytest-httpserver is a httpserver for pytest" +category = "main" +optional = false +python-versions = ">=3.7,<4.0" + +[package.dependencies] +Werkzeug = ">=2.0.0" + [[package]] name = "pytest-lazy-fixture" version = "0.6.3" @@ -1248,8 +1260,8 @@ python-versions = ">=3.6" [package.dependencies] pytest = [ - {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, {version = ">=5.0", markers = "python_version < \"3.10\""}, + {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, ] [[package]] @@ -1583,7 +1595,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "98d63eaa73253882440e0fc8cdb305bb536944768c5ba313c25d0ee65f546544" +content-hash = "af44b269c235a6fd59dacb4ff9e05cbc13a79b57254a8d5d4bde934bd5691a70" [metadata.files] aiopg = [ @@ -1702,8 +1714,8 @@ botocore-stubs = [ {file = "botocore_stubs-1.27.38-py3-none-any.whl", hash = "sha256:7add7641e9a479a9c8366893bb522fd9ca3d58714201e43662a200a148a1bc38"}, ] certifi = [ - {file = "certifi-2022.9.24-py3-none-any.whl", hash = "sha256:90c1a32f1d68f940488354e36370f6cca89f0f106db09518524c88d6ed83f382"}, - {file = "certifi-2022.9.24.tar.gz", hash = "sha256:0d9c601124e5a6ba9712dbc60d9c53c21e34f5f641fe83002317394311bdce14"}, + {file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"}, + {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"}, ] cffi = [ {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"}, @@ -1949,29 +1961,36 @@ moto = [ {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"}, ] mypy = [ - {file = "mypy-0.971-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f2899a3cbd394da157194f913a931edfd4be5f274a88041c9dc2d9cdcb1c315c"}, - {file = "mypy-0.971-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:98e02d56ebe93981c41211c05adb630d1d26c14195d04d95e49cd97dbc046dc5"}, - {file = "mypy-0.971-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:19830b7dba7d5356d3e26e2427a2ec91c994cd92d983142cbd025ebe81d69cf3"}, - {file = "mypy-0.971-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:02ef476f6dcb86e6f502ae39a16b93285fef97e7f1ff22932b657d1ef1f28655"}, - {file = "mypy-0.971-cp310-cp310-win_amd64.whl", hash = "sha256:25c5750ba5609a0c7550b73a33deb314ecfb559c350bb050b655505e8aed4103"}, - {file = "mypy-0.971-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d3348e7eb2eea2472db611486846742d5d52d1290576de99d59edeb7cd4a42ca"}, - {file = "mypy-0.971-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3fa7a477b9900be9b7dd4bab30a12759e5abe9586574ceb944bc29cddf8f0417"}, - {file = "mypy-0.971-cp36-cp36m-win_amd64.whl", hash = "sha256:2ad53cf9c3adc43cf3bea0a7d01a2f2e86db9fe7596dfecb4496a5dda63cbb09"}, - {file = "mypy-0.971-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:855048b6feb6dfe09d3353466004490b1872887150c5bb5caad7838b57328cc8"}, - {file = "mypy-0.971-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:23488a14a83bca6e54402c2e6435467a4138785df93ec85aeff64c6170077fb0"}, - {file = "mypy-0.971-cp37-cp37m-win_amd64.whl", hash = "sha256:4b21e5b1a70dfb972490035128f305c39bc4bc253f34e96a4adf9127cf943eb2"}, - {file = "mypy-0.971-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:9796a2ba7b4b538649caa5cecd398d873f4022ed2333ffde58eaf604c4d2cb27"}, - {file = "mypy-0.971-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5a361d92635ad4ada1b1b2d3630fc2f53f2127d51cf2def9db83cba32e47c856"}, - {file = "mypy-0.971-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b793b899f7cf563b1e7044a5c97361196b938e92f0a4343a5d27966a53d2ec71"}, - {file = "mypy-0.971-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d1ea5d12c8e2d266b5fb8c7a5d2e9c0219fedfeb493b7ed60cd350322384ac27"}, - {file = "mypy-0.971-cp38-cp38-win_amd64.whl", hash = "sha256:23c7ff43fff4b0df93a186581885c8512bc50fc4d4910e0f838e35d6bb6b5e58"}, - {file = "mypy-0.971-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1f7656b69974a6933e987ee8ffb951d836272d6c0f81d727f1d0e2696074d9e6"}, - {file = "mypy-0.971-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d2022bfadb7a5c2ef410d6a7c9763188afdb7f3533f22a0a32be10d571ee4bbe"}, - {file = "mypy-0.971-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef943c72a786b0f8d90fd76e9b39ce81fb7171172daf84bf43eaf937e9f220a9"}, - {file = "mypy-0.971-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d744f72eb39f69312bc6c2abf8ff6656973120e2eb3f3ec4f758ed47e414a4bf"}, - {file = "mypy-0.971-cp39-cp39-win_amd64.whl", hash = "sha256:77a514ea15d3007d33a9e2157b0ba9c267496acf12a7f2b9b9f8446337aac5b0"}, - {file = "mypy-0.971-py3-none-any.whl", hash = "sha256:0d054ef16b071149917085f51f89555a576e2618d5d9dd70bd6eea6410af3ac9"}, - {file = "mypy-0.971.tar.gz", hash = "sha256:40b0f21484238269ae6a57200c807d80debc6459d444c0489a102d7c6a75fa56"}, + {file = "mypy-0.991-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab"}, + {file = "mypy-0.991-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d"}, + {file = "mypy-0.991-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0c8f3be99e8a8bd403caa8c03be619544bc2c77a7093685dcf308c6b109426c6"}, + {file = "mypy-0.991-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9ec663ed6c8f15f4ae9d3c04c989b744436c16d26580eaa760ae9dd5d662eb"}, + {file = "mypy-0.991-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4307270436fd7694b41f913eb09210faff27ea4979ecbcd849e57d2da2f65305"}, + {file = "mypy-0.991-cp310-cp310-win_amd64.whl", hash = "sha256:901c2c269c616e6cb0998b33d4adbb4a6af0ac4ce5cd078afd7bc95830e62c1c"}, + {file = "mypy-0.991-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d13674f3fb73805ba0c45eb6c0c3053d218aa1f7abead6e446d474529aafc372"}, + {file = "mypy-0.991-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1c8cd4fb70e8584ca1ed5805cbc7c017a3d1a29fb450621089ffed3e99d1857f"}, + {file = "mypy-0.991-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:209ee89fbb0deed518605edddd234af80506aec932ad28d73c08f1400ef80a33"}, + {file = "mypy-0.991-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37bd02ebf9d10e05b00d71302d2c2e6ca333e6c2a8584a98c00e038db8121f05"}, + {file = "mypy-0.991-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:26efb2fcc6b67e4d5a55561f39176821d2adf88f2745ddc72751b7890f3194ad"}, + {file = "mypy-0.991-cp311-cp311-win_amd64.whl", hash = "sha256:3a700330b567114b673cf8ee7388e949f843b356a73b5ab22dd7cff4742a5297"}, + {file = "mypy-0.991-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1f7d1a520373e2272b10796c3ff721ea1a0712288cafaa95931e66aa15798813"}, + {file = "mypy-0.991-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:641411733b127c3e0dab94c45af15fea99e4468f99ac88b39efb1ad677da5711"}, + {file = "mypy-0.991-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3d80e36b7d7a9259b740be6d8d906221789b0d836201af4234093cae89ced0cd"}, + {file = "mypy-0.991-cp37-cp37m-win_amd64.whl", hash = "sha256:e62ebaad93be3ad1a828a11e90f0e76f15449371ffeecca4a0a0b9adc99abcef"}, + {file = "mypy-0.991-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b86ce2c1866a748c0f6faca5232059f881cda6dda2a893b9a8373353cfe3715a"}, + {file = "mypy-0.991-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ac6e503823143464538efda0e8e356d871557ef60ccd38f8824a4257acc18d93"}, + {file = "mypy-0.991-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0cca5adf694af539aeaa6ac633a7afe9bbd760df9d31be55ab780b77ab5ae8bf"}, + {file = "mypy-0.991-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a12c56bf73cdab116df96e4ff39610b92a348cc99a1307e1da3c3768bbb5b135"}, + {file = "mypy-0.991-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:652b651d42f155033a1967739788c436491b577b6a44e4c39fb340d0ee7f0d70"}, + {file = "mypy-0.991-cp38-cp38-win_amd64.whl", hash = "sha256:4175593dc25d9da12f7de8de873a33f9b2b8bdb4e827a7cae952e5b1a342e243"}, + {file = "mypy-0.991-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98e781cd35c0acf33eb0295e8b9c55cdbef64fcb35f6d3aa2186f289bed6e80d"}, + {file = "mypy-0.991-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6d7464bac72a85cb3491c7e92b5b62f3dcccb8af26826257760a552a5e244aa5"}, + {file = "mypy-0.991-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c9166b3f81a10cdf9b49f2d594b21b31adadb3d5e9db9b834866c3258b695be3"}, + {file = "mypy-0.991-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8472f736a5bfb159a5e36740847808f6f5b659960115ff29c7cecec1741c648"}, + {file = "mypy-0.991-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5e80e758243b97b618cdf22004beb09e8a2de1af481382e4d84bc52152d1c476"}, + {file = "mypy-0.991-cp39-cp39-win_amd64.whl", hash = "sha256:74e259b5c19f70d35fcc1ad3d56499065c601dfe94ff67ae48b85596b9ec1461"}, + {file = "mypy-0.991-py3-none-any.whl", hash = "sha256:de32edc9b0a7e67c2775e574cb061a537660e51210fbf6006b0b36ea695ae9bb"}, + {file = "mypy-0.991.tar.gz", hash = "sha256:3c0165ba8f354a6d9881809ef29f1a9318a236a6d81c690094c5df32107bde06"}, ] mypy-boto3-s3 = [ {file = "mypy-boto3-s3-1.26.0.post1.tar.gz", hash = "sha256:6d7079f8c739dc993cbedad0736299c413b297814b73795a3855a79169ecc938"}, @@ -2036,6 +2055,7 @@ psutil = [ psycopg2-binary = [ {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"}, + {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"}, @@ -2069,6 +2089,7 @@ psycopg2-binary = [ {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"}, {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"}, + {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"}, @@ -2080,6 +2101,7 @@ psycopg2-binary = [ {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"}, + {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"}, @@ -2096,18 +2118,7 @@ py = [ {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] pyasn1 = [ - {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"}, - {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"}, - {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"}, - {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"}, {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, - {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"}, - {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"}, - {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"}, - {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"}, - {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"}, - {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"}, - {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"}, {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, ] pycodestyle = [ @@ -2165,6 +2176,10 @@ pytest-asyncio = [ {file = "pytest-asyncio-0.19.0.tar.gz", hash = "sha256:ac4ebf3b6207259750bc32f4c1d8fcd7e79739edbc67ad0c58dd150b1d072fed"}, {file = "pytest_asyncio-0.19.0-py3-none-any.whl", hash = "sha256:7a97e37cfe1ed296e2e84941384bdd37c376453912d397ed39293e0916f521fa"}, ] +pytest-httpserver = [ + {file = "pytest_httpserver-1.0.6-py3-none-any.whl", hash = "sha256:ac2379acc91fe8bdbe2911c93af8dd130e33b5899fb9934d15669480739c6d32"}, + {file = "pytest_httpserver-1.0.6.tar.gz", hash = "sha256:9040d07bf59ac45d8de3db1d4468fd2d1d607975e4da4c872ecc0402cdbf7b3e"}, +] pytest-lazy-fixture = [ {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"}, {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"}, @@ -2213,6 +2228,13 @@ pyyaml = [ {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, + {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, + {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 14a5450d5e..e630b2758d 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -33,7 +33,7 @@ sha2 = "0.10.2" socket2 = "0.4.4" thiserror = "1.0.30" tokio = { version = "1.17", features = ["macros"] } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } tokio-rustls = "0.23.0" tracing = "0.1.36" tracing-subscriber = { version = "0.3", features = ["env-filter"] } diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index f272f9adc1..5355946beb 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -49,6 +49,9 @@ pub enum AuthErrorImpl { )] MissingProjectName, + #[error("password authentication failed for user '{0}'")] + AuthFailed(Box), + /// Errors produced by e.g. [`crate::stream::PqStream`]. #[error(transparent)] Io(#[from] io::Error), @@ -62,6 +65,10 @@ impl AuthError { pub fn bad_auth_method(name: impl Into>) -> Self { AuthErrorImpl::BadAuthMethod(name.into()).into() } + + pub fn auth_failed(user: impl Into>) -> Self { + AuthErrorImpl::AuthFailed(user.into()).into() + } } impl> From for AuthError { @@ -78,10 +85,11 @@ impl UserFacingError for AuthError { GetAuthInfo(e) => e.to_string_client(), WakeCompute(e) => e.to_string_client(), Sasl(e) => e.to_string_client(), + AuthFailed(_) => self.to_string(), BadAuthMethod(_) => self.to_string(), MalformedPassword(_) => self.to_string(), MissingProjectName => self.to_string(), - _ => "Internal error".to_string(), + Io(_) => "Internal error".to_string(), } } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 4b937f017a..4adf0ed940 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -8,7 +8,9 @@ pub use console::{GetAuthInfoError, WakeComputeError}; use crate::{ auth::{self, AuthFlow, ClientCredentials}, - compute, http, mgmt, stream, url, + compute, + console::messages::MetricsAuxInfo, + http, mgmt, stream, url, waiters::{self, Waiter, Waiters}, }; use once_cell::sync::Lazy; @@ -126,25 +128,13 @@ pub struct AuthSuccess { pub value: T, } -impl AuthSuccess { - /// Very similar to [`std::option::Option::map`]. - /// Maps [`AuthSuccess`] to [`AuthSuccess`] by applying - /// a function to a contained value. - pub fn map(self, f: impl FnOnce(T) -> R) -> AuthSuccess { - AuthSuccess { - reported_auth_ok: self.reported_auth_ok, - value: f(self.value), - } - } -} - /// Info for establishing a connection to a compute node. /// This is what we get after auth succeeded, but not before! pub struct NodeInfo { - /// Project from [`auth::ClientCredentials`]. - pub project: String, /// Compute node connection params. pub config: compute::ConnCfg, + /// Labels for proxy's metrics. + pub aux: MetricsAuxInfo, } impl BackendType<'_, ClientCredentials<'_>> { @@ -172,37 +162,34 @@ impl BackendType<'_, ClientCredentials<'_>> { }; // TODO: find a proper way to merge those very similar blocks. - let (mut config, payload) = match self { + let (mut node, payload) = match self { Console(endpoint, creds) if creds.project.is_none() => { let payload = fetch_magic_payload.await?; let mut creds = creds.as_ref(); creds.project = Some(payload.project.as_str().into()); - let config = console::Api::new(endpoint, extra, &creds) + let node = console::Api::new(endpoint, extra, &creds) .wake_compute() .await?; - (config, payload) + (node, payload) } Postgres(endpoint, creds) if creds.project.is_none() => { let payload = fetch_magic_payload.await?; let mut creds = creds.as_ref(); creds.project = Some(payload.project.as_str().into()); - let config = postgres::Api::new(endpoint, &creds).wake_compute().await?; + let node = postgres::Api::new(endpoint, &creds).wake_compute().await?; - (config, payload) + (node, payload) } _ => return Ok(None), }; - config.password(payload.password); + node.config.password(payload.password); Ok(Some(AuthSuccess { reported_auth_ok: false, - value: NodeInfo { - project: payload.project, - config, - }, + value: node, })) } @@ -233,10 +220,6 @@ impl BackendType<'_, ClientCredentials<'_>> { console::Api::new(&endpoint, extra, &creds) .handle_user(client) .await? - .map(|config| NodeInfo { - project: creds.project.unwrap().into_owned(), - config, - }) } Postgres(endpoint, creds) => { info!("performing mock authentication using a local postgres instance"); @@ -245,10 +228,6 @@ impl BackendType<'_, ClientCredentials<'_>> { postgres::Api::new(&endpoint, &creds) .handle_user(client) .await? - .map(|config| NodeInfo { - project: creds.project.unwrap().into_owned(), - config, - }) } // NOTE: this auth backend doesn't use client credentials. Link(url) => { diff --git a/proxy/src/auth/backend/console.rs b/proxy/src/auth/backend/console.rs index 929dfb33f7..b3e3fd0c10 100644 --- a/proxy/src/auth/backend/console.rs +++ b/proxy/src/auth/backend/console.rs @@ -1,30 +1,78 @@ //! Cloud API V2. -use super::{AuthSuccess, ConsoleReqExtra}; +use super::{AuthSuccess, ConsoleReqExtra, NodeInfo}; use crate::{ auth::{self, AuthFlow, ClientCredentials}, compute, + console::messages::{ConsoleError, GetRoleSecret, WakeCompute}, error::{io_error, UserFacingError}, - http, scram, + http, sasl, scram, stream::PqStream, }; use futures::TryFutureExt; -use serde::{Deserialize, Serialize}; +use reqwest::StatusCode as HttpStatusCode; use std::future::Future; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{error, info, info_span}; +use tracing::{error, info, info_span, warn, Instrument}; +/// A go-to error message which doesn't leak any detail. const REQUEST_FAILED: &str = "Console request failed"; +/// Common console API error. #[derive(Debug, Error)] -#[error("{}", REQUEST_FAILED)] -pub struct TransportError(#[from] std::io::Error); +pub enum ApiError { + /// Error returned by the console itself. + #[error("{REQUEST_FAILED} with {}: {}", .status, .text)] + Console { + status: HttpStatusCode, + text: Box, + }, -impl UserFacingError for TransportError {} + /// Various IO errors like broken pipe or malformed payload. + #[error("{REQUEST_FAILED}: {0}")] + Transport(#[from] std::io::Error), +} + +impl ApiError { + /// Returns HTTP status code if it's the reason for failure. + fn http_status_code(&self) -> Option { + use ApiError::*; + match self { + Console { status, .. } => Some(*status), + _ => None, + } + } +} + +impl UserFacingError for ApiError { + fn to_string_client(&self) -> String { + use ApiError::*; + match self { + // To minimize risks, only select errors are forwarded to users. + // Ask @neondatabase/control-plane for review before adding more. + Console { status, .. } => match *status { + HttpStatusCode::NOT_FOUND => { + // Status 404: failed to get a project-related resource. + format!("{REQUEST_FAILED}: endpoint cannot be found") + } + HttpStatusCode::NOT_ACCEPTABLE => { + // Status 406: endpoint is disabled (we don't allow connections). + format!("{REQUEST_FAILED}: endpoint is disabled") + } + HttpStatusCode::LOCKED => { + // Status 423: project might be in maintenance mode (or bad state). + format!("{REQUEST_FAILED}: endpoint is temporary unavailable") + } + _ => REQUEST_FAILED.to_owned(), + }, + _ => REQUEST_FAILED.to_owned(), + } + } +} // Helps eliminate graceless `.map_err` calls without introducing another ctor. -impl From for TransportError { +impl From for ApiError { fn from(e: reqwest::Error) -> Self { io_error(e).into() } @@ -37,63 +85,57 @@ pub enum GetAuthInfoError { BadSecret, #[error(transparent)] - Transport(TransportError), + ApiError(ApiError), +} + +// This allows more useful interactions than `#[from]`. +impl> From for GetAuthInfoError { + fn from(e: E) -> Self { + Self::ApiError(e.into()) + } } impl UserFacingError for GetAuthInfoError { fn to_string_client(&self) -> String { use GetAuthInfoError::*; match self { + // We absolutely should not leak any secrets! BadSecret => REQUEST_FAILED.to_owned(), - Transport(e) => e.to_string_client(), + // However, API might return a meaningful error. + ApiError(e) => e.to_string_client(), } } } -impl> From for GetAuthInfoError { - fn from(e: E) -> Self { - Self::Transport(e.into()) - } -} - #[derive(Debug, Error)] pub enum WakeComputeError { - // We shouldn't show users the address even if it's broken. #[error("Console responded with a malformed compute address: {0}")] - BadComputeAddress(String), + BadComputeAddress(Box), #[error(transparent)] - Transport(TransportError), + ApiError(ApiError), +} + +// This allows more useful interactions than `#[from]`. +impl> From for WakeComputeError { + fn from(e: E) -> Self { + Self::ApiError(e.into()) + } } impl UserFacingError for WakeComputeError { fn to_string_client(&self) -> String { use WakeComputeError::*; match self { + // We shouldn't show user the address even if it's broken. + // Besides, user is unlikely to care about this detail. BadComputeAddress(_) => REQUEST_FAILED.to_owned(), - Transport(e) => e.to_string_client(), + // However, API might return a meaningful error. + ApiError(e) => e.to_string_client(), } } } -impl> From for WakeComputeError { - fn from(e: E) -> Self { - Self::Transport(e.into()) - } -} - -// TODO: convert into an enum with "error" -#[derive(Serialize, Deserialize, Debug)] -struct GetRoleSecretResponse { - role_secret: String, -} - -// TODO: convert into an enum with "error" -#[derive(Serialize, Deserialize, Debug)] -struct GetWakeComputeResponse { - address: String, -} - /// Auth secret which is managed by the cloud. pub enum AuthInfo { /// Md5 hash of user's password. @@ -110,6 +152,12 @@ pub(super) struct Api<'a> { creds: &'a ClientCredentials<'a>, } +impl<'a> AsRef> for Api<'a> { + fn as_ref(&self) -> &ClientCredentials<'a> { + self.creds + } +} + impl<'a> Api<'a> { /// Construct an API object containing the auth parameters. pub(super) fn new( @@ -126,83 +174,91 @@ impl<'a> Api<'a> { /// Authenticate the existing user or throw an error. pub(super) async fn handle_user( - self, + &'a self, client: &mut PqStream, - ) -> auth::Result> { - handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await + ) -> auth::Result> { + handle_user(client, self, Self::get_auth_info, Self::wake_compute).await } +} - async fn get_auth_info(&self) -> Result { +impl Api<'_> { + async fn get_auth_info(&self) -> Result, GetAuthInfoError> { let request_id = uuid::Uuid::new_v4().to_string(); - let req = self - .endpoint - .get("proxy_get_role_secret") - .header("X-Request-ID", &request_id) - .query(&[("session_id", self.extra.session_id)]) - .query(&[ - ("application_name", self.extra.application_name), - ("project", Some(self.creds.project().expect("impossible"))), - ("role", Some(self.creds.user)), - ]) - .build()?; + async { + let request = self + .endpoint + .get("proxy_get_role_secret") + .header("X-Request-ID", &request_id) + .query(&[("session_id", self.extra.session_id)]) + .query(&[ + ("application_name", self.extra.application_name), + ("project", Some(self.creds.project().expect("impossible"))), + ("role", Some(self.creds.user)), + ]) + .build()?; - let span = info_span!("http", id = request_id, url = req.url().as_str()); - info!(parent: &span, "request auth info"); - let msg = self - .endpoint - .checked_execute(req) - .and_then(|r| r.json::()) - .await - .map_err(|e| { - error!(parent: &span, "{e}"); - e - })?; + info!(url = request.url().as_str(), "sending http request"); + let response = self.endpoint.execute(request).await?; + let body = match parse_body::(response).await { + Ok(body) => body, + // Error 404 is special: it's ok not to have a secret. + Err(e) => match e.http_status_code() { + Some(HttpStatusCode::NOT_FOUND) => return Ok(None), + _otherwise => return Err(e.into()), + }, + }; - scram::ServerSecret::parse(&msg.role_secret) - .map(AuthInfo::Scram) - .ok_or(GetAuthInfoError::BadSecret) + let secret = scram::ServerSecret::parse(&body.role_secret) + .map(AuthInfo::Scram) + .ok_or(GetAuthInfoError::BadSecret)?; + + Ok(Some(secret)) + } + .map_err(crate::error::log_error) + .instrument(info_span!("get_auth_info", id = request_id)) + .await } /// Wake up the compute node and return the corresponding connection info. - pub(super) async fn wake_compute(&self) -> Result { + pub async fn wake_compute(&self) -> Result { let request_id = uuid::Uuid::new_v4().to_string(); - let req = self - .endpoint - .get("proxy_wake_compute") - .header("X-Request-ID", &request_id) - .query(&[("session_id", self.extra.session_id)]) - .query(&[ - ("application_name", self.extra.application_name), - ("project", Some(self.creds.project().expect("impossible"))), - ]) - .build()?; + async { + let request = self + .endpoint + .get("proxy_wake_compute") + .header("X-Request-ID", &request_id) + .query(&[("session_id", self.extra.session_id)]) + .query(&[ + ("application_name", self.extra.application_name), + ("project", Some(self.creds.project().expect("impossible"))), + ]) + .build()?; - let span = info_span!("http", id = request_id, url = req.url().as_str()); - info!(parent: &span, "request wake-up"); - let msg = self - .endpoint - .checked_execute(req) - .and_then(|r| r.json::()) - .await - .map_err(|e| { - error!(parent: &span, "{e}"); - e - })?; + info!(url = request.url().as_str(), "sending http request"); + let response = self.endpoint.execute(request).await?; + let body = parse_body::(response).await?; - // Unfortunately, ownership won't let us use `Option::ok_or` here. - let (host, port) = match parse_host_port(&msg.address) { - None => return Err(WakeComputeError::BadComputeAddress(msg.address)), - Some(x) => x, - }; + // Unfortunately, ownership won't let us use `Option::ok_or` here. + let (host, port) = match parse_host_port(&body.address) { + None => return Err(WakeComputeError::BadComputeAddress(body.address)), + Some(x) => x, + }; - let mut config = compute::ConnCfg::new(); - config - .host(host) - .port(port) - .dbname(self.creds.dbname) - .user(self.creds.user); + let mut config = compute::ConnCfg::new(); + config + .host(host) + .port(port) + .dbname(self.creds.dbname) + .user(self.creds.user); - Ok(config) + Ok(NodeInfo { + config, + aux: body.aux, + }) + } + .map_err(crate::error::log_error) + .instrument(info_span!("wake_compute", id = request_id)) + .await } } @@ -213,42 +269,84 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>( endpoint: &'a Endpoint, get_auth_info: impl FnOnce(&'a Endpoint) -> GetAuthInfo, wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute, -) -> auth::Result> +) -> auth::Result> where - GetAuthInfo: Future>, - WakeCompute: Future>, + Endpoint: AsRef>, + GetAuthInfo: Future, GetAuthInfoError>>, + WakeCompute: Future>, { + let creds = endpoint.as_ref(); + info!("fetching user's authentication info"); - let auth_info = get_auth_info(endpoint).await?; + let info = get_auth_info(endpoint).await?.unwrap_or_else(|| { + // If we don't have an authentication secret, we mock one to + // prevent malicious probing (possible due to missing protocol steps). + // This mocked secret will never lead to successful authentication. + info!("authentication info not found, mocking it"); + AuthInfo::Scram(scram::ServerSecret::mock(creds.user, rand::random())) + }); let flow = AuthFlow::new(client); - let scram_keys = match auth_info { + let scram_keys = match info { AuthInfo::Md5(_) => { - // TODO: decide if we should support MD5 in api v2 info!("auth endpoint chooses MD5"); return Err(auth::AuthError::bad_auth_method("MD5")); } AuthInfo::Scram(secret) => { info!("auth endpoint chooses SCRAM"); let scram = auth::Scram(&secret); + let client_key = match flow.begin(scram).await?.authenticate().await? { + sasl::Outcome::Success(key) => key, + sasl::Outcome::Failure(reason) => { + info!("auth backend failed with an error: {reason}"); + return Err(auth::AuthError::auth_failed(creds.user)); + } + }; + Some(compute::ScramKeys { - client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(), + client_key: client_key.as_bytes(), server_key: secret.server_key.as_bytes(), }) } }; - let mut config = wake_compute(endpoint).await?; + let mut node = wake_compute(endpoint).await?; if let Some(keys) = scram_keys { - config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(keys)); + use tokio_postgres::config::AuthKeys; + node.config.auth_keys(AuthKeys::ScramSha256(keys)); } Ok(AuthSuccess { reported_auth_ok: false, - value: config, + value: node, }) } +/// Parse http response body, taking status code into account. +async fn parse_body serde::Deserialize<'a>>( + response: reqwest::Response, +) -> Result { + let status = response.status(); + if status.is_success() { + // We shouldn't log raw body because it may contain secrets. + info!("request succeeded, processing the body"); + return Ok(response.json().await?); + } + + // Don't throw an error here because it's not as important + // as the fact that the request itself has failed. + let body = response.json().await.unwrap_or_else(|e| { + warn!("failed to parse error body: {e}"); + ConsoleError { + error: "reason unclear (malformed error message)".into(), + } + }); + + let text = body.error; + error!("console responded with an error ({status}): {text}"); + Err(ApiError::Console { status, text }) +} + fn parse_host_port(input: &str) -> Option<(&str, u16)> { let (host, port) = input.split_once(':')?; Some((host, port.parse().ok()?)) diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 440a55f194..e16bbc70e4 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -1,6 +1,6 @@ use super::{AuthSuccess, NodeInfo}; use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters}; -use pq_proto::{BeMessage as Be, BeParameterStatusMessage}; +use pq_proto::BeMessage as Be; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, info_span}; @@ -60,7 +60,7 @@ pub async fn handle_user( info!(parent: &span, "sending the auth URL to the user"); client .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())? + .write_message_noflush(&Be::CLIENT_ENCODING)? .write_message(&Be::NoticeResponse(&greeting)) .await?; @@ -86,8 +86,8 @@ pub async fn handle_user( Ok(AuthSuccess { reported_auth_ok: true, value: NodeInfo { - project: db_info.project, config, + aux: db_info.aux, }, }) } diff --git a/proxy/src/auth/backend/postgres.rs b/proxy/src/auth/backend/postgres.rs index e56b62622a..260342f103 100644 --- a/proxy/src/auth/backend/postgres.rs +++ b/proxy/src/auth/backend/postgres.rs @@ -1,8 +1,8 @@ //! Local mock of Cloud API V2. use super::{ - console::{self, AuthInfo, GetAuthInfoError, TransportError, WakeComputeError}, - AuthSuccess, + console::{self, AuthInfo, GetAuthInfoError, WakeComputeError}, + AuthSuccess, NodeInfo, }; use crate::{ auth::{self, ClientCredentials}, @@ -12,7 +12,28 @@ use crate::{ stream::PqStream, url::ApiUrl, }; +use futures::TryFutureExt; +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::{info, info_span, warn, Instrument}; + +#[derive(Debug, Error)] +enum MockApiError { + #[error("Failed to read password: {0}")] + PasswordNotSet(tokio_postgres::Error), +} + +impl From for console::ApiError { + fn from(e: MockApiError) -> Self { + io_error(e).into() + } +} + +impl From for console::ApiError { + fn from(e: tokio_postgres::Error) -> Self { + io_error(e).into() + } +} #[must_use] pub(super) struct Api<'a> { @@ -20,10 +41,9 @@ pub(super) struct Api<'a> { creds: &'a ClientCredentials<'a>, } -// Helps eliminate graceless `.map_err` calls without introducing another ctor. -impl From for TransportError { - fn from(e: tokio_postgres::Error) -> Self { - io_error(e).into() +impl<'a> AsRef> for Api<'a> { + fn as_ref(&self) -> &ClientCredentials<'a> { + self.creds } } @@ -35,54 +55,55 @@ impl<'a> Api<'a> { /// Authenticate the existing user or throw an error. pub(super) async fn handle_user( - self, + &'a self, client: &mut PqStream, - ) -> auth::Result> { + ) -> auth::Result> { // We reuse user handling logic from a production module. - console::handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await + console::handle_user(client, self, Self::get_auth_info, Self::wake_compute).await } +} +impl Api<'_> { /// This implementation fetches the auth info from a local postgres instance. - async fn get_auth_info(&self) -> Result { - // Perhaps we could persist this connection, but then we'd have to - // write more code for reopening it if it got closed, which doesn't - // seem worth it. - let (client, connection) = - tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; + async fn get_auth_info(&self) -> Result, GetAuthInfoError> { + async { + // Perhaps we could persist this connection, but then we'd have to + // write more code for reopening it if it got closed, which doesn't + // seem worth it. + let (client, connection) = + tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; - tokio::spawn(connection); - let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1"; - let rows = client.query(query, &[&self.creds.user]).await?; + tokio::spawn(connection); + let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1"; + let rows = client.query(query, &[&self.creds.user]).await?; - match &rows[..] { - // We can't get a secret if there's no such user. - [] => Err(io_error(format!("unknown user '{}'", self.creds.user)).into()), + // We can get at most one row, because `rolname` is unique. + let row = match rows.get(0) { + Some(row) => row, + // This means that the user doesn't exist, so there can be no secret. + // However, this is still a *valid* outcome which is very similar + // to getting `404 Not found` from the Neon console. + None => { + warn!("user '{}' does not exist", self.creds.user); + return Ok(None); + } + }; - // We shouldn't get more than one row anyway. - [row, ..] => { - let entry = row - .try_get("rolpassword") - .map_err(|e| io_error(format!("failed to read user's password: {e}")))?; + let entry = row + .try_get("rolpassword") + .map_err(MockApiError::PasswordNotSet)?; - scram::ServerSecret::parse(entry) - .map(AuthInfo::Scram) - .or_else(|| { - // It could be an md5 hash if it's not a SCRAM secret. - let text = entry.strip_prefix("md5")?; - Some(AuthInfo::Md5({ - let mut bytes = [0u8; 16]; - hex::decode_to_slice(text, &mut bytes).ok()?; - bytes - })) - }) - // Putting the secret into this message is a security hazard! - .ok_or(GetAuthInfoError::BadSecret) - } + info!("got a secret: {entry}"); // safe since it's not a prod scenario + let secret = scram::ServerSecret::parse(entry).map(AuthInfo::Scram); + Ok(secret.or_else(|| parse_md5(entry).map(AuthInfo::Md5))) } + .map_err(crate::error::log_error) + .instrument(info_span!("get_auth_info", mock = self.endpoint.as_str())) + .await } /// We don't need to wake anything locally, so we just return the connection info. - pub(super) async fn wake_compute(&self) -> Result { + pub async fn wake_compute(&self) -> Result { let mut config = compute::ConnCfg::new(); config .host(self.endpoint.host_str().unwrap_or("localhost")) @@ -90,6 +111,18 @@ impl<'a> Api<'a> { .dbname(self.creds.dbname) .user(self.creds.user); - Ok(config) + Ok(NodeInfo { + config, + aux: Default::default(), + }) } } + +fn parse_md5(input: &str) -> Option<[u8; 16]> { + let text = input.strip_prefix("md5")?; + + let mut bytes = [0u8; 16]; + hex::decode_to_slice(text, &mut bytes).ok()?; + + Some(bytes) +} diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 865af4d2e5..d9ee50894d 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -89,7 +89,7 @@ impl AuthFlow<'_, S, PasswordHack> { /// Stream wrapper for handling [SCRAM](crate::scram) auth. impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. - pub async fn authenticate(self) -> super::Result { + pub async fn authenticate(self) -> super::Result> { // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; let sasl = sasl::FirstMessage::parse(&msg) @@ -101,10 +101,10 @@ impl AuthFlow<'_, S, Scram<'_>> { } let secret = self.state.0; - let key = sasl::SaslStream::new(self.stream, sasl.message) + let outcome = sasl::SaslStream::new(self.stream, sasl.message) .authenticate(scram::Exchange::new(secret, rand::random, None)) .await?; - Ok(key) + Ok(outcome) } } diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 4c5edb9673..094db73061 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -8,18 +8,17 @@ use tokio::net::TcpStream; use tokio_postgres::NoTls; use tracing::{error, info}; +const COULD_NOT_CONNECT: &str = "Could not connect to compute node"; + #[derive(Debug, Error)] pub enum ConnectionError { /// This error doesn't seem to reveal any secrets; for instance, /// [`tokio_postgres::error::Kind`] doesn't contain ip addresses and such. - #[error("Failed to connect to the compute node: {0}")] + #[error("{COULD_NOT_CONNECT}: {0}")] Postgres(#[from] tokio_postgres::Error), - #[error("Failed to connect to the compute node")] - FailedToConnectToCompute, - - #[error("Failed to fetch compute node version")] - FailedToFetchPgVersion, + #[error("{COULD_NOT_CONNECT}: {0}")] + CouldNotConnect(#[from] io::Error), } impl UserFacingError for ConnectionError { @@ -29,10 +28,10 @@ impl UserFacingError for ConnectionError { // This helps us drop irrelevant library-specific prefixes. // TODO: propagate severity level and other parameters. Postgres(err) => match err.as_db_error() { - Some(err) => err.message().to_string(), + Some(err) => err.message().to_owned(), None => err.to_string(), }, - other => other.to_string(), + _ => COULD_NOT_CONNECT.to_owned(), } } } @@ -44,12 +43,12 @@ pub type ScramKeys = tokio_postgres::config::ScramKeys<32>; /// Eventually, `tokio_postgres` will be replaced with something better. /// Newtype allows us to implement methods on top of it. #[repr(transparent)] -pub struct ConnCfg(pub tokio_postgres::Config); +pub struct ConnCfg(Box); impl ConnCfg { /// Construct a new connection config. pub fn new() -> Self { - Self(tokio_postgres::Config::new()) + Self(Default::default()) } } @@ -95,7 +94,7 @@ impl ConnCfg { io::ErrorKind::Other, format!( "couldn't connect: bad compute config, \ - ports and hosts entries' count does not match: {:?}", + ports and hosts entries' count does not match: {:?}", self.0 ), )); @@ -131,8 +130,8 @@ impl ConnCfg { pub struct PostgresConnection { /// Socket connected to a compute node. pub stream: TcpStream, - /// PostgreSQL version of this instance. - pub version: String, + /// PostgreSQL connection parameters. + pub params: std::collections::HashMap, } impl ConnCfg { @@ -156,6 +155,7 @@ impl ConnCfg { self.0.application_name(app_name); } + // TODO: This is especially ugly... if let Some(replication) = params.get("replication") { use tokio_postgres::config::ReplicationMode; match replication { @@ -172,22 +172,24 @@ impl ConnCfg { // TODO: extend the list of the forwarded startup parameters. // Currently, tokio-postgres doesn't allow us to pass // arbitrary parameters, but the ones above are a good start. + // + // This and the reverse params problem can be better addressed + // in a bespoke connection machinery (a new library for that sake). - let (socket_addr, mut stream) = self - .connect_raw() - .await - .map_err(|_| ConnectionError::FailedToConnectToCompute)?; - - // TODO: establish a secure connection to the DB - let (client, conn) = self.0.connect_raw(&mut stream, NoTls).await?; - let version = conn - .parameter("server_version") - .ok_or(ConnectionError::FailedToFetchPgVersion)? - .into(); - + // TODO: establish a secure connection to the DB. + let (socket_addr, mut stream) = self.connect_raw().await?; + let (client, connection) = self.0.connect_raw(&mut stream, NoTls).await?; info!("connected to user's compute node at {socket_addr}"); + + // This is very ugly but as of now there's no better way to + // extract the connection parameters from tokio-postgres' connection. + // TODO: solve this problem in a more elegant manner (e.g. the new library). + let params = connection.parameters; + + // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw. + // Yet another reason to rework the connection establishing code. let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); - let db = PostgresConnection { stream, version }; + let db = PostgresConnection { stream, params }; Ok((db, cancel_closure)) } diff --git a/proxy/src/console.rs b/proxy/src/console.rs new file mode 100644 index 0000000000..78f09ac9e1 --- /dev/null +++ b/proxy/src/console.rs @@ -0,0 +1,5 @@ +///! Various stuff for dealing with the Neon Console. +///! Later we might move some API wrappers here. + +/// Payloads used in the console's APIs. +pub mod messages; diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs new file mode 100644 index 0000000000..63a97069b8 --- /dev/null +++ b/proxy/src/console/messages.rs @@ -0,0 +1,190 @@ +use serde::Deserialize; +use std::fmt; + +/// Generic error response with human-readable description. +/// Note that we can't always present it to user as is. +#[derive(Debug, Deserialize)] +pub struct ConsoleError { + pub error: Box, +} + +/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`]. +/// Returned by the `/proxy_get_role_secret` API method. +#[derive(Deserialize)] +pub struct GetRoleSecret { + pub role_secret: Box, +} + +// Manually implement debug to omit sensitive info. +impl fmt::Debug for GetRoleSecret { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("GetRoleSecret").finish_non_exhaustive() + } +} + +/// Response which holds compute node's `host:port` pair. +/// Returned by the `/proxy_wake_compute` API method. +#[derive(Debug, Deserialize)] +pub struct WakeCompute { + pub address: Box, + pub aux: MetricsAuxInfo, +} + +/// Async response which concludes the link auth flow. +/// Also known as `kickResponse` in the console. +#[derive(Debug, Deserialize)] +pub struct KickSession<'a> { + /// Session ID is assigned by the proxy. + pub session_id: &'a str, + + /// Compute node connection params. + #[serde(deserialize_with = "KickSession::parse_db_info")] + pub result: DatabaseInfo, +} + +impl KickSession<'_> { + fn parse_db_info<'de, D>(des: D) -> Result + where + D: serde::Deserializer<'de>, + { + #[derive(Deserialize)] + enum Wrapper { + // Currently, console only reports `Success`. + // `Failure(String)` used to be here... RIP. + Success(DatabaseInfo), + } + + Wrapper::deserialize(des).map(|x| match x { + Wrapper::Success(info) => info, + }) + } +} + +/// Compute node connection params. +#[derive(Deserialize)] +pub struct DatabaseInfo { + pub host: String, + pub port: u16, + pub dbname: String, + pub user: String, + /// Console always provides a password, but it might + /// be inconvenient for debug with local PG instance. + pub password: Option, + pub aux: MetricsAuxInfo, +} + +// Manually implement debug to omit sensitive info. +impl fmt::Debug for DatabaseInfo { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("DatabaseInfo") + .field("host", &self.host) + .field("port", &self.port) + .field("dbname", &self.dbname) + .field("user", &self.user) + .finish_non_exhaustive() + } +} + +/// Various labels for prometheus metrics. +/// Also known as `ProxyMetricsAuxInfo` in the console. +#[derive(Debug, Deserialize, Default)] +pub struct MetricsAuxInfo { + pub endpoint_id: Box, + pub project_id: Box, + pub branch_id: Box, +} + +impl MetricsAuxInfo { + /// Definitions of labels for traffic metric. + pub const TRAFFIC_LABELS: &'static [&'static str] = &[ + // Received (rx) / sent (tx). + "direction", + // ID of a project. + "project_id", + // ID of an endpoint within a project. + "endpoint_id", + // ID of a branch within a project (snapshot). + "branch_id", + ]; + + /// Values of labels for traffic metric. + // TODO: add more type safety (validate arity & positions). + pub fn traffic_labels(&self, direction: &'static str) -> [&str; 4] { + [ + direction, + &self.project_id, + &self.endpoint_id, + &self.branch_id, + ] + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + fn dummy_aux() -> serde_json::Value { + json!({ + "endpoint_id": "endpoint", + "project_id": "project", + "branch_id": "branch", + }) + } + + #[test] + fn parse_kick_session() -> anyhow::Result<()> { + // This is what the console's kickResponse looks like. + let json = json!({ + "session_id": "deadbeef", + "result": { + "Success": { + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "password": "password", + "aux": dummy_aux(), + } + } + }); + let _: KickSession = serde_json::from_str(&json.to_string())?; + + Ok(()) + } + + #[test] + fn parse_db_info() -> anyhow::Result<()> { + // with password + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "password": "password", + "aux": dummy_aux(), + }))?; + + // without password + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "aux": dummy_aux(), + }))?; + + // new field (forward compatibility) + let _: DatabaseInfo = serde_json::from_value(json!({ + "host": "localhost", + "port": 5432, + "dbname": "postgres", + "user": "john_doe", + "project": "hello_world", + "N.E.W": "forward compatibility check", + "aux": dummy_aux(), + }))?; + + Ok(()) + } +} diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 0e376a37cd..f1cb44b1a8 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -1,4 +1,15 @@ -use std::io; +use std::{error::Error as StdError, fmt, io}; + +/// Upcast (almost) any error into an opaque [`io::Error`]. +pub fn io_error(e: impl Into>) -> io::Error { + io::Error::new(io::ErrorKind::Other, e) +} + +/// A small combinator for pluggable error logging. +pub fn log_error(e: E) -> E { + tracing::error!("{e}"); + e +} /// Marks errors that may be safely shown to a client. /// This trait can be seen as a specialized version of [`ToString`]. @@ -6,7 +17,7 @@ use std::io; /// NOTE: This trait should not be implemented for [`anyhow::Error`], since it /// is way too convenient and tends to proliferate all across the codebase, /// ultimately leading to accidental leaks of sensitive data. -pub trait UserFacingError: ToString { +pub trait UserFacingError: fmt::Display { /// Format the error for client, stripping all sensitive info. /// /// Although this might be a no-op for many types, it's highly @@ -17,8 +28,3 @@ pub trait UserFacingError: ToString { self.to_string() } } - -/// Upcast (almost) any error into an opaque [`io::Error`]. -pub fn io_error(e: impl Into>) -> io::Error { - io::Error::new(io::ErrorKind::Other, e) -} diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 6f9145678b..096a33d73d 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -37,16 +37,6 @@ impl Endpoint { ) -> Result { self.client.execute(request).await } - - /// Execute a [request](reqwest::Request) and raise an error if status != 200. - pub async fn checked_execute( - &self, - request: reqwest::Request, - ) -> Result { - self.execute(request) - .await - .and_then(|r| r.error_for_status()) - } } #[cfg(test)] diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 2055616a6e..89ea9142a9 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -8,6 +8,7 @@ mod auth; mod cancellation; mod compute; mod config; +mod console; mod error; mod http; mod mgmt; @@ -28,6 +29,7 @@ use std::{borrow::Cow, future::Future, net::SocketAddr}; use tokio::{net::TcpListener, task::JoinError}; use tracing::info; use utils::project_git_version; +use utils::sentry_init::{init_sentry, release_name}; project_git_version!(GIT_VERSION); @@ -45,6 +47,9 @@ async fn main() -> anyhow::Result<()> { .with_target(false) .init(); + // initialize sentry if SENTRY_DSN is provided + let _sentry_guard = init_sentry(release_name!(), &[]); + let arg_matches = cli().get_matches(); let tls_config = match ( diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 23e10b5a9b..2e0a502e7f 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -1,7 +1,9 @@ -use crate::auth; +use crate::{ + auth, + console::messages::{DatabaseInfo, KickSession}, +}; use anyhow::Context; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; -use serde::Deserialize; use std::{ net::{TcpListener, TcpStream}, thread, @@ -50,59 +52,9 @@ fn handle_connection(socket: TcpStream) -> anyhow::Result<()> { pgbackend.run(&mut MgmtHandler) } -/// Known as `kickResponse` in the console. -#[derive(Debug, Deserialize)] -struct PsqlSessionResponse { - session_id: String, - result: PsqlSessionResult, -} - -#[derive(Debug, Deserialize)] -enum PsqlSessionResult { - Success(DatabaseInfo), - Failure(String), -} - /// A message received by `mgmt` when a compute node is ready. pub type ComputeReady = Result; -impl PsqlSessionResult { - fn into_compute_ready(self) -> ComputeReady { - match self { - Self::Success(db_info) => Ok(db_info), - Self::Failure(message) => Err(message), - } - } -} - -/// Compute node connection params provided by the console. -/// This struct and its parents are mgmt API implementation -/// detail and thus should remain in this module. -// TODO: restore deserialization tests from git history. -#[derive(Deserialize)] -pub struct DatabaseInfo { - pub host: String, - pub port: u16, - pub dbname: String, - pub user: String, - /// Console always provides a password, but it might - /// be inconvenient for debug with local PG instance. - pub password: Option, - pub project: String, -} - -// Manually implement debug to omit sensitive info. -impl std::fmt::Debug for DatabaseInfo { - fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { - fmt.debug_struct("DatabaseInfo") - .field("host", &self.host) - .field("port", &self.port) - .field("dbname", &self.dbname) - .field("user", &self.user) - .finish_non_exhaustive() - } -} - // TODO: replace with an http-based protocol. struct MgmtHandler; impl postgres_backend::Handler for MgmtHandler { @@ -115,13 +67,13 @@ impl postgres_backend::Handler for MgmtHandler { } fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> { - let resp: PsqlSessionResponse = serde_json::from_str(query)?; + let resp: KickSession = serde_json::from_str(query)?; let span = info_span!("event", session_id = resp.session_id); let _enter = span.enter(); info!("got response: {:?}", resp.result); - match auth::backend::notify(&resp.session_id, resp.result.into_compute_ready()) { + match auth::backend::notify(resp.session_id, Ok(resp.result)) { Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? @@ -135,43 +87,3 @@ fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<( Ok(()) } - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn parse_db_info() -> anyhow::Result<()> { - // with password - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "password": "password", - "project": "hello_world", - }))?; - - // without password - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "project": "hello_world", - }))?; - - // new field (forward compatibility) - let _: DatabaseInfo = serde_json::from_value(json!({ - "host": "localhost", - "port": 5432, - "dbname": "postgres", - "user": "john_doe", - "project": "hello_world", - "N.E.W": "forward compatibility check", - }))?; - - Ok(()) - } -} diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 411893fee5..382f7cd918 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -11,7 +11,7 @@ use anyhow::{bail, Context}; use futures::TryFutureExt; use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; use once_cell::sync::Lazy; -use pq_proto::{BeMessage as Be, *}; +use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{error, info, info_span, Instrument}; @@ -39,27 +39,11 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { register_int_counter_vec!( "proxy_io_bytes_per_client", "Number of bytes sent/received between client and backend.", - &[ - // Received (rx) / sent (tx). - "direction", - // Proxy can keep calling it `project` internally. - "endpoint_id" - ] + crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS, ) .unwrap() }); -/// A small combinator for pluggable error logging. -async fn log_error(future: F) -> F::Output -where - F: std::future::Future>, -{ - future.await.map_err(|err| { - error!("{err}"); - err - }) -} - pub async fn task_main( config: &'static ProxyConfig, listener: tokio::net::TcpListener, @@ -80,7 +64,7 @@ pub async fn task_main( let session_id = uuid::Uuid::new_v4(); let cancel_map = Arc::clone(&cancel_map); tokio::spawn( - log_error(async move { + async move { info!("spawned a task for {peer_addr}"); socket @@ -88,6 +72,10 @@ pub async fn task_main( .context("failed to set socket option")?; handle_client(config, &cancel_map, session_id, socket).await + } + .unwrap_or_else(|e| { + // Acknowledge that the task has finished with an error. + error!("per-client task finished with an error: {e:#}"); }) .instrument(info_span!("client", session = format_args!("{session_id}"))), ); @@ -262,29 +250,32 @@ impl Client<'_, S> { // Note that we do this only (for the most part) after we've connected // to a compute (see above) which performs its own authentication. if !auth_result.reported_auth_ok { - stream - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())?; + stream.write_message_noflush(&Be::AuthenticationOk)?; + } + + // Forward all postgres connection params to the client. + // Right now the implementation is very hacky and inefficent (ideally, + // we don't need an intermediate hashmap), but at least it should be correct. + for (name, value) in &db.params { + // TODO: Theoretically, this could result in a big pile of params... + stream.write_message_noflush(&Be::ParameterStatus { + name: name.as_bytes(), + value: value.as_bytes(), + })?; } stream - .write_message_noflush(&BeMessage::ParameterStatus( - BeParameterStatusMessage::ServerVersion(&db.version), - ))? .write_message_noflush(&Be::BackendKeyData(cancel_key_data))? - .write_message(&BeMessage::ReadyForQuery) + .write_message(&Be::ReadyForQuery) .await?; - // TODO: add more identifiers. - let metric_id = node.project; - - let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx", &metric_id]); + let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&node.aux.traffic_labels("tx")); let mut client = MeasuredStream::new(stream.into_inner(), |cnt| { // Number of bytes we sent to the client (outbound). m_sent.inc_by(cnt as u64); }); - let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx", &metric_id]); + let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&node.aux.traffic_labels("rx")); let mut db = MeasuredStream::new(db.stream, |cnt| { // Number of bytes the client sent to the compute node (inbound). m_recv.inc_by(cnt as u64); diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 3d74dbae5a..ed429df421 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -1,6 +1,6 @@ ///! A group of high-level tests for connection establishing logic and auth. use super::*; -use crate::{auth, scram}; +use crate::{auth, sasl, scram}; use async_trait::async_trait; use rstest::rstest; use tokio_postgres::config::SslMode; @@ -100,8 +100,7 @@ impl Scram { } fn mock(user: &str) -> Self { - let salt = rand::random::<[u8; 32]>(); - Scram(scram::ServerSecret::mock(user, &salt)) + Scram(scram::ServerSecret::mock(user, rand::random())) } } @@ -111,13 +110,17 @@ impl TestAuth for Scram { self, stream: &mut PqStream>, ) -> anyhow::Result<()> { - auth::AuthFlow::new(stream) + let outcome = auth::AuthFlow::new(stream) .begin(auth::Scram(&self.0)) .await? .authenticate() .await?; - Ok(()) + use sasl::Outcome::*; + match outcome { + Success(_) => Ok(()), + Failure(reason) => bail!("autentication failed with an error: {reason}"), + } } } @@ -136,8 +139,8 @@ async fn dummy_proxy( stream .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&BeParameterStatusMessage::encoding())? - .write_message(&BeMessage::ReadyForQuery) + .write_message_noflush(&Be::CLIENT_ENCODING)? + .write_message(&Be::ReadyForQuery) .await?; Ok(()) diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs index 689fca6049..6d1dd9fba5 100644 --- a/proxy/src/sasl.rs +++ b/proxy/src/sasl.rs @@ -16,22 +16,19 @@ use thiserror::Error; pub use channel_binding::ChannelBinding; pub use messages::FirstMessage; -pub use stream::SaslStream; +pub use stream::{Outcome, SaslStream}; /// Fine-grained auth errors help in writing tests. #[derive(Error, Debug)] pub enum Error { - #[error("Failed to authenticate client: {0}")] - AuthenticationFailed(&'static str), - #[error("Channel binding failed: {0}")] ChannelBindingFailed(&'static str), #[error("Unsupported channel binding method: {0}")] ChannelBindingBadMethod(Box), - #[error("Bad client message")] - BadClientMessage, + #[error("Bad client message: {0}")] + BadClientMessage(&'static str), #[error(transparent)] Io(#[from] io::Error), @@ -41,8 +38,6 @@ impl UserFacingError for Error { fn to_string_client(&self) -> String { use Error::*; match self { - // This constructor contains the reason why auth has failed. - AuthenticationFailed(s) => s.to_string(), // TODO: add support for channel binding ChannelBindingFailed(_) => "channel binding is not supported yet".to_string(), ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"), @@ -55,11 +50,14 @@ impl UserFacingError for Error { pub type Result = std::result::Result; /// A result of one SASL exchange. +#[must_use] pub enum Step { /// We should continue exchanging messages. - Continue(T), + Continue(T, String), /// The client has been authenticated successfully. - Authenticated(R), + Success(R, String), + /// Authentication failed (reason attached). + Failure(&'static str), } /// Every SASL mechanism (e.g. [SCRAM](crate::scram)) is expected to implement this trait. @@ -69,5 +67,5 @@ pub trait Mechanism: Sized { /// Produce a server challenge to be sent to the client. /// This is how this method is called in PostgreSQL (`libpq/sasl.h`). - fn exchange(self, input: &str) -> Result<(Step, String)>; + fn exchange(self, input: &str) -> Result>; } diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs index 0e782c5f29..b24cc4bf44 100644 --- a/proxy/src/sasl/stream.rs +++ b/proxy/src/sasl/stream.rs @@ -48,28 +48,41 @@ impl SaslStream<'_, S> { } } +/// SASL authentication outcome. +/// It's much easier to match on those two variants +/// than to peek into a noisy protocol error type. +#[must_use = "caller must explicitly check for success"] +pub enum Outcome { + /// Authentication succeeded and produced some value. + Success(R), + /// Authentication failed (reason attached). + Failure(&'static str), +} + impl SaslStream<'_, S> { /// Perform SASL message exchange according to the underlying algorithm /// until user is either authenticated or denied access. pub async fn authenticate( mut self, mut mechanism: M, - ) -> super::Result { + ) -> super::Result> { loop { let input = self.recv().await?; - let (moved, reply) = mechanism.exchange(input)?; + let step = mechanism.exchange(input)?; - use super::Step::*; - match moved { - Continue(moved) => { + use super::Step; + return Ok(match step { + Step::Continue(moved_mechanism, reply) => { self.send(&ServerMessage::Continue(&reply)).await?; - mechanism = moved; + mechanism = moved_mechanism; + continue; } - Authenticated(result) => { + Step::Success(result, reply) => { self.send(&ServerMessage::Final(&reply)).await?; - return Ok(result); + Outcome::Success(result) } - } + Step::Failure(reason) => Outcome::Failure(reason), + }); } } } diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index fca5585b25..882769a70d 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -64,12 +64,12 @@ impl<'a> Exchange<'a> { impl sasl::Mechanism for Exchange<'_> { type Output = super::ScramKey; - fn exchange(mut self, input: &str) -> sasl::Result<(sasl::Step, String)> { + fn exchange(mut self, input: &str) -> sasl::Result> { use {sasl::Step::*, ExchangeState::*}; match &self.state { Initial => { - let client_first_message = - ClientFirstMessage::parse(input).ok_or(SaslError::BadClientMessage)?; + let client_first_message = ClientFirstMessage::parse(input) + .ok_or(SaslError::BadClientMessage("invalid client-first-message"))?; let server_first_message = client_first_message.build_server_first_message( &(self.nonce)(), @@ -84,15 +84,15 @@ impl sasl::Mechanism for Exchange<'_> { server_first_message, }; - Ok((Continue(self), msg)) + Ok(Continue(self, msg)) } SaltSent { cbind_flag, client_first_message_bare, server_first_message, } => { - let client_final_message = - ClientFinalMessage::parse(input).ok_or(SaslError::BadClientMessage)?; + let client_final_message = ClientFinalMessage::parse(input) + .ok_or(SaslError::BadClientMessage("invalid client-final-message"))?; let channel_binding = cbind_flag.encode(|_| { self.cert_digest @@ -106,9 +106,7 @@ impl sasl::Mechanism for Exchange<'_> { } if client_final_message.nonce != server_first_message.nonce() { - return Err(SaslError::AuthenticationFailed( - "combined nonce doesn't match", - )); + return Err(SaslError::BadClientMessage("combined nonce doesn't match")); } let signature_builder = SignatureBuilder { @@ -121,14 +119,15 @@ impl sasl::Mechanism for Exchange<'_> { .build(&self.secret.stored_key) .derive_client_key(&client_final_message.proof); - if client_key.sha256() != self.secret.stored_key { - return Err(SaslError::AuthenticationFailed("password doesn't match")); + // Auth fails either if keys don't match or it's pre-determined to fail. + if client_key.sha256() != self.secret.stored_key || self.secret.doomed { + return Ok(Failure("password doesn't match")); } let msg = client_final_message .build_server_final_message(signature_builder, &self.secret.server_key); - Ok((Authenticated(client_key), msg)) + Ok(Success(client_key, msg)) } } } diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index 765aef4443..424beccec9 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -14,6 +14,9 @@ pub struct ServerSecret { pub stored_key: ScramKey, /// Used by client to verify server's signature. pub server_key: ScramKey, + /// Should auth fail no matter what? + /// This is exactly the case for mocked secrets. + pub doomed: bool, } impl ServerSecret { @@ -30,6 +33,7 @@ impl ServerSecret { salt_base64: salt.to_owned(), stored_key: base64_decode_array(stored_key)?.into(), server_key: base64_decode_array(server_key)?.into(), + doomed: false, }; Some(secret) @@ -38,16 +42,16 @@ impl ServerSecret { /// To avoid revealing information to an attacker, we use a /// mocked server secret even if the user doesn't exist. /// See `auth-scram.c : mock_scram_secret` for details. - #[allow(dead_code)] - pub fn mock(user: &str, nonce: &[u8; 32]) -> Self { + pub fn mock(user: &str, nonce: [u8; 32]) -> Self { // Refer to `auth-scram.c : scram_mock_salt`. - let mocked_salt = super::sha256([user.as_bytes(), nonce]); + let mocked_salt = super::sha256([user.as_bytes(), &nonce]); Self { iterations: 4096, - salt_base64: base64::encode(&mocked_salt), + salt_base64: base64::encode(mocked_salt), stored_key: ScramKey::default(), server_key: ScramKey::default(), + doomed: true, } } @@ -64,9 +68,10 @@ impl ServerSecret { Some(Self { iterations, - salt_base64: base64::encode(&salt), + salt_base64: base64::encode(salt), stored_key: password.client_key().sha256(), server_key: password.server_key(), + doomed: false, }) } } diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 8e4084775c..19e1479068 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -109,8 +109,9 @@ impl PqStream { /// Write the error message using [`Self::write_message`], then re-throw it. /// Allowing string literals is safe under the assumption they might not contain any runtime info. + /// This method exists due to `&str` not implementing `Into`. pub async fn throw_error_str(&mut self, error: &'static str) -> anyhow::Result { - // This method exists due to `&str` not implementing `Into` + tracing::info!("forwarding error to user: {error}"); self.write_message(&BeMessage::ErrorResponse(error)).await?; bail!(error) } @@ -122,6 +123,7 @@ impl PqStream { E: UserFacingError + Into, { let msg = error.to_string_client(); + tracing::info!("forwarding error to user: {msg}"); self.write_message(&BeMessage::ErrorResponse(&msg)).await?; bail!(error) } diff --git a/pyproject.toml b/pyproject.toml index b297f7f70b..b4fb7a9e7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,10 +32,11 @@ toml = "^0.10.2" psutil = "^5.9.4" types-psutil = "^5.9.5.4" types-toml = "^0.10.8" +pytest-httpserver = "^1.0.6" [tool.poetry.dev-dependencies] flake8 = "^5.0.4" -mypy = "==0.971" +mypy = "==0.991" black = "^22.6.0" isort = "^5.10.1" @@ -60,10 +61,8 @@ skip = [ ] [tool.mypy] -# mypy uses regex exclude = "^vendor/" -# some tests don't typecheck when this flag is set -check_untyped_defs = false +check_untyped_defs = true # Help mypy find imports when running against list of individual files. # Without this line it would behave differently when executed on the entire project. mypy_path = "$MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner" diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 928a10e555..7ee14a8f41 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -4,7 +4,7 @@ # version, we can consider updating. # See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package, # we use "unstable" version number as the highest version used in the project by default. -channel = "1.62.1" # do update GitHub CI cache values for rust builds, when changing this value +channel = "1.62.1" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 658bdfe42c..fbcb3f34f7 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -4,11 +4,12 @@ version = "0.1.0" edition = "2021" [dependencies] +async-stream = "0.3" anyhow = "1.0" async-trait = "0.1" byteorder = "1.4.3" bytes = "1.0.1" -clap = "4.0" +clap = { version = "4.0", features = ["derive"] } const_format = "0.2.21" crc32c = "0.6.0" fs2 = "0.4.3" @@ -19,8 +20,8 @@ hyper = "0.14" nix = "0.25" once_cell = "1.13.0" parking_lot = "0.12.1" -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } regex = "1.4.5" serde = { version = "1.0", features = ["derive"] } serde_json = "1" @@ -28,17 +29,17 @@ serde_with = "2.0" signal-hook = "0.3.10" thiserror = "1" tokio = { version = "1.17", features = ["macros", "fs"] } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" } toml_edit = { version = "0.14", features = ["easy"] } tracing = "0.1.27" url = "2.2.2" -etcd_broker = { path = "../libs/etcd_broker" } metrics = { path = "../libs/metrics" } postgres_ffi = { path = "../libs/postgres_ffi" } pq_proto = { path = "../libs/pq_proto" } remote_storage = { path = "../libs/remote_storage" } safekeeper_api = { path = "../libs/safekeeper_api" } +storage_broker = { version = "0.1", path = "../storage_broker" } utils = { path = "../libs/utils" } workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 49e9e30cdc..5ad88276e8 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -2,27 +2,28 @@ // Main entry point for the safekeeper executable // use anyhow::{bail, Context, Result}; -use clap::{value_parser, Arg, ArgAction, Command}; -use const_format::formatcp; -use nix::unistd::Pid; +use clap::Parser; use remote_storage::RemoteStorageConfig; +use toml_edit::Document; + use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::path::{Path, PathBuf}; use std::sync::Arc; use std::thread; +use std::time::Duration; +use storage_broker::Uri; use tokio::sync::mpsc; -use toml_edit::Document; + use tracing::*; -use url::{ParseError, Url}; -use utils::lock_file; +use utils::pid_file; use metrics::set_build_info_metric; use safekeeper::broker; use safekeeper::control_file; use safekeeper::defaults::{ DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, - DEFAULT_PG_LISTEN_ADDR, DEFAULT_WAL_BACKUP_RUNTIME_THREADS, + DEFAULT_PG_LISTEN_ADDR, }; use safekeeper::http; use safekeeper::remove_wal; @@ -30,153 +31,156 @@ use safekeeper::wal_backup; use safekeeper::wal_service; use safekeeper::GlobalTimelines; use safekeeper::SafeKeeperConf; +use storage_broker::DEFAULT_ENDPOINT; use utils::auth::JwtAuth; use utils::{ http::endpoint, id::NodeId, logging::{self, LogFormat}, - project_git_version, signals, tcp_listener, + project_git_version, + sentry_init::{init_sentry, release_name}, + signals, tcp_listener, }; const PID_FILE_NAME: &str = "safekeeper.pid"; const ID_FILE_NAME: &str = "safekeeper.id"; + project_git_version!(GIT_VERSION); -fn main() -> anyhow::Result<()> { - let arg_matches = cli().get_matches(); +const ABOUT: &str = r#" +A fleet of safekeepers is responsible for reliably storing WAL received from +compute, passing it through consensus (mitigating potential computes brain +split), and serving the hardened part further downstream to pageserver(s). +"#; - if let Some(addr) = arg_matches.get_one::("dump-control-file") { - let state = control_file::FileStorage::load_control_file(Path::new(addr))?; +#[derive(Parser)] +#[command(name = "Neon safekeeper", version = GIT_VERSION, about = ABOUT, long_about = None)] +struct Args { + /// Path to the safekeeper data directory. + #[arg(short = 'D', long, default_value = "./")] + datadir: PathBuf, + /// Safekeeper node id. + #[arg(long)] + id: Option, + /// Initialize safekeeper with given id and exit. + #[arg(long)] + init: bool, + /// Listen endpoint for receiving/sending WAL in the form host:port. + #[arg(short, long, default_value = DEFAULT_PG_LISTEN_ADDR)] + listen_pg: String, + /// Listen http endpoint for management and metrics in the form host:port. + #[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)] + listen_http: String, + /// Do not wait for changes to be written safely to disk. Unsafe. + #[arg(short, long)] + no_sync: bool, + /// Dump control file at path specified by this argument and exit. + #[arg(long)] + dump_control_file: Option, + /// Broker endpoint for storage nodes coordination in the form + /// http[s]://host:port. In case of https schema TLS is connection is + /// established; plaintext otherwise. + #[arg(long, default_value = DEFAULT_ENDPOINT, verbatim_doc_comment)] + broker_endpoint: Uri, + /// Broker keepalive interval. + #[arg(long, value_parser= humantime::parse_duration, default_value = storage_broker::DEFAULT_KEEPALIVE_INTERVAL)] + broker_keepalive_interval: Duration, + /// Peer safekeeper is considered dead after not receiving heartbeats from + /// it during this period passed as a human readable duration. + #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT)] + heartbeat_timeout: Duration, + /// Remote storage configuration for WAL backup (offloading to s3) as TOML + /// inline table, e.g. + /// {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "", "bucket_region":"", "concurrency_limit": 119} + /// Safekeeper offloads WAL to + /// [prefix_in_bucket/]//, mirroring + /// structure on the file system. + #[arg(long, value_parser = parse_remote_storage, verbatim_doc_comment)] + remote_storage: Option, + /// Safekeeper won't be elected for WAL offloading if it is lagging for more than this value in bytes + #[arg(long, default_value_t = DEFAULT_MAX_OFFLOADER_LAG_BYTES)] + max_offloader_lag: u64, + /// Number of threads for wal backup runtime, by default number of cores + /// available to the system. + #[arg(long)] + wal_backup_threads: Option, + /// Disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring + /// WAL backup horizon. + #[arg(long)] + disable_wal_backup: bool, + /// Path to an RSA .pem public key which is used to check JWT tokens. + #[arg(long)] + auth_validation_public_key_path: Option, + /// Format for logging, either 'plain' or 'json'. + #[arg(long, default_value = "plain")] + log_format: String, +} + +fn main() -> anyhow::Result<()> { + let args = Args::parse(); + + if let Some(addr) = args.dump_control_file { + let state = control_file::FileStorage::load_control_file(addr)?; let json = serde_json::to_string(&state)?; print!("{json}"); return Ok(()); } - let mut conf = SafeKeeperConf::default(); - - if let Some(dir) = arg_matches.get_one::("datadir") { - // change into the data directory. - std::env::set_current_dir(dir)?; - } - - if arg_matches.get_flag("no-sync") { - conf.no_sync = true; - } - - if let Some(addr) = arg_matches.get_one::("listen-pg") { - conf.listen_pg_addr = addr.to_string(); - } - - if let Some(addr) = arg_matches.get_one::("listen-http") { - conf.listen_http_addr = addr.to_string(); - } - - let mut given_id = None; - if let Some(given_id_str) = arg_matches.get_one::("id") { - given_id = Some(NodeId( - given_id_str - .parse() - .context("failed to parse safekeeper id")?, - )); - } - - if let Some(addr) = arg_matches.get_one::("broker-endpoints") { - let collected_ep: Result, ParseError> = addr.split(',').map(Url::parse).collect(); - conf.broker_endpoints = collected_ep.context("Failed to parse broker endpoint urls")?; - } - if let Some(prefix) = arg_matches.get_one::("broker-etcd-prefix") { - conf.broker_etcd_prefix = prefix.to_string(); - } - - if let Some(heartbeat_timeout_str) = arg_matches.get_one::("heartbeat-timeout") { - conf.heartbeat_timeout = - humantime::parse_duration(heartbeat_timeout_str).with_context(|| { - format!( - "failed to parse heartbeat-timeout {}", - heartbeat_timeout_str - ) - })?; - } - - if let Some(backup_threads) = arg_matches.get_one::("wal-backup-threads") { - conf.backup_runtime_threads = backup_threads - .parse() - .with_context(|| format!("Failed to parse backup threads {}", backup_threads))?; - } - if let Some(storage_conf) = arg_matches.get_one::("remote-storage") { - // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse - let storage_conf_toml = format!("remote_storage = {}", storage_conf); - let parsed_toml = storage_conf_toml.parse::()?; // parse - let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again - conf.remote_storage = Some(RemoteStorageConfig::from_toml(storage_conf_parsed_toml)?); - } - if let Some(max_offloader_lag_str) = arg_matches.get_one::("max-offloader-lag") { - conf.max_offloader_lag_bytes = max_offloader_lag_str.parse().with_context(|| { - format!( - "failed to parse max offloader lag {}", - max_offloader_lag_str - ) - })?; - } - // Seems like there is no better way to accept bool values explicitly in clap. - conf.wal_backup_enabled = arg_matches - .get_one::("enable-wal-backup") - .unwrap() - .parse() - .context("failed to parse bool enable-s3-offload bool")?; - - conf.auth_validation_public_key_path = arg_matches - .get_one::("auth-validation-public-key-path") - .map(PathBuf::from); - - if let Some(log_format) = arg_matches.get_one::("log-format") { - conf.log_format = LogFormat::from_config(log_format)?; - } - - start_safekeeper(conf, given_id, arg_matches.get_flag("init")) -} - -fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { - logging::init(conf.log_format)?; + logging::init(LogFormat::from_config(&args.log_format)?)?; info!("version: {GIT_VERSION}"); - // Prevent running multiple safekeepers on the same directory - let lock_file_path = conf.workdir.join(PID_FILE_NAME); - let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) { - lock_file::LockCreationResult::Created { - new_lock_contents, - file, - } => { - info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}"); - file - } - lock_file::LockCreationResult::AlreadyLocked { - existing_lock_contents, - } => anyhow::bail!( - "Could not lock pid file; safekeeper is already running in {:?} with PID {}", - conf.workdir, - existing_lock_contents - ), - lock_file::LockCreationResult::CreationFailed(e) => { - return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}"))) - } - }; - // ensure that the lock file is held even if the main thread of the process is panics - // we need to release the lock file only when the current process is gone - let _ = Box::leak(Box::new(lock_file)); + let args_workdir = &args.datadir; + let workdir = args_workdir.canonicalize().with_context(|| { + format!("Failed to get the absolute path for input workdir {args_workdir:?}") + })?; + + // Change into the data directory. + std::env::set_current_dir(&workdir)?; // Set or read our ID. - set_id(&mut conf, given_id)?; - if init { + let id = set_id(&workdir, args.id.map(NodeId))?; + if args.init { return Ok(()); } + let conf = SafeKeeperConf { + workdir, + my_id: id, + listen_pg_addr: args.listen_pg, + listen_http_addr: args.listen_http, + no_sync: args.no_sync, + broker_endpoint: args.broker_endpoint, + broker_keepalive_interval: args.broker_keepalive_interval, + heartbeat_timeout: args.heartbeat_timeout, + remote_storage: args.remote_storage, + max_offloader_lag_bytes: args.max_offloader_lag, + backup_runtime_threads: args.wal_backup_threads, + wal_backup_enabled: !args.disable_wal_backup, + auth_validation_public_key_path: args.auth_validation_public_key_path, + }; + + // initialize sentry if SENTRY_DSN is provided + let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.my_id.to_string())]); + start_safekeeper(conf) +} + +fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { + // Prevent running multiple safekeepers on the same directory + let lock_file_path = conf.workdir.join(PID_FILE_NAME); + let lock_file = + pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?; + info!("claimed pid file at {lock_file_path:?}"); + + // ensure that the lock file is held even if the main thread of the process is panics + // we need to release the lock file only when the current process is gone + std::mem::forget(lock_file); + let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| { error!("failed to bind to address {}: {}", conf.listen_http_addr, e); e })?; - info!("Starting safekeeper on {}", conf.listen_pg_addr); + info!("starting safekeeper on {}", conf.listen_pg_addr); let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| { error!("failed to bind to address {}: {}", conf.listen_pg_addr, e); e @@ -184,11 +188,11 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo let auth = match conf.auth_validation_public_key_path.as_ref() { None => { - info!("Auth is disabled"); + info!("auth is disabled"); None } Some(path) => { - info!("Loading JWT auth key from {}", path.display()); + info!("loading JWT auth key from {}", path.display()); Some(Arc::new( JwtAuth::from_key_path(path).context("failed to load the auth key")?, )) @@ -225,7 +229,7 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo let conf_cloned = conf.clone(); let safekeeper_thread = thread::Builder::new() - .name("Safekeeper thread".into()) + .name("safekeeper thread".into()) .spawn(|| { if let Err(e) = wal_service::thread_main(conf_cloned, pg_listener, auth) { info!("safekeeper thread terminated: {e}"); @@ -235,19 +239,15 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo threads.push(safekeeper_thread); - if !conf.broker_endpoints.is_empty() { - let conf_ = conf.clone(); - threads.push( - thread::Builder::new() - .name("broker thread".into()) - .spawn(|| { - // TODO: add auth? - broker::thread_main(conf_); - })?, - ); - } else { - warn!("No broker endpoints providing, starting without node sync") - } + let conf_ = conf.clone(); + threads.push( + thread::Builder::new() + .name("broker thread".into()) + .spawn(|| { + // TODO: add auth? + broker::thread_main(conf_); + })?, + ); let conf_ = conf.clone(); threads.push( @@ -258,12 +258,11 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo })?, ); - let conf_ = conf.clone(); threads.push( thread::Builder::new() - .name("wal backup launcher thread".into()) + .name("WAL backup launcher thread".into()) .spawn(move || { - wal_backup::wal_backup_launcher_thread_main(conf_, wal_backup_launcher_rx); + wal_backup::wal_backup_launcher_thread_main(conf, wal_backup_launcher_rx); })?, ); @@ -275,19 +274,19 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo signals.handle(|signal| { // TODO: implement graceful shutdown with joining threads etc info!( - "Got {}. Terminating in immediate shutdown mode", + "received {}, terminating in immediate shutdown mode", signal.name() ); - std::process::exit(111); + std::process::exit(0); }) } -/// Determine safekeeper id and set it in config. -fn set_id(conf: &mut SafeKeeperConf, given_id: Option) -> Result<()> { - let id_file_path = conf.workdir.join(ID_FILE_NAME); +/// Determine safekeeper id. +fn set_id(workdir: &Path, given_id: Option) -> Result { + let id_file_path = workdir.join(ID_FILE_NAME); let my_id: NodeId; - // If ID exists, read it in; otherwise set one passed + // If file with ID exists, read it in; otherwise set one passed. match fs::read(&id_file_path) { Ok(id_serialized) => { my_id = NodeId( @@ -314,118 +313,34 @@ fn set_id(conf: &mut SafeKeeperConf, given_id: Option) -> Result<()> { } else { bail!("safekeeper id is not specified"); }; - let mut f = File::create(&id_file_path)?; + let mut f = File::create(&id_file_path) + .with_context(|| format!("Failed to create id file at {id_file_path:?}"))?; f.write_all(my_id.to_string().as_bytes())?; f.sync_all()?; - info!("initialized safekeeper ID {}", my_id); + info!("initialized safekeeper id {}", my_id); } _ => { return Err(error.into()); } }, } - conf.my_id = my_id; - Ok(()) + Ok(my_id) } -fn cli() -> Command { - Command::new("Neon safekeeper") - .about("Store WAL stream to local file system and push it to WAL receivers") - .version(GIT_VERSION) - .arg( - Arg::new("datadir") - .short('D') - .long("dir") - .value_parser(value_parser!(PathBuf)) - .help("Path to the safekeeper data directory"), - ) - .arg( - Arg::new("init") - .long("init") - .action(ArgAction::SetTrue) - .help("Initialize safekeeper with ID"), - ) - .arg( - Arg::new("listen-pg") - .short('l') - .long("listen-pg") - .alias("listen") // for compatibility - .help(formatcp!("listen for incoming WAL data connections on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")), - ) - .arg( - Arg::new("listen-http") - .long("listen-http") - .help(formatcp!("http endpoint address for metrics on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")), - ) - // FIXME this argument is no longer needed since pageserver address is forwarded from compute. - // However because this argument is in use by console's e2e tests let's keep it for now and remove separately. - // So currently it is a noop. - .arg( - Arg::new("pageserver") - .short('p') - .long("pageserver"), - ) - .arg( - Arg::new("no-sync") - .short('n') - .long("no-sync") - .action(ArgAction::SetTrue) - .help("Do not wait for changes to be written safely to disk"), - ) - .arg( - Arg::new("dump-control-file") - .long("dump-control-file") - .help("Dump control file at path specified by this argument and exit"), - ) - .arg( - Arg::new("id").long("id").help("safekeeper node id: integer") - ).arg( - Arg::new("broker-endpoints") - .long("broker-endpoints") - .help("a comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'"), - ) - .arg( - Arg::new("broker-etcd-prefix") - .long("broker-etcd-prefix") - .help("a prefix to always use when polling/pusing data in etcd from this safekeeper"), - ) - .arg( - Arg::new("heartbeat-timeout") - .long("heartbeat-timeout") - .help(formatcp!("Peer is considered dead after not receiving heartbeats from it during this period (default {}s), passed as a human readable duration.", DEFAULT_HEARTBEAT_TIMEOUT.as_secs())) - ) - .arg( - Arg::new("wal-backup-threads").long("backup-threads").help(formatcp!("number of threads for wal backup (default {DEFAULT_WAL_BACKUP_RUNTIME_THREADS}")), - ).arg( - Arg::new("remote-storage") - .long("remote-storage") - .help("Remote storage configuration for WAL backup (offloading to s3) as TOML inline table, e.g. {\"max_concurrent_syncs\" = 17, \"max_sync_errors\": 13, \"bucket_name\": \"\", \"bucket_region\":\"\", \"concurrency_limit\": 119}.\nSafekeeper offloads WAL to [prefix_in_bucket/]//, mirroring structure on the file system.") - ) - .arg( - Arg::new("max-offloader-lag") - .long("max-offloader-lag") - .help(formatcp!("Safekeeper won't be elected for WAL offloading if it is lagging for more than this value (default {}MB) in bytes", DEFAULT_MAX_OFFLOADER_LAG_BYTES / (1 << 20))) - ) - .arg( - Arg::new("enable-wal-backup") - .long("enable-wal-backup") - .default_value("true") - .default_missing_value("true") - .help("Enable/disable WAL backup to s3. When disabled, safekeeper removes WAL ignoring WAL backup horizon."), - ) - .arg( - Arg::new("auth-validation-public-key-path") - .long("auth-validation-public-key-path") - .help("Path to an RSA .pem public key which is used to check JWT tokens") - ) - .arg( - Arg::new("log-format") - .long("log-format") - .help("Format for logging, either 'plain' or 'json'") - ) +// Parse RemoteStorage from TOML table. +fn parse_remote_storage(storage_conf: &str) -> anyhow::Result { + // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse + let storage_conf_toml = format!("remote_storage = {storage_conf}"); + let parsed_toml = storage_conf_toml.parse::()?; // parse + let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again + RemoteStorageConfig::from_toml(storage_conf_parsed_toml).and_then(|parsed_config| { + // XXX: Don't print the original toml here, there might be some sensitive data + parsed_config.context("Incorrectly parsed remote storage toml as no remote storage config") + }) } #[test] fn verify_cli() { - cli().debug_assert(); + use clap::CommandFactory; + Args::command().debug_assert() } diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 76135241b9..92f35bf51f 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -1,15 +1,18 @@ -//! Communication with etcd, providing safekeeper peers and pageserver coordination. +//! Communication with the broker, providing safekeeper peers and pageserver coordination. +use anyhow::anyhow; +use anyhow::bail; use anyhow::Context; + use anyhow::Error; use anyhow::Result; -use etcd_broker::subscription_value::SkTimelineInfo; -use etcd_broker::LeaseKeepAliveStream; -use etcd_broker::LeaseKeeper; -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::collections::HashSet; +use storage_broker::parse_proto_ttid; +use storage_broker::proto::broker_service_client::BrokerServiceClient; +use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; +use storage_broker::proto::SubscribeSafekeeperInfoRequest; +use storage_broker::Request; + use std::time::Duration; use tokio::task::JoinHandle; use tokio::{runtime, time::sleep}; @@ -17,15 +20,9 @@ use tracing::*; use crate::GlobalTimelines; use crate::SafeKeeperConf; -use etcd_broker::{ - subscription_key::{OperationKind, SkOperationKind, SubscriptionKey}, - Client, PutOptions, -}; -use utils::id::{NodeId, TenantTimelineId}; const RETRY_INTERVAL_MSEC: u64 = 1000; const PUSH_INTERVAL_MSEC: u64 = 1000; -const LEASE_TTL_SEC: i64 = 10; pub fn thread_main(conf: SafeKeeperConf) { let runtime = runtime::Builder::new_current_thread() @@ -34,158 +31,70 @@ pub fn thread_main(conf: SafeKeeperConf) { .unwrap(); let _enter = info_span!("broker").entered(); - info!("started, broker endpoints {:?}", conf.broker_endpoints); + info!("started, broker endpoint {:?}", conf.broker_endpoint); runtime.block_on(async { main_loop(conf).await; }); } -/// Key to per timeline per safekeeper data. -fn timeline_safekeeper_path( - broker_etcd_prefix: String, - ttid: TenantTimelineId, - sk_id: NodeId, -) -> String { - format!( - "{}/{sk_id}", - SubscriptionKey::sk_timeline_info(broker_etcd_prefix, ttid).watch_key() - ) -} - -async fn push_sk_info( - ttid: TenantTimelineId, - mut client: Client, - key: String, - sk_info: SkTimelineInfo, - mut lease: Lease, -) -> anyhow::Result<(TenantTimelineId, Lease)> { - let put_opts = PutOptions::new().with_lease(lease.id); - client - .put( - key.clone(), - serde_json::to_string(&sk_info)?, - Some(put_opts), - ) - .await - .with_context(|| format!("failed to push safekeeper info to {}", key))?; - - // revive the lease - lease - .keeper - .keep_alive() - .await - .context("failed to send LeaseKeepAliveRequest")?; - lease - .ka_stream - .message() - .await - .context("failed to receive LeaseKeepAliveResponse")?; - - Ok((ttid, lease)) -} - -struct Lease { - id: i64, - keeper: LeaseKeeper, - ka_stream: LeaseKeepAliveStream, -} - /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { - let mut client = Client::connect(&conf.broker_endpoints, None).await?; - let mut leases: HashMap = HashMap::new(); - + let mut client = BrokerServiceClient::connect(conf.broker_endpoint.clone()).await?; let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); - loop { - // Note: we lock runtime here and in timeline methods as GlobalTimelines - // is under plain mutex. That's ok, all this code is not performance - // sensitive and there is no risk of deadlock as we don't await while - // lock is held. - let mut active_tlis = GlobalTimelines::get_all(); - active_tlis.retain(|tli| tli.is_active()); - let active_tlis_set: HashSet = - active_tlis.iter().map(|tli| tli.ttid).collect(); - - // // Get and maintain (if not yet) per timeline lease to automatically delete obsolete data. - for tli in &active_tlis { - if let Entry::Vacant(v) = leases.entry(tli.ttid) { - let lease = client.lease_grant(LEASE_TTL_SEC, None).await?; - let (keeper, ka_stream) = client.lease_keep_alive(lease.id()).await?; - v.insert(Lease { - id: lease.id(), - keeper, - ka_stream, - }); - } - } - leases.retain(|ttid, _| active_tlis_set.contains(ttid)); - - // Push data concurrently to not suffer from latency, with many timelines it can be slow. - let handles = active_tlis - .iter() - .map(|tli| { + let outbound = async_stream::stream! { + loop { + // Note: we lock runtime here and in timeline methods as GlobalTimelines + // is under plain mutex. That's ok, all this code is not performance + // sensitive and there is no risk of deadlock as we don't await while + // lock is held. + let mut active_tlis = GlobalTimelines::get_all(); + active_tlis.retain(|tli| tli.is_active()); + for tli in &active_tlis { let sk_info = tli.get_safekeeper_info(&conf); - let key = - timeline_safekeeper_path(conf.broker_etcd_prefix.clone(), tli.ttid, conf.my_id); - let lease = leases.remove(&tli.ttid).unwrap(); - tokio::spawn(push_sk_info(tli.ttid, client.clone(), key, sk_info, lease)) - }) - .collect::>(); - for h in handles { - let (ttid, lease) = h.await??; - // It is ugly to pull leases from hash and then put it back, but - // otherwise we have to resort to long living per tli tasks (which - // would generate a lot of errors when etcd is down) as task wants to - // have 'static objects, we can't borrow to it. - leases.insert(ttid, lease); + yield sk_info; + } + sleep(push_interval).await; } - - sleep(push_interval).await; - } + }; + client + .publish_safekeeper_info(Request::new(outbound)) + .await?; + Ok(()) } /// Subscribe and fetch all the interesting data from the broker. async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { - let mut client = Client::connect(&conf.broker_endpoints, None).await?; + let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?; - let mut subscription = etcd_broker::subscribe_for_values( - &mut client, - SubscriptionKey::all(conf.broker_etcd_prefix.clone()), - |full_key, value_str| { - if full_key.operation == OperationKind::Safekeeper(SkOperationKind::TimelineInfo) { - match serde_json::from_str::(value_str) { - Ok(new_info) => return Some(new_info), - Err(e) => { - error!("Failed to parse timeline info from value str '{value_str}': {e}") - } - } - } - None - }, - ) - .await - .context("failed to subscribe for safekeeper info")?; - loop { - match subscription.value_updates.recv().await { - Some(new_info) => { - // note: there are blocking operations below, but it's considered fine for now - if let Ok(tli) = GlobalTimelines::get(new_info.key.id) { - // Note that we also receive *our own* info. That's - // important, as it is used as an indication of live - // connection to the broker. - tli.record_safekeeper_info(&new_info.value, new_info.key.node_id) - .await? - } - } - None => { - // XXX it means we lost connection with etcd, error is consumed inside sub object - debug!("timeline updates sender closed, aborting the pull loop"); - return Ok(()); - } + // TODO: subscribe only to local timelines instead of all + let request = SubscribeSafekeeperInfoRequest { + subscription_key: Some(ProtoSubscriptionKey::All(())), + }; + + let mut stream = client + .subscribe_safekeeper_info(request) + .await + .context("subscribe_safekeper_info request failed")? + .into_inner(); + + while let Some(msg) = stream.message().await? { + let proto_ttid = msg + .tenant_timeline_id + .as_ref() + .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?; + let ttid = parse_proto_ttid(proto_ttid)?; + if let Ok(tli) = GlobalTimelines::get(ttid) { + // Note that we also receive *our own* info. That's + // important, as it is used as an indication of live + // connection to the broker. + + // note: there are blocking operations below, but it's considered fine for now + tli.record_safekeeper_info(&msg).await? } } + bail!("end of stream"); } async fn main_loop(conf: SafeKeeperConf) { diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 6be3f9abb2..ba5e453e41 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -231,7 +231,7 @@ mod test { let workdir = tempfile::tempdir().unwrap().into_path(); SafeKeeperConf { workdir, - ..Default::default() + ..SafeKeeperConf::dummy() } } @@ -239,7 +239,7 @@ mod test { conf: &SafeKeeperConf, ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir"); + fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir"); Ok(( FileStorage::restore_new(ttid, conf)?, FileStorage::load_control_file_conf(conf, ttid)?, @@ -250,7 +250,7 @@ mod test { conf: &SafeKeeperConf, ttid: &TenantTimelineId, ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(&conf.timeline_dir(ttid)).expect("failed to create timeline dir"); + fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir"); let state = SafeKeeperState::empty(); let storage = FileStorage::create_new(ttid, conf, state.clone())?; Ok((storage, state)) diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 9343611959..a9a9eb3388 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -3,11 +3,14 @@ use hyper::{Body, Request, Response, StatusCode, Uri}; use anyhow::Context; use once_cell::sync::Lazy; use postgres_ffi::WAL_SEGMENT_SIZE; +use safekeeper_api::models::SkTimelineInfo; use serde::Serialize; use serde::Serializer; use std::collections::{HashMap, HashSet}; use std::fmt::Display; use std::sync::Arc; +use storage_broker::proto::SafekeeperTimelineInfo; +use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use tokio::task::JoinError; use crate::safekeeper::ServerInfo; @@ -16,7 +19,6 @@ use crate::safekeeper::Term; use crate::timelines_global_map::TimelineDeleteForceResult; use crate::GlobalTimelines; use crate::SafeKeeperConf; -use etcd_broker::subscription_value::SkTimelineInfo; use utils::{ auth::JwtAuth, http::{ @@ -241,7 +243,22 @@ async fn record_safekeeper_info(mut request: Request) -> Result) -> Result, - pub backup_runtime_threads: usize, - pub wal_backup_enabled: bool, - pub my_id: NodeId, - pub broker_endpoints: Vec, - pub broker_etcd_prefix: String, - pub auth_validation_public_key_path: Option, + pub no_sync: bool, + pub broker_endpoint: Uri, + pub broker_keepalive_interval: Duration, pub heartbeat_timeout: Duration, + pub remote_storage: Option, pub max_offloader_lag_bytes: u64, - pub log_format: LogFormat, + pub backup_runtime_threads: Option, + pub wal_backup_enabled: bool, + pub auth_validation_public_key_path: Option, } impl SafeKeeperConf { @@ -81,26 +71,25 @@ impl SafeKeeperConf { } } -impl Default for SafeKeeperConf { - fn default() -> Self { +impl SafeKeeperConf { + #[cfg(test)] + fn dummy() -> Self { SafeKeeperConf { - // Always set to './'. We will chdir into the directory specified on the - // command line, so that when the server is running, all paths are relative - // to that. workdir: PathBuf::from("./"), no_sync: false, listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), remote_storage: None, my_id: NodeId(0), - broker_endpoints: Vec::new(), - broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(), - backup_runtime_threads: DEFAULT_WAL_BACKUP_RUNTIME_THREADS, + broker_endpoint: storage_broker::DEFAULT_ENDPOINT + .parse() + .expect("failed to parse default broker endpoint"), + broker_keepalive_interval: Duration::from_secs(5), + backup_runtime_threads: None, wal_backup_enabled: true, auth_validation_public_key_path: None, - heartbeat_timeout: DEFAULT_HEARTBEAT_TIMEOUT, - max_offloader_lag_bytes: DEFAULT_MAX_OFFLOADER_LAG_BYTES, - log_format: LogFormat::Plain, + heartbeat_timeout: Duration::new(5, 0), + max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, } } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index d4d3d37737..b21770686c 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -425,7 +425,7 @@ impl Collector for TimelineCollector { .set(tli.num_computes as i64); self.acceptor_term .with_label_values(labels) - .set(tli.persisted_state.acceptor_state.term as u64); + .set(tli.persisted_state.acceptor_state.term); self.written_wal_bytes .with_label_values(labels) .set(tli.wal_storage.write_wal_bytes); diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 6577e8c4d6..be7f071abb 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -52,7 +52,7 @@ impl<'pg> ReceiveWalConn<'pg> { /// Receive WAL from wal_proposer pub fn run(&mut self, spg: &mut SafekeeperPostgresHandler) -> Result<()> { - let _enter = info_span!("WAL acceptor", timeline = %spg.timeline_id.unwrap()).entered(); + let _enter = info_span!("WAL acceptor", ttid = %spg.ttid).entered(); // Notify the libpq client that it's allowed to send `CopyData` messages self.pg_backend @@ -69,7 +69,7 @@ impl<'pg> ReceiveWalConn<'pg> { let tli = match next_msg { ProposerAcceptorMessage::Greeting(ref greeting) => { info!( - "start handshake with wal proposer {} sysid {} timeline {}", + "start handshake with walproposer {} sysid {} timeline {}", self.peer_addr, greeting.system_id, greeting.tli, ); let server_info = ServerInfo { diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 7dfa6f636e..fa973a3ede 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -4,13 +4,13 @@ use anyhow::{bail, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use etcd_broker::subscription_value::SkTimelineInfo; use postgres_ffi::{TimeLineID, XLogSegNo, MAX_SEND_SIZE}; use serde::{Deserialize, Serialize}; use std::cmp::max; use std::cmp::min; use std::fmt; use std::io::Read; +use storage_broker::proto::SafekeeperTimelineInfo; use tracing::*; @@ -182,7 +182,7 @@ pub struct SafeKeeperState { /// All WAL segments next to one containing local_start_lsn are /// filled with data from the beginning. pub local_start_lsn: Lsn, - /// Part of WAL acknowledged by quorum and available locally. Always points + /// Part of WAL acknowledged by quorum *and available locally*. Always points /// to record boundary. pub commit_lsn: Lsn, /// LSN that points to the end of the last backed up segment. Useful to @@ -501,10 +501,6 @@ impl AcceptorProposerMessage { /// - messages from compute (proposers) and provides replies /// - messages from broker peers pub struct SafeKeeper { - /// Maximum commit_lsn between all nodes, can be ahead of local flush_lsn. - /// Note: be careful to set only if we are sure our WAL (term history) matches - /// committed one. - pub global_commit_lsn: Lsn, /// LSN since the proposer safekeeper currently talking to appends WAL; /// determines epoch switch point. pub epoch_start_lsn: Lsn, @@ -537,7 +533,6 @@ where } Ok(SafeKeeper { - global_commit_lsn: state.commit_lsn, epoch_start_lsn: Lsn(0), inmem: SafekeeperMemState { commit_lsn: state.commit_lsn, @@ -639,10 +634,12 @@ where // system_id will be updated on mismatch if self.state.server.system_id != msg.system_id { - warn!( - "unexpected system ID arrived, got {}, expected {}", - msg.system_id, self.state.server.system_id - ); + if self.state.server.system_id != 0 { + warn!( + "unexpected system ID arrived, got {}, expected {}", + msg.system_id, self.state.server.system_id + ); + } let mut state = self.state.clone(); state.server.system_id = msg.system_id; @@ -653,8 +650,9 @@ where } info!( - "processed greeting from proposer {:?}, sending term {:?}", - msg.proposer_id, self.state.acceptor_state.term + "processed greeting from walproposer {}, sending term {:?}", + msg.proposer_id.map(|b| format!("{:X}", b)).join(""), + self.state.acceptor_state.term ); Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting { term: self.state.acceptor_state.term, @@ -727,6 +725,24 @@ where return Ok(None); } + // This might happen in a rare race when another (old) connection from + // the same walproposer writes + flushes WAL after this connection + // already sent flush_lsn in VoteRequest. It is generally safe to + // proceed, but to prevent commit_lsn surprisingly going down we should + // either refuse the session (simpler) or skip the part we already have + // from the stream (can be implemented). + if msg.term == self.get_epoch() && self.flush_lsn() > msg.start_streaming_at { + bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help", + msg.term, self.flush_lsn(), msg.start_streaming_at) + } + // Otherwise this shouldn't happen. + assert!( + msg.start_streaming_at >= self.inmem.commit_lsn, + "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}", + msg.start_streaming_at, + self.inmem.commit_lsn + ); + // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to // intersection of our history and history from msg @@ -759,7 +775,6 @@ where // NB: on new clusters, this happens at the same time as // timeline_start_lsn initialization, it is taken outside to provide // upgrade. - self.global_commit_lsn = max(self.global_commit_lsn, state.timeline_start_lsn); self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn); // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment. @@ -778,10 +793,21 @@ where Ok(None) } - /// Advance commit_lsn taking into account what we have locally - fn update_commit_lsn(&mut self) -> Result<()> { - let commit_lsn = min(self.global_commit_lsn, self.flush_lsn()); - assert!(commit_lsn >= self.inmem.commit_lsn); + /// Advance commit_lsn taking into account what we have locally. + /// + /// Note: it is assumed that 'WAL we have is from the right term' check has + /// already been done outside. + fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> { + // Both peers and walproposer communicate this value, we might already + // have a fresher (higher) version. + candidate = max(candidate, self.inmem.commit_lsn); + let commit_lsn = min(candidate, self.flush_lsn()); + assert!( + commit_lsn >= self.inmem.commit_lsn, + "commit_lsn monotonicity violated: old={} new={}", + self.inmem.commit_lsn, + commit_lsn + ); self.inmem.commit_lsn = commit_lsn; @@ -847,14 +873,11 @@ where self.wal_store.flush_wal()?; } - // Update global_commit_lsn + // Update commit_lsn. if msg.h.commit_lsn != Lsn(0) { - // We also obtain commit lsn from peers, so value arrived here might be stale (less) - self.global_commit_lsn = max(self.global_commit_lsn, msg.h.commit_lsn); + self.update_commit_lsn(msg.h.commit_lsn)?; } - self.inmem.peer_horizon_lsn = msg.h.truncate_lsn; - self.update_commit_lsn()?; // Update truncate and commit LSN in control file. // To avoid negative impact on performance of extra fsync, do it only @@ -886,49 +909,43 @@ where /// Flush WAL to disk. Return AppendResponse with latest LSNs. fn handle_flush(&mut self) -> Result> { self.wal_store.flush_wal()?; - - // commit_lsn can be updated because we have new flushed data locally. - self.update_commit_lsn()?; - Ok(Some(AcceptorProposerMessage::AppendResponse( self.append_response(), ))) } /// Update timeline state with peer safekeeper data. - pub fn record_safekeeper_info(&mut self, sk_info: &SkTimelineInfo) -> Result<()> { + pub fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> { let mut sync_control_file = false; - if let (Some(commit_lsn), Some(last_log_term)) = (sk_info.commit_lsn, sk_info.last_log_term) - { + + if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) { // Note: the check is too restrictive, generally we can update local // commit_lsn if our history matches (is part of) history of advanced // commit_lsn provider. - if last_log_term == self.get_epoch() { - self.global_commit_lsn = max(commit_lsn, self.global_commit_lsn); - self.update_commit_lsn()?; + if sk_info.last_log_term == self.get_epoch() { + self.update_commit_lsn(Lsn(sk_info.commit_lsn))?; } } - if let Some(backup_lsn) = sk_info.backup_lsn { - let new_backup_lsn = max(backup_lsn, self.inmem.backup_lsn); - sync_control_file |= - self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn; - self.inmem.backup_lsn = new_backup_lsn; - } - if let Some(remote_consistent_lsn) = sk_info.remote_consistent_lsn { - let new_remote_consistent_lsn = - max(remote_consistent_lsn, self.inmem.remote_consistent_lsn); - sync_control_file |= self.state.remote_consistent_lsn - + (self.state.server.wal_seg_size as u64) - < new_remote_consistent_lsn; - self.inmem.remote_consistent_lsn = new_remote_consistent_lsn; - } - if let Some(peer_horizon_lsn) = sk_info.peer_horizon_lsn { - let new_peer_horizon_lsn = max(peer_horizon_lsn, self.inmem.peer_horizon_lsn); - sync_control_file |= self.state.peer_horizon_lsn - + (self.state.server.wal_seg_size as u64) - < new_peer_horizon_lsn; - self.inmem.peer_horizon_lsn = new_peer_horizon_lsn; - } + + let new_backup_lsn = max(Lsn(sk_info.backup_lsn), self.inmem.backup_lsn); + sync_control_file |= + self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn; + self.inmem.backup_lsn = new_backup_lsn; + + let new_remote_consistent_lsn = max( + Lsn(sk_info.remote_consistent_lsn), + self.inmem.remote_consistent_lsn, + ); + sync_control_file |= self.state.remote_consistent_lsn + + (self.state.server.wal_seg_size as u64) + < new_remote_consistent_lsn; + self.inmem.remote_consistent_lsn = new_remote_consistent_lsn; + + let new_peer_horizon_lsn = max(Lsn(sk_info.peer_horizon_lsn), self.inmem.peer_horizon_lsn); + sync_control_file |= self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) + < new_peer_horizon_lsn; + self.inmem.peer_horizon_lsn = new_peer_horizon_lsn; + if sync_control_file { self.persist_control_file(self.state.clone())?; } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 576a02c686..a054b8fe14 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -161,7 +161,7 @@ impl ReplicationConn { pgb: &mut PostgresBackend, mut start_pos: Lsn, ) -> Result<()> { - let _enter = info_span!("WAL sender", timeline = %spg.timeline_id.unwrap()).entered(); + let _enter = info_span!("WAL sender", ttid = %spg.ttid).entered(); let tli = GlobalTimelines::get(spg.ttid)?; @@ -226,6 +226,7 @@ impl ReplicationConn { let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn); let mut wal_reader = WalReader::new( + spg.conf.workdir.clone(), spg.conf.timeline_dir(&tli.ttid), &persisted_state, start_pos, diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 132a926203..038c32afe0 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -2,7 +2,6 @@ //! to glue together SafeKeeper and all other background services. use anyhow::{bail, Result}; -use etcd_broker::subscription_value::SkTimelineInfo; use parking_lot::{Mutex, MutexGuard}; use postgres_ffi::XLogSegNo; use pq_proto::ReplicationFeedback; @@ -18,6 +17,9 @@ use utils::{ lsn::Lsn, }; +use storage_broker::proto::SafekeeperTimelineInfo; +use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; + use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, SafekeeperMemState, ServerInfo, Term, @@ -47,13 +49,13 @@ pub struct PeerInfo { } impl PeerInfo { - fn from_sk_info(sk_id: NodeId, sk_info: &SkTimelineInfo, ts: Instant) -> PeerInfo { + fn from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo { PeerInfo { - sk_id, - _last_log_term: sk_info.last_log_term.unwrap_or(0), - _flush_lsn: sk_info.flush_lsn.unwrap_or(Lsn::INVALID), - commit_lsn: sk_info.commit_lsn.unwrap_or(Lsn::INVALID), - local_start_lsn: sk_info.local_start_lsn.unwrap_or(Lsn::INVALID), + sk_id: NodeId(sk_info.safekeeper_id), + _last_log_term: sk_info.last_log_term, + _flush_lsn: Lsn(sk_info.flush_lsn), + commit_lsn: Lsn(sk_info.commit_lsn), + local_start_lsn: Lsn(sk_info.local_start_lsn), ts, } } @@ -308,21 +310,31 @@ impl SharedState { pos } - fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo { - SkTimelineInfo { - last_log_term: Some(self.sk.get_epoch()), - flush_lsn: Some(self.sk.wal_store.flush_lsn()), + fn get_safekeeper_info( + &self, + ttid: &TenantTimelineId, + conf: &SafeKeeperConf, + ) -> SafekeeperTimelineInfo { + SafekeeperTimelineInfo { + safekeeper_id: conf.my_id.0, + tenant_timeline_id: Some(ProtoTenantTimelineId { + tenant_id: ttid.tenant_id.as_ref().to_owned(), + timeline_id: ttid.timeline_id.as_ref().to_owned(), + }), + last_log_term: self.sk.get_epoch(), + flush_lsn: self.sk.wal_store.flush_lsn().0, // note: this value is not flushed to control file yet and can be lost - commit_lsn: Some(self.sk.inmem.commit_lsn), + commit_lsn: self.sk.inmem.commit_lsn.0, // TODO: rework feedbacks to avoid max here - remote_consistent_lsn: Some(max( + remote_consistent_lsn: max( self.get_replicas_state().remote_consistent_lsn, self.sk.inmem.remote_consistent_lsn, - )), - peer_horizon_lsn: Some(self.sk.inmem.peer_horizon_lsn), - safekeeper_connstr: Some(conf.listen_pg_addr.clone()), - backup_lsn: Some(self.sk.inmem.backup_lsn), - local_start_lsn: Some(self.sk.state.local_start_lsn), + ) + .0, + peer_horizon_lsn: self.sk.inmem.peer_horizon_lsn.0, + safekeeper_connstr: conf.listen_pg_addr.clone(), + backup_lsn: self.sk.inmem.backup_lsn.0, + local_start_lsn: self.sk.state.local_start_lsn.0, } } } @@ -682,23 +694,19 @@ impl Timeline { } /// Get safekeeper info for broadcasting to broker and other peers. - pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo { + pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo { let shared_state = self.write_shared_state(); - shared_state.get_safekeeper_info(conf) + shared_state.get_safekeeper_info(&self.ttid, conf) } /// Update timeline state with peer safekeeper data. - pub async fn record_safekeeper_info( - &self, - sk_info: &SkTimelineInfo, - sk_id: NodeId, - ) -> Result<()> { + pub async fn record_safekeeper_info(&self, sk_info: &SafekeeperTimelineInfo) -> Result<()> { let is_wal_backup_action_pending: bool; let commit_lsn: Lsn; { let mut shared_state = self.write_shared_state(); shared_state.sk.record_safekeeper_info(sk_info)?; - let peer_info = PeerInfo::from_sk_info(sk_id, sk_info, Instant::now()); + let peer_info = PeerInfo::from_sk_info(sk_info, Instant::now()); shared_state.peers_info.upsert(&peer_info); is_wal_backup_action_pending = shared_state.update_status(self.ttid); commit_lsn = shared_state.sk.inmem.commit_lsn; diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index a5d373a1da..fd5f010b3d 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -20,14 +20,21 @@ use utils::lsn::Lsn; struct GlobalTimelinesState { timelines: HashMap>, wal_backup_launcher_tx: Option>, - conf: SafeKeeperConf, + conf: Option, } impl GlobalTimelinesState { + /// Get configuration, which must be set once during init. + fn get_conf(&self) -> &SafeKeeperConf { + self.conf + .as_ref() + .expect("GlobalTimelinesState conf is not initialized") + } + /// Get dependencies for a timeline constructor. fn get_dependencies(&self) -> (SafeKeeperConf, Sender) { ( - self.conf.clone(), + self.get_conf().clone(), self.wal_backup_launcher_tx.as_ref().unwrap().clone(), ) } @@ -55,7 +62,7 @@ static TIMELINES_STATE: Lazy> = Lazy::new(|| { Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), wal_backup_launcher_tx: None, - conf: SafeKeeperConf::default(), + conf: None, }) }); @@ -71,12 +78,12 @@ impl GlobalTimelines { let mut state = TIMELINES_STATE.lock().unwrap(); assert!(state.wal_backup_launcher_tx.is_none()); state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); - state.conf = conf; + state.conf = Some(conf); // Iterate through all directories and load tenants for all directories // named as a valid tenant_id. let mut tenant_count = 0; - let tenants_dir = state.conf.workdir.clone(); + let tenants_dir = state.get_conf().workdir.clone(); for tenants_dir_entry in std::fs::read_dir(&tenants_dir) .with_context(|| format!("failed to list tenants dir {}", tenants_dir.display()))? { @@ -111,7 +118,7 @@ impl GlobalTimelines { state: &mut MutexGuard, tenant_id: TenantId, ) -> Result<()> { - let timelines_dir = state.conf.tenant_dir(&tenant_id); + let timelines_dir = state.get_conf().tenant_dir(&tenant_id); for timelines_dir_entry in std::fs::read_dir(&timelines_dir) .with_context(|| format!("failed to list timelines dir {}", timelines_dir.display()))? { @@ -122,7 +129,7 @@ impl GlobalTimelines { { let ttid = TenantTimelineId::new(tenant_id, timeline_id); match Timeline::load_timeline( - state.conf.clone(), + state.get_conf().clone(), ttid, state.wal_backup_launcher_tx.as_ref().unwrap().clone(), ) { @@ -281,7 +288,11 @@ impl GlobalTimelines { } Err(_) => { // Timeline is not memory, but it may still exist on disk in broken state. - let dir_path = TIMELINES_STATE.lock().unwrap().conf.timeline_dir(ttid); + let dir_path = TIMELINES_STATE + .lock() + .unwrap() + .get_conf() + .timeline_dir(ttid); let dir_existed = delete_dir(dir_path)?; Ok(TimelineDeleteForceResult { @@ -327,7 +338,13 @@ impl GlobalTimelines { // Note that we could concurrently create new timelines while we were deleting them, // so the directory may be not empty. In this case timelines will have bad state // and timeline background jobs can panic. - delete_dir(TIMELINES_STATE.lock().unwrap().conf.tenant_dir(tenant_id))?; + delete_dir( + TIMELINES_STATE + .lock() + .unwrap() + .get_conf() + .tenant_dir(tenant_id), + )?; let tlis_after_delete = Self::get_all_for_tenant(*tenant_id); if !tlis_after_delete.is_empty() { diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 0a43d6085c..fc971ca753 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -13,7 +13,7 @@ use std::time::Duration; use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; use postgres_ffi::XLogFileName; use postgres_ffi::{XLogSegNo, PG_TLI}; -use remote_storage::GenericRemoteStorage; +use remote_storage::{GenericRemoteStorage, RemotePath}; use tokio::fs::File; use tokio::runtime::Builder; @@ -37,8 +37,11 @@ pub fn wal_backup_launcher_thread_main( conf: SafeKeeperConf, wal_backup_launcher_rx: Receiver, ) { - let rt = Builder::new_multi_thread() - .worker_threads(conf.backup_runtime_threads) + let mut builder = Builder::new_multi_thread(); + if let Some(num_threads) = conf.backup_runtime_threads { + builder.worker_threads(num_threads); + } + let rt = builder .enable_all() .build() .expect("failed to create wal backup runtime"); @@ -151,7 +154,7 @@ async fn update_task( let timeline_dir = conf.timeline_dir(&ttid); let handle = tokio::spawn( - backup_task_main(ttid, timeline_dir, shutdown_rx) + backup_task_main(ttid, timeline_dir, conf.workdir.clone(), shutdown_rx) .instrument(info_span!("WAL backup task", ttid = %ttid)), ); @@ -182,10 +185,10 @@ async fn wal_backup_launcher_main_loop( let conf_ = conf.clone(); REMOTE_STORAGE.get_or_init(|| { - conf_.remote_storage.as_ref().map(|c| { - GenericRemoteStorage::from_config(conf_.workdir, c) - .expect("failed to create remote storage") - }) + conf_ + .remote_storage + .as_ref() + .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage")) }); // Presense in this map means launcher is aware s3 offloading is needed for @@ -234,6 +237,7 @@ async fn wal_backup_launcher_main_loop( struct WalBackupTask { timeline: Arc, timeline_dir: PathBuf, + workspace_dir: PathBuf, wal_seg_size: usize, commit_lsn_watch_rx: watch::Receiver, } @@ -242,6 +246,7 @@ struct WalBackupTask { async fn backup_task_main( ttid: TenantTimelineId, timeline_dir: PathBuf, + workspace_dir: PathBuf, mut shutdown_rx: Receiver<()>, ) { info!("started"); @@ -257,6 +262,7 @@ async fn backup_task_main( commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), timeline: tli, timeline_dir, + workspace_dir, }; // task is spinned up only when wal_seg_size already initialized @@ -321,6 +327,7 @@ impl WalBackupTask { commit_lsn, self.wal_seg_size, &self.timeline_dir, + &self.workspace_dir, ) .await { @@ -339,9 +346,7 @@ impl WalBackupTask { backup_lsn, commit_lsn, e ); - if retry_attempt < u32::MAX { - retry_attempt += 1; - } + retry_attempt = retry_attempt.saturating_add(1); } } } @@ -353,11 +358,12 @@ pub async fn backup_lsn_range( end_lsn: Lsn, wal_seg_size: usize, timeline_dir: &Path, + workspace_dir: &Path, ) -> Result { let mut res = start_lsn; let segments = get_segments(start_lsn, end_lsn, wal_seg_size); for s in &segments { - backup_single_segment(s, timeline_dir) + backup_single_segment(s, timeline_dir, workspace_dir) .await .with_context(|| format!("offloading segno {}", s.seg_no))?; @@ -372,11 +378,24 @@ pub async fn backup_lsn_range( Ok(res) } -async fn backup_single_segment(seg: &Segment, timeline_dir: &Path) -> Result<()> { - let segment_file_name = seg.file_path(timeline_dir)?; +async fn backup_single_segment( + seg: &Segment, + timeline_dir: &Path, + workspace_dir: &Path, +) -> Result<()> { + let segment_file_path = seg.file_path(timeline_dir)?; + let remote_segment_path = segment_file_path + .strip_prefix(workspace_dir) + .context("Failed to strip workspace dir prefix") + .and_then(RemotePath::new) + .with_context(|| { + format!( + "Failed to resolve remote part of path {segment_file_path:?} for base {workspace_dir:?}", + ) + })?; - backup_object(&segment_file_name, seg.size()).await?; - debug!("Backup of {} done", segment_file_name.display()); + backup_object(&segment_file_path, &remote_segment_path, seg.size()).await?; + debug!("Backup of {} done", segment_file_path.display()); Ok(()) } @@ -426,7 +445,7 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec { static REMOTE_STORAGE: OnceCell> = OnceCell::new(); -async fn backup_object(source_file: &Path, size: usize) -> Result<()> { +async fn backup_object(source_file: &Path, target_file: &RemotePath, size: usize) -> Result<()> { let storage = REMOTE_STORAGE .get() .expect("failed to get remote storage") @@ -441,12 +460,12 @@ async fn backup_object(source_file: &Path, size: usize) -> Result<()> { })?); storage - .upload_storage_object(Box::new(file), size, source_file) + .upload_storage_object(Box::new(file), size, target_file) .await } pub async fn read_object( - file_path: PathBuf, + file_path: &RemotePath, offset: u64, ) -> anyhow::Result>> { let storage = REMOTE_STORAGE @@ -455,19 +474,13 @@ pub async fn read_object( .as_ref() .context("No remote storage configured")?; - info!( - "segment download about to start for local path {} at offset {}", - file_path.display(), - offset - ); + info!("segment download about to start from remote path {file_path:?} at offset {offset}"); + let download = storage - .download_storage_object(Some((offset, None)), &file_path) + .download_storage_object(Some((offset, None)), file_path) .await .with_context(|| { - format!( - "Failed to open WAL segment download stream for local path {}", - file_path.display() - ) + format!("Failed to open WAL segment download stream for remote path {file_path:?}") })?; Ok(download.download_stream) diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index bc5e2d7b24..41457868fe 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -8,6 +8,7 @@ //! Note that last file has `.partial` suffix, that's different from postgres. use anyhow::{bail, Context, Result}; +use remote_storage::RemotePath; use std::io::{self, Seek, SeekFrom}; use std::pin::Pin; @@ -222,7 +223,7 @@ impl PhysicalStorage { // Rename partial file to completed file let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; - fs::rename(&wal_file_partial_path, &wal_file_path)?; + fs::rename(wal_file_partial_path, wal_file_path)?; } else { // otherwise, file can be reused later self.file = Some(file); @@ -248,7 +249,7 @@ impl PhysicalStorage { while !buf.is_empty() { // Extract WAL location for this block - let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size) as usize; + let xlogoff = self.write_lsn.segment_offset(self.wal_seg_size); let segno = self.write_lsn.segment_number(self.wal_seg_size); // If crossing a WAL boundary, only write up until we reach wal segment size. @@ -365,7 +366,7 @@ impl Storage for PhysicalStorage { self.fdatasync_file(&mut unflushed_file)?; } - let xlogoff = end_pos.segment_offset(self.wal_seg_size) as usize; + let xlogoff = end_pos.segment_offset(self.wal_seg_size); let segno = end_pos.segment_number(self.wal_seg_size); // Remove all segments after the given LSN. @@ -382,7 +383,7 @@ impl Storage for PhysicalStorage { // Make segment partial once again let (wal_file_path, wal_file_partial_path) = wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?; - fs::rename(&wal_file_path, &wal_file_partial_path)?; + fs::rename(wal_file_path, wal_file_partial_path)?; } // Update LSNs @@ -415,7 +416,7 @@ fn remove_segments_from_disk( let mut min_removed = u64::MAX; let mut max_removed = u64::MIN; - for entry in fs::read_dir(&timeline_dir)? { + for entry in fs::read_dir(timeline_dir)? { let entry = entry?; let entry_path = entry.path(); let fname = entry_path.file_name().unwrap(); @@ -445,6 +446,7 @@ fn remove_segments_from_disk( } pub struct WalReader { + workdir: PathBuf, timeline_dir: PathBuf, wal_seg_size: usize, pos: Lsn, @@ -459,6 +461,7 @@ pub struct WalReader { impl WalReader { pub fn new( + workdir: PathBuf, timeline_dir: PathBuf, state: &SafeKeeperState, start_pos: Lsn, @@ -478,6 +481,7 @@ impl WalReader { } Ok(Self { + workdir, timeline_dir, wal_seg_size: state.server.wal_seg_size as usize, pos: start_pos, @@ -495,7 +499,7 @@ impl WalReader { // How much to read and send in message? We cannot cross the WAL file // boundary, and we don't want send more than provided buffer. - let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize; + let xlogoff = self.pos.segment_offset(self.wal_seg_size); let send_size = min(buf.len(), self.wal_seg_size - xlogoff); // Read some data from the file. @@ -514,7 +518,7 @@ impl WalReader { /// Open WAL segment at the current position of the reader. async fn open_segment(&self) -> Result>> { - let xlogoff = self.pos.segment_offset(self.wal_seg_size) as usize; + let xlogoff = self.pos.segment_offset(self.wal_seg_size); let segno = self.pos.segment_number(self.wal_seg_size); let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size); let wal_file_path = self.timeline_dir.join(wal_file_name); @@ -545,7 +549,17 @@ impl WalReader { // Try to open remote file, if remote reads are enabled if self.enable_remote_read { - return read_object(wal_file_path, xlogoff as u64).await; + let remote_wal_file_path = wal_file_path + .strip_prefix(&self.workdir) + .context("Failed to strip workdir prefix") + .and_then(RemotePath::new) + .with_context(|| { + format!( + "Failed to resolve remote part of path {:?} for base {:?}", + wal_file_path, self.workdir, + ) + })?; + return read_object(&remote_wal_file_path, xlogoff as u64).await; } bail!("WAL segment is not found") diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 1734038661..d83a74ae14 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -318,14 +318,8 @@ def remote_consistent_lsn( detail = pageserver_http_client.timeline_detail(tenant, timeline) lsn_str = detail["remote_consistent_lsn"] - if lsn_str is None: - # No remote information at all. This happens right after creating - # a timeline, before any part of it has been uploaded to remote - # storage yet. - return 0 - else: - assert isinstance(lsn_str, str) - return lsn_from_hex(lsn_str) + assert isinstance(lsn_str, str) + return lsn_from_hex(lsn_str) def wait_for_upload( @@ -448,15 +442,15 @@ def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int): def get_rlsn(pageserver_connstr, tenant_id, timeline_id): - conn = psycopg2.connect(pageserver_connstr) - conn.autocommit = True - with conn.cursor() as cur: - cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}" - cur.execute(cmd) - res = cur.fetchone() - prev_lsn = res[0] - last_lsn = res[1] - conn.close() + with closing(psycopg2.connect(pageserver_connstr)) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}" + cur.execute(cmd) + res = cur.fetchone() + assert res is not None + prev_lsn = res[0] + last_lsn = res[1] return last_lsn, prev_lsn diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml index 843fc53f36..7aa33a5234 100644 --- a/storage_broker/Cargo.toml +++ b/storage_broker/Cargo.toml @@ -7,9 +7,11 @@ edition = "2021" bench = [] [dependencies] +anyhow = "1.0" async-stream = "0.3" bytes = "1.0" clap = { version = "4.0", features = ["derive"] } +const_format = "0.2.21" futures = "0.3" futures-core = "0.3" futures-util = "0.3" @@ -19,7 +21,7 @@ hyper = {version = "0.14.14", features = ["full"]} once_cell = "1.13.0" parking_lot = "0.12" prost = "0.11" -tonic = "0.8" +tonic = {version = "0.8", features = ["tls", "tls-roots"]} tokio = { version = "1.0", features = ["macros", "rt-multi-thread"] } tokio-stream = "0.1" tracing = "0.1.27" diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index 0a72adc948..f3544a7cb8 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -6,8 +6,8 @@ use clap::Parser; use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest}; -use storage_broker::BrokerClientChannel; -use storage_broker::DEFAULT_LISTEN_ADDR; + +use storage_broker::{BrokerClientChannel, DEFAULT_ENDPOINT}; use tokio::time; use tonic::Request; @@ -88,9 +88,7 @@ fn tli_from_u64(i: u64) -> Vec { async fn subscribe(client: Option, counter: Arc, i: u64) { let mut client = match client { Some(c) => c, - None => BrokerClientChannel::connect_lazy(format!("http://{}", DEFAULT_LISTEN_ADDR)) - .await - .unwrap(), + None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(), }; let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId { @@ -114,9 +112,7 @@ async fn subscribe(client: Option, counter: Arc, async fn publish(client: Option, n_keys: u64) { let mut client = match client { Some(c) => c, - None => BrokerClientChannel::connect_lazy(format!("http://{}", DEFAULT_LISTEN_ADDR)) - .await - .unwrap(), + None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(), }; let mut counter: u64 = 0; @@ -156,9 +152,7 @@ async fn main() -> Result<(), Box> { } let h = tokio::spawn(progress_reporter(counters.clone())); - let c = BrokerClientChannel::connect_lazy(format!("http://{}", DEFAULT_LISTEN_ADDR)) - .await - .unwrap(); + let c = storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(); for i in 0..args.num_subs { let c = Some(c.clone()); @@ -166,7 +160,7 @@ async fn main() -> Result<(), Box> { } for _i in 0..args.num_pubs { let c = None; - tokio::spawn(publish(c, args.num_subs as u64)); + tokio::spawn(publish(c, args.num_subs)); } h.await?; diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 04f93a1ebb..6d80e96bf1 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -39,15 +39,18 @@ use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE}; use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer}; use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest}; -use storage_broker::{parse_proto_ttid, EitherBody, DEFAULT_LISTEN_ADDR}; +use storage_broker::{ + parse_proto_ttid, EitherBody, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR, +}; use utils::id::TenantTimelineId; use utils::logging::{self, LogFormat}; use utils::project_git_version; +use utils::sentry_init::{init_sentry, release_name}; project_git_version!(GIT_VERSION); -const DEFAULT_CHAN_SIZE: usize = 128; -const DEFAULT_HTTP2_KEEPALIVE_INTERVAL: &str = "5000ms"; +const DEFAULT_CHAN_SIZE: usize = 32; +const DEFAULT_ALL_KEYS_CHAN_SIZE: usize = 16384; #[derive(Parser, Debug)] #[command(version = GIT_VERSION, about = "Broker for neon storage nodes communication", long_about = None)] @@ -55,11 +58,14 @@ struct Args { /// Endpoint to listen on. #[arg(short, long, default_value = DEFAULT_LISTEN_ADDR)] listen_addr: SocketAddr, - /// Size of the queue to the subscriber. + /// Size of the queue to the per timeline subscriber. #[arg(long, default_value_t = DEFAULT_CHAN_SIZE)] - chan_size: usize, + timeline_chan_size: usize, + /// Size of the queue to the all keys subscriber. + #[arg(long, default_value_t = DEFAULT_ALL_KEYS_CHAN_SIZE)] + all_keys_chan_size: usize, /// HTTP/2 keepalive interval. - #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HTTP2_KEEPALIVE_INTERVAL)] + #[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_KEEPALIVE_INTERVAL)] http2_keepalive_interval: Duration, /// Format for logging, either 'plain' or 'json'. #[arg(long, default_value = "plain")] @@ -107,7 +113,7 @@ struct SharedState { } impl SharedState { - pub fn new(chan_size: usize) -> Self { + pub fn new(all_keys_chan_size: usize) -> Self { SharedState { next_pub_id: 0, num_pubs: 0, @@ -115,7 +121,7 @@ impl SharedState { num_subs_to_timelines: 0, chans_to_timeline_subs: HashMap::new(), num_subs_to_all: 0, - chan_to_all_subs: broadcast::channel(chan_size).0, + chan_to_all_subs: broadcast::channel(all_keys_chan_size).0, } } @@ -138,7 +144,7 @@ impl SharedState { pub fn register_subscriber( &mut self, sub_key: SubscriptionKey, - chan_size: usize, + timeline_chan_size: usize, ) -> (SubId, broadcast::Receiver) { let sub_id = self.next_sub_id; self.next_sub_id += 1; @@ -157,7 +163,7 @@ impl SharedState { self.chans_to_timeline_subs .entry(ttid) .or_insert(ChanToTimelineSub { - chan: broadcast::channel(chan_size).0, + chan: broadcast::channel(timeline_chan_size).0, num_subscribers: 0, }); chan_to_timeline_sub.num_subscribers += 1; @@ -199,7 +205,7 @@ impl SharedState { #[derive(Clone)] struct Registry { shared_state: Arc>, - chan_size: usize, + timeline_chan_size: usize, } impl Registry { @@ -231,7 +237,7 @@ impl Registry { let (sub_id, sub_rx) = self .shared_state .write() - .register_subscriber(sub_key, self.chan_size); + .register_subscriber(sub_key, self.timeline_chan_size); info!( "subscription started id={}, key={:?}, addr={:?}", sub_id, sub_key, remote_addr @@ -368,7 +374,8 @@ impl BrokerService for Broker { Err(RecvError::Lagged(skipped_msg)) => { missed_msgs += skipped_msg; if let Poll::Ready(_) = futures::poll!(Box::pin(warn_interval.tick())) { - warn!("dropped {} messages, channel is full", missed_msgs); + warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full", + subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs); missed_msgs = 0; } } @@ -417,14 +424,17 @@ async fn http1_handler( #[tokio::main] async fn main() -> Result<(), Box> { + // initialize sentry if SENTRY_DSN is provided + let _sentry_guard = init_sentry(release_name!(), &[]); + let args = Args::parse(); logging::init(LogFormat::from_config(&args.log_format)?)?; info!("version: {GIT_VERSION}"); let registry = Registry { - shared_state: Arc::new(RwLock::new(SharedState::new(args.chan_size))), - chan_size: args.chan_size, + shared_state: Arc::new(RwLock::new(SharedState::new(args.all_keys_chan_size))), + timeline_chan_size: args.timeline_chan_size, }; let storage_broker_impl = Broker { registry: registry.clone(), @@ -518,7 +528,7 @@ mod tests { async fn test_registry() { let registry = Registry { shared_state: Arc::new(RwLock::new(SharedState::new(16))), - chan_size: 16, + timeline_chan_size: 16, }; // subscribe to timeline 2 diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs index f25acdfcb3..8441aaf625 100644 --- a/storage_broker/src/lib.rs +++ b/storage_broker/src/lib.rs @@ -1,7 +1,9 @@ use hyper::body::HttpBody; use std::pin::Pin; use std::task::{Context, Poll}; +use std::time::Duration; use tonic::codegen::StdError; +use tonic::transport::{ClientTlsConfig, Endpoint}; use tonic::{transport::Channel, Code, Status}; use utils::id::{TenantId, TenantTimelineId, TimelineId}; @@ -11,6 +13,10 @@ use proto::{ // Code generated by protobuf. pub mod proto { + // Tonic does derives as `#[derive(Clone, PartialEq, ::prost::Message)]` + // we don't use these types for anything but broker data transmission, + // so it's ok to ignore this one. + #![allow(clippy::derive_partial_eq_without_eq)] tonic::include_proto!("storage_broker"); } @@ -20,12 +26,41 @@ pub mod metrics; pub use tonic::Request; pub use tonic::Streaming; -pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051"; +pub use hyper::Uri; -// NeonBrokerClient charged with tonic provided Channel transport; helps to +pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051"; +pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}"); + +pub const DEFAULT_KEEPALIVE_INTERVAL: &str = "5000 ms"; + +// BrokerServiceClient charged with tonic provided Channel transport; helps to // avoid depending on tonic directly in user crates. pub type BrokerClientChannel = BrokerServiceClient; +// Create connection object configured to run TLS if schema starts with https:// +// and plain text otherwise. Connection is lazy, only endpoint sanity is +// validated here. +pub fn connect(endpoint: U, keepalive_interval: Duration) -> anyhow::Result +where + U: std::convert::TryInto, + U::Error: std::error::Error + Send + Sync + 'static, +{ + let uri: Uri = endpoint.try_into()?; + let mut tonic_endpoint: Endpoint = uri.into(); + // If schema starts with https, start encrypted connection; do plain text + // otherwise. + if let Some("https") = tonic_endpoint.uri().scheme_str() { + let tls = ClientTlsConfig::new(); + tonic_endpoint = tonic_endpoint.tls_config(tls)?; + } + tonic_endpoint = tonic_endpoint + .http2_keep_alive_interval(keepalive_interval) + .keep_alive_while_idle(true); + // keep_alive_timeout is 20s by default on both client and server side + let channel = tonic_endpoint.connect_lazy(); + Ok(BrokerClientChannel::new(channel)) +} + impl BrokerClientChannel { /// Create a new client to the given endpoint, but don't actually connect until the first request. pub async fn connect_lazy(dst: D) -> Result diff --git a/test_runner/README.md b/test_runner/README.md index e066ac3235..877498bae7 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -13,8 +13,6 @@ Prerequisites: below to run from other directories. - The neon git repo, including the postgres submodule (for some tests, e.g. `pg_regress`) -- Some tests (involving storage nodes coordination) require etcd installed. Follow - [`the guide`](https://etcd.io/docs/v3.5/install/) to obtain it. ### Test Organization @@ -78,9 +76,15 @@ Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION="14"` should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. `NEON_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as -`--pageserver-config-override=${value}` parameter values when neon_local cli is invoked `RUST_LOG`: logging configuration to pass into Neon CLI +Useful parameters and commands: + +`--pageserver-config-override=${value}` `-c` values to pass into pageserver through neon_local cli + +`--preserve-database-files` to preserve pageserver (layer) and safekeer (segment) timeline files on disk +after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents. + Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them: `./scripts/pytest -s --log-cli-level=INFO ...` (Note many tests capture subprocess outputs separately, so this may not diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 27fb0a60b2..b1489b7ab1 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -11,7 +11,7 @@ from datetime import datetime from pathlib import Path # Type-related stuff -from typing import Callable, ClassVar, Iterator, Optional +from typing import Callable, ClassVar, Dict, Iterator, Optional import pytest from _pytest.config import Config @@ -135,23 +135,26 @@ class PgBenchRunResult: @dataclasses.dataclass class PgBenchInitResult: - REGEX: ClassVar[re.Pattern] = re.compile( # type: ignore[type-arg] - r"done in (\d+\.\d+) s " - r"\(" - r"(?:drop tables (\d+\.\d+) s)?(?:, )?" - r"(?:create tables (\d+\.\d+) s)?(?:, )?" - r"(?:client-side generate (\d+\.\d+) s)?(?:, )?" - r"(?:vacuum (\d+\.\d+) s)?(?:, )?" - r"(?:primary keys (\d+\.\d+) s)?(?:, )?" - r"\)\." - ) + # Taken from https://github.com/postgres/postgres/blob/REL_15_1/src/bin/pgbench/pgbench.c#L5144-L5171 + EXTRACTORS: ClassVar[Dict[str, re.Pattern]] = { # type: ignore[type-arg] + "drop_tables": re.compile(r"drop tables (\d+\.\d+) s"), + "create_tables": re.compile(r"create tables (\d+\.\d+) s"), + "client_side_generate": re.compile(r"client-side generate (\d+\.\d+) s"), + "server_side_generate": re.compile(r"server-side generate (\d+\.\d+) s"), + "vacuum": re.compile(r"vacuum (\d+\.\d+) s"), + "primary_keys": re.compile(r"primary keys (\d+\.\d+) s"), + "foreign_keys": re.compile(r"foreign keys (\d+\.\d+) s"), + "total": re.compile(r"done in (\d+\.\d+) s"), # Total time printed by pgbench + } - total: float + total: Optional[float] drop_tables: Optional[float] create_tables: Optional[float] client_side_generate: Optional[float] + server_side_generate: Optional[float] vacuum: Optional[float] primary_keys: Optional[float] + foreign_keys: Optional[float] duration: float start_timestamp: int end_timestamp: int @@ -164,25 +167,35 @@ class PgBenchInitResult: start_timestamp: int, end_timestamp: int, ): - # Parses pgbench initialize output for default initialization steps (dtgvp) + # Parses pgbench initialize output # Example: done in 5.66 s (drop tables 0.05 s, create tables 0.31 s, client-side generate 2.01 s, vacuum 0.53 s, primary keys 0.38 s). last_line = stderr.splitlines()[-1] - if (m := cls.REGEX.match(last_line)) is not None: - total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [ - float(v) for v in m.groups() if v is not None - ] - else: + timings: Dict[str, Optional[float]] = {} + last_line_items = re.split(r"\(|\)|,", last_line) + for item in last_line_items: + for key, regex in cls.EXTRACTORS.items(): + if (m := regex.match(item.strip())) is not None: + if key in timings: + raise RuntimeError( + f"can't store pgbench results for repeated action `{key}`" + ) + + timings[key] = float(m.group(1)) + + if not timings or "total" not in timings: raise RuntimeError(f"can't parse pgbench initialize results from `{last_line}`") return cls( - total=total, - drop_tables=drop_tables, - create_tables=create_tables, - client_side_generate=client_side_generate, - vacuum=vacuum, - primary_keys=primary_keys, + total=timings["total"], + drop_tables=timings.get("drop_tables", 0.0), + create_tables=timings.get("create_tables", 0.0), + client_side_generate=timings.get("client_side_generate", 0.0), + server_side_generate=timings.get("server_side_generate", 0.0), + vacuum=timings.get("vacuum", 0.0), + primary_keys=timings.get("primary_keys", 0.0), + foreign_keys=timings.get("foreign_keys", 0.0), duration=duration, start_timestamp=start_timestamp, end_timestamp=end_timestamp, @@ -326,8 +339,10 @@ class NeonBenchmarker: "drop_tables", "create_tables", "client_side_generate", + "server_side_generate", "vacuum", "primary_keys", + "foreign_keys", ] for metric in metrics: if (value := getattr(result, metric)) is not None: diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 291f924379..be1f146735 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -115,6 +115,7 @@ class NeonCompare(PgCompare): return self._pg_bin def flush(self): + self.pageserver_http_client.timeline_checkpoint(self.env.initial_tenant, self.timeline) self.pageserver_http_client.timeline_gc(self.env.initial_tenant, self.timeline, 0) def compact(self): @@ -176,7 +177,7 @@ class VanillaCompare(PgCompare): self.cur = self.conn.cursor() @property - def pg(self) -> PgProtocol: + def pg(self) -> VanillaPostgres: return self._pg @property diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 86ab4425ed..9236137d19 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -39,9 +39,17 @@ def parse_metrics(text: str, name: str = "") -> Metrics: return metrics +PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = ( + "pageserver_remote_upload_queue_unfinished_tasks", + "pageserver_remote_operation_seconds_bucket", + "pageserver_remote_operation_seconds_count", + "pageserver_remote_operation_seconds_sum", + "pageserver_remote_physical_size", +) + PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_current_logical_size", - "pageserver_current_physical_size", + "pageserver_resident_physical_size", "pageserver_getpage_reconstruct_seconds_bucket", "pageserver_getpage_reconstruct_seconds_count", "pageserver_getpage_reconstruct_seconds_sum", @@ -62,4 +70,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_wait_lsn_seconds_sum", "pageserver_created_persistent_files_total", "pageserver_written_persistent_bytes_total", + *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, ) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7fc2a7c24b..5b00ebdea7 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -26,14 +26,23 @@ import asyncpg import backoff # type: ignore import boto3 import jwt +import prometheus_client import psycopg2 import pytest import requests from _pytest.config import Config +from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest from fixtures.log_helper import log from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import Fn, allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture +from fixtures.utils import ( + ATTACHMENT_NAME_REGEX, + Fn, + allure_attach_from_dir, + get_self_dir, + subprocess_capture, +) +from prometheus_client.parser import text_string_to_metric_families # Type-related stuff from psycopg2.extensions import connection as PgConnection @@ -279,21 +288,19 @@ def port_distributor(worker_base_port: int) -> PortDistributor: return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM) -@pytest.fixture(scope="session") +@pytest.fixture(scope="function") def default_broker( - request: FixtureRequest, port_distributor: PortDistributor, top_output_dir: Path -) -> Iterator[Etcd]: + port_distributor: PortDistributor, + test_output_dir: Path, + neon_binpath: Path, +) -> Iterator[NeonBroker]: + # multiple pytest sessions could get launched in parallel, get them different ports/datadirs client_port = port_distributor.get_port() - # multiple pytest sessions could get launched in parallel, get them different datadirs - etcd_datadir = get_test_output_dir(request, top_output_dir) / f"etcd_datadir_{client_port}" - etcd_datadir.mkdir(exist_ok=True, parents=True) + broker_logfile = test_output_dir / "repo" / "storage_broker.log" - broker = Etcd( - datadir=str(etcd_datadir), port=client_port, peer_port=port_distributor.get_port() - ) + broker = NeonBroker(logfile=broker_logfile, port=client_port, neon_binpath=neon_binpath) yield broker broker.stop() - allure_attach_from_dir(etcd_datadir) @pytest.fixture(scope="session") @@ -570,7 +577,7 @@ class NeonEnvBuilder: self, repo_dir: Path, port_distributor: PortDistributor, - broker: Etcd, + broker: NeonBroker, run_id: uuid.UUID, mock_s3_server: MockS3Server, neon_binpath: Path, @@ -587,6 +594,7 @@ class NeonEnvBuilder: auth_enabled: bool = False, rust_log_override: Optional[str] = None, default_branch_name: str = DEFAULT_BRANCH_NAME, + preserve_database_files: bool = False, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -608,6 +616,7 @@ class NeonEnvBuilder: self.neon_binpath = neon_binpath self.pg_distrib_dir = pg_distrib_dir self.pg_version = pg_version + self.preserve_database_files = preserve_database_files def init(self) -> NeonEnv: # Cannot create more than one environment from one builder @@ -616,6 +625,7 @@ class NeonEnvBuilder: return self.env def start(self): + assert self.env is not None, "environment is not already initialized, call init() first" self.env.start() def init_start(self) -> NeonEnv: @@ -715,6 +725,28 @@ class NeonEnvBuilder: prefix_in_bucket=self.remote_storage_prefix, ) + def cleanup_local_storage(self): + if self.preserve_database_files: + return + + directories_to_clean: List[Path] = [] + for test_entry in Path(self.repo_dir).glob("**/*"): + if test_entry.is_file(): + test_file = test_entry + if ATTACHMENT_NAME_REGEX.fullmatch(test_file.name): + continue + if SMALL_DB_FILE_NAME_REGEX.fullmatch(test_file.name): + continue + log.debug(f"Removing large database {test_file} file") + test_file.unlink() + elif test_entry.is_dir(): + directories_to_clean.append(test_entry) + + for directory_to_clean in reversed(directories_to_clean): + if not os.listdir(directory_to_clean): + log.debug(f"Removing empty directory {directory_to_clean}") + directory_to_clean.rmdir() + def cleanup_remote_storage(self): # here wee check for true remote storage, no the local one # local cleanup is not needed after test because in ci all env will be destroyed anyway @@ -722,6 +754,11 @@ class NeonEnvBuilder: log.info("no remote storage was set up, skipping cleanup") return + # Making mypy happy with allowing only `S3Storage` further. + # `self.remote_storage_prefix` is coupled with `S3Storage` storage type, + # so this line effectively a no-op + assert isinstance(self.remote_storage, S3Storage) + if self.keep_remote_storage_contents: log.info("keep_remote_storage_contents skipping remote storage cleanup") return @@ -737,7 +774,8 @@ class NeonEnvBuilder: Prefix=self.remote_storage_prefix, ) - objects_to_delete = {"Objects": []} + # Using Any because DeleteTypeDef (from boto3-stubs) doesn't fit our case + objects_to_delete: Any = {"Objects": []} cnt = 0 for item in pages.search("Contents"): # weirdly when nothing is found it returns [None] @@ -752,16 +790,17 @@ class NeonEnvBuilder: Bucket=self.remote_storage.bucket_name, Delete=objects_to_delete, ) - objects_to_delete = dict(Objects=[]) + objects_to_delete = {"Objects": []} cnt += 1 # flush rest if len(objects_to_delete["Objects"]): self.remote_storage_client.delete_objects( - Bucket=self.remote_storage.bucket_name, Delete=objects_to_delete + Bucket=self.remote_storage.bucket_name, + Delete=objects_to_delete, ) - log.info("deleted %s objects from remote storage", cnt) + log.info(f"deleted {cnt} objects from remote storage") def __enter__(self) -> "NeonEnvBuilder": return self @@ -780,7 +819,22 @@ class NeonEnvBuilder: sk.stop(immediate=True) self.env.pageserver.stop(immediate=True) - self.cleanup_remote_storage() + cleanup_error = None + try: + self.cleanup_remote_storage() + except Exception as e: + log.error(f"Error during remote storage cleanup: {e}") + cleanup_error = e + + try: + self.cleanup_local_storage() + except Exception as e: + log.error(f"Error during local storage cleanup: {e}") + if cleanup_error is not None: + cleanup_error = e + + if cleanup_error is not None: + raise cleanup_error self.env.pageserver.assert_no_errors() @@ -846,9 +900,8 @@ class NeonEnv: toml += textwrap.dedent( f""" - [etcd_broker] - broker_endpoints = ['{self.broker.client_url()}'] - etcd_binary_path = '{self.broker.binary_path}' + [broker] + listen_addr = '{self.broker.listen_addr()}' """ ) @@ -947,9 +1000,10 @@ class NeonEnv: @pytest.fixture(scope=shareable_scope) def _shared_simple_env( request: FixtureRequest, + pytestconfig: Config, port_distributor: PortDistributor, mock_s3_server: MockS3Server, - default_broker: Etcd, + default_broker: NeonBroker, run_id: uuid.UUID, top_output_dir: Path, neon_binpath: Path, @@ -963,7 +1017,7 @@ def _shared_simple_env( if os.environ.get("TEST_SHARED_FIXTURES") is None: # Create the environment in the per-test output directory - repo_dir = get_test_output_dir(request, top_output_dir) / "repo" + repo_dir = get_test_repo_dir(request, top_output_dir) else: # We're running shared fixtures. Share a single directory. repo_dir = top_output_dir / "shared_repo" @@ -978,6 +1032,7 @@ def _shared_simple_env( pg_distrib_dir=pg_distrib_dir, pg_version=pg_version, run_id=run_id, + preserve_database_files=pytestconfig.getoption("--preserve-database-files"), ) as builder: env = builder.init_start() @@ -1004,13 +1059,14 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]: @pytest.fixture(scope="function") def neon_env_builder( + pytestconfig: Config, test_output_dir: str, port_distributor: PortDistributor, mock_s3_server: MockS3Server, neon_binpath: Path, pg_distrib_dir: Path, pg_version: str, - default_broker: Etcd, + default_broker: NeonBroker, run_id: uuid.UUID, ) -> Iterator[NeonEnvBuilder]: """ @@ -1039,6 +1095,7 @@ def neon_env_builder( pg_version=pg_version, broker=default_broker, run_id=run_id, + preserve_database_files=pytestconfig.getoption("--preserve-database-files"), ) as builder: yield builder @@ -1119,6 +1176,14 @@ class PageserverHttpClient(requests.Session): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach") self.verbose_error(res) + def tenant_load(self, tenant_id: TenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load") + self.verbose_error(res) + + def tenant_ignore(self, tenant_id: TenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore") + self.verbose_error(res) + def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) @@ -1141,8 +1206,22 @@ class PageserverHttpClient(requests.Session): # there are no tests for those right now. return size - def timeline_list(self, tenant_id: TenantId) -> List[Dict[str, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline") + def timeline_list( + self, + tenant_id: TenantId, + include_non_incremental_logical_size: bool = False, + include_timeline_dir_layer_file_size_sum: bool = False, + ) -> List[Dict[str, Any]]: + + params = {} + if include_non_incremental_logical_size: + params["include-non-incremental-logical-size"] = "yes" + if include_timeline_dir_layer_file_size_sum: + params["include-timeline-dir-layer-file-size-sum"] = "yes" + + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params + ) self.verbose_error(res) res_json = res.json() assert isinstance(res_json, list) @@ -1176,13 +1255,13 @@ class PageserverHttpClient(requests.Session): tenant_id: TenantId, timeline_id: TimelineId, include_non_incremental_logical_size: bool = False, - include_non_incremental_physical_size: bool = False, + include_timeline_dir_layer_file_size_sum: bool = False, ) -> Dict[Any, Any]: params = {} if include_non_incremental_logical_size: params["include-non-incremental-logical-size"] = "yes" - if include_non_incremental_physical_size: - params["include-non-incremental-physical-size"] = "yes" + if include_timeline_dir_layer_file_size_sum: + params["include-timeline-dir-layer-file-size-sum"] = "yes" res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", @@ -1257,11 +1336,88 @@ class PageserverHttpClient(requests.Session): res_json = res.json() assert res_json is None + def timeline_spawn_download_remote_layers( + self, tenant_id: TenantId, timeline_id: TimelineId + ) -> dict[str, Any]: + + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", + ) + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + return res_json + + def timeline_poll_download_remote_layers_status( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + spawn_response: dict[str, Any], + poll_state=None, + ) -> None | dict[str, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", + ) + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + + # assumption in this API client here is that nobody else spawns the task + assert res_json["task_id"] == spawn_response["task_id"] + + if poll_state is None or res_json["state"] == poll_state: + return res_json + return None + + def timeline_download_remote_layers( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + errors_ok=False, + at_least_one_download=True, + ): + res = self.timeline_spawn_download_remote_layers(tenant_id, timeline_id) + while True: + completed = self.timeline_poll_download_remote_layers_status( + tenant_id, timeline_id, res, poll_state="Completed" + ) + if not completed: + time.sleep(0.1) + continue + if not errors_ok: + assert completed["failed_download_count"] == 0 + if at_least_one_download: + assert completed["successful_download_count"] > 0 + return completed + def get_metrics(self) -> str: res = self.get(f"http://localhost:{self.port}/metrics") self.verbose_error(res) return res.text + def get_timeline_metric(self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str): + raw = self.get_metrics() + family: List[prometheus_client.Metric] = list(text_string_to_metric_families(raw)) + [metric] = [m for m in family if m.name == metric_name] + [sample] = [ + s + for s in metric.samples + if s.labels["tenant_id"] == str(tenant_id) + and s.labels["timeline_id"] == str(timeline_id) + ] + return sample.value + + def get_metric_value(self, name: str) -> Optional[str]: + metrics = self.get_metrics() + relevant = [line for line in metrics.splitlines() if line.startswith(name)] + if len(relevant) == 0: + log.info(f'could not find metric "{name}"') + return None + assert len(relevant) == 1 + return relevant[0].lstrip(name).strip() + @dataclass class PageserverPort: @@ -1559,7 +1715,12 @@ class NeonCli(AbstractNeonCli): pageserver_config_override=self.env.pageserver.config_override, ) - res = self.raw_cli(cmd) + s3_env_vars = None + if self.env.remote_storage is not None and isinstance( + self.env.remote_storage, S3Storage + ): + s3_env_vars = self.env.remote_storage.access_env_vars() + res = self.raw_cli(cmd, extra_env_vars=s3_env_vars) res.check_returncode() return res @@ -1682,6 +1843,12 @@ class NeonCli(AbstractNeonCli): return self.raw_cli(args, check_return_code=check_return_code) + def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]": + return self.raw_cli(["start"], check_return_code=check_return_code) + + def stop(self, check_return_code=True) -> "subprocess.CompletedProcess[str]": + return self.raw_cli(["stop"], check_return_code=check_return_code) + class WalCraft(AbstractNeonCli): """ @@ -1735,7 +1902,7 @@ class NeonPageserver(PgProtocol): # All tests print these, when starting up or shutting down ".*wal receiver task finished with an error: walreceiver connection handling failure.*", ".*Shutdown task error: walreceiver connection handling failure.*", - ".*Etcd client error: grpc request error: status: Unavailable.*", + ".*wal_connection_manager.*tcp connect error: Connection refused.*", ".*query handler for .* failed: Connection reset by peer.*", ".*serving compute connection task.*exited with error: Broken pipe.*", ".*Connection aborted: error communicating with the server: Broken pipe.*", @@ -1743,6 +1910,7 @@ class NeonPageserver(PgProtocol): ".*Connection aborted: error communicating with the server: Connection reset by peer.*", ".*kill_and_wait_impl.*: wait successful.*", ".*end streaming to Some.*", + ".*query handler for 'pagestream.*failed: Broken pipe.*", # pageserver notices compute shut down # safekeeper connection can fail with this, in the window between timeline creation # and streaming start ".*Failed to process query for timeline .*: state uninitialized, no data to read.*", @@ -1768,6 +1936,7 @@ class NeonPageserver(PgProtocol): ".*gc_loop.*Gc failed, retrying in.*timeline is Stopping", # When gc checks timeline state after acquiring layer_removal_cs ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs ".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock() + ".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress ] def start( @@ -1824,7 +1993,6 @@ class NeonPageserver(PgProtocol): def assert_no_errors(self): logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r") - error_or_warn = re.compile("ERROR|WARN") errors = [] while True: @@ -1842,6 +2010,28 @@ class NeonPageserver(PgProtocol): assert not errors + def log_contains(self, pattern: str) -> Optional[str]: + """Check that the pageserver log contains a line that matches the given regex""" + logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r") + + contains_re = re.compile(pattern) + + # XXX: Our rust logging machinery buffers the messages, so if you + # call this function immediately after it's been logged, there is + # no guarantee it is already present in the log file. This hasn't + # been a problem in practice, our python tests are not fast enough + # to hit that race condition. + while True: + line = logfile.readline() + if not line: + break + + if contains_re.search(line): + # found it! + return line + + return None + def append_pageserver_param_overrides( params_to_update: List[str], @@ -2081,62 +2271,73 @@ class PSQL: class NeonProxy(PgProtocol): + link_auth_uri: str = "http://dummy-uri" + + class AuthBackend(abc.ABC): + """All auth backends must inherit from this class""" + + @property + def default_conn_url(self) -> Optional[str]: + return None + + @abc.abstractmethod + def extra_args(self) -> list[str]: + pass + + class Link(AuthBackend): + def extra_args(self) -> list[str]: + return [ + # Link auth backend params + *["--auth-backend", "link"], + *["--uri", NeonProxy.link_auth_uri], + ] + + @dataclass(frozen=True) + class Postgres(AuthBackend): + pg_conn_url: str + + @property + def default_conn_url(self) -> Optional[str]: + return self.pg_conn_url + + def extra_args(self) -> list[str]: + return [ + # Postgres auth backend params + *["--auth-backend", "postgres"], + *["--auth-endpoint", self.pg_conn_url], + ] + def __init__( self, + neon_binpath: Path, proxy_port: int, http_port: int, mgmt_port: int, - neon_binpath: Path, - auth_endpoint=None, + auth_backend: NeonProxy.AuthBackend, ): - super().__init__(dsn=auth_endpoint, port=proxy_port) - self.host = "127.0.0.1" + host = "127.0.0.1" + super().__init__(dsn=auth_backend.default_conn_url, host=host, port=proxy_port) + + self.host = host self.http_port = http_port self.neon_binpath = neon_binpath self.proxy_port = proxy_port self.mgmt_port = mgmt_port - self.auth_endpoint = auth_endpoint + self.auth_backend = auth_backend self._popen: Optional[subprocess.Popen[bytes]] = None - self.link_auth_uri_prefix = "http://dummy-uri" - def start(self): - """ - Starts a proxy with option '--auth-backend postgres' and a postgres instance - already provided though '--auth-endpoint '." - """ + def start(self) -> NeonProxy: assert self._popen is None - assert self.auth_endpoint is not None - - # Start proxy args = [ str(self.neon_binpath / "proxy"), *["--http", f"{self.host}:{self.http_port}"], *["--proxy", f"{self.host}:{self.proxy_port}"], *["--mgmt", f"{self.host}:{self.mgmt_port}"], - *["--auth-backend", "postgres"], - *["--auth-endpoint", self.auth_endpoint], + *self.auth_backend.extra_args(), ] self._popen = subprocess.Popen(args) self._wait_until_ready() - - def start_with_link_auth(self): - """ - Starts a proxy with option '--auth-backend link' and a dummy authentication link '--uri dummy-auth-link'." - """ - assert self._popen is None - - # Start proxy - bin_proxy = str(self.neon_binpath / "proxy") - args = [bin_proxy] - args.extend(["--http", f"{self.host}:{self.http_port}"]) - args.extend(["--proxy", f"{self.host}:{self.proxy_port}"]) - args.extend(["--mgmt", f"{self.host}:{self.mgmt_port}"]) - args.extend(["--auth-backend", "link"]) - args.extend(["--uri", self.link_auth_uri_prefix]) - arg_str = " ".join(args) - log.info(f"starting proxy with command line ::: {arg_str}") - self._popen = subprocess.Popen(args, stdout=subprocess.PIPE) - self._wait_until_ready() + return self @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10) def _wait_until_ready(self): @@ -2147,7 +2348,7 @@ class NeonProxy(PgProtocol): request_result.raise_for_status() return request_result.text - def __enter__(self) -> "NeonProxy": + def __enter__(self) -> NeonProxy: return self def __exit__( @@ -2165,11 +2366,19 @@ class NeonProxy(PgProtocol): @pytest.fixture(scope="function") def link_proxy(port_distributor: PortDistributor, neon_binpath: Path) -> Iterator[NeonProxy]: """Neon proxy that routes through link auth.""" + http_port = port_distributor.get_port() proxy_port = port_distributor.get_port() mgmt_port = port_distributor.get_port() - with NeonProxy(proxy_port, http_port, neon_binpath=neon_binpath, mgmt_port=mgmt_port) as proxy: - proxy.start_with_link_auth() + + with NeonProxy( + neon_binpath=neon_binpath, + proxy_port=proxy_port, + http_port=http_port, + mgmt_port=mgmt_port, + auth_backend=NeonProxy.Link(), + ) as proxy: + proxy.start() yield proxy @@ -2193,11 +2402,11 @@ def static_proxy( http_port = port_distributor.get_port() with NeonProxy( + neon_binpath=neon_binpath, proxy_port=proxy_port, http_port=http_port, mgmt_port=mgmt_port, - neon_binpath=neon_binpath, - auth_endpoint=auth_endpoint, + auth_backend=NeonProxy.Postgres(auth_endpoint), ) as proxy: proxy.start() yield proxy @@ -2643,51 +2852,36 @@ class SafekeeperHttpClient(requests.Session): @dataclass -class Etcd: - """An object managing etcd instance""" +class NeonBroker: + """An object managing storage_broker instance""" - datadir: str + logfile: Path port: int - peer_port: int - binary_path: Path = field(init=False) + neon_binpath: Path handle: Optional[subprocess.Popen[Any]] = None # handle of running daemon - def __post_init__(self): - self.binary_path = etcd_path() + def listen_addr(self): + return f"127.0.0.1:{self.port}" def client_url(self): - return f"http://127.0.0.1:{self.port}" + return f"http://{self.listen_addr()}" def check_status(self): - with requests.Session() as s: - s.mount("http://", requests.adapters.HTTPAdapter(max_retries=1)) # do not retry - s.get(f"{self.client_url()}/health").raise_for_status() + return True # TODO def try_start(self): if self.handle is not None: - log.debug(f"etcd is already running on port {self.port}") + log.debug(f"storage_broker is already running on port {self.port}") return - Path(self.datadir).mkdir(exist_ok=True) - - if not self.binary_path.is_file(): - raise RuntimeError(f"etcd broker binary '{self.binary_path}' is not a file") - - client_url = self.client_url() - log.info(f'Starting etcd to listen incoming connections at "{client_url}"') - with open(os.path.join(self.datadir, "etcd.log"), "wb") as log_file: + listen_addr = self.listen_addr() + log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"') + with open(self.logfile, "wb") as logfile: args = [ - self.binary_path, - f"--data-dir={self.datadir}", - f"--listen-client-urls={client_url}", - f"--advertise-client-urls={client_url}", - f"--listen-peer-urls=http://127.0.0.1:{self.peer_port}", - # Set --quota-backend-bytes to keep the etcd virtual memory - # size smaller. Our test etcd clusters are very small. - # See https://github.com/etcd-io/etcd/issues/7910 - "--quota-backend-bytes=100000000", + str(self.neon_binpath / "storage_broker"), + f"--listen-addr={listen_addr}", ] - self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file) + self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile) # wait for start started_at = time.time() @@ -2697,7 +2891,9 @@ class Etcd: except Exception as e: elapsed = time.time() - started_at if elapsed > 5: - raise RuntimeError(f"timed out waiting {elapsed:.0f}s for etcd start: {e}") + raise RuntimeError( + f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}" + ) time.sleep(0.5) else: break # success @@ -2718,6 +2914,24 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: return test_dir +def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + return get_test_output_dir(request, top_output_dir) / "repo" + + +def pytest_addoption(parser: Parser): + parser.addoption( + "--preserve-database-files", + action="store_true", + default=False, + help="Preserve timeline files after the test suite is over", + ) + + +SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] + r"config|metadata|.+\.(?:toml|pid|json|sql)" +) + + # This is autouse, so the test output directory always gets created, even # if a test doesn't put anything there. It also solves a problem with the # neon_simple_env fixture: if TEST_SHARED_FIXTURES is not set, it @@ -2880,12 +3094,55 @@ def check_restored_datadir_content( assert (mismatch, error) == ([], []) -def assert_no_in_progress_downloads_for_tenant( - pageserver_http_client: PageserverHttpClient, - tenant: TenantId, +def wait_until(number_of_iterations: int, interval: float, func): + """ + Wait until 'func' returns successfully, without exception. Returns the + last return value from the function. + """ + last_exception = None + for i in range(number_of_iterations): + try: + res = func() + except Exception as e: + log.info("waiting for %s iteration %s failed", func, i + 1) + last_exception = e + time.sleep(interval) + continue + return res + raise Exception("timed out while waiting for %s" % func) from last_exception + + +def wait_while(number_of_iterations: int, interval: float, func): + """ + Wait until 'func' returns false, or throws an exception. + """ + for i in range(number_of_iterations): + try: + if not func(): + return + log.info("waiting for %s iteration %s failed", func, i + 1) + time.sleep(interval) + continue + except Exception: + return + raise Exception("timed out while waiting for %s" % func) + + +def assert_tenant_status( + pageserver_http_client: PageserverHttpClient, tenant: TenantId, expected_status: str ): tenant_status = pageserver_http_client.tenant_status(tenant) - assert tenant_status["has_in_progress_downloads"] is False, tenant_status + log.info(f"tenant_status: {tenant_status}") + assert tenant_status["state"] == expected_status, tenant_status + + +def tenant_exists(ps_http: PageserverHttpClient, tenant_id: TenantId): + tenants = ps_http.tenant_list() + matching = [t for t in tenants if TenantId(t["id"]) == tenant_id] + assert len(matching) < 2 + if len(matching) == 0: + return None + return matching[0] def remote_consistent_lsn( @@ -2893,14 +3150,15 @@ def remote_consistent_lsn( ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) - lsn_str = detail["remote_consistent_lsn"] - if lsn_str is None: + if detail["remote_consistent_lsn"] is None: # No remote information at all. This happens right after creating # a timeline, before any part of it has been uploaded to remote # storage yet. return Lsn(0) - assert isinstance(lsn_str, str) - return Lsn(lsn_str) + else: + lsn_str = detail["remote_consistent_lsn"] + assert isinstance(lsn_str, str) + return Lsn(lsn_str) def wait_for_upload( @@ -2913,6 +3171,7 @@ def wait_for_upload( for i in range(20): current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) if current_lsn >= lsn: + log.info("wait finished") return log.info( "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( @@ -2927,6 +3186,27 @@ def wait_for_upload( ) +# Does not use `wait_until` for debugging purposes +def wait_until_tenant_state( + pageserver_http: PageserverHttpClient, + tenant_id: TenantId, + expected_state: str, + iterations: int, +) -> bool: + for _ in range(iterations): + try: + tenant = pageserver_http.tenant_status(tenant_id=tenant_id) + log.debug(f"Tenant {tenant_id} data: {tenant}") + if tenant["state"] == expected_state: + return True + except Exception as e: + log.debug(f"Tenant {tenant_id} state retrieval failure: {e}") + + time.sleep(1) + + raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds") + + def last_record_lsn( pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId ) -> Lsn: @@ -2981,3 +3261,34 @@ def fork_at_current_lsn( """ current_lsn = pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0] return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn) + + +def wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn( + tenant_id: TenantId, + timeline_id: TimelineId, + safekeepers: List[Safekeeper], + pageserver: NeonPageserver, +): + sk_commit_lsns = [ + sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn for sk in safekeepers + ] + lsn = max(sk_commit_lsns) + ps_http = pageserver.http_client() + wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, lsn) + return lsn + + +def wait_for_sk_commit_lsn_to_reach_remote_storage( + tenant_id: TenantId, + timeline_id: TimelineId, + safekeepers: List[Safekeeper], + pageserver: NeonPageserver, +): + lsn = wait_for_sk_commit_lsn_to_arrive_at_pageserver_last_record_lsn( + tenant_id, timeline_id, safekeepers, pageserver + ) + ps_http = pageserver.http_client() + # force a checkpoint to trigger upload + ps_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(ps_http, tenant_id, timeline_id, lsn) + return lsn diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 506fe6f9da..1fb9eb72e6 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,7 +1,6 @@ import contextlib import os import re -import shutil import subprocess import tarfile import time @@ -74,13 +73,6 @@ def print_gc_result(row: Dict[str, Any]): ) -def etcd_path() -> Path: - path_output = shutil.which("etcd") - if path_output is None: - raise RuntimeError("etcd not found in PATH") - return Path(path_output) - - def query_scalar(cur: cursor, query: str) -> Any: """ It is a convenience wrapper to avoid repetitions diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index 725612853a..a32ce87c33 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -16,6 +16,7 @@ Some handy pytest flags for local development: - `-s` shows test output - `-k` selects a test to run - `--timeout=0` disables our default timeout of 300s (see `setup.cfg`) +- `--cleanup-test-ouput` cleans up after each test # What performance tests do we have and how we run them diff --git a/test_runner/performance/test_bulk_update.py b/test_runner/performance/test_bulk_update.py index bcd26013e5..f8e29cda69 100644 --- a/test_runner/performance/test_bulk_update.py +++ b/test_runner/performance/test_bulk_update.py @@ -42,7 +42,8 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor) cur.execute("drop table t") cur.execute("set enable_seqscan_prefetch=on") - cur.execute("set seqscan_prefetch_buffers=100") + cur.execute("set effective_io_concurrency=32") + cur.execute("set maintenance_io_concurrency=32") cur.execute(f"create table t2(x integer) WITH (fillfactor={fillfactor})") diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py index 01b2097112..a91c78e867 100644 --- a/test_runner/performance/test_copy.py +++ b/test_runner/performance/test_copy.py @@ -1,5 +1,6 @@ from contextlib import closing from io import BufferedReader, RawIOBase +from typing import Optional from fixtures.compare_fixtures import PgCompare @@ -8,7 +9,7 @@ class CopyTestData(RawIOBase): def __init__(self, rows: int): self.rows = rows self.rownum = 0 - self.linebuf = None + self.linebuf: Optional[bytes] = None self.ptr = 0 def readable(self): diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py index 84693325c0..8c3b6e57ff 100644 --- a/test_runner/performance/test_perf_olap.py +++ b/test_runner/performance/test_perf_olap.py @@ -2,8 +2,10 @@ from dataclasses import dataclass from typing import Dict, Tuple import pytest +from _pytest.mark import ParameterSet from fixtures.compare_fixtures import RemoteCompare from fixtures.log_helper import log +from fixtures.utils import get_self_dir @dataclass @@ -109,3 +111,36 @@ def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare): """ run_psql(remote_compare, query, times=3) + + +def tpch_queuies() -> Tuple[ParameterSet, ...]: + """ + A list of queries to run for the TPC-H benchmark. + - querues in returning tuple are ordered by the query number + - pytest parameters id is adjusted to match the query id (the numbering starts from 1) + """ + queries_dir = get_self_dir().parent / "performance" / "tpc-h" / "queries" + assert queries_dir.exists(), f"TPC-H queries dir not found: {queries_dir}" + + return tuple( + pytest.param(LabelledQuery(f"Q{f.stem}", f.read_text()), id=f"query{f.stem}") + for f in sorted(queries_dir.glob("*.sql"), key=lambda f: int(f.stem)) + ) + + +@pytest.mark.parametrize("query", tpch_queuies()) +@pytest.mark.remote_cluster +def test_tpch(query: LabelledQuery, remote_compare: RemoteCompare): + """ + TCP-H Benchmark + + The DB prepared manually in advance: + - schema: test_runner/performance/tpc-h/create-schema.sql + - indexes: test_runner/performance/tpc-h/create-indexes.sql + - data generated by `dbgen` program of the official TPC-H benchmark + - `VACUUM (FREEZE, PARALLEL 0);` + + For query generation `1669822882` is used as a seed to the RNG + """ + + run_psql(remote_compare, query, times=1) diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 015cc40a72..50e5366c1e 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -15,7 +15,7 @@ from fixtures.utils import get_scale_for_db @enum.unique class PgBenchLoadType(enum.Enum): INIT = "init" - SIMPLE_UPDATE = "simple_update" + SIMPLE_UPDATE = "simple-update" SELECT_ONLY = "select-only" @@ -94,7 +94,9 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P if workload_type == PgBenchLoadType.INIT: # Run initialize - init_pgbench(env, ["pgbench", f"-s{scale}", "-i", connstr], password=password) + init_pgbench( + env, ["pgbench", f"-s{scale}", "-i", "-I", "dtGvp", connstr], password=password + ) if workload_type == PgBenchLoadType.SIMPLE_UPDATE: # Run simple-update workload diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py index a0a1dbd01d..bd84724405 100644 --- a/test_runner/performance/test_seqscans.py +++ b/test_runner/performance/test_seqscans.py @@ -22,15 +22,16 @@ from pytest_lazyfixture import lazy_fixture # type: ignore ], ) @pytest.mark.parametrize( - "env, scale", + "env,scale", [ - # Run on all envs. Use 50x larger table on remote cluster to make sure + # Run on all envs. Use 200x larger table on remote cluster to make sure # it doesn't fit in shared buffers, which are larger on remote than local. pytest.param(lazy_fixture("neon_compare"), 1, id="neon"), pytest.param(lazy_fixture("vanilla_compare"), 1, id="vanilla"), - pytest.param( - lazy_fixture("remote_compare"), 50, id="remote", marks=pytest.mark.remote_cluster - ), + # Reenable after switching per-test projects created via API + # pytest.param( + # lazy_fixture("remote_compare"), 200, id="remote", marks=pytest.mark.remote_cluster + # ), ], ) def test_seqscans(env: PgCompare, scale: int, rows: int, iters: int, workers: int): @@ -45,7 +46,7 @@ def test_seqscans(env: PgCompare, scale: int, rows: int, iters: int, workers: in # Verify that the table is larger than shared_buffers cur.execute( """ - select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('t') as tbl_ize + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('t') as tbl_size from pg_settings where name = 'shared_buffers' """ ) diff --git a/test_runner/performance/tpc-h/create-indexes.sql b/test_runner/performance/tpc-h/create-indexes.sql new file mode 100644 index 0000000000..590a9c1900 --- /dev/null +++ b/test_runner/performance/tpc-h/create-indexes.sql @@ -0,0 +1,43 @@ +-- Section 1.4.2.2 + +ALTER TABLE part ADD PRIMARY KEY (p_partkey); +ALTER TABLE supplier ADD PRIMARY KEY (s_suppkey); +ALTER TABLE partsupp ADD PRIMARY KEY (ps_partkey, ps_suppkey); +ALTER TABLE customer ADD PRIMARY KEY (c_custkey); +ALTER TABLE orders ADD PRIMARY KEY (o_orderkey); +ALTER TABLE lineitem ADD PRIMARY KEY (l_orderkey, l_linenumber); +ALTER TABLE nation ADD PRIMARY KEY (n_nationkey); +ALTER TABLE region ADD PRIMARY KEY (r_regionkey); + +-- Section 1.4.2.3 + +CREATE INDEX ON supplier USING btree (s_nationkey); +ALTER TABLE supplier ADD FOREIGN KEY (s_nationkey) REFERENCES nation (n_nationkey); + +/* IGNORE: implied by primary key */ +-- CREATE INDEX ON partsupp USING btree (ps_partkey); +CREATE INDEX ON partsupp USING btree (ps_suppkey); +ALTER TABLE partsupp ADD FOREIGN KEY (ps_partkey) REFERENCES part (p_partkey); +ALTER TABLE partsupp ADD FOREIGN KEY (ps_suppkey) REFERENCES supplier (s_suppkey); + +CREATE INDEX ON customer USING btree (c_nationkey); +ALTER TABLE customer ADD FOREIGN KEY (c_nationkey) REFERENCES nation (n_nationkey); + +CREATE INDEX ON orders USING btree (o_custkey); +ALTER TABLE orders ADD FOREIGN KEY (o_custkey) REFERENCES customer (c_custkey); + +/* IGNORE: implied by primary key */ +-- CREATE INDEX ON lineitem USING btree (l_orderkey); +CREATE INDEX ON lineitem USING btree (l_partkey, l_suppkey); +CREATE INDEX ON lineitem USING btree (l_suppkey); +ALTER TABLE lineitem ADD FOREIGN KEY (l_orderkey) REFERENCES orders (o_orderkey); +ALTER TABLE lineitem ADD FOREIGN KEY (l_partkey) REFERENCES part (p_partkey); +ALTER TABLE lineitem ADD FOREIGN KEY (l_suppkey) REFERENCES supplier (s_suppkey); +ALTER TABLE lineitem ADD FOREIGN KEY (l_partkey, l_suppkey) REFERENCES partsupp (ps_partkey, ps_suppkey); + +CREATE INDEX ON nation USING btree (n_regionkey); +ALTER TABLE nation ADD FOREIGN KEY (n_regionkey) REFERENCES region (r_regionkey); + +-- Section 1.4.2.4 + +ALTER TABLE lineitem ADD CHECK (l_shipdate <= l_receiptdate); diff --git a/test_runner/performance/tpc-h/create-schema.sql b/test_runner/performance/tpc-h/create-schema.sql new file mode 100644 index 0000000000..4293951aa1 --- /dev/null +++ b/test_runner/performance/tpc-h/create-schema.sql @@ -0,0 +1,69 @@ +-- Sccsid: @(#)dss.ddl 2.1.8.1 +CREATE TABLE NATION ( N_NATIONKEY INTEGER NOT NULL, + N_NAME CHAR(25) NOT NULL, + N_REGIONKEY INTEGER NOT NULL, + N_COMMENT VARCHAR(152)); + +CREATE TABLE REGION ( R_REGIONKEY INTEGER NOT NULL, + R_NAME CHAR(25) NOT NULL, + R_COMMENT VARCHAR(152)); + +CREATE TABLE PART ( P_PARTKEY INTEGER NOT NULL, + P_NAME VARCHAR(55) NOT NULL, + P_MFGR CHAR(25) NOT NULL, + P_BRAND CHAR(10) NOT NULL, + P_TYPE VARCHAR(25) NOT NULL, + P_SIZE INTEGER NOT NULL, + P_CONTAINER CHAR(10) NOT NULL, + P_RETAILPRICE DECIMAL(15,2) NOT NULL, + P_COMMENT VARCHAR(23) NOT NULL ); + +CREATE TABLE SUPPLIER ( S_SUPPKEY INTEGER NOT NULL, + S_NAME CHAR(25) NOT NULL, + S_ADDRESS VARCHAR(40) NOT NULL, + S_NATIONKEY INTEGER NOT NULL, + S_PHONE CHAR(15) NOT NULL, + S_ACCTBAL DECIMAL(15,2) NOT NULL, + S_COMMENT VARCHAR(101) NOT NULL); + +CREATE TABLE PARTSUPP ( PS_PARTKEY INTEGER NOT NULL, + PS_SUPPKEY INTEGER NOT NULL, + PS_AVAILQTY INTEGER NOT NULL, + PS_SUPPLYCOST DECIMAL(15,2) NOT NULL, + PS_COMMENT VARCHAR(199) NOT NULL ); + +CREATE TABLE CUSTOMER ( C_CUSTKEY INTEGER NOT NULL, + C_NAME VARCHAR(25) NOT NULL, + C_ADDRESS VARCHAR(40) NOT NULL, + C_NATIONKEY INTEGER NOT NULL, + C_PHONE CHAR(15) NOT NULL, + C_ACCTBAL DECIMAL(15,2) NOT NULL, + C_MKTSEGMENT CHAR(10) NOT NULL, + C_COMMENT VARCHAR(117) NOT NULL); + +CREATE TABLE ORDERS ( O_ORDERKEY INTEGER NOT NULL, + O_CUSTKEY INTEGER NOT NULL, + O_ORDERSTATUS CHAR(1) NOT NULL, + O_TOTALPRICE DECIMAL(15,2) NOT NULL, + O_ORDERDATE DATE NOT NULL, + O_ORDERPRIORITY CHAR(15) NOT NULL, + O_CLERK CHAR(15) NOT NULL, + O_SHIPPRIORITY INTEGER NOT NULL, + O_COMMENT VARCHAR(79) NOT NULL); + +CREATE TABLE LINEITEM ( L_ORDERKEY INTEGER NOT NULL, + L_PARTKEY INTEGER NOT NULL, + L_SUPPKEY INTEGER NOT NULL, + L_LINENUMBER INTEGER NOT NULL, + L_QUANTITY DECIMAL(15,2) NOT NULL, + L_EXTENDEDPRICE DECIMAL(15,2) NOT NULL, + L_DISCOUNT DECIMAL(15,2) NOT NULL, + L_TAX DECIMAL(15,2) NOT NULL, + L_RETURNFLAG CHAR(1) NOT NULL, + L_LINESTATUS CHAR(1) NOT NULL, + L_SHIPDATE DATE NOT NULL, + L_COMMITDATE DATE NOT NULL, + L_RECEIPTDATE DATE NOT NULL, + L_SHIPINSTRUCT CHAR(25) NOT NULL, + L_SHIPMODE CHAR(10) NOT NULL, + L_COMMENT VARCHAR(44) NOT NULL); diff --git a/test_runner/performance/tpc-h/queries/1.sql b/test_runner/performance/tpc-h/queries/1.sql new file mode 100644 index 0000000000..2e1967fec8 --- /dev/null +++ b/test_runner/performance/tpc-h/queries/1.sql @@ -0,0 +1,27 @@ +-- $ID$ +-- TPC-H/TPC-R Pricing Summary Report Query (Q1) +-- Functional Query Definition +-- Approved February 1998 + + +select + l_returnflag, + l_linestatus, + sum(l_quantity) as sum_qty, + sum(l_extendedprice) as sum_base_price, + sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, + sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, + avg(l_quantity) as avg_qty, + avg(l_extendedprice) as avg_price, + avg(l_discount) as avg_disc, + count(*) as count_order +from + lineitem +where + l_shipdate <= date '1998-12-01' - interval '89' day +group by + l_returnflag, + l_linestatus +order by + l_returnflag, + l_linestatus; diff --git a/test_runner/performance/tpc-h/queries/10.sql b/test_runner/performance/tpc-h/queries/10.sql new file mode 100644 index 0000000000..0569e2ed86 --- /dev/null +++ b/test_runner/performance/tpc-h/queries/10.sql @@ -0,0 +1,38 @@ +-- $ID$ +-- TPC-H/TPC-R Returned Item Reporting Query (Q10) +-- Functional Query Definition +-- Approved February 1998 + + +select + c_custkey, + c_name, + sum(l_extendedprice * (1 - l_discount)) as revenue, + c_acctbal, + n_name, + c_address, + c_phone, + c_comment +from + customer, + orders, + lineitem, + nation +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate >= date '1993-08-01' + and o_orderdate < date '1993-08-01' + interval '3' month + and l_returnflag = 'R' + and c_nationkey = n_nationkey +group by + c_custkey, + c_name, + c_acctbal, + c_phone, + n_name, + c_address, + c_comment +order by + revenue desc +limit 20; diff --git a/test_runner/performance/tpc-h/queries/11.sql b/test_runner/performance/tpc-h/queries/11.sql new file mode 100644 index 0000000000..f7500c260e --- /dev/null +++ b/test_runner/performance/tpc-h/queries/11.sql @@ -0,0 +1,34 @@ +-- $ID$ +-- TPC-H/TPC-R Important Stock Identification Query (Q11) +-- Functional Query Definition +-- Approved February 1998 + + +select + ps_partkey, + sum(ps_supplycost * ps_availqty) as value +from + partsupp, + supplier, + nation +where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'INDONESIA' +group by + ps_partkey having + sum(ps_supplycost * ps_availqty) > ( + select + sum(ps_supplycost * ps_availqty) * 0.0001000000 + from + partsupp, + supplier, + nation + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'INDONESIA' + ) +order by + value desc +; diff --git a/test_runner/performance/tpc-h/queries/12.sql b/test_runner/performance/tpc-h/queries/12.sql new file mode 100644 index 0000000000..bd879321c8 --- /dev/null +++ b/test_runner/performance/tpc-h/queries/12.sql @@ -0,0 +1,35 @@ +-- $ID$ +-- TPC-H/TPC-R Shipping Modes and Order Priority Query (Q12) +-- Functional Query Definition +-- Approved February 1998 + + +select + l_shipmode, + sum(case + when o_orderpriority = '1-URGENT' + or o_orderpriority = '2-HIGH' + then 1 + else 0 + end) as high_line_count, + sum(case + when o_orderpriority <> '1-URGENT' + and o_orderpriority <> '2-HIGH' + then 1 + else 0 + end) as low_line_count +from + orders, + lineitem +where + o_orderkey = l_orderkey + and l_shipmode in ('REG AIR', 'AIR') + and l_commitdate < l_receiptdate + and l_shipdate < l_commitdate + and l_receiptdate >= date '1995-01-01' + and l_receiptdate < date '1995-01-01' + interval '1' year +group by + l_shipmode +order by + l_shipmode +; diff --git a/test_runner/performance/tpc-h/queries/13.sql b/test_runner/performance/tpc-h/queries/13.sql new file mode 100644 index 0000000000..554b2bec92 --- /dev/null +++ b/test_runner/performance/tpc-h/queries/13.sql @@ -0,0 +1,27 @@ +-- $ID$ +-- TPC-H/TPC-R Customer Distribution Query (Q13) +-- Functional Query Definition +-- Approved February 1998 + + +select + c_count, + count(*) as custdist +from + ( + select + c_custkey, + count(o_orderkey) + from + customer left outer join orders on + c_custkey = o_custkey + and o_comment not like '%special%accounts%' + group by + c_custkey + ) as c_orders (c_custkey, c_count) +group by + c_count +order by + custdist desc, + c_count desc +; diff --git a/test_runner/performance/tpc-h/queries/14.sql b/test_runner/performance/tpc-h/queries/14.sql new file mode 100644 index 0000000000..794a5656f5 --- /dev/null +++ b/test_runner/performance/tpc-h/queries/14.sql @@ -0,0 +1,20 @@ +-- $ID$ +-- TPC-H/TPC-R Promotion Effect Query (Q14) +-- Functional Query Definition +-- Approved February 1998 + + +select + 100.00 * sum(case + when p_type like 'PROMO%' + then l_extendedprice * (1 - l_discount) + else 0 + end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue +from + lineitem, + part +where + l_partkey = p_partkey + and l_shipdate >= date '1995-07-01' + and l_shipdate < date '1995-07-01' + interval '1' month +; diff --git a/test_runner/performance/tpc-h/queries/15.sql b/test_runner/performance/tpc-h/queries/15.sql new file mode 100644 index 0000000000..5d618c9906 --- /dev/null +++ b/test_runner/performance/tpc-h/queries/15.sql @@ -0,0 +1,40 @@ +-- $ID$ +-- TPC-H/TPC-R Top Supplier Query (Q15) +-- Functional Query Definition +-- Approved February 1998 + +create view revenue0 (supplier_no, total_revenue) as + select + l_suppkey, + sum(l_extendedprice * (1 - l_discount)) + from + lineitem + where + l_shipdate >= date '1995-01-01' + and l_shipdate < date '1995-01-01' + interval '3' month + group by + l_suppkey; + + +select + s_suppkey, + s_name, + s_address, + s_phone, + total_revenue +from + supplier, + revenue0 +where + s_suppkey = supplier_no + and total_revenue = ( + select + max(total_revenue) + from + revenue0 + ) +order by + s_suppkey; + +drop view revenue0 +; diff --git a/test_runner/performance/tpc-h/queries/16.sql b/test_runner/performance/tpc-h/queries/16.sql new file mode 100644 index 0000000000..f525d55d5d --- /dev/null +++ b/test_runner/performance/tpc-h/queries/16.sql @@ -0,0 +1,37 @@ +-- $ID$ +-- TPC-H/TPC-R Parts/Supplier Relationship Query (Q16) +-- Functional Query Definition +-- Approved February 1998 + + +select + p_brand, + p_type, + p_size, + count(distinct ps_suppkey) as supplier_cnt +from + partsupp, + part +where + p_partkey = ps_partkey + and p_brand <> 'Brand#43' + and p_type not like 'PROMO POLISHED%' + and p_size in (35, 5, 42, 13, 11, 40, 50, 47) + and ps_suppkey not in ( + select + s_suppkey + from + supplier + where + s_comment like '%Customer%Complaints%' + ) +group by + p_brand, + p_type, + p_size +order by + supplier_cnt desc, + p_brand, + p_type, + p_size +; diff --git a/test_runner/performance/tpc-h/queries/17.sql b/test_runner/performance/tpc-h/queries/17.sql new file mode 100644 index 0000000000..7d736cd3b5 --- /dev/null +++ b/test_runner/performance/tpc-h/queries/17.sql @@ -0,0 +1,25 @@ + +-- $ID$ +-- TPC-H/TPC-R Small-Quantity-Order Revenue Query (Q17) +-- Functional Query Definition +-- Approved February 1998 + + +select + sum(l_extendedprice) / 7.0 as avg_yearly +from + lineitem, + part +where + p_partkey = l_partkey + and p_brand = 'Brand#35' + and p_container = 'JUMBO JAR' + and l_quantity < ( + select + 0.2 * avg(l_quantity) + from + lineitem + where + l_partkey = p_partkey + ) +; diff --git a/test_runner/performance/tpc-h/queries/18.sql b/test_runner/performance/tpc-h/queries/18.sql new file mode 100644 index 0000000000..13f7ce7306 --- /dev/null +++ b/test_runner/performance/tpc-h/queries/18.sql @@ -0,0 +1,39 @@ +-- $ID$ +-- TPC-H/TPC-R Large Volume Customer Query (Q18) +-- Function Query Definition +-- Approved February 1998 + + +select + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice, + sum(l_quantity) +from + customer, + orders, + lineitem +where + o_orderkey in ( + select + l_orderkey + from + lineitem + group by + l_orderkey having + sum(l_quantity) > 315 + ) + and c_custkey = o_custkey + and o_orderkey = l_orderkey +group by + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice +order by + o_totalprice desc, + o_orderdate +limit 100; diff --git a/test_runner/performance/tpc-h/queries/19.sql b/test_runner/performance/tpc-h/queries/19.sql new file mode 100644 index 0000000000..43a64bde6f --- /dev/null +++ b/test_runner/performance/tpc-h/queries/19.sql @@ -0,0 +1,42 @@ +-- $ID$ +-- TPC-H/TPC-R Discounted Revenue Query (Q19) +-- Functional Query Definition +-- Approved February 1998 + + +select + sum(l_extendedprice* (1 - l_discount)) as revenue +from + lineitem, + part +where + ( + p_partkey = l_partkey + and p_brand = 'Brand#41' + and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') + and l_quantity >= 10 and l_quantity <= 10 + 10 + and p_size between 1 and 5 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#52' + and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') + and l_quantity >= 20 and l_quantity <= 20 + 10 + and p_size between 1 and 10 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#14' + and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') + and l_quantity >= 22 and l_quantity <= 22 + 10 + and p_size between 1 and 15 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) +; diff --git a/test_runner/performance/tpc-h/queries/2.sql b/test_runner/performance/tpc-h/queries/2.sql new file mode 100644 index 0000000000..2e8164b65a --- /dev/null +++ b/test_runner/performance/tpc-h/queries/2.sql @@ -0,0 +1,50 @@ +-- $ID$ +-- TPC-H/TPC-R Minimum Cost Supplier Query (Q2) +-- Functional Query Definition +-- Approved February 1998 + + +select + s_acctbal, + s_name, + n_name, + p_partkey, + p_mfgr, + s_address, + s_phone, + s_comment +from + part, + supplier, + partsupp, + nation, + region +where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and p_size = 39 + and p_type like '%BRASS' + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'MIDDLE EAST' + and ps_supplycost = ( + select + min(ps_supplycost) + from + partsupp, + supplier, + nation, + region + where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'MIDDLE EAST' + ) +order by + s_acctbal desc, + n_name, + s_name, + p_partkey +limit 100; diff --git a/test_runner/performance/tpc-h/queries/20.sql b/test_runner/performance/tpc-h/queries/20.sql new file mode 100644 index 0000000000..7e587783c5 --- /dev/null +++ b/test_runner/performance/tpc-h/queries/20.sql @@ -0,0 +1,44 @@ +-- $ID$ +-- TPC-H/TPC-R Potential Part Promotion Query (Q20) +-- Function Query Definition +-- Approved February 1998 + + +select + s_name, + s_address +from + supplier, + nation +where + s_suppkey in ( + select + ps_suppkey + from + partsupp + where + ps_partkey in ( + select + p_partkey + from + part + where + p_name like 'bisque%' + ) + and ps_availqty > ( + select + 0.5 * sum(l_quantity) + from + lineitem + where + l_partkey = ps_partkey + and l_suppkey = ps_suppkey + and l_shipdate >= date '1997-01-01' + and l_shipdate < date '1997-01-01' + interval '1' year + ) + ) + and s_nationkey = n_nationkey + and n_name = 'ETHIOPIA' +order by + s_name +; diff --git a/test_runner/performance/tpc-h/queries/21.sql b/test_runner/performance/tpc-h/queries/21.sql new file mode 100644 index 0000000000..9a0a88236e --- /dev/null +++ b/test_runner/performance/tpc-h/queries/21.sql @@ -0,0 +1,46 @@ +-- $ID$ +-- TPC-H/TPC-R Suppliers Who Kept Orders Waiting Query (Q21) +-- Functional Query Definition +-- Approved February 1998 + + +select + s_name, + count(*) as numwait +from + supplier, + lineitem l1, + orders, + nation +where + s_suppkey = l1.l_suppkey + and o_orderkey = l1.l_orderkey + and o_orderstatus = 'F' + and l1.l_receiptdate > l1.l_commitdate + and exists ( + select + * + from + lineitem l2 + where + l2.l_orderkey = l1.l_orderkey + and l2.l_suppkey <> l1.l_suppkey + ) + and not exists ( + select + * + from + lineitem l3 + where + l3.l_orderkey = l1.l_orderkey + and l3.l_suppkey <> l1.l_suppkey + and l3.l_receiptdate > l3.l_commitdate + ) + and s_nationkey = n_nationkey + and n_name = 'SAUDI ARABIA' +group by + s_name +order by + numwait desc, + s_name +limit 100; diff --git a/test_runner/performance/tpc-h/queries/22.sql b/test_runner/performance/tpc-h/queries/22.sql new file mode 100644 index 0000000000..965239f194 --- /dev/null +++ b/test_runner/performance/tpc-h/queries/22.sql @@ -0,0 +1,44 @@ +-- $ID$ +-- TPC-H/TPC-R Global Sales Opportunity Query (Q22) +-- Functional Query Definition +-- Approved February 1998 + + +select + cntrycode, + count(*) as numcust, + sum(c_acctbal) as totacctbal +from + ( + select + substring(c_phone from 1 for 2) as cntrycode, + c_acctbal + from + customer + where + substring(c_phone from 1 for 2) in + ('15', '14', '29', '34', '33', '19', '13') + and c_acctbal > ( + select + avg(c_acctbal) + from + customer + where + c_acctbal > 0.00 + and substring(c_phone from 1 for 2) in + ('15', '14', '29', '34', '33', '19', '13') + ) + and not exists ( + select + * + from + orders + where + o_custkey = c_custkey + ) + ) as custsale +group by + cntrycode +order by + cntrycode +; diff --git a/test_runner/performance/tpc-h/queries/3.sql b/test_runner/performance/tpc-h/queries/3.sql new file mode 100644 index 0000000000..bbb8f7371a --- /dev/null +++ b/test_runner/performance/tpc-h/queries/3.sql @@ -0,0 +1,29 @@ +-- $ID$ +-- TPC-H/TPC-R Shipping Priority Query (Q3) +-- Functional Query Definition +-- Approved February 1998 + + +select + l_orderkey, + sum(l_extendedprice * (1 - l_discount)) as revenue, + o_orderdate, + o_shippriority +from + customer, + orders, + lineitem +where + c_mktsegment = 'AUTOMOBILE' + and c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate < date '1995-03-26' + and l_shipdate > date '1995-03-26' +group by + l_orderkey, + o_orderdate, + o_shippriority +order by + revenue desc, + o_orderdate +limit 10; diff --git a/test_runner/performance/tpc-h/queries/4.sql b/test_runner/performance/tpc-h/queries/4.sql new file mode 100644 index 0000000000..098b203414 --- /dev/null +++ b/test_runner/performance/tpc-h/queries/4.sql @@ -0,0 +1,28 @@ +-- $ID$ +-- TPC-H/TPC-R Order Priority Checking Query (Q4) +-- Functional Query Definition +-- Approved February 1998 + + +select + o_orderpriority, + count(*) as order_count +from + orders +where + o_orderdate >= date '1996-12-01' + and o_orderdate < date '1996-12-01' + interval '3' month + and exists ( + select + * + from + lineitem + where + l_orderkey = o_orderkey + and l_commitdate < l_receiptdate + ) +group by + o_orderpriority +order by + o_orderpriority +; diff --git a/test_runner/performance/tpc-h/queries/5.sql b/test_runner/performance/tpc-h/queries/5.sql new file mode 100644 index 0000000000..393e17987f --- /dev/null +++ b/test_runner/performance/tpc-h/queries/5.sql @@ -0,0 +1,31 @@ +-- $ID$ +-- TPC-H/TPC-R Local Supplier Volume Query (Q5) +-- Functional Query Definition +-- Approved February 1998 + + +select + n_name, + sum(l_extendedprice * (1 - l_discount)) as revenue +from + customer, + orders, + lineitem, + supplier, + nation, + region +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and l_suppkey = s_suppkey + and c_nationkey = s_nationkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'ASIA' + and o_orderdate >= date '1996-01-01' + and o_orderdate < date '1996-01-01' + interval '1' year +group by + n_name +order by + revenue desc +; diff --git a/test_runner/performance/tpc-h/queries/6.sql b/test_runner/performance/tpc-h/queries/6.sql new file mode 100644 index 0000000000..90ebcd4782 --- /dev/null +++ b/test_runner/performance/tpc-h/queries/6.sql @@ -0,0 +1,16 @@ +-- $ID$ +-- TPC-H/TPC-R Forecasting Revenue Change Query (Q6) +-- Functional Query Definition +-- Approved February 1998 + + +select + sum(l_extendedprice * l_discount) as revenue +from + lineitem +where + l_shipdate >= date '1996-01-01' + and l_shipdate < date '1996-01-01' + interval '1' year + and l_discount between 0.02 - 0.01 and 0.02 + 0.01 + and l_quantity < 24 +; diff --git a/test_runner/performance/tpc-h/queries/7.sql b/test_runner/performance/tpc-h/queries/7.sql new file mode 100644 index 0000000000..8a34724b38 --- /dev/null +++ b/test_runner/performance/tpc-h/queries/7.sql @@ -0,0 +1,46 @@ +-- $ID$ +-- TPC-H/TPC-R Volume Shipping Query (Q7) +-- Functional Query Definition +-- Approved February 1998 + + +select + supp_nation, + cust_nation, + l_year, + sum(volume) as revenue +from + ( + select + n1.n_name as supp_nation, + n2.n_name as cust_nation, + extract(year from l_shipdate) as l_year, + l_extendedprice * (1 - l_discount) as volume + from + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2 + where + s_suppkey = l_suppkey + and o_orderkey = l_orderkey + and c_custkey = o_custkey + and s_nationkey = n1.n_nationkey + and c_nationkey = n2.n_nationkey + and ( + (n1.n_name = 'ALGERIA' and n2.n_name = 'CANADA') + or (n1.n_name = 'CANADA' and n2.n_name = 'ALGERIA') + ) + and l_shipdate between date '1995-01-01' and date '1996-12-31' + ) as shipping +group by + supp_nation, + cust_nation, + l_year +order by + supp_nation, + cust_nation, + l_year +; diff --git a/test_runner/performance/tpc-h/queries/8.sql b/test_runner/performance/tpc-h/queries/8.sql new file mode 100644 index 0000000000..f8259c960b --- /dev/null +++ b/test_runner/performance/tpc-h/queries/8.sql @@ -0,0 +1,44 @@ +-- $ID$ +-- TPC-H/TPC-R National Market Share Query (Q8) +-- Functional Query Definition +-- Approved February 1998 + + +select + o_year, + sum(case + when nation = 'CANADA' then volume + else 0 + end) / sum(volume) as mkt_share +from + ( + select + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) as volume, + n2.n_name as nation + from + part, + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2, + region + where + p_partkey = l_partkey + and s_suppkey = l_suppkey + and l_orderkey = o_orderkey + and o_custkey = c_custkey + and c_nationkey = n1.n_nationkey + and n1.n_regionkey = r_regionkey + and r_name = 'AMERICA' + and s_nationkey = n2.n_nationkey + and o_orderdate between date '1995-01-01' and date '1996-12-31' + and p_type = 'SMALL POLISHED BRASS' + ) as all_nations +group by + o_year +order by + o_year +; diff --git a/test_runner/performance/tpc-h/queries/9.sql b/test_runner/performance/tpc-h/queries/9.sql new file mode 100644 index 0000000000..d2e2df9f00 --- /dev/null +++ b/test_runner/performance/tpc-h/queries/9.sql @@ -0,0 +1,39 @@ +-- $ID$ +-- TPC-H/TPC-R Product Type Profit Measure Query (Q9) +-- Functional Query Definition +-- Approved February 1998 + + +select + nation, + o_year, + sum(amount) as sum_profit +from + ( + select + n_name as nation, + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount + from + part, + supplier, + lineitem, + partsupp, + orders, + nation + where + s_suppkey = l_suppkey + and ps_suppkey = l_suppkey + and ps_partkey = l_partkey + and p_partkey = l_partkey + and o_orderkey = l_orderkey + and s_nationkey = n_nationkey + and p_name like '%firebrick%' + ) as profit +group by + nation, + o_year +order by + nation, + o_year desc +; diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index dfbf956568..cc807b7ff3 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -84,6 +84,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv): # Set the GC horizon so that lsn1 is inside the horizon, which means # we can create a new branch starting from lsn1. + pageserver_http_client.timeline_checkpoint(tenant, timeline_main) pageserver_http_client.timeline_gc(tenant, timeline_main, lsn2 - lsn1 + 1024) env.neon_cli.create_branch( @@ -156,6 +157,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): # branch creation task but the individual timeline GC iteration happens *after* # the branch creation task. pageserver_http_client.configure_failpoints(("before-timeline-gc", "sleep(2000)")) + pageserver_http_client.timeline_checkpoint(tenant, b0) def do_gc(): pageserver_http_client.timeline_gc(tenant, b0, 0) diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index a841e3ced2..d19f6a7d39 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -109,6 +109,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): # check that we cannot create branch based on garbage collected data with env.pageserver.http_client() as pageserver_http: + pageserver_http.timeline_checkpoint(env.initial_tenant, timeline) gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) print_gc_result(gc_result) diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 71964f622f..05d5788028 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -15,7 +15,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ - ".*Failed to load delta layer.*", + ".*Failed to reconstruct the page.*", ".*could not find data for key.*", ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", @@ -87,9 +87,9 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}" ) - # Second timeline has no ancestors, only the metadata file and no layer files. - # That is checked explicitly in the pageserver, and causes the tenant to be marked - # as broken. + # Second timeline has no ancestors, only the metadata file and no layer files locally, + # and we don't have the remote storage enabled. It is loaded into memory, but getting + # the basebackup from it will fail. with pytest.raises( Exception, match=f"Tenant {tenant2} will not become active. Current state: Broken" ) as err: @@ -97,8 +97,9 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder): log.info(f"As expected, compute startup failed for timeline with missing layers: {err}") # Third timeline will also fail during basebackup, because the layer file is corrupt. + # It will fail when we try to read (and reconstruct) a page from it, ergo the error message. # (We don't check layer file contents on startup, when loading the timeline) - with pytest.raises(Exception, match="Failed to load delta layer") as err: + with pytest.raises(Exception, match="Failed to reconstruct the page") as err: pg3.start() log.info( f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}" diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 6b3324b7a7..332e2f2519 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -47,6 +47,7 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o neon_env_builder.pg_version = "14" neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_local_fs_remote_storage() + neon_env_builder.preserve_database_files = True env = neon_env_builder.init_start() pg = env.postgres.create_start("main") @@ -97,17 +98,19 @@ def test_backward_compatibility( ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)" compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve() - # Copy the snapshot to current directory, and prepare for the test - prepare_snapshot( - from_dir=compatibility_snapshot_dir, - to_dir=test_output_dir / "compatibility_snapshot", - port_distributor=port_distributor, - ) - breaking_changes_allowed = ( os.environ.get("ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" ) + try: + # Copy the snapshot to current directory, and prepare for the test + prepare_snapshot( + from_dir=compatibility_snapshot_dir, + to_dir=test_output_dir / "compatibility_snapshot", + neon_binpath=neon_binpath, + port_distributor=port_distributor, + ) + check_neon_works( test_output_dir / "compatibility_snapshot" / "repo", neon_binpath, @@ -155,18 +158,21 @@ def test_forward_compatibility( compatibility_snapshot_dir = ( test_output_dir.parent / "test_create_snapshot" / "compatibility_snapshot_pg14" ) - # Copy the snapshot to current directory, and prepare for the test - prepare_snapshot( - from_dir=compatibility_snapshot_dir, - to_dir=test_output_dir / "compatibility_snapshot", - port_distributor=port_distributor, - pg_distrib_dir=compatibility_postgres_distrib_dir, - ) breaking_changes_allowed = ( os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" ) + try: + # Copy the snapshot to current directory, and prepare for the test + prepare_snapshot( + from_dir=compatibility_snapshot_dir, + to_dir=test_output_dir / "compatibility_snapshot", + port_distributor=port_distributor, + neon_binpath=compatibility_neon_bin, + pg_distrib_dir=compatibility_postgres_distrib_dir, + ) + check_neon_works( test_output_dir / "compatibility_snapshot" / "repo", compatibility_neon_bin, @@ -194,6 +200,7 @@ def prepare_snapshot( from_dir: Path, to_dir: Path, port_distributor: PortDistributor, + neon_binpath: Path, pg_distrib_dir: Optional[Path] = None, ): assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist" @@ -227,9 +234,14 @@ def prepare_snapshot( pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port( pageserver_config["listen_pg_addr"] ) - pageserver_config["broker_endpoints"] = [ - port_distributor.replace_with_new_port(ep) for ep in pageserver_config["broker_endpoints"] - ] + # since storage_broker these are overriden by neon_local during pageserver + # start; remove both to prevent unknown options during etcd -> + # storage_broker migration. TODO: remove once broker is released + pageserver_config.pop("broker_endpoint", None) + pageserver_config.pop("broker_endpoints", None) + etcd_broker_endpoints = [f"http://localhost:{port_distributor.get_port()}/"] + if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0": + pageserver_config["broker_endpoints"] = etcd_broker_endpoints # old etcd version if pg_distrib_dir: pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir) @@ -239,10 +251,22 @@ def prepare_snapshot( snapshot_config_toml = repo_dir / "config" snapshot_config = toml.load(snapshot_config_toml) - snapshot_config["etcd_broker"]["broker_endpoints"] = [ - port_distributor.replace_with_new_port(ep) - for ep in snapshot_config["etcd_broker"]["broker_endpoints"] - ] + + # Provide up/downgrade etcd <-> storage_broker to make forward/backward + # compatibility test happy. TODO: leave only the new part once broker is released. + if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0": + # old etcd version + snapshot_config["etcd_broker"] = { + "etcd_binary_path": shutil.which("etcd"), + "broker_endpoints": etcd_broker_endpoints, + } + snapshot_config.pop("broker", None) + else: + # new storage_broker version + broker_listen_addr = f"127.0.0.1:{port_distributor.get_port()}" + snapshot_config["broker"] = {"listen_addr": broker_listen_addr} + snapshot_config.pop("etcd_broker", None) + snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port( snapshot_config["pageserver"]["listen_http_addr"] ) @@ -277,6 +301,12 @@ def prepare_snapshot( ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}" +# get git SHA of neon binary +def get_neon_version(neon_binpath: Path): + out = subprocess.check_output([neon_binpath / "neon_local", "--version"]).decode("utf-8") + return out.split("git:", 1)[1].rstrip() + + def check_neon_works( repo_dir: Path, neon_binpath: Path, @@ -302,6 +332,7 @@ def check_neon_works( config.initial_tenant = snapshot_config["default_tenant_id"] config.neon_binpath = neon_binpath config.pg_distrib_dir = pg_distrib_dir + config.preserve_database_files = True cli = NeonCli(config) cli.raw_cli(["start"]) diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py index 1851aeed55..f973bd8e60 100644 --- a/test_runner/regress/test_compute_ctl.py +++ b/test_runner/regress/test_compute_ctl.py @@ -1,4 +1,5 @@ import os +from pathlib import Path from subprocess import TimeoutExpired from fixtures.log_helper import log @@ -192,21 +193,63 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): timeout=10, ) except TimeoutExpired as exc: - ctl_logs = exc.stderr.decode("utf-8") - log.info("compute_ctl output:\n" + ctl_logs) + ctl_logs = (exc.stderr or b"").decode("utf-8") + log.info("compute_ctl output:\n{ctl_logs}") - start = "starting safekeepers syncing" - end = "safekeepers synced at LSN" - start_pos = ctl_logs.index(start) - assert start_pos != -1 - end_pos = ctl_logs.index(end, start_pos) - assert end_pos != -1 - sync_safekeepers_logs = ctl_logs[start_pos : end_pos + len(end)] - log.info("sync_safekeepers_logs:\n" + sync_safekeepers_logs) + with ExternalProcessManager(Path(pgdata) / "postmaster.pid"): + start = "starting safekeepers syncing" + end = "safekeepers synced at LSN" + start_pos = ctl_logs.index(start) + assert start_pos != -1 + end_pos = ctl_logs.index(end, start_pos) + assert end_pos != -1 + sync_safekeepers_logs = ctl_logs[start_pos : end_pos + len(end)] + log.info("sync_safekeepers_logs:\n" + sync_safekeepers_logs) - # assert that --sync-safekeepers logs are present in the output - assert "connecting with node" in sync_safekeepers_logs - assert "connected with node" in sync_safekeepers_logs - assert "proposer connected to quorum (2)" in sync_safekeepers_logs - assert "got votes from majority (2)" in sync_safekeepers_logs - assert "sending elected msg to node" in sync_safekeepers_logs + # assert that --sync-safekeepers logs are present in the output + assert "connecting with node" in sync_safekeepers_logs + assert "connected with node" in sync_safekeepers_logs + assert "proposer connected to quorum (2)" in sync_safekeepers_logs + assert "got votes from majority (2)" in sync_safekeepers_logs + assert "sending elected msg to node" in sync_safekeepers_logs + + +class ExternalProcessManager: + """ + Context manager that kills a process with a pid file on exit. + """ + + def __init__(self, pid_file: Path): + self.path = pid_file + self.pid_file = open(pid_file, "r") + self.pid = int(self.pid_file.readline().strip()) + + def __enter__(self): + return self + + def leave_alive(self): + self.pid_file.close() + + def __exit__(self, _type, _value, _traceback): + import signal + import time + + if self.pid_file.closed: + return + + with self.pid_file: + try: + os.kill(self.pid, signal.SIGTERM) + except OSError as e: + if not self.path.is_file(): + return + log.info(f"Failed to kill {self.pid}, but the pidfile remains: {e}") + return + + for _ in range(20): + if not self.path.is_file(): + return + time.sleep(0.2) + + log.info("Process failed to stop after SIGTERM: {self.pid}") + os.kill(self.pid, signal.SIGKILL) diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 332bef225f..5f052bf81a 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -2,9 +2,17 @@ import asyncio import concurrent.futures import random +import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres -from fixtures.types import TimelineId +from fixtures.metrics import parse_metrics +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + Postgres, + RemoteStorageKind, + wait_for_last_flush_lsn, +) +from fixtures.types import TenantId, TimelineId from fixtures.utils import query_scalar # Test configuration @@ -35,11 +43,13 @@ async def gc(env: NeonEnv, timeline: TimelineId): loop = asyncio.get_running_loop() + def do_gc(): + pageserver_http.timeline_checkpoint(env.initial_tenant, timeline) + pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) + with concurrent.futures.ThreadPoolExecutor() as pool: while updates_performed < updates_to_perform: - await loop.run_in_executor( - pool, lambda: pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) - ) + await loop.run_in_executor(pool, do_gc) # At the same time, run UPDATEs and GC @@ -87,3 +97,81 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): r = cur.fetchone() assert r is not None assert r == (num_rows, updates_to_perform) + + +# +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind): + + # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls + neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_gc_index_upload", + ) + + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_gc_index_upload", "main") + pg = env.postgres.create_start("test_gc_index_upload") + + pageserver_http = env.pageserver.http_client() + + pg_conn = pg.connect() + cur = pg_conn.cursor() + + tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id")) + timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) + + cur.execute("CREATE TABLE foo (id int, counter int, t text)") + cur.execute( + """ + INSERT INTO foo + SELECT g, 0, 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Helper function that gets the number of given kind of remote ops from the metrics + def get_num_remote_ops(file_kind: str, op_kind: str) -> int: + ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver") + total = 0.0 + for sample in ps_metrics.query_all( + name="pageserver_remote_operation_seconds_count", + filter={ + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "file_kind": str(file_kind), + "op_kind": str(op_kind), + }, + ): + total += sample[2] + return int(total) + + # Sanity check that the metric works + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + pageserver_http.timeline_gc(tenant_id, timeline_id, 10000) + before = get_num_remote_ops("index", "upload") + assert before > 0 + + # Run many cycles of GC. Then check that the number of index files + # uploads didn't grow much. In particular we don't want to re-upload the + # index file on every GC iteration, when it has no work to do. + # + # On each iteration, we use a slightly smaller GC horizon, so that the GC + # at least needs to check if it has work to do. + for i in range(100): + cur.execute("INSERT INTO foo VALUES (0, 0, 'foo')") + pageserver_http.timeline_gc(tenant_id, timeline_id, 10000 - i * 32) + num_index_uploads = get_num_remote_ops("index", "upload") + + # Also make sure that a no-op compaction doesn't upload the index + # file unnecessarily. + pageserver_http.timeline_compact(tenant_id, timeline_id) + + log.info(f"{num_index_uploads} index uploads after GC iteration {i}") + + after = num_index_uploads + log.info(f"{after-before} new index uploads during test") + assert after - before < 5 diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 1a99d13a0b..0388e24e98 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -53,10 +53,10 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build unpacked_base = os.path.join(basebackup_dir, "unpacked-base") corrupt_base_tar = os.path.join(unpacked_base, "corrupt-base.tar") os.mkdir(unpacked_base, 0o750) - subprocess_capture(str(test_output_dir), ["tar", "-xf", base_tar, "-C", unpacked_base]) + subprocess_capture(test_output_dir, ["tar", "-xf", base_tar, "-C", unpacked_base]) os.remove(os.path.join(unpacked_base, "global/pg_control")) subprocess_capture( - str(test_output_dir), + test_output_dir, ["tar", "-cf", "corrupt-base.tar"] + os.listdir(unpacked_base), cwd=unpacked_base, ) @@ -306,6 +306,7 @@ def _import( # Check that gc works pageserver_http = env.pageserver.http_client() + pageserver_http.timeline_checkpoint(tenant, timeline) pageserver_http.timeline_gc(tenant, timeline, 0) return tar_output_file diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py new file mode 100644 index 0000000000..ac9f163801 --- /dev/null +++ b/test_runner/regress/test_metric_collection.py @@ -0,0 +1,157 @@ +import pytest +from fixtures.log_helper import log +from fixtures.metrics import parse_metrics +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PortDistributor, + RemoteStorageKind, + wait_for_last_flush_lsn, +) +from fixtures.types import TenantId, TimelineId +from fixtures.utils import query_scalar +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + + +@pytest.fixture(scope="session") +def httpserver_listen_address(port_distributor: PortDistributor): + port = port_distributor.get_port() + return ("localhost", port) + + +num_metrics_received = 0 +remote_uploaded = 0 +first_request = True + + +# +# verify that metrics look minilally sane +# +def metrics_handler(request: Request) -> Response: + if request.json is None: + return Response(status=400) + + events = request.json["events"] + log.info("received events:") + log.info(events) + + checks = { + "written_size": lambda value: value > 0, + "resident_size": lambda value: value >= 0, + # >= 0 check here is to avoid race condition when we receive metrics before + # remote_uploaded is updated + "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0, + # logical size may lag behind the actual size, so allow 0 here + "timeline_logical_size": lambda value: value >= 0, + } + + events_received = 0 + for event in events: + check = checks.get(event["metric"]) + # calm down mypy + if check is not None: + assert check(event["value"]), f"{event['metric']} isn't valid" + events_received += 1 + + global first_request + # check that all checks were sent + # but only on the first request, because we don't send non-changed metrics + if first_request: + # we may receive more metrics than we check, + # because there are two timelines + # and we may receive per-timeline metrics from both + # if the test was slow enough for these metrics to be collected + # -1 because that is ok to not receive timeline_logical_size + assert events_received >= len(checks) - 1 + first_request = False + + global num_metrics_received + num_metrics_received += 1 + return Response(status=200) + + +@pytest.mark.parametrize( + "remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS] +) +def test_metric_collection( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address, + remote_storage_kind: RemoteStorageKind, +): + (host, port) = httpserver_listen_address + metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" + + # Disable time-based pitr, we will use the manual GC calls + # to trigger remote storage operations in a controlled way + neon_env_builder.pageserver_config_override = ( + f""" + metric_collection_interval="60s" + metric_collection_endpoint="{metric_collection_endpoint}" + """ + + "tenant_config={pitr_interval = '0 sec'}" + ) + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_metric_collection", + ) + + log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}") + + # mock http server that returns OK for the metrics + httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler( + metrics_handler + ) + + # spin up neon, after http server is ready + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_metric_collection") + pg = env.postgres.create_start("test_metric_collection") + + pg_conn = pg.connect() + cur = pg_conn.cursor() + + tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id")) + timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) + + cur.execute("CREATE TABLE foo (id int, counter int, t text)") + cur.execute( + """ + INSERT INTO foo + SELECT g, 0, 'long string to consume some space' || g + FROM generate_series(1, 100000) g + """ + ) + + # Helper function that gets the number of given kind of remote ops from the metrics + def get_num_remote_ops(file_kind: str, op_kind: str) -> int: + ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver") + total = 0.0 + for sample in ps_metrics.query_all( + name="pageserver_remote_operation_seconds_count", + filter={ + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "file_kind": str(file_kind), + "op_kind": str(op_kind), + }, + ): + total += sample[2] + return int(total) + + # upload some data to remote storage + if remote_storage_kind == RemoteStorageKind.LOCAL_FS: + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + pageserver_http = env.pageserver.http_client() + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + pageserver_http.timeline_gc(tenant_id, timeline_id, 10000) + global remote_uploaded + remote_uploaded = get_num_remote_ops("index", "upload") + assert remote_uploaded > 0 + + # check that all requests are served + httpserver.check() + global num_metrics_received + assert num_metrics_received > 0, "no metrics were received" diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py new file mode 100644 index 0000000000..6c7cdb6f7f --- /dev/null +++ b/test_runner/regress/test_neon_local_cli.py @@ -0,0 +1,10 @@ +from fixtures.neon_fixtures import NeonEnvBuilder + + +# Test that neon cli is able to start and stop all processes with the user defaults. +# def test_neon_cli_basics(neon_simple_env: NeonEnv): +def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init() + + env.neon_cli.start() + env.neon_cli.stop() diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index 3e387bb6cc..9885a811e1 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -45,7 +45,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # will cause GetPage requests. cur.execute( """ - select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' """ ) @@ -59,6 +59,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Make a lot of updates on a single row, generating a lot of WAL. Trigger # garbage collections so that the page server will remove old page versions. for i in range(10): + pageserver_http.timeline_checkpoint(env.initial_tenant, timeline) gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) print_gc_result(gc_result) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py new file mode 100644 index 0000000000..352ae4b95c --- /dev/null +++ b/test_runner/regress/test_ondemand_download.py @@ -0,0 +1,437 @@ +# It's possible to run any regular test with the local fs remote storage via +# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... + +from pathlib import Path + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + RemoteStorageKind, + assert_tenant_status, + available_remote_storages, + wait_for_last_record_lsn, + wait_for_sk_commit_lsn_to_reach_remote_storage, + wait_for_upload, + wait_until, +) +from fixtures.types import Lsn +from fixtures.utils import query_scalar + + +def get_num_downloaded_layers(client, tenant_id, timeline_id): + value = client.get_metric_value( + f'pageserver_remote_operation_seconds_count{{file_kind="layer",op_kind="download",status="success",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}}' + ) + if value is None: + return 0 + return int(value) + + +# +# If you have a large relation, check that the pageserver downloads parts of it as +# require by queries. +# +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_ondemand_download_large_rel( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_ondemand_download_large_rel", + ) + + ##### First start, insert secret data and upload it to the remote storage + env = neon_env_builder.init_start() + + # Override defaults, to create more layers + tenant, _ = env.neon_cli.create_tenant( + conf={ + # disable background GC + "gc_period": "10 m", + "gc_horizon": f"{10 * 1024 ** 3}", # 10 GB + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{10 * 1024 ** 2}", # 10 MB + "compaction_threshold": "3", + "compaction_target_size": f"{10 * 1024 ** 2}", # 10 MB + } + ) + env.initial_tenant = tenant + + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + # We want to make sure that the data is large enough that the keyspace is partitioned. + num_rows = 1000000 + + with pg.cursor() as cur: + # data loading may take a while, so increase statement timeout + cur.execute("SET statement_timeout='300s'") + cur.execute( + f"""CREATE TABLE tbl AS SELECT g as id, 'long string to consume some space' || g + from generate_series(1,{num_rows}) g""" + ) + cur.execute("CREATE INDEX ON tbl (id)") + cur.execute("VACUUM tbl") + + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + client.timeline_checkpoint(tenant_id, timeline_id) + + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, tenant_id, timeline_id, current_lsn) + log.info("uploads have finished") + + ##### Stop the first pageserver instance, erase all its data + pg.stop() + env.pageserver.stop() + + # remove all the layer files + for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"): + log.info(f"unlinking layer {layer}") + layer.unlink() + + ##### Second start, restore the data and ensure it's the same + env.pageserver.start() + + pg.start() + before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + + # Probe in the middle of the table. There's a high chance that the beginning + # and end of the table was stored together in the same layer files with data + # from other tables, and with the entry that stores the size of the + # relation, so they are likely already downloaded. But the middle of the + # table should not have been needed by anything yet. + with pg.cursor() as cur: + assert query_scalar(cur, "select count(*) from tbl where id = 500000") == 1 + + after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + log.info(f"layers downloaded before {before_downloads} and after {after_downloads}") + assert after_downloads > before_downloads + + +# +# If you have a relation with a long history of updates,the pageserver downloads the layer +# files containing the history as needed by timetravel queries. +# +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_ondemand_download_timetravel( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_ondemand_download_timetravel", + ) + + ##### First start, insert data and upload it to the remote storage + env = neon_env_builder.init_start() + + # Override defaults, to create more layers + tenant, _ = env.neon_cli.create_tenant( + conf={ + # Disable background GC & compaction + # We don't want GC, that would break the assertion about num downloads. + # We don't want background compaction, we force a compaction every time we do explicit checkpoint. + "gc_period": "0s", + "compaction_period": "0s", + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB + "compaction_threshold": "1", + "image_creation_threshold": "1", + "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB + } + ) + env.initial_tenant = tenant + + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + lsns = [] + + table_len = 10000 + with pg.cursor() as cur: + cur.execute( + f""" + CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text); + INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len}); + """ + ) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + # run checkpoint manually to be sure that data landed in remote storage + client.timeline_checkpoint(tenant_id, timeline_id) + lsns.append((0, current_lsn)) + + for checkpoint_number in range(1, 20): + with pg.cursor() as cur: + cur.execute(f"UPDATE testtab SET checkpoint_number = {checkpoint_number}") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + lsns.append((checkpoint_number, current_lsn)) + + # wait until pageserver receives that data + wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + client.timeline_checkpoint(tenant_id, timeline_id) + + # wait until pageserver successfully uploaded a checkpoint to remote storage + wait_for_upload(client, tenant_id, timeline_id, current_lsn) + log.info("uploads have finished") + + ##### Stop the first pageserver instance, erase all its data + env.postgres.stop_all() + + wait_for_sk_commit_lsn_to_reach_remote_storage( + tenant_id, timeline_id, env.safekeepers, env.pageserver + ) + + def get_api_current_physical_size(): + d = client.timeline_detail(tenant_id, timeline_id) + return d["current_physical_size"] + + def get_resident_physical_size(): + return client.get_timeline_metric( + tenant_id, timeline_id, "pageserver_resident_physical_size" + ) + + filled_current_physical = get_api_current_physical_size() + log.info(filled_current_physical) + filled_size = get_resident_physical_size() + log.info(filled_size) + assert filled_current_physical == filled_size, "we don't yet do layer eviction" + + env.pageserver.stop() + + # remove all the layer files + for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"): + log.info(f"unlinking layer {layer}") + layer.unlink() + + ##### Second start, restore the data and ensure it's the same + env.pageserver.start() + + wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active")) + + # current_physical_size reports sum of layer file sizes, regardless of local or remote + assert filled_current_physical == get_api_current_physical_size() + + num_layers_downloaded = [0] + physical_size = [get_resident_physical_size()] + for (checkpoint_number, lsn) in lsns: + pg_old = env.postgres.create_start( + branch_name="main", node_name=f"test_old_lsn_{checkpoint_number}", lsn=lsn + ) + with pg_old.cursor() as cur: + # assert query_scalar(cur, f"select count(*) from testtab where checkpoint_number={checkpoint_number}") == 100000 + assert ( + query_scalar( + cur, + f"select count(*) from testtab where checkpoint_number<>{checkpoint_number}", + ) + == 0 + ) + assert ( + query_scalar( + cur, + f"select count(*) from testtab where checkpoint_number={checkpoint_number}", + ) + == table_len + ) + + after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + num_layers_downloaded.append(after_downloads) + log.info(f"num_layers_downloaded[-1]={num_layers_downloaded[-1]}") + + # Check that on each query, we need to download at least one more layer file. However in + # practice, thanks to compaction and the fact that some requests need to download + # more history, some points-in-time are covered by earlier downloads already. But + # in broad strokes, as we query more points-in-time, more layers need to be downloaded. + # + # Do a fuzzy check on that, by checking that after each point-in-time, we have downloaded + # more files than we had three iterations ago. + log.info(f"layers downloaded after checkpoint {checkpoint_number}: {after_downloads}") + if len(num_layers_downloaded) > 4: + assert after_downloads > num_layers_downloaded[len(num_layers_downloaded) - 4] + + # Likewise, assert that the physical_size metric grows as layers are downloaded + physical_size.append(get_resident_physical_size()) + log.info(f"physical_size[-1]={physical_size[-1]}") + if len(physical_size) > 4: + assert physical_size[-1] > physical_size[len(physical_size) - 4] + + # current_physical_size reports sum of layer file sizes, regardless of local or remote + assert filled_current_physical == get_api_current_physical_size() + + +# +# Ensure that the `download_remote_layers` API works +# +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_download_remote_layers_api( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_download_remote_layers_api", + ) + + ##### First start, insert data and upload it to the remote storage + env = neon_env_builder.init_start() + + # Override defaults, to create more layers + tenant, _ = env.neon_cli.create_tenant( + conf={ + # Disable background GC & compaction + # We don't want GC, that would break the assertion about num downloads. + # We don't want background compaction, we force a compaction every time we do explicit checkpoint. + "gc_period": "0s", + "compaction_period": "0s", + # small checkpoint distance to create more delta layer files + "checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB + "compaction_threshold": "1", + "image_creation_threshold": "1", + "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB + } + ) + env.initial_tenant = tenant + + pg = env.postgres.create_start("main") + + client = env.pageserver.http_client() + + tenant_id = pg.safe_psql("show neon.tenant_id")[0][0] + timeline_id = pg.safe_psql("show neon.timeline_id")[0][0] + + table_len = 10000 + with pg.cursor() as cur: + cur.execute( + f""" + CREATE TABLE testtab(id serial primary key, checkpoint_number int, data text); + INSERT INTO testtab (checkpoint_number, data) SELECT 0, 'data' FROM generate_series(1, {table_len}); + """ + ) + + env.postgres.stop_all() + + wait_for_sk_commit_lsn_to_reach_remote_storage( + tenant_id, timeline_id, env.safekeepers, env.pageserver + ) + + def get_api_current_physical_size(): + d = client.timeline_detail(tenant_id, timeline_id) + return d["current_physical_size"] + + def get_resident_physical_size(): + return client.get_timeline_metric( + tenant_id, timeline_id, "pageserver_resident_physical_size" + ) + + filled_current_physical = get_api_current_physical_size() + log.info(filled_current_physical) + filled_size = get_resident_physical_size() + log.info(filled_size) + assert filled_current_physical == filled_size, "we don't yet do layer eviction" + + env.pageserver.stop() + + # remove all the layer files + # XXX only delete some of the layer files, to show that it really just downloads all the layers + for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"): + log.info(f"unlinking layer {layer}") + layer.unlink() + + # Shut down safekeepers before starting the pageserver. + # If we don't, the tenant's walreceiver handler will trigger the + # the logical size computation task, and that downloads layes, + # which makes our assertions on size fail. + for sk in env.safekeepers: + sk.stop(immediate=True) + + ##### Second start, restore the data and ensure it's the same + env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"}) + env.pageserver.allowed_errors.extend( + [ + f".*download_all_remote_layers.*{tenant_id}.*{timeline_id}.*layer download failed.*remote-storage-download-pre-rename failpoint", + f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size", + ] + ) + + wait_until(10, 0.2, lambda: assert_tenant_status(client, tenant_id, "Active")) + + ###### Phase 1: exercise download error code path + assert ( + filled_current_physical == get_api_current_physical_size() + ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote" + post_unlink_size = get_resident_physical_size() + log.info(post_unlink_size) + assert ( + post_unlink_size < filled_size + ), "we just deleted layers and didn't cause anything to re-download them yet" + assert filled_size - post_unlink_size > 5 * ( + 1024**2 + ), "we may be downloading some layers as part of tenant activation" + + # issue downloads that we know will fail + info = client.timeline_download_remote_layers( + tenant_id, timeline_id, errors_ok=True, at_least_one_download=False + ) + log.info(f"info={info}") + assert info["state"] == "Completed" + assert info["total_layer_count"] > 0 + assert info["successful_download_count"] == 0 + assert ( + info["failed_download_count"] > 0 + ) # can't assert == total_layer_count because attach + tenant status downloads some layers + assert ( + info["total_layer_count"] + == info["successful_download_count"] + info["failed_download_count"] + ) + assert get_api_current_physical_size() == filled_current_physical + assert ( + get_resident_physical_size() == post_unlink_size + ), "didn't download anything new due to failpoint" + # would be nice to assert that the layers in the layer map are still RemoteLayer + + ##### Retry, this time without failpoints + client.configure_failpoints(("remote-storage-download-pre-rename", "off")) + info = client.timeline_download_remote_layers(tenant_id, timeline_id, errors_ok=False) + log.info(f"info={info}") + + assert info["state"] == "Completed" + assert info["total_layer_count"] > 0 + assert info["successful_download_count"] > 0 + assert info["failed_download_count"] == 0 + assert ( + info["total_layer_count"] + == info["successful_download_count"] + info["failed_download_count"] + ) + + refilled_size = get_resident_physical_size() + log.info(refilled_size) + + assert filled_size == refilled_size, "we redownloaded all the layers" + assert get_api_current_physical_size() == filled_current_physical + + for sk in env.safekeepers: + sk.start() + + # ensure that all the data is back + pg_old = env.postgres.create_start(branch_name="main") + with pg_old.cursor() as cur: + assert query_scalar(cur, "select count(*) from testtab") == table_len diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index e48815906b..6388e979e5 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -32,7 +32,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # Verify that the table is larger than shared_buffers cur.execute( """ - select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' """ ) @@ -115,7 +115,7 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder): # Verify that the table is larger than shared_buffers cur.execute( """ - select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' """ ) diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index d8b7256577..fe4fbc0927 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -52,6 +52,7 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): # run GC with env.pageserver.http_client() as pageserver_http: + pageserver_http.timeline_checkpoint(env.initial_tenant, timeline) pageserver_http.timeline_compact(env.initial_tenant, timeline) # perform aggressive GC. Data still should be kept because of the PITR setting. gc_result = pageserver_http.timeline_gc(env.initial_tenant, timeline, 0) diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index e868d6b616..e13ba51f4b 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -28,61 +28,63 @@ def test_password_hack(static_proxy: NeonProxy): static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic) -def get_session_id(uri_prefix, uri_line): - assert uri_prefix in uri_line - - url_parts = urlparse(uri_line) - psql_session_id = url_parts.path[1:] - assert psql_session_id.isalnum(), "session_id should only contain alphanumeric chars" - - return psql_session_id - - -async def find_auth_link(link_auth_uri_prefix, proc): - for _ in range(100): - line = (await proc.stderr.readline()).decode("utf-8").strip() - log.info(f"psql line: {line}") - if link_auth_uri_prefix in line: - log.info(f"SUCCESS, found auth url: {line}") - return line - - -async def activate_link_auth(local_vanilla_pg, link_proxy, psql_session_id): - pg_user = "proxy" - - log.info("creating a new user for link auth test") - local_vanilla_pg.start() - local_vanilla_pg.safe_psql(f"create user {pg_user} with login superuser") - - db_info = json.dumps( - { - "session_id": psql_session_id, - "result": { - "Success": { - "host": local_vanilla_pg.default_options["host"], - "port": local_vanilla_pg.default_options["port"], - "dbname": local_vanilla_pg.default_options["dbname"], - "user": pg_user, - "project": "irrelevant", - } - }, - } - ) - - log.info("sending session activation message") - psql = await PSQL(host=link_proxy.host, port=link_proxy.mgmt_port).run(db_info) - out = (await psql.stdout.read()).decode("utf-8").strip() - assert out == "ok" - - @pytest.mark.asyncio async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProxy): + def get_session_id(uri_prefix, uri_line): + assert uri_prefix in uri_line + + url_parts = urlparse(uri_line) + psql_session_id = url_parts.path[1:] + assert psql_session_id.isalnum(), "session_id should only contain alphanumeric chars" + + return psql_session_id + + async def find_auth_link(link_auth_uri, proc): + for _ in range(100): + line = (await proc.stderr.readline()).decode("utf-8").strip() + log.info(f"psql line: {line}") + if link_auth_uri in line: + log.info(f"SUCCESS, found auth url: {line}") + return line + + async def activate_link_auth(local_vanilla_pg, link_proxy, psql_session_id): + pg_user = "proxy" + + log.info("creating a new user for link auth test") + local_vanilla_pg.start() + local_vanilla_pg.safe_psql(f"create user {pg_user} with login superuser") + + db_info = json.dumps( + { + "session_id": psql_session_id, + "result": { + "Success": { + "host": local_vanilla_pg.default_options["host"], + "port": local_vanilla_pg.default_options["port"], + "dbname": local_vanilla_pg.default_options["dbname"], + "user": pg_user, + "aux": { + "project_id": "project", + "endpoint_id": "endpoint", + "branch_id": "branch", + }, + } + }, + } + ) + + log.info("sending session activation message") + psql = await PSQL(host=link_proxy.host, port=link_proxy.mgmt_port).run(db_info) + assert psql.stdout is not None + out = (await psql.stdout.read()).decode("utf-8").strip() + assert out == "ok" + psql = await PSQL(host=link_proxy.host, port=link_proxy.proxy_port).run("select 42") - uri_prefix = link_proxy.link_auth_uri_prefix - link = await find_auth_link(uri_prefix, psql) + base_uri = link_proxy.link_auth_uri + link = await find_auth_link(base_uri, psql) - psql_session_id = get_session_id(uri_prefix, link) + psql_session_id = get_session_id(base_uri, link) await activate_link_auth(vanilla_pg, link_proxy, psql_session_id) assert psql.stdout is not None @@ -97,3 +99,61 @@ def test_proxy_options(static_proxy: NeonProxy): cur.execute("SHOW proxytest.option") value = cur.fetchall()[0][0] assert value == "value" + + +def test_auth_errors(static_proxy: NeonProxy): + # User does not exist + with pytest.raises(psycopg2.Error) as exprinfo: + static_proxy.connect(user="pinocchio", options="project=irrelevant") + text = str(exprinfo.value).strip() + assert text.endswith("password authentication failed for user 'pinocchio'") + + static_proxy.safe_psql( + "create role pinocchio with login password 'magic'", options="project=irrelevant" + ) + + # User exists, but password is missing + with pytest.raises(psycopg2.Error) as exprinfo: + static_proxy.connect(user="pinocchio", password=None, options="project=irrelevant") + text = str(exprinfo.value).strip() + assert text.endswith("password authentication failed for user 'pinocchio'") + + # User exists, but password is wrong + with pytest.raises(psycopg2.Error) as exprinfo: + static_proxy.connect(user="pinocchio", password="bad", options="project=irrelevant") + text = str(exprinfo.value).strip() + assert text.endswith("password authentication failed for user 'pinocchio'") + + # Finally, check that the user can connect + with static_proxy.connect(user="pinocchio", password="magic", options="project=irrelevant"): + pass + + +def test_forward_params_to_client(static_proxy: NeonProxy): + # A subset of parameters (GUCs) which postgres + # sends to the client during connection setup. + # Unfortunately, `GUC_REPORT` can't be queried. + # Proxy *should* forward them, otherwise client library + # might misbehave (e.g. parse timestamps incorrectly). + reported_params_subset = [ + "client_encoding", + "integer_datetimes", + "is_superuser", + "server_encoding", + "server_version", + "session_authorization", + "standard_conforming_strings", + ] + + query = """ + select name, setting + from pg_catalog.pg_settings + where name = any(%s) + """ + + with static_proxy.connect(options="project=irrelevant") as conn: + with conn.cursor() as cur: + cur.execute(query, (reported_params_subset,)) + for name, value in cur.fetchall(): + # Check that proxy has forwarded this parameter. + assert conn.get_parameter_status(name) == value diff --git a/test_runner/performance/test_read_trace.py b/test_runner/regress/test_read_trace.py similarity index 60% rename from test_runner/performance/test_read_trace.py rename to test_runner/regress/test_read_trace.py index a5bd0b8de6..1b00b272c2 100644 --- a/test_runner/performance/test_read_trace.py +++ b/test_runner/regress/test_read_trace.py @@ -1,10 +1,14 @@ from contextlib import closing -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_record_lsn +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import query_scalar # This test demonstrates how to collect a read trace. It's useful until # it gets replaced by a test that actually does stuff with the trace. +# +# Additionally, tests that pageserver is able to create tenants with custom configs. def test_read_request_tracing(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 1 env = neon_env_builder.init_start() @@ -23,6 +27,12 @@ def test_read_request_tracing(neon_env_builder: NeonEnvBuilder): cur.execute("create table t (i integer);") cur.execute(f"insert into t values (generate_series(1,{10000}));") cur.execute("select count(*) from t;") + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + # wait until pageserver receives that data + pageserver_http = env.pageserver.http_client() + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) # Stop pg so we drop the connection and flush the traces pg.stop() diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 550ad43fc9..32c25b2e8c 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -14,11 +14,11 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PageserverApiException, RemoteStorageKind, - assert_no_in_progress_downloads_for_tenant, available_remote_storages, wait_for_last_flush_lsn, wait_for_last_record_lsn, wait_for_upload, + wait_until_tenant_state, ) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import print_gc_result, query_scalar, wait_until @@ -55,10 +55,15 @@ def test_remote_storage_backup_and_restore( test_name="test_remote_storage_backup_and_restore", ) - data_id = 1 - data_secret = "very secret secret" + # Exercise retry code path by making all uploads and downloads fail for the + # first time. The retries print INFO-messages to the log; we will check + # that they are present after the test. + neon_env_builder.pageserver_config_override = "test_remote_failures=1" - ##### First start, insert secret data and upload it to the remote storage + data_id = 1 + data = "just some data" + + ##### First start, insert data and upload it to the remote storage env = neon_env_builder.init_start() # FIXME: Is this expected? @@ -71,8 +76,11 @@ def test_remote_storage_backup_and_restore( # FIXME retry downloads without throwing errors env.pageserver.allowed_errors.append(".*failed to load remote timeline.*") # we have a bunch of pytest.raises for these below - env.pageserver.allowed_errors.append(".*tenant already exists.*") - env.pageserver.allowed_errors.append(".*attach is already in progress.*") + env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*") + env.pageserver.allowed_errors.append( + ".*Cannot attach tenant .*?, local tenant directory already exists.*" + ) + env.pageserver.allowed_errors.append(".*simulated failure of remote operation.*") pageserver_http = env.pageserver.http_client() pg = env.postgres.create_start("main") @@ -84,22 +92,12 @@ def test_remote_storage_backup_and_restore( checkpoint_numbers = range(1, 3) - # On the first iteration, exercise retry code path by making the uploads - # fail for the first 3 times - action = "3*return->off" - pageserver_http.configure_failpoints( - [ - ("before-upload-layer", action), - ("before-upload-index", action), - ] - ) - for checkpoint_number in checkpoint_numbers: with pg.cursor() as cur: cur.execute( f""" - CREATE TABLE t{checkpoint_number}(id int primary key, secret text); - INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data_secret}|{checkpoint_number}'); + CREATE TABLE t{checkpoint_number}(id int primary key, data text); + INSERT INTO t{checkpoint_number} VALUES ({data_id}, '{data}|{checkpoint_number}'); """ ) current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) @@ -115,6 +113,14 @@ def test_remote_storage_backup_and_restore( wait_for_upload(client, tenant_id, timeline_id, current_lsn) log.info(f"upload of checkpoint {checkpoint_number} is done") + # Check that we had to retry the uploads + assert env.pageserver.log_contains( + ".*failed to perform remote task UploadLayer.*, will retry.*" + ) + assert env.pageserver.log_contains( + ".*failed to perform remote task UploadMetadata.*, will retry.*" + ) + ##### Stop the first pageserver instance, erase all its data env.postgres.stop_all() env.pageserver.stop() @@ -126,38 +132,53 @@ def test_remote_storage_backup_and_restore( ##### Second start, restore the data and ensure it's the same env.pageserver.start() - # Introduce failpoint in download - pageserver_http.configure_failpoints(("remote-storage-download-pre-rename", "return")) - + # Introduce failpoint in list remote timelines code path to make tenant_attach fail. + # This is before the failures injected by test_remote_failures, so it's a permanent error. + pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return")) + env.pageserver.allowed_errors.append( + ".*error attaching tenant: storage-sync-list-remote-timelines", + ) + # Attach it. This HTTP request will succeed and launch a + # background task to load the tenant. In that background task, + # listing the remote timelines will fail because of the failpoint, + # and the tenant will be marked as Broken. client.tenant_attach(tenant_id) + wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15) - # is there a better way to assert that failpoint triggered? - time.sleep(10) - - # assert cannot attach timeline that is scheduled for download - # FIXME implement layer download retries - with pytest.raises(Exception, match="tenant already exists, current state: Broken"): + # Ensure that even though the tenant is broken, we can't attach it again. + with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"): client.tenant_attach(tenant_id) - tenant_status = client.tenant_status(tenant_id) - log.info("Tenant status with active failpoint: %s", tenant_status) - # FIXME implement layer download retries - # assert tenant_status["has_in_progress_downloads"] is True - - # trigger temporary download files removal + # Restart again, this implicitly clears the failpoint. + # test_remote_failures=1 remains active, though, as it's in the pageserver config. + # This means that any of the remote client operations after restart will exercise the + # retry code path. + # + # The initiated attach operation should survive the restart, and continue from where it was. env.pageserver.stop() + layer_download_failed_regex = ( + r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure" + ) + assert not env.pageserver.log_contains( + layer_download_failed_regex + ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint" env.pageserver.start() - # ensure that an initiated attach operation survives pageserver restart - with pytest.raises( - Exception, match=r".*(tenant already exists|attach is already in progress).*" - ): + # Ensure that the pageserver remembers that the tenant was attaching, by + # trying to attach it again. It should fail. + with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"): client.tenant_attach(tenant_id) - log.info("waiting for timeline redownload") + log.info("waiting for tenant to become active. this should be quick with on-demand download") + + def tenant_active(): + all_states = client.tenant_list() + [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] + assert tenant["state"] == "Active" + wait_until( - number_of_iterations=20, + number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), + func=tenant_active, ) detail = client.timeline_detail(tenant_id, timeline_id) @@ -165,16 +186,19 @@ def test_remote_storage_backup_and_restore( assert ( Lsn(detail["last_record_lsn"]) >= current_lsn ), "current db Lsn should should not be less than the one stored on remote storage" - assert not detail["awaits_download"] + log.info("select some data, this will cause layers to be downloaded") pg = env.postgres.create_start("main") with pg.cursor() as cur: for checkpoint_number in checkpoint_numbers: assert ( - query_scalar(cur, f"SELECT secret FROM t{checkpoint_number} WHERE id = {data_id};") - == f"{data_secret}|{checkpoint_number}" + query_scalar(cur, f"SELECT data FROM t{checkpoint_number} WHERE id = {data_id};") + == f"{data}|{checkpoint_number}" ) + log.info("ensure that we neede to retry downloads due to test_remote_failures=1") + assert env.pageserver.log_contains(layer_download_failed_regex) + # Exercises the upload queue retry code paths. # - Use failpoints to cause all storage ops to fail @@ -191,7 +215,7 @@ def test_remote_storage_upload_queue_retries( neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_remote_storage_backup_and_restore", + test_name="test_remote_storage_upload_queue_retries", ) env = neon_env_builder.init_start() @@ -334,7 +358,6 @@ def test_remote_storage_upload_queue_retries( def tenant_active(): all_states = client.tenant_list() [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] - assert tenant["has_in_progress_downloads"] is False assert tenant["state"] == "Active" wait_until(30, 1, tenant_active) @@ -353,7 +376,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( ): neon_env_builder.enable_remote_storage( remote_storage_kind=remote_storage_kind, - test_name="test_remote_storage_backup_and_restore", + test_name="test_timeline_deletion_with_files_stuck_in_upload_queue", ) env = neon_env_builder.init_start() @@ -384,7 +407,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( metrics, re.MULTILINE, ) - assert matches + if matches is None: + return None return int(matches[1]) pg = env.postgres.create_start("main", tenant_id=tenant_id) @@ -436,8 +460,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( assert not timeline_path.exists() - # timeline deletion should kill ongoing uploads - assert get_queued_count(file_kind="index", op_kind="upload") == 0 + # timeline deletion should kill ongoing uploads, so, the metric will be gone + assert get_queued_count(file_kind="index", op_kind="upload") is None # timeline deletion should be unblocking checkpoint ops checkpoint_thread.join(2.0) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 46a945a58b..6d621fbb77 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -133,3 +133,28 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}""" "pitr_interval": 2592000, }.items() ) + + # update the config with very short config and make sure no trailing chars are left from previous config + env.neon_cli.config_tenant( + tenant_id=tenant, + conf={ + "pitr_interval": "1 min", + }, + ) + + # restart the pageserver and ensure that the config is still correct + env.pageserver.stop() + env.pageserver.start() + + with closing(env.pageserver.connect()) as psconn: + with psconn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as pscur: + pscur.execute(f"show {tenant}") + res = pscur.fetchone() + log.info(f"after restart res: {res}") + assert all( + i in res.items() + for i in { + "compaction_period": 20, + "pitr_interval": 60, + }.items() + ) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index bafddc7721..6963a57542 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -7,10 +7,12 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PageserverApiException, PageserverHttpClient, + Postgres, RemoteStorageKind, available_remote_storages, wait_for_last_record_lsn, wait_for_upload, + wait_until_tenant_state, ) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar @@ -22,6 +24,7 @@ def do_gc_target( """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211""" try: log.info("sending gc http request") + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) pageserver_http.timeline_gc(tenant_id, timeline_id, 0) except Exception as e: log.error("do_gc failed: %s", e) @@ -29,6 +32,58 @@ def do_gc_target( log.info("gc http thread returning") +# Basic detach and re-attach test +@pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) +def test_tenant_reattach( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_tenant_reattach", + ) + + # Exercise retry code path by making all uploads and downloads fail for the + # first time. The retries print INFO-messages to the log; we will check + # that they are present after the test. + neon_env_builder.pageserver_config_override = "test_remote_failures=1" + + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + # create new nenant + tenant_id, timeline_id = env.neon_cli.create_tenant() + + pg = env.postgres.create_start("main", tenant_id=tenant_id) + with pg.cursor() as cur: + cur.execute("CREATE TABLE t(key int primary key, value text)") + cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # Wait for the all data to be processed by the pageserver and uploaded in remote storage + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + # Check that we had to retry the uploads + assert env.pageserver.log_contains( + ".*failed to perform remote task UploadLayer.*, will retry.*" + ) + assert env.pageserver.log_contains( + ".*failed to perform remote task UploadMetadata.*, will retry.*" + ) + + pageserver_http.tenant_detach(tenant_id) + pageserver_http.tenant_attach(tenant_id) + + with pg.cursor() as cur: + assert query_scalar(cur, "SELECT count(*) FROM t") == 100000 + + # Check that we had to retry the downloads + assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*") + assert env.pageserver.log_contains(".*download.*failed, will retry.*") + + def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -167,3 +222,316 @@ def test_detach_while_attaching( with pg.cursor() as cur: cur.execute("SELECT COUNT(*) FROM foo") + + +# Tests that `ignore` and `get` operations' combination is able to remove and restore the tenant in pageserver's memory. +# * writes some data into tenant's timeline +# * ensures it's synced with the remote storage +# * `ignore` the tenant +# * verify that ignored tenant files are generally unchanged, only an ignored mark had appeared +# * verify the ignored tenant is gone from pageserver's memory +# * restart the pageserver and verify that ignored tenant is still not loaded +# * `load` the same tenant +# * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]) +def test_ignored_tenant_reattach( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_remote_storage_backup_and_restore", + ) + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + ignored_tenant_id, _ = env.neon_cli.create_tenant() + tenant_dir = env.repo_dir / "tenants" / str(ignored_tenant_id) + tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] + tenants_before_ignore.sort() + timelines_before_ignore = [ + timeline["timeline_id"] + for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id) + ] + files_before_ignore = [tenant_path for tenant_path in tenant_dir.glob("**/*")] + + # ignore the tenant and veirfy it's not present in pageserver replies, with its files still on disk + pageserver_http.tenant_ignore(ignored_tenant_id) + + files_after_ignore_with_retain = [tenant_path for tenant_path in tenant_dir.glob("**/*")] + new_files = set(files_after_ignore_with_retain) - set(files_before_ignore) + disappeared_files = set(files_before_ignore) - set(files_after_ignore_with_retain) + assert ( + len(disappeared_files) == 0 + ), f"Tenant ignore should not remove files from disk, missing: {disappeared_files}" + assert ( + len(new_files) == 1 + ), f"Only tenant ignore file should appear on disk but got: {new_files}" + + tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] + assert ignored_tenant_id not in tenants_after_ignore, "Ignored tenant should be missing" + assert len(tenants_after_ignore) + 1 == len( + tenants_before_ignore + ), "Only ignored tenant should be missing" + + # restart the pageserver to ensure we don't load the ignore timeline + env.pageserver.stop() + env.pageserver.start() + tenants_after_restart = [tenant["id"] for tenant in pageserver_http.tenant_list()] + tenants_after_restart.sort() + assert ( + tenants_after_restart == tenants_after_ignore + ), "Ignored tenant should not be reloaded after pageserver restart" + + # now, load it from the local files and expect it works + pageserver_http.tenant_load(tenant_id=ignored_tenant_id) + wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5) + + tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()] + tenants_after_attach.sort() + assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back" + + timelines_after_ignore = [ + timeline["timeline_id"] + for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id) + ] + assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back" + + +# Tests that it's possible to `load` tenants with missing layers and get them restored: +# * writes some data into tenant's timeline +# * ensures it's synced with the remote storage +# * `ignore` the tenant +# * removes all timeline's local layers +# * `load` the same tenant +# * ensure that it's status is `Active` +# * check that timeline data is restored +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_ignored_tenant_download_missing_layers( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_ignored_tenant_download_and_attach", + ) + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + data_id = 1 + data_secret = "very secret secret" + insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, pg) + + tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] + tenants_before_ignore.sort() + timelines_before_ignore = [ + timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id) + ] + + # ignore the tenant and remove its layers + pageserver_http.tenant_ignore(tenant_id) + tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + layers_removed = False + for dir_entry in tenant_timeline_dir.iterdir(): + if dir_entry.name.startswith("00000"): + # Looks like a layer file. Remove it + dir_entry.unlink() + layers_removed = True + assert layers_removed, f"Found no layers for tenant {tenant_timeline_dir}" + + # now, load it from the local files and expect it to work due to remote storage restoration + pageserver_http.tenant_load(tenant_id=tenant_id) + wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) + + tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()] + tenants_after_attach.sort() + assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back" + + timelines_after_ignore = [ + timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id) + ] + assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back" + + pg.stop() + pg.start() + ensure_test_data(data_id, data_secret, pg) + + +# Tests that it's possible to `load` broken tenants: +# * `ignore` a tenant +# * removes its `metadata` file locally +# * `load` the same tenant +# * ensure that it's status is `Broken` +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_ignored_tenant_stays_broken_without_metadata( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_ignored_tenant_stays_broken_without_metadata", + ) + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + # ignore the tenant and remove its metadata + pageserver_http.tenant_ignore(tenant_id) + tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + metadata_removed = False + for dir_entry in tenant_timeline_dir.iterdir(): + if dir_entry.name == "metadata": + # Looks like a layer file. Remove it + dir_entry.unlink() + metadata_removed = True + assert metadata_removed, f"Failed to find metadata file in {tenant_timeline_dir}" + + env.pageserver.allowed_errors.append(".*could not load tenant .*?: failed to load metadata.*") + + # now, load it from the local files and expect it to be broken due to inability to load tenant files into memory + pageserver_http.tenant_load(tenant_id=tenant_id) + wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 5) + + +# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally +# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored. +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_load_attach_negatives( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_load_attach_negatives", + ) + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + + env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*") + with pytest.raises( + expected_exception=PageserverApiException, + match=f"tenant {tenant_id} already exists, state: Active", + ): + pageserver_http.tenant_load(tenant_id) + + with pytest.raises( + expected_exception=PageserverApiException, + match=f"tenant {tenant_id} already exists, state: Active", + ): + pageserver_http.tenant_attach(tenant_id) + + pageserver_http.tenant_ignore(tenant_id) + + env.pageserver.allowed_errors.append( + ".*Cannot attach tenant .*?, local tenant directory already exists.*" + ) + with pytest.raises( + expected_exception=PageserverApiException, + match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists", + ): + pageserver_http.tenant_attach(tenant_id) + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_ignore_while_attaching( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_ignore_while_attaching", + ) + + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + pageserver_http = env.pageserver.http_client() + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + data_id = 1 + data_secret = "very secret secret" + insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, pg) + + tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] + + # Detach it + pageserver_http.tenant_detach(tenant_id) + # And re-attach, but stop attach task_mgr task from completing + pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")]) + pageserver_http.tenant_attach(tenant_id) + # Run ignore on the task, thereby cancelling the attach. + # XXX This should take priority over attach, i.e., it should cancel the attach task. + # But neither the failpoint, nor the proper storage_sync download functions, + # are sensitive to task_mgr::shutdown. + # This problem is tracked in https://github.com/neondatabase/neon/issues/2996 . + # So, for now, effectively, this ignore here will block until attach task completes. + pageserver_http.tenant_ignore(tenant_id) + + # Cannot attach it due to some local files existing + env.pageserver.allowed_errors.append( + ".*Cannot attach tenant .*?, local tenant directory already exists.*" + ) + with pytest.raises( + expected_exception=PageserverApiException, + match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists", + ): + pageserver_http.tenant_attach(tenant_id) + + tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] + assert tenant_id not in tenants_after_ignore, "Ignored tenant should be missing" + assert len(tenants_after_ignore) + 1 == len( + tenants_before_ignore + ), "Only ignored tenant should be missing" + + # But can load it from local files, that will restore attach. + pageserver_http.tenant_load(tenant_id) + + wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) + + pg.stop() + pg.start() + ensure_test_data(data_id, data_secret, pg) + + +def insert_test_data( + pageserver_http: PageserverHttpClient, + tenant_id: TenantId, + timeline_id: TimelineId, + data_id: int, + data: str, + pg: Postgres, +): + with pg.cursor() as cur: + cur.execute( + f""" + CREATE TABLE test(id int primary key, secret text); + INSERT INTO test VALUES ({data_id}, '{data}'); + """ + ) + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # wait until pageserver receives that data + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + + # run checkpoint manually to be sure that data landed in remote storage + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + # wait until pageserver successfully uploaded a checkpoint to remote storage + log.info("waiting for to be ignored tenant data checkpoint upload") + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + +def ensure_test_data(data_id: int, data: str, pg: Postgres): + with pg.cursor() as cur: + assert ( + query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data + ), "Should have timeline data back" diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index c4b3b28f34..1b58937e2a 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -7,18 +7,21 @@ from typing import Any, Dict, Optional, Tuple import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( - Etcd, + NeonBroker, NeonEnv, NeonEnvBuilder, PageserverHttpClient, PortDistributor, Postgres, - assert_no_in_progress_downloads_for_tenant, + assert_tenant_status, + tenant_exists, wait_for_last_record_lsn, wait_for_upload, + wait_until, + wait_while, ) from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import query_scalar, start_in_background, subprocess_capture, wait_until +from fixtures.utils import query_scalar, start_in_background, subprocess_capture def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): @@ -32,7 +35,7 @@ def new_pageserver_service( remote_storage_mock_path: Path, pg_port: int, http_port: int, - broker: Optional[Etcd], + broker: Optional[NeonBroker], pg_distrib_dir: Path, ): """ @@ -53,7 +56,7 @@ def new_pageserver_service( ] if broker is not None: cmd.append( - f"-c broker_endpoints=['{broker.client_url()}']", + f"-c broker_endpoint='{broker.client_url()}'", ) pageserver_client = PageserverHttpClient( port=http_port, @@ -406,17 +409,13 @@ def test_tenant_relocation( # call to attach timeline to new pageserver new_pageserver_http.tenant_attach(tenant_id) - # check that it shows that download is in progress + # wait for tenant to finish attaching tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id) - assert tenant_status.get("has_in_progress_downloads"), tenant_status - - # wait until tenant is downloaded + assert tenant_status["state"] in ["Attaching", "Active"] wait_until( number_of_iterations=10, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant( - new_pageserver_http, tenant_id - ), + func=lambda: assert_tenant_status(new_pageserver_http, tenant_id, "Active"), ) check_timeline_attached( @@ -459,9 +458,15 @@ def test_tenant_relocation( # detach tenant from old pageserver before we check # that all the data is there to be sure that old pageserver - # is no longer involved, and if it is, we will see the errors + # is no longer involved, and if it is, we will see the error pageserver_http.tenant_detach(tenant_id) + # Wait a little, so that the detach operation has time to finish. + wait_while( + number_of_iterations=100, + interval=1, + func=lambda: tenant_exists(pageserver_http, tenant_id), + ) post_migration_check(pg_main, 500500, old_local_path_main) post_migration_check(pg_second, 1001000, old_local_path_second) diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index ddae1a67ff..4eba4ce942 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -20,44 +20,48 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): matching = [t for t in all_states if TenantId(t["id"]) == tenant] return get_only_element(matching)["state"] - def get_metric_value(name): - metrics = client.get_metrics() - relevant = [line for line in metrics.splitlines() if line.startswith(name)] - if len(relevant) == 0: - return 0 - line = get_only_element(relevant) - value = line.lstrip(name).strip() - return int(value) - def delete_all_timelines(tenant: TenantId): timelines = [TimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)] for t in timelines: client.timeline_delete(tenant, t) + def assert_active(tenant): + assert get_state(tenant) == "Active" + # Create tenant, start compute tenant, _ = env.neon_cli.create_tenant() env.neon_cli.create_timeline(name, tenant_id=tenant) pg = env.postgres.create_start(name, tenant_id=tenant) + assert ( + get_state(tenant) == "Active" + ), "Pageserver should activate a tenant and start background jobs if timelines are loaded" # Stop compute pg.stop() - # Delete all timelines on all tenants + # Delete all timelines on all tenants. + # + # FIXME: we used to check that the background jobs are stopped when all timelines + # are removed, but we don't stop them anymore. Not sure if this test still makes sense + # or we should just remove it. for tenant_info in client.tenant_list(): tenant_id = TenantId(tenant_info["id"]) delete_all_timelines(tenant_id) + wait_until(10, 0.2, lambda: assert_active(tenant_id)) # Assert that all tasks finish quickly after tenant is detached - assert get_metric_value('pageserver_tenant_task_events{event="start"}') > 0 + task_starts = client.get_metric_value('pageserver_tenant_task_events{event="start"}') + assert task_starts is not None + assert int(task_starts) > 0 client.tenant_detach(tenant) client.tenant_detach(env.initial_tenant) def assert_tasks_finish(): - tasks_started = get_metric_value('pageserver_tenant_task_events{event="start"}') - tasks_ended = get_metric_value('pageserver_tenant_task_events{event="stop"}') - tasks_panicked = get_metric_value('pageserver_tenant_task_events{event="panic"}') + tasks_started = client.get_metric_value('pageserver_tenant_task_events{event="start"}') + tasks_ended = client.get_metric_value('pageserver_tenant_task_events{event="stop"}') + tasks_panicked = client.get_metric_value('pageserver_tenant_task_events{event="panic"}') log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}") assert tasks_started == tasks_ended - assert tasks_panicked == 0 + assert tasks_panicked is None or int(tasks_panicked) == 0 wait_until(10, 0.2, assert_tasks_finish) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 0b20afefc3..9477ae3c25 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -7,7 +7,11 @@ from typing import List import pytest from fixtures.log_helper import log -from fixtures.metrics import PAGESERVER_PER_TENANT_METRICS, parse_metrics +from fixtures.metrics import ( + PAGESERVER_PER_TENANT_METRICS, + PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, + parse_metrics, +) from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, @@ -157,9 +161,21 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): ) -def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize( + "remote_storage_kind", + # exercise both the code paths where remote_storage=None and remote_storage=Some(...) + [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3], +) +def test_pageserver_metrics_removed_after_detach( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): """Tests that when a tenant is detached, the tenant specific metrics are not left behind""" + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_pageserver_metrics_removed_after_detach", + ) + neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() @@ -192,7 +208,11 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde for tenant in [tenant_1, tenant_2]: pre_detach_samples = set([x.name for x in get_ps_metric_samples_for_tenant(tenant)]) - assert pre_detach_samples == set(PAGESERVER_PER_TENANT_METRICS) + expected = set(PAGESERVER_PER_TENANT_METRICS) + if remote_storage_kind == RemoteStorageKind.NOOP: + # if there's no remote storage configured, we don't expose the remote timeline client metrics + expected -= set(PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS) + assert pre_detach_samples == expected env.pageserver.http_client().tenant_detach(tenant) diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 76639e4055..6da6a4d446 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -21,9 +21,10 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, Postgres, RemoteStorageKind, - assert_no_in_progress_downloads_for_tenant, + assert_tenant_status, available_remote_storages, wait_for_last_record_lsn, + wait_for_sk_commit_lsn_to_reach_remote_storage, wait_for_upload, ) from fixtures.types import Lsn, TenantId, TimelineId @@ -120,6 +121,11 @@ def test_tenants_attached_after_download( data_id = 1 data_secret = "very secret secret" + # Exercise retry code path by making all uploads and downloads fail for the + # first time. The retries print INFO-messages to the log; we will check + # that they are present after the test. + neon_env_builder.pageserver_config_override = "test_remote_failures=1" + ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() @@ -158,26 +164,19 @@ def test_tenants_attached_after_download( wait_for_upload(client, tenant_id, timeline_id, current_lsn) log.info(f"upload of checkpoint {checkpoint_number} is done") + # Check that we had to retry the uploads + assert env.pageserver.log_contains( + ".*failed to perform remote task UploadLayer.*, will retry.*" + ) + assert env.pageserver.log_contains( + ".*failed to perform remote task UploadMetadata.*, will retry.*" + ) + ##### Stop the pageserver, erase its layer file to force it being downloaded from S3 env.postgres.stop_all() - sk_commit_lsns = [ - sk.http_client().timeline_status(tenant_id, timeline_id).commit_lsn - for sk in env.safekeepers - ] - log.info("wait for pageserver to process all the WAL") - wait_for_last_record_lsn(client, tenant_id, timeline_id, max(sk_commit_lsns)) - log.info("wait for it to reach remote storage") - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload(client, tenant_id, timeline_id, max(sk_commit_lsns)) - log.info("latest safekeeper_commit_lsn reached remote storage") - - detail_before = client.timeline_detail( - tenant_id, timeline_id, include_non_incremental_physical_size=True - ) - assert ( - detail_before["current_physical_size_non_incremental"] - == detail_before["current_physical_size"] + wait_for_sk_commit_lsn_to_reach_remote_storage( + tenant_id, timeline_id, env.safekeepers, env.pageserver ) env.pageserver.stop() @@ -193,13 +192,16 @@ def test_tenants_attached_after_download( assert local_layer_deleted, f"Found no local layer files to delete in directory {timeline_dir}" ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory + # FIXME: just starting the pageserver no longer downloads the + # layer files. Do we want to force download, or maybe run some + # queries, or is it enough that it starts up without layer files? env.pageserver.start() client = env.pageserver.http_client() wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), + func=lambda: assert_tenant_status(client, tenant_id, "Active"), ) restored_timelines = client.timeline_list(tenant_id) @@ -211,11 +213,8 @@ def test_tenants_attached_after_download( timeline_id ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" - # Check that the physical size matches after re-downloading - detail_after = client.timeline_detail( - tenant_id, timeline_id, include_non_incremental_physical_size=True - ) - assert detail_before["current_physical_size"] == detail_after["current_physical_size"] + # Check that we had to retry the downloads + assert env.pageserver.log_contains(".*download .* succeeded after 1 retries.*") @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) @@ -230,7 +229,7 @@ def test_tenant_upgrades_index_json_from_v0( "timeline_layers":[ "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9" ], - "missing_layers":[], + "missing_layers":["This should not fail as its not used anymore"], "disk_consistent_lsn":"0/16960E8", "metadata_bytes":[] }""" @@ -248,15 +247,6 @@ def test_tenant_upgrades_index_json_from_v0( # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade env = neon_env_builder.init_start() - # FIXME: Are these expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - env.pageserver.allowed_errors.append(".*No timelines to attach received.*") - env.pageserver.allowed_errors.append( - ".*Failed to get local tenant state: Tenant .* not found in the local state.*" - ) - pageserver_http = env.pageserver.http_client() pg = env.postgres.create_start("main") @@ -271,7 +261,6 @@ def test_tenant_upgrades_index_json_from_v0( wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) pageserver_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) - env.postgres.stop_all() env.pageserver.stop() @@ -284,7 +273,10 @@ def test_tenant_upgrades_index_json_from_v0( # keep the deserialized for later inspection orig_index_part = json.load(timeline_file) - v0_index_part = {key: orig_index_part[key] for key in v0_skeleton} + v0_index_part = { + key: orig_index_part[key] + for key in v0_skeleton.keys() - ["missing_layers"] # pgserver doesn't have it anymore + } timeline_file.seek(0) json.dump(v0_index_part, timeline_file) @@ -296,7 +288,7 @@ def test_tenant_upgrades_index_json_from_v0( wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(pageserver_http, tenant_id), + func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"), ) pg = env.postgres.create_start("main") @@ -316,7 +308,7 @@ def test_tenant_upgrades_index_json_from_v0( # make sure the file has been upgraded back to how it started index_part = local_fs_index_part(env, tenant_id, timeline_id) assert index_part["version"] == orig_index_part["version"] - assert index_part["missing_layers"] == orig_index_part["missing_layers"] + assert "missing_layers" not in index_part.keys() # expect one more layer because of the forced checkpoint assert len(index_part["timeline_layers"]) == len(orig_index_part["timeline_layers"]) + 1 @@ -352,6 +344,80 @@ def test_tenant_upgrades_index_json_from_v0( # FIXME: test index_part.json getting downgraded from imaginary new version +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_tenant_ignores_backup_file( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + # getting a too eager compaction happening for this test would not play + # well with the strict assertions. + neon_env_builder.pageserver_config_override = "tenant_config.compaction_period='1h'" + + neon_env_builder.enable_remote_storage(remote_storage_kind, "test_tenant_ignores_backup_file") + + # launch pageserver, populate the default tenants timeline, wait for it to be uploaded, + # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.append(".*got backup file on the remote storage, ignoring it.*") + + pageserver_http = env.pageserver.http_client() + pg = env.postgres.create_start("main") + + tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0]) + + with pg.cursor() as cur: + cur.execute("CREATE TABLE t0 AS VALUES (123, 'second column as text');") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + # flush, wait until in remote storage + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + env.postgres.stop_all() + env.pageserver.stop() + + # change the remote file to have entry with .0.old suffix + timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id) + with open(timeline_path, "r+") as timeline_file: + # keep the deserialized for later inspection + orig_index_part = json.load(timeline_file) + backup_layer_name = orig_index_part["timeline_layers"][0] + ".0.old" + orig_index_part["timeline_layers"].append(backup_layer_name) + + timeline_file.seek(0) + json.dump(orig_index_part, timeline_file) + + env.pageserver.start() + pageserver_http = env.pageserver.http_client() + + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: assert_tenant_status(pageserver_http, tenant_id, "Active"), + ) + + pg = env.postgres.create_start("main") + + with pg.cursor() as cur: + cur.execute("INSERT INTO t0 VALUES (234, 'test data');") + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) + + # not needed anymore + env.postgres.stop_all() + env.pageserver.stop() + + # the .old file is gone from newly serialized index_part + new_index_part = local_fs_index_part(env, tenant_id, timeline_id) + backup_layers = filter(lambda x: x.endswith(".old"), new_index_part["timeline_layers"]) + assert len(list(backup_layers)) == 0 + + @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) def test_tenant_redownloads_truncated_file_on_startup( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind @@ -409,14 +475,15 @@ def test_tenant_redownloads_truncated_file_on_startup( index_part = local_fs_index_part(env, tenant_id, timeline_id) assert index_part["layer_metadata"][path.name]["file_size"] == expected_size - ##### Start the pageserver, forcing it to download the layer file and load the timeline into memory + ## Start the pageserver. It will notice that the file size doesn't match, and + ## rename away the local file. It will be re-downloaded when it's needed. env.pageserver.start() client = env.pageserver.http_client() wait_until( number_of_iterations=5, interval=1, - func=lambda: assert_no_in_progress_downloads_for_tenant(client, tenant_id), + func=lambda: assert_tenant_status(client, tenant_id, "Active"), ) restored_timelines = client.timeline_list(tenant_id) @@ -428,6 +495,10 @@ def test_tenant_redownloads_truncated_file_on_startup( timeline_id ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage" + # Request non-incremental logical size. Calculating it needs the layer file that + # we corrupted, forcing it to be redownloaded. + client.timeline_detail(tenant_id, timeline_id, include_non_incremental_logical_size=True) + assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded" # the remote side of local_layer_truncated diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index cef1f365cd..3b41cc5c90 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -1,22 +1,28 @@ import math +import queue import random import re +import threading import time from contextlib import closing from pathlib import Path import psycopg2.errors import psycopg2.extras +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + PageserverApiException, PageserverHttpClient, PgBin, PortDistributor, Postgres, VanillaPostgres, + assert_tenant_status, wait_for_last_flush_lsn, + wait_until, ) from fixtures.types import TenantId, TimelineId from fixtures.utils import get_timeline_dir_size @@ -213,6 +219,89 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): ), "after the WAL is streamed, current_logical_size is expected to be calculated and to be equal its non-incremental value" +@pytest.mark.parametrize("deletion_method", ["tenant_detach", "timeline_delete"]) +def test_timeline_initial_logical_size_calculation_cancellation( + neon_env_builder: NeonEnvBuilder, deletion_method: str +): + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + + tenant_id, timeline_id = env.neon_cli.create_tenant() + + # load in some data + pg = env.postgres.create_start("main", tenant_id=tenant_id) + pg.safe_psql_many( + [ + "CREATE TABLE foo (x INTEGER)", + "INSERT INTO foo SELECT g FROM generate_series(1, 10000) g", + ] + ) + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + pg.stop() + + # restart with failpoint inside initial size calculation task + env.pageserver.stop() + env.pageserver.start( + extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"} + ) + + def tenant_active(): + all_states = client.tenant_list() + [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] + assert tenant["state"] == "Active" + + wait_until(30, 1, tenant_active) + + # kick off initial size calculation task (the response we get here is the estimated size) + def assert_size_calculation_not_done(): + details = client.timeline_detail( + tenant_id, timeline_id, include_non_incremental_logical_size=True + ) + assert details["current_logical_size"] != details["current_logical_size_non_incremental"] + + assert_size_calculation_not_done() + # ensure we're really stuck + time.sleep(5) + assert_size_calculation_not_done() + + log.info( + f"try to delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish" + ) + delete_timeline_success: queue.Queue[bool] = queue.Queue(maxsize=1) + + def delete_timeline_thread_fn(): + try: + if deletion_method == "tenant_detach": + client.tenant_detach(tenant_id) + elif deletion_method == "timeline_delete": + client.timeline_delete(tenant_id, timeline_id) + delete_timeline_success.put(True) + except PageserverApiException: + delete_timeline_success.put(False) + raise + + delete_timeline_thread = threading.Thread(target=delete_timeline_thread_fn) + delete_timeline_thread.start() + # give it some time to settle in the state where it waits for size computation task + time.sleep(5) + if not delete_timeline_success.empty(): + assert ( + False + ), f"test is broken, the {deletion_method} should be stuck waiting for size computation task, got result {delete_timeline_success.get()}" + + log.info( + "resume the size calculation. The failpoint checks that the timeline directory still exists." + ) + client.configure_failpoints(("timeline-calculate-logical-size-check-dir-exists", "return")) + client.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) + + log.info("wait for delete timeline thread to finish and assert that it succeeded") + assert delete_timeline_success.get() + + # if the implementation is incorrect, the teardown would complain about an error log + # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists" + + def test_timeline_physical_size_init(neon_simple_env: NeonEnv): env = neon_simple_env new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_init") @@ -233,7 +322,17 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv): env.pageserver.stop() env.pageserver.start() - assert_physical_size(env, env.initial_tenant, new_timeline_id) + # Wait for the tenant to be loaded + client = env.pageserver.http_client() + wait_until( + number_of_iterations=5, + interval=1, + func=lambda: assert_tenant_status(client, env.initial_tenant, "Active"), + ) + + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): @@ -254,7 +353,9 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv): wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) - assert_physical_size(env, env.initial_tenant, new_timeline_id) + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder): @@ -289,7 +390,9 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id) - assert_physical_size(env, env.initial_tenant, new_timeline_id) + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): @@ -326,10 +429,11 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id) pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) - pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None) - assert_physical_size(env, env.initial_tenant, new_timeline_id) + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id) + ) # The timeline logical and physical sizes are also exposed as prometheus metrics. @@ -362,7 +466,7 @@ def test_timeline_size_metrics( # get the metrics and parse the metric for the current timeline's physical size metrics = env.pageserver.http_client().get_metrics() matches = re.search( - f'^pageserver_current_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', + f'^pageserver_resident_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$', metrics, re.MULTILINE, ) @@ -421,11 +525,12 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): tenant, timeline = env.neon_cli.create_tenant() - def get_timeline_physical_size(timeline: TimelineId): - res = client.timeline_detail(tenant, timeline, include_non_incremental_physical_size=True) - return res["current_physical_size_non_incremental"] + def get_timeline_resident_physical_size(timeline: TimelineId): + sizes = get_physical_size_values(env, tenant, timeline) + assert_physical_size_invariants(sizes) + return sizes.prometheus_resident_physical - timeline_total_size = get_timeline_physical_size(timeline) + timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline) for i in range(10): n_rows = random.randint(100, 1000) @@ -442,22 +547,54 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv): wait_for_last_flush_lsn(env, pg, tenant, timeline) pageserver_http.timeline_checkpoint(tenant, timeline) - timeline_total_size += get_timeline_physical_size(timeline) + timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline) pg.stop() - tenant_physical_size = int(client.tenant_status(tenant_id=tenant)["current_physical_size"]) - assert tenant_physical_size == timeline_total_size + # ensure that tenant_status current_physical size reports sum of timeline current_physical_size + tenant_current_physical_size = int( + client.tenant_status(tenant_id=tenant)["current_physical_size"] + ) + assert tenant_current_physical_size == sum( + [tl["current_physical_size"] for tl in client.timeline_list(tenant_id=tenant)] + ) + # since we don't do layer eviction, current_physical_size is identical to resident physical size + assert timeline_total_resident_physical_size == tenant_current_physical_size -def assert_physical_size(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): - """Check the current physical size returned from timeline API - matches the total physical size of the timeline on disk""" +class TimelinePhysicalSizeValues: + api_current_physical: int + prometheus_resident_physical: int + python_timelinedir_layerfiles_physical: int + + +def get_physical_size_values( + env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId +) -> TimelinePhysicalSizeValues: + res = TimelinePhysicalSizeValues() + client = env.pageserver.http_client() - res = client.timeline_detail(tenant_id, timeline_id, include_non_incremental_physical_size=True) + + res.prometheus_resident_physical = client.get_timeline_metric( + tenant_id, timeline_id, "pageserver_resident_physical_size" + ) + + detail = client.timeline_detail( + tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True + ) + res.api_current_physical = detail["current_physical_size"] + timeline_path = env.timeline_dir(tenant_id, timeline_id) - assert res["current_physical_size"] == res["current_physical_size_non_incremental"] - assert res["current_physical_size"] == get_timeline_dir_size(timeline_path) + res.python_timelinedir_layerfiles_physical = get_timeline_dir_size(timeline_path) + + return res + + +def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues): + # resident phyiscal size is defined as + assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical + # we don't do layer eviction, so, all layers are resident + assert sizes.api_current_physical == sizes.prometheus_resident_physical # Timeline logical size initialization is an asynchronous background task that runs once, diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 3945376e5e..77ec33f8b0 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -16,7 +16,7 @@ from typing import Any, List, Optional import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( - Etcd, + NeonBroker, NeonEnv, NeonEnvBuilder, NeonPageserver, @@ -520,7 +520,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re ) # advance remote_consistent_lsn to trigger WAL trimming - # this LSN should be less than commit_lsn, so timeline will be active=true in safekeepers, to push etcd updates + # this LSN should be less than commit_lsn, so timeline will be active=true in safekeepers, to push broker updates env.safekeepers[0].http_client().record_safekeeper_info( tenant_id, timeline_id, {"remote_consistent_lsn": str(offloaded_seg_end)} ) @@ -585,17 +585,23 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re if elapsed > wait_lsn_timeout: raise RuntimeError("Timed out waiting for WAL redo") - pageserver_lsn = Lsn( - env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)["last_record_lsn"] - ) - lag = last_lsn - pageserver_lsn + tenant_status = ps_cli.tenant_status(tenant_id) + if tenant_status["state"] == "Loading": + log.debug(f"Tenant {tenant_id} is still loading, retrying") + else: + pageserver_lsn = Lsn( + env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[ + "last_record_lsn" + ] + ) + lag = last_lsn - pageserver_lsn - if time.time() > last_debug_print + 10 or lag <= 0: - last_debug_print = time.time() - log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb") + if time.time() > last_debug_print + 10 or lag <= 0: + last_debug_print = time.time() + log.info(f"Pageserver last_record_lsn={pageserver_lsn}; lag is {lag / 1024}kb") - if lag <= 0: - break + if lag <= 0: + break time.sleep(1) @@ -812,10 +818,10 @@ class SafekeeperEnv: ): self.repo_dir = repo_dir self.port_distributor = port_distributor - self.broker = Etcd( - datadir=os.path.join(self.repo_dir, "etcd"), + self.broker = NeonBroker( + logfile=Path(self.repo_dir) / "storage_broker.log", port=self.port_distributor.get_port(), - peer_port=self.port_distributor.get_port(), + neon_binpath=neon_binpath, ) self.pg_bin = pg_bin self.num_safekeepers = num_safekeepers @@ -863,7 +869,7 @@ class SafekeeperEnv: str(safekeeper_dir), "--id", str(i), - "--broker-endpoints", + "--broker-endpoint", self.broker.client_url(), ] log.info(f'Running command "{" ".join(cmd)}"') @@ -883,9 +889,12 @@ class SafekeeperEnv: raise Exception(f"Failed to start safekepeer as {cmd}, reason: {e}") def get_safekeeper_connstrs(self): + assert self.safekeepers is not None, "safekeepers are not initialized" return ",".join([sk_proc.args[2] for sk_proc in self.safekeepers]) def create_postgres(self): + assert self.tenant_id is not None, "tenant_id is not initialized" + assert self.timeline_id is not None, "tenant_id is not initialized" pgdata_dir = os.path.join(self.repo_dir, "proposer_pgdata") pg = ProposerPostgres( pgdata_dir, diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index aaaa8893a5..24045e2eb7 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -65,7 +65,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): # Verify that the table is larger than shared_buffers cur.execute( """ - select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize + select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' """ ) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index da50d99db5..c22aea6714 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit da50d99db54848f7a3e910f920aaad7dc6915d36 +Subproject commit c22aea67149a2fe71cab881be7a31fba305ddc21 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 780c3f8e35..114da43a49 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 780c3f8e3524c2e32a2e28884c7b647fcebf71d7 +Subproject commit 114da43a4967c068c958dacd6dedf65053c99148 diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 31e4426ac2..6c81756fe1 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -16,6 +16,7 @@ publish = false ahash = { version = "0.7", features = ["std"] } anyhow = { version = "1", features = ["backtrace", "std"] } bytes = { version = "1", features = ["serde", "std"] } +chrono = { version = "0.4", default-features = false, features = ["clock", "iana-time-zone", "serde", "std", "winapi"] } clap = { version = "4", features = ["color", "derive", "error-context", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8", features = ["once_cell", "std"] } either = { version = "1", features = ["use_std"] } @@ -32,21 +33,22 @@ nom = { version = "7", features = ["alloc", "std"] } num-bigint = { version = "0.4", features = ["std"] } num-integer = { version = "0.1", features = ["i128", "std"] } num-traits = { version = "0.2", features = ["i128", "libm", "std"] } -prost-93f6ce9d446188ac = { package = "prost", version = "0.10", features = ["prost-derive", "std"] } -prost-a6292c17cd707f01 = { package = "prost", version = "0.11", features = ["prost-derive", "std"] } +prost = { version = "0.11", features = ["prost-derive", "std"] } rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } -reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "hyper-rustls", "json", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-rustls", "webpki-roots"] } +reqwest = { version = "0.11", features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] } scopeguard = { version = "1", features = ["use_std"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] } +serde_json = { version = "1", features = ["raw_value", "std"] } +socket2 = { version = "0.4", default-features = false, features = ["all"] } stable_deref_trait = { version = "1", features = ["alloc", "std"] } -time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] } tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] } tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] } tower = { version = "0.4", features = ["__common", "balance", "buffer", "discover", "futures-core", "futures-util", "indexmap", "limit", "load", "log", "make", "pin-project", "pin-project-lite", "rand", "ready-cache", "retry", "slab", "timeout", "tokio", "tokio-util", "tracing", "util"] } tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] } tracing-core = { version = "0.1", features = ["once_cell", "std"] } +url = { version = "2", features = ["serde"] } [build-dependencies] ahash = { version = "0.7", features = ["std"] } @@ -59,8 +61,7 @@ libc = { version = "0.2", features = ["extra_traits", "std"] } log = { version = "0.4", default-features = false, features = ["serde", "std"] } memchr = { version = "2", features = ["std"] } nom = { version = "7", features = ["alloc", "std"] } -prost-93f6ce9d446188ac = { package = "prost", version = "0.10", features = ["prost-derive", "std"] } -prost-a6292c17cd707f01 = { package = "prost", version = "0.11", features = ["prost-derive", "std"] } +prost = { version = "0.11", features = ["prost-derive", "std"] } regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] } serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }